├── .envrc ├── .github ├── release.yml └── workflows │ ├── ci.yml │ ├── docker-publish.yml │ └── publish.yml ├── .gitignore ├── .vscode └── launch.json ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md ├── benchmark ├── .gitignore ├── Cargo.toml ├── README.md ├── bench_server.rs ├── clickbench │ ├── answers │ │ └── clickbench_0 │ │ │ ├── Q0.parquet │ │ │ ├── Q1.parquet │ │ │ ├── Q10.parquet │ │ │ ├── Q11.parquet │ │ │ ├── Q12.parquet │ │ │ ├── Q13.parquet │ │ │ ├── Q14.parquet │ │ │ ├── Q15.parquet │ │ │ ├── Q16.parquet │ │ │ ├── Q17.parquet │ │ │ ├── Q18.parquet │ │ │ ├── Q2.parquet │ │ │ ├── Q20.parquet │ │ │ ├── Q21.parquet │ │ │ ├── Q22.parquet │ │ │ ├── Q23.parquet │ │ │ ├── Q24.parquet │ │ │ ├── Q25.parquet │ │ │ ├── Q26.parquet │ │ │ ├── Q27.parquet │ │ │ ├── Q28.parquet │ │ │ ├── Q29.parquet │ │ │ ├── Q3.parquet │ │ │ ├── Q30.parquet │ │ │ ├── Q31.parquet │ │ │ ├── Q32.parquet │ │ │ ├── Q33.parquet │ │ │ ├── Q34.parquet │ │ │ ├── Q35.parquet │ │ │ ├── Q36.parquet │ │ │ ├── Q37.parquet │ │ │ ├── Q38.parquet │ │ │ ├── Q39.parquet │ │ │ ├── Q4.parquet │ │ │ ├── Q40.parquet │ │ │ ├── Q42.parquet │ │ │ ├── Q5.parquet │ │ │ ├── Q6.parquet │ │ │ ├── Q7.parquet │ │ │ ├── Q8.parquet │ │ │ └── Q9.parquet │ ├── clickbench_client.rs │ ├── data │ │ └── .gitkeep │ └── queries │ │ ├── queries.sql │ │ └── query_select.sql ├── src │ ├── bin │ │ ├── create_nano_hits.rs │ │ └── encoding.rs │ ├── lib.rs │ ├── observability.rs │ ├── runner.rs │ └── utils.rs └── tpch │ ├── answers │ └── .gitkeep │ ├── data │ ├── .gitkeep │ └── sf0.001 │ │ ├── customer.parquet │ │ ├── lineitem.parquet │ │ ├── nation.parquet │ │ ├── orders.parquet │ │ ├── part.parquet │ │ ├── partsupp.parquet │ │ ├── region.parquet │ │ └── supplier.parquet │ ├── queries │ ├── q1.sql │ ├── q10.sql │ ├── q11.sql │ ├── q12.sql │ ├── q13.sql │ ├── q14.sql │ ├── q15.sql │ ├── q16.sql │ ├── q17.sql │ ├── q18.sql │ ├── q19.sql │ ├── q2.sql │ ├── q20.sql │ ├── q21.sql │ ├── q22.sql │ ├── q3.sql │ ├── q4.sql │ ├── q5.sql │ ├── q6.sql │ ├── q7.sql │ ├── q8.sql │ └── q9.sql │ ├── tpch_client.rs │ └── tpch_gen.py ├── dev ├── README.md ├── doc │ ├── arch.png │ ├── arch.svg │ ├── liquid-cache-vldb.pdf │ └── logo.png ├── git-hooks │ └── pre-push ├── install-git-hooks.sh ├── liquid_cache_server.dockerfile └── thoughts │ ├── architecture.md │ ├── artifact-eval.md │ ├── debugging-tips.md │ └── thoughts.md ├── examples ├── Cargo.toml ├── README.md ├── example_client.rs ├── example_server.rs └── nano_hits.parquet ├── flake.lock ├── flake.nix ├── rust-toolchain.toml └── src ├── client ├── Cargo.toml └── src │ ├── client_exec.rs │ ├── lib.rs │ ├── metrics.rs │ ├── optimizer.rs │ └── tests │ ├── mod.rs │ └── snapshots │ ├── liquid_cache_client__tests__tpch_q1.snap │ ├── liquid_cache_client__tests__tpch_q10.snap │ ├── liquid_cache_client__tests__tpch_q11.snap │ ├── liquid_cache_client__tests__tpch_q12.snap │ ├── liquid_cache_client__tests__tpch_q13.snap │ ├── liquid_cache_client__tests__tpch_q14.snap │ ├── liquid_cache_client__tests__tpch_q16.snap │ ├── liquid_cache_client__tests__tpch_q17.snap │ ├── liquid_cache_client__tests__tpch_q18.snap │ ├── liquid_cache_client__tests__tpch_q19.snap │ ├── liquid_cache_client__tests__tpch_q2.snap │ ├── liquid_cache_client__tests__tpch_q20.snap │ ├── liquid_cache_client__tests__tpch_q21.snap │ ├── liquid_cache_client__tests__tpch_q22.snap │ ├── liquid_cache_client__tests__tpch_q3.snap │ ├── liquid_cache_client__tests__tpch_q4.snap │ ├── liquid_cache_client__tests__tpch_q5.snap │ ├── liquid_cache_client__tests__tpch_q6.snap │ ├── liquid_cache_client__tests__tpch_q7.snap │ ├── liquid_cache_client__tests__tpch_q8.snap │ └── liquid_cache_client__tests__tpch_q9.snap ├── common ├── Cargo.toml └── src │ ├── lib.rs │ ├── rpc.rs │ └── utils.rs ├── liquid_parquet ├── Cargo.toml ├── bench │ ├── bench_eviction.rs │ ├── bitpacking.rs │ ├── boolean_and_then.rs │ ├── eviction_cache.rs │ ├── fsstarray.rs │ └── liquid_float_array.rs ├── clippy.toml └── src │ ├── cache │ ├── budget.rs │ ├── mod.rs │ ├── policies.rs │ ├── stats.rs │ ├── store.rs │ ├── tracer.rs │ ├── transcode.rs │ └── utils.rs │ ├── lib.rs │ ├── liquid_array │ ├── byte_array.rs │ ├── fix_len_byte_array.rs │ ├── float_array.rs │ ├── ipc.rs │ ├── mod.rs │ ├── primitive_array.rs │ ├── raw │ │ ├── bit_pack_array.rs │ │ ├── fsst_array.rs │ │ └── mod.rs │ └── utils.rs │ ├── reader │ ├── mod.rs │ ├── plantime │ │ ├── mod.rs │ │ ├── opener.rs │ │ ├── row_filter.rs │ │ ├── row_group_filter.rs │ │ └── source.rs │ ├── runtime │ │ ├── in_memory_rg.rs │ │ ├── liquid_stream.rs │ │ ├── mod.rs │ │ ├── parquet_bridge.rs │ │ ├── reader │ │ │ ├── cached_array_reader.rs │ │ │ ├── cached_page.rs │ │ │ ├── liquid_batch_reader.rs │ │ │ ├── mod.rs │ │ │ └── tests.rs │ │ └── utils.rs │ └── utils │ │ ├── boolean_selection.rs │ │ └── mod.rs │ ├── sync.rs │ └── utils.rs └── server ├── Cargo.toml └── src ├── admin_server ├── flamegraph.rs ├── handlers.rs ├── mod.rs └── models.rs ├── errors.rs ├── lib.rs ├── local_cache.rs ├── service.rs ├── tests ├── cases.rs ├── mod.rs └── snapshots │ ├── liquid_cache_server__tests__cases__parquet_with_page_index.snap │ ├── liquid_cache_server__tests__min_max.snap │ ├── liquid_cache_server__tests__os.snap │ ├── liquid_cache_server__tests__referer.snap │ ├── liquid_cache_server__tests__title.snap │ ├── liquid_cache_server__tests__url.snap │ └── liquid_cache_server__tests__url_prefix.snap └── utils.rs /.envrc: -------------------------------------------------------------------------------- 1 | use flake 2 | -------------------------------------------------------------------------------- /.github/release.yml: -------------------------------------------------------------------------------- 1 | # Configuration for GitHub's automatic release notes 2 | # See: https://docs.github.com/en/repositories/releasing-projects-on-github/automatically-generated-release-notes 3 | 4 | changelog: 5 | exclude: 6 | labels: 7 | - ignore-for-release 8 | - dependencies 9 | categories: 10 | - title: 🚀 Features 11 | labels: 12 | - feature 13 | - enhancement 14 | - Semver-Minor 15 | - title: 🐛 Bug Fixes 16 | labels: 17 | - bug 18 | - bugfix 19 | - fix 20 | - title: ⚠️ Breaking Changes 21 | labels: 22 | - breaking-change 23 | - Semver-Major 24 | - title: 📚 Documentation 25 | labels: 26 | - documentation 27 | - docs 28 | - title: 🔧 Maintenance 29 | labels: 30 | - chore 31 | - refactor 32 | - test 33 | - title: 📦 Dependencies 34 | labels: 35 | - dependencies 36 | - title: 🌱 Other Changes 37 | labels: 38 | - "*" -------------------------------------------------------------------------------- /.github/workflows/docker-publish.yml: -------------------------------------------------------------------------------- 1 | name: Docker 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | jobs: 8 | build: 9 | runs-on: ubuntu-latest 10 | permissions: 11 | packages: write 12 | 13 | steps: 14 | - uses: actions/checkout@v4 15 | - uses: dtolnay/rust-toolchain@master 16 | with: 17 | toolchain: nightly-2025-03-03 18 | - uses: Swatinem/rust-cache@v2 19 | - name: Build binary 20 | run: cargo build --release --bin bench_server 21 | 22 | - name: Log in to the Container registry 23 | uses: docker/login-action@v3 24 | with: 25 | registry: ghcr.io 26 | username: ${{ github.actor }} 27 | password: ${{ secrets.GITHUB_TOKEN }} 28 | 29 | - name: Set lower case owner name 30 | run: | 31 | echo "OWNER_LC=${OWNER,,}" >> ${GITHUB_ENV} 32 | env: 33 | OWNER: '${{ github.repository_owner }}' 34 | 35 | - name: Build and push Docker image 36 | uses: docker/build-push-action@v4 37 | with: 38 | context: . 39 | push: true 40 | tags: ghcr.io/${{ env.OWNER_LC }}/liquid-cache/liquid-cache-server:latest,ghcr.io/${{ env.OWNER_LC }}/liquid-cache/liquid-cache-server:${{ github.event.release.tag_name }} 41 | file: dev/liquid_cache_server.dockerfile 42 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish Release 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | version_bump: 7 | description: 'Version bump type' 8 | required: true 9 | default: 'patch' 10 | type: 'choice' 11 | options: 12 | - patch 13 | - minor 14 | - major 15 | custom_version: 16 | description: 'Custom version (optional, overrides version_bump if provided)' 17 | required: false 18 | type: 'string' 19 | 20 | jobs: 21 | release-and-publish: 22 | name: Release and Publish 23 | runs-on: ubuntu-latest 24 | permissions: 25 | contents: write 26 | pull-requests: write 27 | steps: 28 | - name: Checkout code 29 | uses: actions/checkout@v4 30 | with: 31 | fetch-depth: 0 32 | token: ${{ secrets.GITHUB_TOKEN }} 33 | 34 | - name: Set up Rust 35 | uses: dtolnay/rust-toolchain@master 36 | with: 37 | toolchain: nightly-2025-04-29 38 | 39 | - name: Install cargo-release 40 | run: cargo install cargo-release 41 | 42 | - name: Configure Git 43 | run: | 44 | git config --local user.email "github-actions[bot]@users.noreply.github.com" 45 | git config --local user.name "github-actions[bot]" 46 | 47 | - name: Generate branch name 48 | id: branch-name 49 | run: | 50 | echo "release_branch=release/$(date +'%Y%m%d')" >> $GITHUB_OUTPUT 51 | 52 | - name: Execute Release 53 | id: release 54 | run: | 55 | # Determine version bump level 56 | LEVEL="${{ github.event.inputs.version_bump }}" 57 | CUSTOM_VERSION="${{ github.event.inputs.custom_version }}" 58 | 59 | if [ -n "$CUSTOM_VERSION" ]; then 60 | # Use custom version 61 | cargo release "$CUSTOM_VERSION" --execute --no-confirm --no-push --no-tag --no-publish 62 | echo "new_version=$CUSTOM_VERSION" >> $GITHUB_OUTPUT 63 | else 64 | # Use version bump type 65 | cargo release $LEVEL --execute --no-confirm --no-push --no-tag --no-publish 66 | # Get the new version 67 | NEW_VERSION=$(grep -m 1 'version = ' Cargo.toml | sed 's/version = "\(.*\)"/\1/') 68 | echo "new_version=$NEW_VERSION" >> $GITHUB_OUTPUT 69 | fi 70 | 71 | - name: Publish crates 72 | env: 73 | CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} 74 | run: | 75 | # Publish in dependency order 76 | cargo publish -p liquid-cache-common 77 | sleep 30 # Wait for the registry to update 78 | 79 | cargo publish -p liquid-cache-parquet 80 | sleep 30 81 | 82 | cargo publish -p liquid-cache-client 83 | sleep 30 84 | 85 | cargo publish -p liquid-cache-server 86 | sleep 30 87 | 88 | - name: Create GitHub Release 89 | uses: ncipollo/release-action@v1 90 | with: 91 | tag: v${{ steps.release.outputs.new_version }} 92 | name: Release v${{ steps.release.outputs.new_version }} 93 | generateReleaseNotes: true 94 | draft: false 95 | prerelease: false 96 | token: ${{ secrets.GITHUB_TOKEN }} 97 | 98 | - name: Create Pull Request 99 | id: create_pr 100 | uses: peter-evans/create-pull-request@v7 101 | with: 102 | token: ${{ secrets.GITHUB_TOKEN }} 103 | branch: ${{ steps.branch-name.outputs.release_branch }} 104 | title: "Release v${{ steps.release.outputs.new_version }}" 105 | body: | 106 | This PR prepares release v${{ steps.release.outputs.new_version }}. 107 | 108 | **IMPORTANT**: This PR MUST be merged as the crates have already been published to crates.io with this version. 109 | commit-message: "chore: prepare release v${{ steps.release.outputs.new_version }}" 110 | delete-branch: true 111 | 112 | - name: Output PR URL 113 | run: echo "Pull request created at ${{ steps.create_pr.outputs.pull-request-url }}" -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | .DS_Store 3 | .idea 4 | profile.json 5 | data/ 6 | .direnv 7 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "type": "lldb", 9 | "request": "launch", 10 | "name": "bench_server", 11 | "cargo": { 12 | "args": [ 13 | "build", 14 | "--bin=bench_server", 15 | ], 16 | "filter": { 17 | "name": "bench_server", 18 | "kind": "bin" 19 | } 20 | }, 21 | "env": { 22 | "RUST_LOG": "info" 23 | }, 24 | "args": [ 25 | "--max-cache-mb", 26 | "10" 27 | ], 28 | "cwd": "${workspaceFolder}", 29 | }, 30 | { 31 | "type": "lldb", 32 | "request": "launch", 33 | "name": "clickbench_client", 34 | "cargo": { 35 | "args": [ 36 | "build", 37 | "--bin=clickbench_client", 38 | ], 39 | "filter": { 40 | "name": "clickbench_client", 41 | "kind": "bin" 42 | } 43 | }, 44 | "env": { 45 | "RUST_LOG": "info" 46 | }, 47 | "args": [ 48 | "--query-path", 49 | "benchmark/clickbench/queries/queries.sql", 50 | "--file", 51 | "benchmark/clickbench/data/hits_0.parquet", 52 | "--query", 53 | "24" 54 | ], 55 | "cwd": "${workspaceFolder}" 56 | }, 57 | { 58 | "type": "lldb", 59 | "request": "launch", 60 | "name": "tpch_client", 61 | "cargo": { 62 | "args": [ 63 | "build", 64 | "--bin=tpch_client", 65 | ], 66 | "filter": { 67 | "name": "tpch_client", 68 | "kind": "bin" 69 | } 70 | }, 71 | "env": { 72 | "RUST_LOG": "info" 73 | }, 74 | "args": [ 75 | "--query-dir", 76 | "benchmark/tpch/queries", 77 | "--data-dir", 78 | "benchmark/tpch/data/sf100.0", 79 | "--query", 80 | "6", 81 | ], 82 | "cwd": "${workspaceFolder}" 83 | } 84 | ] 85 | } -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace.package] 2 | version = "0.1.4" 3 | edition = "2024" 4 | repository = "https://github.com/XiangpengHao/liquid-cache" 5 | authors = ["XiangpengHao "] 6 | license = "Apache-2.0 OR MIT" 7 | readme = "README.md" 8 | description = "10x lower latency for cloud-native DataFusion" 9 | 10 | 11 | [workspace] 12 | members = [ 13 | "benchmark", 14 | "src/server", 15 | "src/client", 16 | "src/liquid_parquet", 17 | "src/common", 18 | "examples", 19 | ] 20 | resolver = "3" 21 | 22 | [workspace.dependencies] 23 | liquid-cache-server = { path = "src/server", version = "0.1.4" } 24 | liquid-cache-client = { path = "src/client", version = "0.1.4" } 25 | liquid-cache-parquet = { path = "src/liquid_parquet", version = "0.1.4" } 26 | liquid-cache-common = { path = "src/common", version = "0.1.4" } 27 | arrow = { version = "55.1.0", default-features = false, features = [ 28 | "prettyprint", 29 | ] } 30 | arrow-flight = { version = "55.1.0", features = ["flight-sql-experimental"] } 31 | arrow-schema = { version = "55.1.0", features = ["serde"] } 32 | parquet = { version = "55.1.0", features = ["async", "experimental"] } 33 | datafusion = { version = "47.0.0" } 34 | datafusion-proto = { version = "47.0.0" } 35 | async-trait = "0.1.88" 36 | futures = { version = "0.3.31", default-features = false, features = ["std"] } 37 | tokio = { version = "1.45.0", features = ["rt-multi-thread"] } 38 | log = "0.4.27" 39 | tonic = { version = "0.12" } 40 | url = "2.5.4" 41 | itertools = "0.14.0" 42 | bytes = { version = "1.10.1", default-features = false } 43 | ahash = "0.8.12" 44 | prost = "0.13.5" 45 | object_store = { version = "0.12.1", default-features = false } 46 | serde = { version = "1.0", default-features = false, features = ["derive"] } 47 | serde_json = { version = "1.0", default-features = false, features = ["std"] } 48 | tempfile = "3.20.0" 49 | uuid = { version = "1.16.0", features = ["v4"] } 50 | fastrace = "0.7" 51 | fastrace-futures = "0.7" 52 | fastrace-tonic = "0.1" 53 | congee = "0.4.1" 54 | 55 | 56 | [profile.dev.package] 57 | insta.opt-level = 3 58 | 59 | [patch.crates-io] 60 | # datafusion = { path = "../datafusion/datafusion/core" } 61 | # datafusion-proto = { path = "../datafusion/datafusion/proto" } 62 | -------------------------------------------------------------------------------- /benchmark/.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | tpch/answers 3 | -------------------------------------------------------------------------------- /benchmark/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "liquid-cache-benchmarks" 3 | description = "LiquidCache Benchmarks" 4 | edition = { workspace = true } 5 | publish = false 6 | 7 | [dependencies] 8 | liquid-cache-server = { workspace = true } 9 | liquid-cache-client = { workspace = true } 10 | liquid-cache-parquet = { workspace = true } 11 | liquid-cache-common = { workspace = true } 12 | async-trait = { workspace = true } 13 | futures = { workspace = true } 14 | datafusion = { workspace = true } 15 | tokio = { workspace = true } 16 | log = { workspace = true } 17 | arrow-flight = { workspace = true } 18 | tonic = { workspace = true } 19 | clap = { version = "4.5.38", features = ["derive"] } 20 | url = { workspace = true } 21 | mimalloc = "0.1.46" 22 | serde_json.workspace = true 23 | serde.workspace = true 24 | sysinfo = { version = "0.35.1", default-features = false, features = [ 25 | "network", 26 | ] } 27 | object_store = { workspace = true, features = ["http"] } 28 | bytes = { workspace = true } 29 | prost = { workspace = true } 30 | fsst-rs = "0.5.2" 31 | parquet = { workspace = true } 32 | fastrace = { version = "0.7.9", features = ["enable"] } 33 | fastrace-tonic = { workspace = true } 34 | fastrace-opentelemetry = "0.10" 35 | opentelemetry = "0.29.1" 36 | opentelemetry_sdk = "0.29.0" 37 | opentelemetry-otlp = { version = "0.29.0", features = ["trace", "grpc-tonic"] } 38 | tower = "0.5.2" 39 | logforth = { version = "0.25.0", features = ["opentelemetry"] } 40 | reqwest = { version = "0.12.15", default-features = false, features = ["json"] } 41 | uuid = { version = "1.13.0", features = ["v4"] } 42 | 43 | [[bin]] 44 | name = "clickbench_client" 45 | path = "clickbench/clickbench_client.rs" 46 | 47 | [[bin]] 48 | name = "tpch_client" 49 | path = "tpch/tpch_client.rs" 50 | 51 | [[bin]] 52 | name = "bench_server" 53 | path = "bench_server.rs" 54 | -------------------------------------------------------------------------------- /benchmark/README.md: -------------------------------------------------------------------------------- 1 | # Benchmark Guide 2 | 3 | ## [ClickBench](https://github.com/ClickHouse/ClickBench) 4 | 5 | ### Download dataset 6 | To download partitioned dataset (~100MB): 7 | ```bash 8 | wget https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_0.parquet -O benchmark/data/hits_0.parquet 9 | ``` 10 | 11 | To download the entire dataset (~15GB): 12 | 13 | ```bash 14 | wget https://datasets.clickhouse.com/hits_compatible/athena/hits.parquet -O benchmark/clickbench/data/hits.parquet 15 | ``` 16 | 17 | To download the partitioned dataset (100 files, ~150MB each): 18 | ```bash 19 | for i in (seq 0 99) 20 | wget https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_$i.parquet -O benchmark/clickbench/data/partitioned/hits_$i.parquet 21 | end 22 | ``` 23 | Or bash : 24 | ```bash 25 | for i in {0..99}; do 26 | wget https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_$i.parquet -O benchmark/clickbench/data/partitioned/hits_$i.parquet 27 | done 28 | ``` 29 | 30 | ### Run benchmarks 31 | 32 | #### Minimal 33 | 34 | ```bash 35 | cargo run --release --bin bench_server 36 | cargo run --release --bin clickbench_client -- --query-path benchmark/clickbench/queries/queries.sql --file benchmark/clickbench/data/hits.parquet 37 | ``` 38 | 39 | #### Advanced 40 | 41 | ```bash 42 | env RUST_LOG=info RUST_BACKTRACE=1 RUSTFLAGS='-C target-cpu=native' cargo run --release --bin bench_server -- --cache-mode liquid_eager_transcode 43 | env RUST_LOG=info RUST_BACKTRACE=1 RUSTFLAGS='-C target-cpu=native' cargo run --release --bin clickbench_client -- --query-path benchmark/clickbench/queries/queries.sql --file benchmark/clickbench/data/hits.parquet --query 42 44 | ``` 45 | 46 | ## TPCH 47 | 48 | ### Generate data 49 | 50 | (make sure you have [uv](https://docs.astral.sh/uv/getting-started/installation/) installed) 51 | 52 | ```bash 53 | cd benchmark/tpch 54 | uvx --from duckdb python tpch_gen.py --scale 0.01 55 | ``` 56 | 57 | In NixOS, you want to set `env LD_LIBRARY_PATH=$NIX_LD_LIBRARY_PATH` 58 | 59 | ### Run server (same as ClickBench) 60 | 61 | ```bash 62 | cargo run --release --bin bench_server -- --cache-mode liquid_eager_transcode 63 | ``` 64 | 65 | ### Run client 66 | 67 | ```bash 68 | env RUST_LOG=info,clickbench_client=debug RUSTFLAGS='-C target-cpu=native' cargo run --release --bin tpch_client -- --query-dir benchmark/tpch/queries/ --data-dir benchmark/tpch/data/sf0.1 --iteration 3 --answer-dir benchmark/tpch/answers/sf0.1 69 | ``` 70 | 71 | 72 | 73 | ## Profile 74 | 75 | ### Flamegraph 76 | 77 | To collect flamegraph from server side, simply add `--flamegraph-dir benchmark/data/flamegraph` to the server command, for example: 78 | ```bash 79 | cargo run --release --bin bench_server -- --flamegraph-dir benchmark/data/flamegraph 80 | ``` 81 | It will generate flamegraph for each query that the server executed. 82 | 83 | ### Cache stats 84 | 85 | To collect cache stats, simply add `--stats-dir benchmark/data/cache_stats` to the server command, for example: 86 | ```bash 87 | cargo run --release --bin bench_server -- --stats-dir benchmark/data/cache_stats 88 | ``` 89 | It will generate a parquet file that contains the cache stats for each query that the server executed. 90 | You can use [`parquet-viewer`](https://parquet-viewer.xiangpeng.systems) to view the stats in the browser. 91 | 92 | ### Collect cache trace 93 | 94 | To collect cache trace, simply add `--cache-trace-dir benchmark/data/cache_trace` to the client command, for example: 95 | ```bash 96 | env RUST_LOG=info cargo run --bin clickbench_client --release -- --query-path benchmark/clickbench/queries/queries.sql --file benchmark/clickbench/data/hits.parquet --query 20 --iteration 2 --partitions 8 --cache-trace-dir benchmark/data/ 97 | ``` 98 | It will generate a parquet file that contains the cache trace for each query that the server executed. 99 | 100 | 101 | ### Run encoding benchmarks 102 | 103 | ```bash 104 | RUST_LOG=info RUSTFLAGS='-C target-cpu=native' cargo run --release --bin encoding -- --file benchmark/clickbench/data/hits.parquet --column 2 105 | ``` 106 | This will benchmark the encoding time of the `URL` column. 107 | -------------------------------------------------------------------------------- /benchmark/bench_server.rs: -------------------------------------------------------------------------------- 1 | use arrow_flight::flight_service_server::FlightServiceServer; 2 | use clap::Parser; 3 | use fastrace_tonic::FastraceServerLayer; 4 | use liquid_cache_benchmarks::setup_observability; 5 | use liquid_cache_common::{CacheEvictionStrategy, CacheMode}; 6 | use liquid_cache_server::{LiquidCacheService, run_admin_server}; 7 | use log::info; 8 | use mimalloc::MiMalloc; 9 | use std::{net::SocketAddr, path::PathBuf, sync::Arc}; 10 | use tonic::transport::Server; 11 | 12 | #[global_allocator] 13 | static GLOBAL: MiMalloc = MiMalloc; 14 | 15 | #[derive(Parser)] 16 | #[command(name = "ClickBench Benchmark Server")] 17 | struct CliArgs { 18 | /// Address to listen on 19 | #[arg(long, default_value = "127.0.0.1:15214")] 20 | address: SocketAddr, 21 | 22 | /// HTTP address for admin endpoint 23 | #[arg(long = "admin-address", default_value = "127.0.0.1:53703")] 24 | admin_address: SocketAddr, 25 | 26 | /// Abort the server if any thread panics 27 | #[arg(long = "abort-on-panic")] 28 | abort_on_panic: bool, 29 | 30 | /// Maximum cache size in MB 31 | #[arg(long = "max-cache-mb")] 32 | max_cache_mb: Option, 33 | 34 | /// Path to disk cache directory 35 | #[arg(long = "disk-cache-dir")] 36 | disk_cache_dir: Option, 37 | 38 | /// Cache mode 39 | #[arg(long = "cache-mode", default_value = "liquid_eager_transcode")] 40 | cache_mode: CacheMode, 41 | 42 | /// Openobserve auth token 43 | #[arg(long)] 44 | openobserve_auth: Option, 45 | } 46 | 47 | #[tokio::main] 48 | async fn main() -> Result<(), Box> { 49 | let args = CliArgs::parse(); 50 | setup_observability( 51 | "liquid-cache-server", 52 | opentelemetry::trace::SpanKind::Server, 53 | args.openobserve_auth.as_deref(), 54 | ); 55 | 56 | let max_cache_bytes = args.max_cache_mb.map(|size| size * 1024 * 1024); 57 | 58 | if args.abort_on_panic { 59 | // Be loud and crash loudly if any thread panics. 60 | // This will stop the server if any thread panics. 61 | // But will prevent debugger to break on panic. 62 | std::panic::set_hook(Box::new(|info| { 63 | eprintln!("Some thread panicked: {info:?}"); 64 | std::process::exit(1); 65 | })); 66 | } 67 | 68 | let ctx = LiquidCacheService::context()?; 69 | let liquid_cache_server = LiquidCacheService::new( 70 | ctx, 71 | max_cache_bytes, 72 | args.disk_cache_dir.clone(), 73 | args.cache_mode, 74 | CacheEvictionStrategy::Discard, 75 | )?; 76 | 77 | let liquid_cache_server = Arc::new(liquid_cache_server); 78 | let flight = FlightServiceServer::from_arc(liquid_cache_server.clone()); 79 | 80 | info!("LiquidCache server listening on {}", args.address); 81 | info!("Admin server listening on {}", args.admin_address); 82 | info!( 83 | "Dashboard: https://liquid-cache-admin.xiangpeng.systems/?host=http://{}", 84 | args.admin_address 85 | ); 86 | 87 | // Run both servers concurrently 88 | tokio::select! { 89 | result = Server::builder().layer(FastraceServerLayer).add_service(flight).serve(args.address) => { 90 | result?; 91 | }, 92 | result = run_admin_server(args.admin_address, liquid_cache_server) => { 93 | result?; 94 | }, 95 | } 96 | 97 | fastrace::flush(); 98 | Ok(()) 99 | } 100 | -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q0.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q0.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q1.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q1.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q10.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q10.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q11.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q11.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q12.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q12.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q13.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q13.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q14.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q14.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q15.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q15.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q16.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q16.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q17.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q17.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q18.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q18.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q2.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q2.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q20.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q20.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q21.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q21.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q22.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q22.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q23.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q23.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q24.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q24.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q25.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q25.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q26.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q26.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q27.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q27.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q28.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q28.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q29.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q29.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q3.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q3.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q30.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q30.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q31.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q31.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q32.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q32.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q33.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q33.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q34.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q34.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q35.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q35.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q36.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q36.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q37.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q37.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q38.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q38.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q39.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q39.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q4.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q4.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q40.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q40.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q42.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q42.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q5.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q5.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q6.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q6.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q7.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q7.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q8.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q8.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/answers/clickbench_0/Q9.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q9.parquet -------------------------------------------------------------------------------- /benchmark/clickbench/data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/data/.gitkeep -------------------------------------------------------------------------------- /benchmark/clickbench/queries/query_select.sql: -------------------------------------------------------------------------------- 1 | SELECT "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhoneModel" ORDER BY u DESC LIMIT 10; 2 | SELECT "UserID" FROM hits WHERE "UserID" = 435090932899640449; 3 | SELECT COUNT(*) FROM hits WHERE "URL" LIKE '%google%'; 4 | SELECT "SearchPhrase", MIN("URL"), COUNT(*) AS c FROM hits WHERE "URL" LIKE '%google%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10; 5 | SELECT "SearchPhrase", MIN("URL"), MIN("Title"), COUNT(*) AS c, COUNT(DISTINCT "UserID") FROM hits WHERE "Title" LIKE '%Google%' AND "URL" NOT LIKE '%.google.%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10; 6 | SELECT * FROM hits WHERE "URL" LIKE '%google%' ORDER BY to_timestamp_seconds("EventTime") LIMIT 10; 7 | SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10; 8 | -------------------------------------------------------------------------------- /benchmark/src/bin/create_nano_hits.rs: -------------------------------------------------------------------------------- 1 | use parquet::arrow::ArrowWriter; 2 | use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; 3 | use parquet::file::properties::WriterProperties; 4 | use std::fs::File; 5 | 6 | fn main() { 7 | let file = File::open("benchmark/clickbench/data/hits.parquet").unwrap(); 8 | let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap(); 9 | 10 | let compression_alg = builder.metadata().row_groups()[0].columns()[0].compression(); 11 | 12 | let mut arrow_reader = builder.with_batch_size(8192 * 3).build().unwrap(); 13 | 14 | let batch_one = arrow_reader.next().unwrap().unwrap(); 15 | let batch_two = arrow_reader.next().unwrap().unwrap(); 16 | 17 | let props = WriterProperties::builder() 18 | .set_compression(compression_alg) 19 | .set_max_row_group_size(8192 * 3) 20 | .build(); 21 | 22 | let file = File::create("examples/nano_hits.parquet").unwrap(); 23 | 24 | let mut writer = ArrowWriter::try_new(file, batch_one.schema(), Some(props)).unwrap(); 25 | 26 | writer 27 | .write(&batch_one) 28 | .expect("Writing batch 1 (Full Batch)"); 29 | writer 30 | .write(&batch_two.slice(0, 10)) 31 | .expect("Writing batch 2 (Small Batch)"); 32 | 33 | writer.close().unwrap(); 34 | } 35 | -------------------------------------------------------------------------------- /benchmark/src/observability.rs: -------------------------------------------------------------------------------- 1 | use fastrace_opentelemetry::OpenTelemetryReporter; 2 | use logforth::append::opentelemetry::OpentelemetryLogBuilder; 3 | use logforth::filter::EnvFilter; 4 | use opentelemetry::InstrumentationScope; 5 | use opentelemetry::KeyValue; 6 | use opentelemetry::trace::SpanKind; 7 | use opentelemetry_otlp::LogExporter; 8 | use opentelemetry_otlp::WithExportConfig; 9 | use opentelemetry_otlp::{SpanExporter, WithTonicConfig}; 10 | use opentelemetry_sdk::Resource; 11 | use std::borrow::Cow; 12 | use tonic::metadata::MetadataMap; 13 | 14 | fn otl_metadata(auth: &str) -> MetadataMap { 15 | let mut map = MetadataMap::with_capacity(3); 16 | map.insert("authorization", format!("Basic {auth}").parse().unwrap()); 17 | map.insert("organization", "default".parse().unwrap()); 18 | map.insert("stream-name", "default".parse().unwrap()); 19 | map 20 | } 21 | 22 | pub fn setup_observability(service_name: &str, kind: SpanKind, auth: Option<&str>) { 23 | let Some(auth) = auth else { 24 | logforth::builder() 25 | .dispatch(|d| { 26 | d.filter(EnvFilter::from_default_env()) 27 | .append(logforth::append::Stdout::default()) 28 | }) 29 | .apply(); 30 | return; 31 | }; 32 | 33 | // Setup logging with logforth 34 | let log_exporter = LogExporter::builder() 35 | .with_tonic() 36 | .with_endpoint("http://localhost:5081/api/development".to_string()) 37 | .with_protocol(opentelemetry_otlp::Protocol::Grpc) 38 | .with_metadata(otl_metadata(auth)) 39 | .build() 40 | .unwrap(); 41 | logforth::builder() 42 | .dispatch(|d| { 43 | d.filter(EnvFilter::from_default_env()) 44 | .append(logforth::append::Stdout::default()) 45 | }) 46 | .dispatch(|d| { 47 | let otl_appender = OpentelemetryLogBuilder::new(service_name, log_exporter) 48 | .build() 49 | .unwrap(); 50 | d.filter(EnvFilter::from_default_env()).append(otl_appender) 51 | }) 52 | .apply(); 53 | 54 | let trace_exporter = OpenTelemetryReporter::new( 55 | SpanExporter::builder() 56 | .with_tonic() 57 | .with_endpoint("http://localhost:5081/api/development".to_string()) 58 | .with_metadata(otl_metadata(auth)) 59 | .with_protocol(opentelemetry_otlp::Protocol::Grpc) 60 | .build() 61 | .expect("initialize oltp exporter"), 62 | kind, 63 | Cow::Owned( 64 | Resource::builder() 65 | .with_attributes([KeyValue::new("service.name", service_name.to_string())]) 66 | .build(), 67 | ), 68 | InstrumentationScope::builder(env!("CARGO_PKG_NAME")) 69 | .with_version(env!("CARGO_PKG_VERSION")) 70 | .build(), 71 | ); 72 | fastrace::set_reporter(trace_exporter, fastrace::collector::Config::default()); 73 | } 74 | -------------------------------------------------------------------------------- /benchmark/src/runner.rs: -------------------------------------------------------------------------------- 1 | use crate::{ 2 | BenchmarkResult, CommonBenchmarkArgs, IterationResult, Query, QueryResult, setup_observability, 3 | }; 4 | use datafusion::{ 5 | arrow::{array::RecordBatch, util::pretty}, 6 | error::Result, 7 | physical_plan::display::DisplayableExecutionPlan, 8 | prelude::SessionContext, 9 | }; 10 | use fastrace::prelude::*; 11 | use log::{debug, info}; 12 | use serde::Serialize; 13 | use std::{fs::File, sync::Arc, time::Instant}; 14 | use sysinfo::Networks; 15 | use uuid::Uuid; 16 | 17 | /// Trait that benchmarks must implement 18 | #[allow(async_fn_in_trait)] 19 | pub trait Benchmark: Serialize + Clone { 20 | type Args: Serialize + Clone; 21 | 22 | /// Get the common benchmark arguments 23 | fn common_args(&self) -> &CommonBenchmarkArgs; 24 | 25 | /// Get the benchmark-specific arguments 26 | fn args(&self) -> &Self::Args; 27 | 28 | /// Setup the session context for this benchmark 29 | async fn setup_context(&self) -> Result>; 30 | 31 | /// Get all queries to run for this benchmark 32 | async fn get_queries(&self) -> Result>; 33 | 34 | /// Validate query results against expected answers (optional) 35 | async fn validate_result(&self, query: &Query, results: &[RecordBatch]) -> Result<()>; 36 | 37 | /// Custom query execution logic (optional, for special cases like TPCH Q15) 38 | async fn execute_query( 39 | &self, 40 | ctx: &Arc, 41 | query: &Query, 42 | ) -> Result<( 43 | Vec, 44 | Arc, 45 | Vec, 46 | )>; 47 | 48 | /// Get the benchmark name for tracing 49 | fn benchmark_name(&self) -> &'static str; 50 | } 51 | 52 | /// Generic benchmark runner that handles the common execution logic 53 | pub struct BenchmarkRunner; 54 | 55 | impl BenchmarkRunner { 56 | /// Run a benchmark using the provided benchmark implementation 57 | pub async fn run(benchmark: B) -> Result> { 58 | let common = benchmark.common_args(); 59 | 60 | setup_observability( 61 | benchmark.benchmark_name(), 62 | opentelemetry::trace::SpanKind::Client, 63 | common.openobserve_auth.as_deref(), 64 | ); 65 | 66 | let ctx = benchmark.setup_context().await?; 67 | let queries = benchmark.get_queries().await?; 68 | let queries = if let Some(query) = common.query { 69 | vec![queries.into_iter().find(|q| q.id == query).unwrap()] 70 | } else { 71 | queries 72 | }; 73 | 74 | let mut benchmark_result = BenchmarkResult { 75 | args: benchmark.args().clone(), 76 | results: Vec::new(), 77 | }; 78 | 79 | std::fs::create_dir_all("benchmark/data/results")?; 80 | 81 | let mut networks = Networks::new_with_refreshed_list(); 82 | let bench_start_time = Instant::now(); 83 | 84 | for query in queries { 85 | let mut query_result = QueryResult::new(query.id, query.sql.clone()); 86 | 87 | for it in 0..common.iteration { 88 | let iteration_result = Self::run_single_iteration( 89 | &benchmark, 90 | &ctx, 91 | &query, 92 | it, 93 | &mut networks, 94 | bench_start_time, 95 | ) 96 | .await?; 97 | 98 | query_result.add(iteration_result); 99 | } 100 | 101 | if common.reset_cache { 102 | common.reset_cache().await?; 103 | } 104 | 105 | benchmark_result.results.push(query_result); 106 | } 107 | 108 | if let Some(output_path) = &common.output { 109 | let output_file = File::create(output_path)?; 110 | serde_json::to_writer_pretty(output_file, &benchmark_result).unwrap(); 111 | } 112 | 113 | fastrace::flush(); 114 | Ok(benchmark_result) 115 | } 116 | 117 | async fn run_single_iteration( 118 | benchmark: &B, 119 | ctx: &Arc, 120 | query: &Query, 121 | iteration: u32, 122 | networks: &mut Networks, 123 | bench_start_time: Instant, 124 | ) -> Result { 125 | let common = benchmark.common_args(); 126 | 127 | info!("Running query {}: \n{}", query.id, query.sql); 128 | 129 | common.start_trace().await; 130 | common.start_flamegraph().await; 131 | 132 | let root = Span::root( 133 | format!("{}-{}-{}", benchmark.benchmark_name(), query.id, iteration), 134 | SpanContext::random(), 135 | ); 136 | let _g = root.set_local_parent(); 137 | 138 | let now = Instant::now(); 139 | let starting_timestamp = bench_start_time.elapsed(); 140 | 141 | let (results, physical_plan, plan_uuid) = benchmark.execute_query(ctx, query).await?; 142 | let elapsed = now.elapsed(); 143 | 144 | networks.refresh(true); 145 | let network_info = networks 146 | .get("lo0") 147 | .or_else(|| networks.get("lo")) 148 | .expect("No loopback interface found in networks"); 149 | 150 | let flamegraph = if !plan_uuid.is_empty() { 151 | common.stop_flamegraph().await 152 | } else { 153 | None 154 | }; 155 | common.stop_trace().await; 156 | 157 | let physical_plan_with_metrics = 158 | DisplayableExecutionPlan::with_metrics(physical_plan.as_ref()); 159 | debug!( 160 | "Physical plan: \n{}", 161 | physical_plan_with_metrics.indent(true) 162 | ); 163 | let result_str = pretty::pretty_format_batches(&results).unwrap(); 164 | debug!("Query result: \n{result_str}"); 165 | 166 | benchmark.validate_result(query, &results).await?; 167 | 168 | common.get_cache_stats().await; 169 | let network_traffic = network_info.received(); 170 | 171 | if !plan_uuid.is_empty() { 172 | common 173 | .set_execution_stats( 174 | plan_uuid, 175 | flamegraph, 176 | format!("{}-q{}-{}", benchmark.benchmark_name(), query.id, iteration), 177 | network_traffic, 178 | elapsed.as_millis() as u64, 179 | query.sql.clone(), 180 | ) 181 | .await; 182 | } 183 | 184 | let metrics_response = common.get_execution_metrics(&physical_plan).await; 185 | 186 | let result = IterationResult { 187 | network_traffic, 188 | time_millis: elapsed.as_millis() as u64, 189 | cache_cpu_time: metrics_response.pushdown_eval_time, 190 | cache_memory_usage: metrics_response.cache_memory_usage, 191 | liquid_cache_usage: metrics_response.liquid_cache_usage, 192 | starting_timestamp, 193 | }; 194 | 195 | result.log(); 196 | Ok(result) 197 | } 198 | } 199 | -------------------------------------------------------------------------------- /benchmark/src/utils.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | 3 | use datafusion::arrow; 4 | use datafusion::arrow::array::Array; 5 | use datafusion::arrow::compute::kernels::numeric::{div, sub_wrapping}; 6 | use datafusion::arrow::datatypes::Float64Type; 7 | use datafusion::arrow::{ 8 | array::{AsArray, RecordBatch}, 9 | datatypes::DataType, 10 | }; 11 | use datafusion::common::tree_node::TreeNode; 12 | use datafusion::physical_plan::ExecutionPlan; 13 | use liquid_cache_client::LiquidCacheClientExec; 14 | use log::warn; 15 | use uuid::Uuid; 16 | 17 | pub(crate) fn get_plan_uuids(plan: &Arc) -> Vec { 18 | let mut uuids = Vec::new(); 19 | plan.apply(|plan| { 20 | if let Some(plan) = plan.as_any().downcast_ref::() { 21 | uuids.push(plan.get_uuid()); 22 | } 23 | Ok(datafusion::common::tree_node::TreeNodeRecursion::Continue) 24 | }) 25 | .unwrap(); 26 | uuids 27 | } 28 | 29 | fn float_eq_helper(left: &dyn Array, right: &dyn Array, tol: f64) -> bool { 30 | let diff = sub_wrapping(&left, &right).unwrap(); 31 | let diff = arrow::compute::kernels::cast(&diff, &DataType::Float64).unwrap(); 32 | let diff = diff.as_primitive_opt::().unwrap(); 33 | 34 | // Check if all differences are within tolerance 35 | if diff.iter().flatten().all(|v| v.abs() <= tol) { 36 | return true; 37 | } 38 | 39 | let scale = div(&diff, &left).unwrap(); 40 | let scale = arrow::compute::kernels::cast(&scale, &DataType::Float64).unwrap(); 41 | let scale = scale.as_primitive_opt::().unwrap(); 42 | for d in scale.iter().flatten() { 43 | if d.abs() > tol { 44 | warn!("scale: {scale:?}"); 45 | return false; 46 | } 47 | } 48 | true 49 | } 50 | 51 | pub fn assert_batch_eq(expected: &RecordBatch, actual: &RecordBatch) { 52 | use datafusion::arrow::compute::*; 53 | 54 | if expected.num_rows() != actual.num_rows() { 55 | panic!( 56 | "Left (answer) had {} rows, but right (result) had {} rows", 57 | expected.num_rows(), 58 | actual.num_rows() 59 | ); 60 | } 61 | if expected.columns().len() != actual.columns().len() { 62 | panic!( 63 | "Left (answer) had {} cols, but right (result) had {} cols", 64 | expected.columns().len(), 65 | actual.columns().len() 66 | ); 67 | } 68 | for (i, (c_expected, c_actual)) in expected 69 | .columns() 70 | .iter() 71 | .zip(actual.columns().iter()) 72 | .enumerate() 73 | { 74 | let casted_expected = cast(c_expected, c_actual.data_type()).unwrap(); 75 | let sorted_expected = sort(&casted_expected, None).unwrap(); 76 | let sorted_actual = sort(c_actual, None).unwrap(); 77 | 78 | let data_type = c_expected.data_type(); 79 | let tol: f64 = 1e-4; 80 | let ok = match data_type { 81 | DataType::Float16 => { 82 | unreachable!() 83 | } 84 | DataType::Float32 | DataType::Float64 => { 85 | float_eq_helper(&sorted_expected, &sorted_actual, tol) 86 | } 87 | _ => { 88 | let eq = 89 | arrow::compute::kernels::cmp::eq(&sorted_expected, &sorted_actual).unwrap(); 90 | eq.false_count() == 0 91 | } 92 | }; 93 | assert!( 94 | ok, 95 | "Column {} answer does not match result\nExpected: {:?}\n Actual: {:?}", 96 | expected.schema().field(i).name(), 97 | sorted_expected, 98 | sorted_actual 99 | ); 100 | } 101 | } 102 | 103 | #[cfg(test)] 104 | mod tests { 105 | 106 | use super::*; 107 | use datafusion::arrow::array::Float64Array; 108 | 109 | #[test] 110 | fn test_float_eq() { 111 | let left = Float64Array::from(vec![ 112 | 1.9481948778949233e18, 113 | 1.9481948778941111e18, 114 | 1.9481948778949233e18, 115 | 0.00, 116 | ]); 117 | let right = Float64Array::from(vec![ 118 | 1.948194877894922e18, 119 | 1.9481948778942222e18, 120 | 1.948194877894922e18, 121 | 0.00, 122 | ]); 123 | assert!(float_eq_helper(&left, &right, 1e-9)); 124 | 125 | let left = Float64Array::from(vec![0.00]); 126 | let right = Float64Array::from(vec![0.00]); 127 | assert!(float_eq_helper(&left, &right, 1e-9)); 128 | } 129 | 130 | #[should_panic] 131 | #[test] 132 | fn test_float_eq_helper() { 133 | let left = Float64Array::from(vec![0.00]); 134 | let right = Float64Array::from(vec![1.00]); 135 | assert!(float_eq_helper(&left, &right, 1e-9)); 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /benchmark/tpch/answers/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/tpch/answers/.gitkeep -------------------------------------------------------------------------------- /benchmark/tpch/data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/tpch/data/.gitkeep -------------------------------------------------------------------------------- /benchmark/tpch/data/sf0.001/customer.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/tpch/data/sf0.001/customer.parquet -------------------------------------------------------------------------------- /benchmark/tpch/data/sf0.001/lineitem.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/tpch/data/sf0.001/lineitem.parquet -------------------------------------------------------------------------------- /benchmark/tpch/data/sf0.001/nation.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/tpch/data/sf0.001/nation.parquet -------------------------------------------------------------------------------- /benchmark/tpch/data/sf0.001/orders.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/tpch/data/sf0.001/orders.parquet -------------------------------------------------------------------------------- /benchmark/tpch/data/sf0.001/part.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/tpch/data/sf0.001/part.parquet -------------------------------------------------------------------------------- /benchmark/tpch/data/sf0.001/partsupp.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/tpch/data/sf0.001/partsupp.parquet -------------------------------------------------------------------------------- /benchmark/tpch/data/sf0.001/region.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/tpch/data/sf0.001/region.parquet -------------------------------------------------------------------------------- /benchmark/tpch/data/sf0.001/supplier.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/tpch/data/sf0.001/supplier.parquet -------------------------------------------------------------------------------- /benchmark/tpch/queries/q1.sql: -------------------------------------------------------------------------------- 1 | select 2 | l_returnflag, 3 | l_linestatus, 4 | sum(l_quantity) as sum_qty, 5 | sum(l_extendedprice) as sum_base_price, 6 | sum(l_extendedprice * (1 - l_discount)) as sum_disc_price, 7 | sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge, 8 | avg(l_quantity) as avg_qty, 9 | avg(l_extendedprice) as avg_price, 10 | avg(l_discount) as avg_disc, 11 | count(*) as count_order 12 | from 13 | lineitem 14 | where 15 | l_shipdate <= date '1998-09-02' 16 | group by 17 | l_returnflag, 18 | l_linestatus 19 | order by 20 | l_returnflag, 21 | l_linestatus; -------------------------------------------------------------------------------- /benchmark/tpch/queries/q10.sql: -------------------------------------------------------------------------------- 1 | select 2 | c_custkey, 3 | c_name, 4 | sum(l_extendedprice * (1 - l_discount)) as revenue, 5 | c_acctbal, 6 | n_name, 7 | c_address, 8 | c_phone, 9 | c_comment 10 | from 11 | customer, 12 | orders, 13 | lineitem, 14 | nation 15 | where 16 | c_custkey = o_custkey 17 | and l_orderkey = o_orderkey 18 | and o_orderdate >= date '1993-10-01' 19 | and o_orderdate < date '1994-01-01' 20 | and l_returnflag = 'R' 21 | and c_nationkey = n_nationkey 22 | group by 23 | c_custkey, 24 | c_name, 25 | c_acctbal, 26 | c_phone, 27 | n_name, 28 | c_address, 29 | c_comment 30 | order by 31 | revenue desc 32 | LIMIT 20; 33 | -------------------------------------------------------------------------------- /benchmark/tpch/queries/q11.sql: -------------------------------------------------------------------------------- 1 | select 2 | ps_partkey, 3 | sum(ps_supplycost * ps_availqty) as value 4 | from 5 | partsupp, 6 | supplier, 7 | nation 8 | where 9 | ps_suppkey = s_suppkey 10 | and s_nationkey = n_nationkey 11 | and n_name = 'GERMANY' 12 | group by 13 | ps_partkey having 14 | sum(ps_supplycost * ps_availqty) > ( 15 | select 16 | sum(ps_supplycost * ps_availqty) * 0.0001 17 | from 18 | partsupp, 19 | supplier, 20 | nation 21 | where 22 | ps_suppkey = s_suppkey 23 | and s_nationkey = n_nationkey 24 | and n_name = 'GERMANY' 25 | ) 26 | order by 27 | value desc; -------------------------------------------------------------------------------- /benchmark/tpch/queries/q12.sql: -------------------------------------------------------------------------------- 1 | select 2 | l_shipmode, 3 | sum(case 4 | when o_orderpriority = '1-URGENT' 5 | or o_orderpriority = '2-HIGH' 6 | then 1 7 | else 0 8 | end) as high_line_count, 9 | sum(case 10 | when o_orderpriority <> '1-URGENT' 11 | and o_orderpriority <> '2-HIGH' 12 | then 1 13 | else 0 14 | end) as low_line_count 15 | from 16 | lineitem 17 | join 18 | orders 19 | on 20 | l_orderkey = o_orderkey 21 | where 22 | l_shipmode in ('MAIL', 'SHIP') 23 | and l_commitdate < l_receiptdate 24 | and l_shipdate < l_commitdate 25 | and l_receiptdate >= date '1994-01-01' 26 | and l_receiptdate < date '1995-01-01' 27 | group by 28 | l_shipmode 29 | order by 30 | l_shipmode; -------------------------------------------------------------------------------- /benchmark/tpch/queries/q13.sql: -------------------------------------------------------------------------------- 1 | select 2 | c_count, 3 | count(*) as custdist 4 | from 5 | ( 6 | select 7 | c_custkey, 8 | count(o_orderkey) 9 | from 10 | customer left outer join orders on 11 | c_custkey = o_custkey 12 | and o_comment not like '%special%requests%' 13 | group by 14 | c_custkey 15 | ) as c_orders (c_custkey, c_count) 16 | group by 17 | c_count 18 | order by 19 | custdist desc, 20 | c_count desc; -------------------------------------------------------------------------------- /benchmark/tpch/queries/q14.sql: -------------------------------------------------------------------------------- 1 | select 2 | 100.00 * sum(case 3 | when p_type like 'PROMO%' 4 | then l_extendedprice * (1 - l_discount) 5 | else 0 6 | end) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue 7 | from 8 | lineitem, 9 | part 10 | where 11 | l_partkey = p_partkey 12 | and l_shipdate >= date '1995-09-01' 13 | and l_shipdate < date '1995-10-01'; -------------------------------------------------------------------------------- /benchmark/tpch/queries/q15.sql: -------------------------------------------------------------------------------- 1 | create view revenue0 (supplier_no, total_revenue) as 2 | select 3 | l_suppkey, 4 | sum(l_extendedprice * (1 - l_discount)) 5 | from 6 | lineitem 7 | where 8 | l_shipdate >= date '1996-01-01' 9 | and l_shipdate < date '1996-01-01' + interval '3' month 10 | group by 11 | l_suppkey; 12 | 13 | 14 | select 15 | s_suppkey, 16 | s_name, 17 | s_address, 18 | s_phone, 19 | total_revenue 20 | from 21 | supplier, 22 | revenue0 23 | where 24 | s_suppkey = supplier_no 25 | and total_revenue = ( 26 | select 27 | max(total_revenue) 28 | from 29 | revenue0 30 | ) 31 | order by 32 | s_suppkey; 33 | 34 | drop view revenue0; -------------------------------------------------------------------------------- /benchmark/tpch/queries/q16.sql: -------------------------------------------------------------------------------- 1 | select 2 | p_brand, 3 | p_type, 4 | p_size, 5 | count(distinct ps_suppkey) as supplier_cnt 6 | from 7 | partsupp, 8 | part 9 | where 10 | p_partkey = ps_partkey 11 | and p_brand <> 'Brand#45' 12 | and p_type not like 'MEDIUM POLISHED%' 13 | and p_size in (49, 14, 23, 45, 19, 3, 36, 9) 14 | and ps_suppkey not in ( 15 | select 16 | s_suppkey 17 | from 18 | supplier 19 | where 20 | s_comment like '%Customer%Complaints%' 21 | ) 22 | group by 23 | p_brand, 24 | p_type, 25 | p_size 26 | order by 27 | supplier_cnt desc, 28 | p_brand, 29 | p_type, 30 | p_size; -------------------------------------------------------------------------------- /benchmark/tpch/queries/q17.sql: -------------------------------------------------------------------------------- 1 | select 2 | sum(l_extendedprice) / 7.0 as avg_yearly 3 | from 4 | lineitem, 5 | part 6 | where 7 | p_partkey = l_partkey 8 | and p_brand = 'Brand#23' 9 | and p_container = 'MED BOX' 10 | and l_quantity < ( 11 | select 12 | 0.2 * avg(l_quantity) 13 | from 14 | lineitem 15 | where 16 | l_partkey = p_partkey 17 | ); -------------------------------------------------------------------------------- /benchmark/tpch/queries/q18.sql: -------------------------------------------------------------------------------- 1 | select 2 | c_name, 3 | c_custkey, 4 | o_orderkey, 5 | o_orderdate, 6 | o_totalprice, 7 | sum(l_quantity) 8 | from 9 | customer, 10 | orders, 11 | lineitem 12 | where 13 | o_orderkey in ( 14 | select 15 | l_orderkey 16 | from 17 | lineitem 18 | group by 19 | l_orderkey having 20 | sum(l_quantity) > 300 21 | ) 22 | and c_custkey = o_custkey 23 | and o_orderkey = l_orderkey 24 | group by 25 | c_name, 26 | c_custkey, 27 | o_orderkey, 28 | o_orderdate, 29 | o_totalprice 30 | order by 31 | o_totalprice desc, 32 | o_orderdate; -------------------------------------------------------------------------------- /benchmark/tpch/queries/q19.sql: -------------------------------------------------------------------------------- 1 | select 2 | sum(l_extendedprice* (1 - l_discount)) as revenue 3 | from 4 | lineitem, 5 | part 6 | where 7 | ( 8 | p_partkey = l_partkey 9 | and p_brand = 'Brand#12' 10 | and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG') 11 | and l_quantity >= 1 and l_quantity <= 1 + 10 12 | and p_size between 1 and 5 13 | and l_shipmode in ('AIR', 'AIR REG') 14 | and l_shipinstruct = 'DELIVER IN PERSON' 15 | ) 16 | or 17 | ( 18 | p_partkey = l_partkey 19 | and p_brand = 'Brand#23' 20 | and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK') 21 | and l_quantity >= 10 and l_quantity <= 10 + 10 22 | and p_size between 1 and 10 23 | and l_shipmode in ('AIR', 'AIR REG') 24 | and l_shipinstruct = 'DELIVER IN PERSON' 25 | ) 26 | or 27 | ( 28 | p_partkey = l_partkey 29 | and p_brand = 'Brand#34' 30 | and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG') 31 | and l_quantity >= 20 and l_quantity <= 20 + 10 32 | and p_size between 1 and 15 33 | and l_shipmode in ('AIR', 'AIR REG') 34 | and l_shipinstruct = 'DELIVER IN PERSON' 35 | ); -------------------------------------------------------------------------------- /benchmark/tpch/queries/q2.sql: -------------------------------------------------------------------------------- 1 | select 2 | s_acctbal, 3 | s_name, 4 | n_name, 5 | p_partkey, 6 | p_mfgr, 7 | s_address, 8 | s_phone, 9 | s_comment 10 | from 11 | part, 12 | supplier, 13 | partsupp, 14 | nation, 15 | region 16 | where 17 | p_partkey = ps_partkey 18 | and s_suppkey = ps_suppkey 19 | and p_size = 15 20 | and p_type like '%BRASS' 21 | and s_nationkey = n_nationkey 22 | and n_regionkey = r_regionkey 23 | and r_name = 'EUROPE' 24 | and ps_supplycost = ( 25 | select 26 | min(ps_supplycost) 27 | from 28 | partsupp, 29 | supplier, 30 | nation, 31 | region 32 | where 33 | p_partkey = ps_partkey 34 | and s_suppkey = ps_suppkey 35 | and s_nationkey = n_nationkey 36 | and n_regionkey = r_regionkey 37 | and r_name = 'EUROPE' 38 | ) 39 | order by 40 | s_acctbal desc, 41 | n_name, 42 | s_name, 43 | p_partkey; -------------------------------------------------------------------------------- /benchmark/tpch/queries/q20.sql: -------------------------------------------------------------------------------- 1 | select 2 | s_name, 3 | s_address 4 | from 5 | supplier, 6 | nation 7 | where 8 | s_suppkey in ( 9 | select 10 | ps_suppkey 11 | from 12 | partsupp 13 | where 14 | ps_partkey in ( 15 | select 16 | p_partkey 17 | from 18 | part 19 | where 20 | p_name like 'forest%' 21 | ) 22 | and ps_availqty > ( 23 | select 24 | 0.5 * sum(l_quantity) 25 | from 26 | lineitem 27 | where 28 | l_partkey = ps_partkey 29 | and l_suppkey = ps_suppkey 30 | and l_shipdate >= date '1994-01-01' 31 | and l_shipdate < date '1994-01-01' + interval '1' year 32 | ) 33 | ) 34 | and s_nationkey = n_nationkey 35 | and n_name = 'CANADA' 36 | order by 37 | s_name; -------------------------------------------------------------------------------- /benchmark/tpch/queries/q21.sql: -------------------------------------------------------------------------------- 1 | select 2 | s_name, 3 | count(*) as numwait 4 | from 5 | supplier, 6 | lineitem l1, 7 | orders, 8 | nation 9 | where 10 | s_suppkey = l1.l_suppkey 11 | and o_orderkey = l1.l_orderkey 12 | and o_orderstatus = 'F' 13 | and l1.l_receiptdate > l1.l_commitdate 14 | and exists ( 15 | select 16 | * 17 | from 18 | lineitem l2 19 | where 20 | l2.l_orderkey = l1.l_orderkey 21 | and l2.l_suppkey <> l1.l_suppkey 22 | ) 23 | and not exists ( 24 | select 25 | * 26 | from 27 | lineitem l3 28 | where 29 | l3.l_orderkey = l1.l_orderkey 30 | and l3.l_suppkey <> l1.l_suppkey 31 | and l3.l_receiptdate > l3.l_commitdate 32 | ) 33 | and s_nationkey = n_nationkey 34 | and n_name = 'SAUDI ARABIA' 35 | group by 36 | s_name 37 | order by 38 | numwait desc, 39 | s_name; -------------------------------------------------------------------------------- /benchmark/tpch/queries/q22.sql: -------------------------------------------------------------------------------- 1 | select 2 | cntrycode, 3 | count(*) as numcust, 4 | sum(c_acctbal) as totacctbal 5 | from 6 | ( 7 | select 8 | substring(c_phone from 1 for 2) as cntrycode, 9 | c_acctbal 10 | from 11 | customer 12 | where 13 | substring(c_phone from 1 for 2) in 14 | ('13', '31', '23', '29', '30', '18', '17') 15 | and c_acctbal > ( 16 | select 17 | avg(c_acctbal) 18 | from 19 | customer 20 | where 21 | c_acctbal > 0.00 22 | and substring(c_phone from 1 for 2) in 23 | ('13', '31', '23', '29', '30', '18', '17') 24 | ) 25 | and not exists ( 26 | select 27 | * 28 | from 29 | orders 30 | where 31 | o_custkey = c_custkey 32 | ) 33 | ) as custsale 34 | group by 35 | cntrycode 36 | order by 37 | cntrycode; -------------------------------------------------------------------------------- /benchmark/tpch/queries/q3.sql: -------------------------------------------------------------------------------- 1 | select 2 | l_orderkey, 3 | sum(l_extendedprice * (1 - l_discount)) as revenue, 4 | o_orderdate, 5 | o_shippriority 6 | from 7 | customer, 8 | orders, 9 | lineitem 10 | where 11 | c_mktsegment = 'BUILDING' 12 | and c_custkey = o_custkey 13 | and l_orderkey = o_orderkey 14 | and o_orderdate < date '1995-03-15' 15 | and l_shipdate > date '1995-03-15' 16 | group by 17 | l_orderkey, 18 | o_orderdate, 19 | o_shippriority 20 | order by 21 | revenue desc, 22 | o_orderdate 23 | LIMIT 10; -------------------------------------------------------------------------------- /benchmark/tpch/queries/q4.sql: -------------------------------------------------------------------------------- 1 | select 2 | o_orderpriority, 3 | count(*) as order_count 4 | from 5 | orders 6 | where 7 | o_orderdate >= '1993-07-01' 8 | and o_orderdate < date '1993-07-01' + interval '3' month 9 | and exists ( 10 | select 11 | * 12 | from 13 | lineitem 14 | where 15 | l_orderkey = o_orderkey 16 | and l_commitdate < l_receiptdate 17 | ) 18 | group by 19 | o_orderpriority 20 | order by 21 | o_orderpriority; -------------------------------------------------------------------------------- /benchmark/tpch/queries/q5.sql: -------------------------------------------------------------------------------- 1 | select 2 | n_name, 3 | sum(l_extendedprice * (1 - l_discount)) as revenue 4 | from 5 | customer, 6 | orders, 7 | lineitem, 8 | supplier, 9 | nation, 10 | region 11 | where 12 | c_custkey = o_custkey 13 | and l_orderkey = o_orderkey 14 | and l_suppkey = s_suppkey 15 | and c_nationkey = s_nationkey 16 | and s_nationkey = n_nationkey 17 | and n_regionkey = r_regionkey 18 | and r_name = 'ASIA' 19 | and o_orderdate >= date '1994-01-01' 20 | and o_orderdate < date '1995-01-01' 21 | group by 22 | n_name 23 | order by 24 | revenue desc; -------------------------------------------------------------------------------- /benchmark/tpch/queries/q6.sql: -------------------------------------------------------------------------------- 1 | select 2 | sum(l_extendedprice * l_discount) as revenue 3 | from 4 | lineitem 5 | where 6 | l_shipdate >= date '1994-01-01' 7 | and l_shipdate < date '1995-01-01' 8 | and l_discount between 0.06 - 0.01 and 0.06 + 0.01 9 | and l_quantity < 24; -------------------------------------------------------------------------------- /benchmark/tpch/queries/q7.sql: -------------------------------------------------------------------------------- 1 | select 2 | supp_nation, 3 | cust_nation, 4 | l_year, 5 | sum(volume) as revenue 6 | from 7 | ( 8 | select 9 | n1.n_name as supp_nation, 10 | n2.n_name as cust_nation, 11 | extract(year from l_shipdate) as l_year, 12 | l_extendedprice * (1 - l_discount) as volume 13 | from 14 | supplier, 15 | lineitem, 16 | orders, 17 | customer, 18 | nation n1, 19 | nation n2 20 | where 21 | s_suppkey = l_suppkey 22 | and o_orderkey = l_orderkey 23 | and c_custkey = o_custkey 24 | and s_nationkey = n1.n_nationkey 25 | and c_nationkey = n2.n_nationkey 26 | and ( 27 | (n1.n_name = 'FRANCE' and n2.n_name = 'GERMANY') 28 | or (n1.n_name = 'GERMANY' and n2.n_name = 'FRANCE') 29 | ) 30 | and l_shipdate between date '1995-01-01' and date '1996-12-31' 31 | ) as shipping 32 | group by 33 | supp_nation, 34 | cust_nation, 35 | l_year 36 | order by 37 | supp_nation, 38 | cust_nation, 39 | l_year; 40 | -------------------------------------------------------------------------------- /benchmark/tpch/queries/q8.sql: -------------------------------------------------------------------------------- 1 | select 2 | o_year, 3 | sum(case 4 | when nation = 'BRAZIL' then volume 5 | else 0 6 | end) / sum(volume) as mkt_share 7 | from 8 | ( 9 | select 10 | extract(year from o_orderdate) as o_year, 11 | l_extendedprice * (1 - l_discount) as volume, 12 | n2.n_name as nation 13 | from 14 | part, 15 | supplier, 16 | lineitem, 17 | orders, 18 | customer, 19 | nation n1, 20 | nation n2, 21 | region 22 | where 23 | p_partkey = l_partkey 24 | and s_suppkey = l_suppkey 25 | and l_orderkey = o_orderkey 26 | and o_custkey = c_custkey 27 | and c_nationkey = n1.n_nationkey 28 | and n1.n_regionkey = r_regionkey 29 | and r_name = 'AMERICA' 30 | and s_nationkey = n2.n_nationkey 31 | and o_orderdate between date '1995-01-01' and date '1996-12-31' 32 | and p_type = 'ECONOMY ANODIZED STEEL' 33 | ) as all_nations 34 | group by 35 | o_year 36 | order by 37 | o_year; -------------------------------------------------------------------------------- /benchmark/tpch/queries/q9.sql: -------------------------------------------------------------------------------- 1 | select 2 | nation, 3 | o_year, 4 | sum(amount) as sum_profit 5 | from 6 | ( 7 | select 8 | n_name as nation, 9 | extract(year from o_orderdate) as o_year, 10 | l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount 11 | from 12 | part, 13 | supplier, 14 | lineitem, 15 | partsupp, 16 | orders, 17 | nation 18 | where 19 | s_suppkey = l_suppkey 20 | and ps_suppkey = l_suppkey 21 | and ps_partkey = l_partkey 22 | and p_partkey = l_partkey 23 | and o_orderkey = l_orderkey 24 | and s_nationkey = n_nationkey 25 | and p_name like '%green%' 26 | ) as profit 27 | group by 28 | nation, 29 | o_year 30 | order by 31 | nation, 32 | o_year desc; -------------------------------------------------------------------------------- /benchmark/tpch/tpch_gen.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import sys 5 | import argparse 6 | import duckdb 7 | from pathlib import Path 8 | 9 | def ensure_dir(dir_path): 10 | """Ensure directory exists""" 11 | Path(dir_path).mkdir(parents=True, exist_ok=True) 12 | 13 | def generate_tpch_data(scale_factor, data_dir, answers_dir): 14 | """Generate TPCH data at specified scale factor and save to parquet files""" 15 | print(f"Generating TPCH data with scale factor {scale_factor}") 16 | 17 | ensure_dir(data_dir) 18 | ensure_dir(answers_dir) 19 | 20 | conn = duckdb.connect(database=':memory:') 21 | 22 | try: 23 | conn.execute("INSTALL tpch") 24 | conn.execute("LOAD tpch") 25 | 26 | conn.execute(f"CALL dbgen(sf={scale_factor})") 27 | 28 | tables = ['lineitem', 'orders', 'customer', 'part', 'partsupp', 'supplier', 'nation', 'region'] 29 | 30 | for table in tables: 31 | output_path = os.path.join(data_dir, f"{table}.parquet") 32 | print(f"Saving {table} to {output_path}") 33 | conn.execute(f"COPY {table} TO '{output_path}' (FORMAT 'PARQUET')") 34 | 35 | print(f"Retrieving answers for scale factor {scale_factor}") 36 | 37 | conn.execute(f"CREATE TEMPORARY TABLE tpch_answers_temp AS SELECT * FROM tpch_answers() WHERE scale_factor = {scale_factor}") 38 | 39 | result = conn.execute("SELECT query_nr FROM tpch_answers_temp ORDER BY query_nr").fetchall() 40 | 41 | for row in result: 42 | query_nr = row[0] 43 | 44 | answer_csv_result = conn.execute(f"SELECT answer FROM tpch_answers_temp WHERE query_nr = {query_nr}").fetchone() 45 | if not answer_csv_result: 46 | print(f"Warning: No answer found for query {query_nr}, skipping") 47 | continue 48 | 49 | answer_csv = answer_csv_result[0] 50 | if not answer_csv or answer_csv.isspace(): 51 | print(f"Warning: Empty answer for query {query_nr}, skipping") 52 | continue 53 | 54 | output_path = os.path.join(answers_dir, f"q{query_nr}.parquet") 55 | print(f"Processing answer for query {query_nr} and saving to {output_path}") 56 | 57 | temp_csv = os.path.join(answers_dir, f"q{query_nr}_temp.csv") 58 | with open(temp_csv, 'w') as f: 59 | f.write(answer_csv) 60 | 61 | conn.execute(f"CREATE OR REPLACE TABLE q{query_nr}_temp AS SELECT * FROM read_csv('{temp_csv}', delim='|', header=true)") 62 | conn.execute(f"COPY q{query_nr}_temp TO '{output_path}' (FORMAT 'PARQUET')") 63 | 64 | os.remove(temp_csv) 65 | 66 | except Exception as e: 67 | print(f"Error: {e}") 68 | sys.exit(1) 69 | finally: 70 | conn.close() 71 | 72 | print("TPCH data and query answers generation completed successfully") 73 | 74 | def main(): 75 | parser = argparse.ArgumentParser(description='Generate TPCH data and query answers using DuckDB') 76 | parser.add_argument('--scale', type=float, default=0.01, help='Scale factor (default: 0.01)') 77 | parser.add_argument('--data-dir', type=str, default='data', help='Directory to store data parquet files') 78 | parser.add_argument('--answers-dir', type=str, default='answers', help='Directory to store query answers parquet files') 79 | 80 | args = parser.parse_args() 81 | data_dir = os.path.join(args.data_dir, f"sf{args.scale}") 82 | answers_dir = os.path.join(args.answers_dir, f"sf{args.scale}") 83 | 84 | generate_tpch_data(args.scale, data_dir, answers_dir) 85 | 86 | if __name__ == "__main__": 87 | main() 88 | -------------------------------------------------------------------------------- /dev/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Development Setup 3 | 4 | **Engineering is art, it has to be beautiful.** 5 | 6 | ### Install Rust toolchain 7 | 8 | ```bash 9 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh 10 | ``` 11 | 12 | Run tests: 13 | 14 | ```bash 15 | cargo test 16 | ``` 17 | 18 | ### Observability 19 | 20 | LiquidCache exports opentelemetry metrics. 21 | 22 | First, start a [openobserve](https://openobserve.ai/) instance: 23 | ```bash 24 | docker run -d \ 25 | --name openobserve \ 26 | -v $PWD/data:/data \ 27 | -p 5080:5080 \ 28 | -p 5081:5081 \ 29 | -e ZO_ROOT_USER_EMAIL="root@example.com" \ 30 | -e ZO_ROOT_USER_PASSWORD="Complexpass#123" \ 31 | public.ecr.aws/zinclabs/openobserve:latest 32 | ``` 33 | 34 | Then, get the auth token from the instance: http://localhost:5080/web/ingestion/recommended/traces 35 | 36 | You will see a token like this: 37 | ``` 38 | cm9vdEBleGFtcGxlLmNvbTpGT01qZ3NRUlNmelNoNzJQ 39 | ``` 40 | 41 | Then, run the server/client with the auth token: 42 | ```bash 43 | cargo run --release --bin bench_server -- --openobserve-auth cm9vdEBleGFtcGxlLmNvbTpGT01qZ3NRUlNmelNoNzJQ 44 | ``` 45 | 46 | Then open http://localhost:5080 to view the traces. 47 | 48 | 49 | ### Deploy a LiquidCache server with Docker 50 | 51 | ```bash 52 | docker run -p 15214:15214 -p 53793:53793 ghcr.io/xiangpenghao/liquid-cache/liquid-cache-server:latest 53 | ``` 54 | 55 | ### Git hooks 56 | 57 | After cloning the repository, run the following command to set up git hooks: 58 | 59 | ```bash 60 | ./dev/install-git-hooks.sh 61 | ``` 62 | 63 | This will set up pre-commit hooks that check formatting, run clippy, and verify documentation. 64 | -------------------------------------------------------------------------------- /dev/doc/arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/dev/doc/arch.png -------------------------------------------------------------------------------- /dev/doc/liquid-cache-vldb.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/dev/doc/liquid-cache-vldb.pdf -------------------------------------------------------------------------------- /dev/doc/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/dev/doc/logo.png -------------------------------------------------------------------------------- /dev/git-hooks/pre-push: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Running pre-push checks..." 4 | 5 | # Check formatting 6 | echo "Checking formatting..." 7 | if ! cargo fmt -- --check; then 8 | echo "❌ Formatting check failed. Please run 'cargo fmt' to fix formatting issues." 9 | exit 1 10 | fi 11 | 12 | # Run clippy 13 | echo "Running clippy..." 14 | if ! cargo clippy -- -D warnings; then 15 | echo "❌ Clippy check failed. Please fix the warnings before committing." 16 | exit 1 17 | fi 18 | 19 | # Check documentation 20 | echo "Checking documentation..." 21 | if ! RUSTDOCFLAGS="-D warnings" cargo doc --no-deps --document-private-items --all-features; then 22 | echo "❌ Documentation check failed. Please fix documentation issues." 23 | exit 1 24 | fi 25 | 26 | echo "✅ All checks passed!" 27 | exit 0 28 | -------------------------------------------------------------------------------- /dev/install-git-hooks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Create symbolic link for pre-push hook 4 | ln -sf "../../dev/git-hooks/pre-push" ".git/hooks/pre-push" 5 | chmod +x "dev/git-hooks/pre-push" 6 | 7 | echo "Git hooks installed successfully!" -------------------------------------------------------------------------------- /dev/liquid_cache_server.dockerfile: -------------------------------------------------------------------------------- 1 | # Use a minimal Debian base image 2 | FROM ubuntu:24.04 3 | 4 | # Install minimal runtime dependencies 5 | RUN apt-get update && apt-get install -y \ 6 | ca-certificates \ 7 | && rm -rf /var/lib/apt/lists/* 8 | 9 | ENV RUST_BACKTRACE=1 10 | ENV RUST_LOG=info 11 | 12 | WORKDIR /app 13 | 14 | COPY ./target/release/bench_server /app/bench_server 15 | 16 | EXPOSE 15214 17 | EXPOSE 53793 18 | 19 | # Run the server when the container starts 20 | CMD ["/app/bench_server", "--address", "0.0.0.0:15214", "--admin-address", "0.0.0.0:53793"] 21 | -------------------------------------------------------------------------------- /dev/thoughts/architecture.md: -------------------------------------------------------------------------------- 1 | # LiquidCache Architecture 2 | 3 | LiquidCache consists of three parts: 4 | - Cache: a server that caches data and evaluates the predicates. 5 | - Compute: the DataFusion instance that executes user queries. 6 | - LiquidParquet: the cache-specific file format used by the server. 7 | 8 | ## Cache 9 | 10 | ## Compute 11 | The compute node is stateless. 12 | 13 | ## LiquidParquet -------------------------------------------------------------------------------- /dev/thoughts/artifact-eval.md: -------------------------------------------------------------------------------- 1 | ### Configure network bandwidth 2 | 3 | Clean up the config: 4 | ```bash 5 | sudo tc qdisc del dev lo root 6 | ``` 7 | 8 | Set the bandwidth limit to 20Gbps: 9 | ```bash 10 | sudo tc qdisc add dev lo root tbf rate 20gbit burst 32mb limit 1000000 11 | ``` 12 | 13 | (I noticed very rarely the network will drop some connections causing client to panic, if that happens, either restart the benchmark or increase the limit a bit.) 14 | 15 | ### Ablation study 16 | 17 | 1. Change src/liquid_parquet/src/lib.rs: 18 | ```rust 19 | const ABLATION_STUDY_MODE: AblationStudyMode = AblationStudyMode::FullDecoding; 20 | ``` 21 | 22 | 2. Start server with: 23 | ```bash 24 | env RUST_LOG=info RUST_BACKTRACE=1 RUSTFLAGS='-C target-cpu=native' cargo run --release --bin bench_server -- --address 127.0.0.1:5001 --abort-on-panic 25 | ``` 26 | 27 | 3. Run client with: 28 | ```bash 29 | env RUST_LOG=info,clickbench_client=debug RUST_BACKTRACE=1 RUSTFLAGS='-C target-cpu=native' cargo run --release --bin clickbench_client -- --query-path benchmark/query_select.sql --file benchmark/data/hits.parquet --bench-mode liquid-eager-transcode --server http://127.0.0.1:5001 --iteration 5 --output benchmark/data/liquid_eager_transcode.json --reset-cache 30 | ``` 31 | 32 | 33 | ### Start server with limited memory 34 | ```bash 35 | systemd-run --scope -p MemoryMax=16G ./target/release/bench_server --address 127.0.0.1:5001 --max- 36 | cache-mb 12288 37 | ``` 38 | -------------------------------------------------------------------------------- /dev/thoughts/debugging-tips.md: -------------------------------------------------------------------------------- 1 | ### Use memory sanitizer to find memory issues 2 | 3 | We have to use unsafe. 4 | Sometimes it creates invalid memory accesses, which are very hard to debug. 5 | 6 | Be sure to disable `mimalloc` in the benchmark. 7 | 8 | ```bash 9 | env RUSTFLAGS="-Z sanitizer=address" RUST_LOG=info cargo run -Zbuild-std --target x86_64-unknown-linux-gnu --bin bench_server 10 | ``` 11 | 12 | ### Use RUST_LOG filtering 13 | 14 | ```bash 15 | env RUST_LOG=clickbench_client=debug,info 16 | ``` 17 | 18 | Will only show debug logs for `clickbench_client` but info logs for other modules. 19 | 20 | -------------------------------------------------------------------------------- /dev/thoughts/thoughts.md: -------------------------------------------------------------------------------- 1 | ### Inventing a new file format is not a good idea. 2 | One of the reason is that a file format needs to be well supported by the query engine. 3 | Parquet format itself is only x line of code, but DataFusion has xxx lines of code to support Parquet. The effort to support parquet is x% of the effort to implement a new file format, where x is a very large number. 4 | If we simply invent a new file format, we are likely have worse performance than Parquet. 5 | 6 | ### Predicate pushdown can be slower 7 | Because the output of predicate pushdown is in CSV/JSON format, meaning that the data is not compressed. 8 | It can result in much larger network traffic if the filter is not selective enough. 9 | 10 | Why we can't compress the output or re-encode the data in Parquet? 11 | That's a lot of CPU cost. (is that true?) 12 | 13 | 14 | ### TableProvider vs LiquidParquetExec 15 | We currently pack our system as a `TableProvider`. 16 | This does not work for people who already have their own `TableProvider`. 17 | In that case, they might want to re-implement many parts of our system. 18 | 19 | ### In-process mode 20 | While LiquidCache is a one-stop comprehensive solution for disaggregated cache, many people might want to use part of our components. 21 | For example, some people might want disaggregated cache, others might want to use only `LiquidParquetExec`. 22 | 23 | ### Rigorous testing 24 | We need a lot of systematic testing to ensure our system is correct. 25 | Especially the `LiquidArray` part. Fuzzing is planned. 26 | -------------------------------------------------------------------------------- /examples/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "examples" 3 | edition = { workspace = true } 4 | publish = false 5 | 6 | [[bin]] 7 | name = "example_server" 8 | path = "example_server.rs" 9 | 10 | [[bin]] 11 | name = "example_client" 12 | path = "example_client.rs" 13 | 14 | [dependencies] 15 | datafusion = { workspace = true } 16 | liquid-cache-server = { workspace = true } 17 | liquid-cache-client = { workspace = true } 18 | liquid-cache-common = { workspace = true } 19 | async-trait = { workspace = true } 20 | futures = { workspace = true } 21 | tokio = { workspace = true } 22 | log = { workspace = true } 23 | arrow-flight = { workspace = true } 24 | tonic = { workspace = true } 25 | env_logger = "0.11.8" 26 | url = { workspace = true } 27 | tempfile = "3.20.0" 28 | clap = { version = "4.5.38", features = ["derive"] } 29 | object_store = { workspace = true, features = ["http"] } 30 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | DataFusion Cache examples 2 | 3 | ## Server 4 | 5 | ```bash 6 | cargo run --bin example_server 7 | ``` 8 | 9 | ## Client 10 | 11 | ```bash 12 | cargo run --bin example_client 13 | ``` 14 | -------------------------------------------------------------------------------- /examples/example_client.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use clap::{Parser, command}; 19 | use datafusion::{error::Result, execution::object_store::ObjectStoreUrl, prelude::*}; 20 | use liquid_cache_client::LiquidCacheBuilder; 21 | use liquid_cache_common::CacheMode; 22 | use std::path::Path; 23 | use std::sync::Arc; 24 | use url::Url; 25 | 26 | #[derive(Parser, Clone)] 27 | #[command(name = "Example Client")] 28 | struct CliArgs { 29 | /// SQL query to execute 30 | #[arg( 31 | long, 32 | default_value = "SELECT COUNT(*) FROM \"aws-edge-locations\" WHERE \"countryCode\" = 'US';" 33 | )] 34 | query: String, 35 | 36 | /// URL of the table to query 37 | #[arg( 38 | long, 39 | default_value = "https://raw.githubusercontent.com/tobilg/aws-edge-locations/main/data/aws-edge-locations.parquet" 40 | )] 41 | file: String, 42 | 43 | /// Server URL 44 | #[arg(long, default_value = "http://localhost:15214")] 45 | cache_server: String, 46 | } 47 | 48 | #[tokio::main] 49 | pub async fn main() -> Result<()> { 50 | let args = CliArgs::parse(); 51 | let url = Url::parse(&args.file).unwrap(); 52 | let object_store_url = format!("{}://{}", url.scheme(), url.host_str().unwrap_or_default()); 53 | 54 | let ctx = LiquidCacheBuilder::new(args.cache_server.clone()) 55 | .with_object_store(ObjectStoreUrl::parse(object_store_url.as_str())?, None) 56 | .with_cache_mode(CacheMode::Liquid) 57 | .build(SessionConfig::from_env()?)?; 58 | let ctx = Arc::new(ctx); 59 | 60 | let table_name = Path::new(url.path()) 61 | .file_stem() 62 | .unwrap_or_default() 63 | .to_str() 64 | .unwrap_or("default"); 65 | let sql = args.query; 66 | let object_store = object_store::http::HttpBuilder::new() 67 | .with_url(object_store_url.as_str()) 68 | .build() 69 | .unwrap(); 70 | let object_store_url = ObjectStoreUrl::parse(object_store_url.as_str()).unwrap(); 71 | ctx.register_object_store(object_store_url.as_ref(), Arc::new(object_store)); 72 | ctx.register_parquet(table_name, url.as_ref(), Default::default()) 73 | .await?; 74 | 75 | ctx.sql(&sql).await?.show().await?; 76 | 77 | Ok(()) 78 | } 79 | -------------------------------------------------------------------------------- /examples/example_server.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use arrow_flight::flight_service_server::FlightServiceServer; 19 | use datafusion::prelude::SessionContext; 20 | use liquid_cache_common::{CacheEvictionStrategy, CacheMode}; 21 | use liquid_cache_server::LiquidCacheService; 22 | use tonic::transport::Server; 23 | 24 | #[tokio::main] 25 | async fn main() -> Result<(), Box> { 26 | let liquid_cache = LiquidCacheService::new( 27 | SessionContext::new(), 28 | Some(1024 * 1024 * 1024), // max memory cache size 1GB 29 | Some(tempfile::tempdir()?.keep()), // disk cache dir 30 | CacheMode::LiquidEagerTranscode, 31 | CacheEvictionStrategy::Discard, 32 | )?; 33 | 34 | let flight = FlightServiceServer::new(liquid_cache); 35 | 36 | Server::builder() 37 | .add_service(flight) 38 | .serve("0.0.0.0:15214".parse()?) 39 | .await?; 40 | 41 | Ok(()) 42 | } 43 | -------------------------------------------------------------------------------- /examples/nano_hits.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/examples/nano_hits.parquet -------------------------------------------------------------------------------- /flake.lock: -------------------------------------------------------------------------------- 1 | { 2 | "nodes": { 3 | "flake-utils": { 4 | "inputs": { 5 | "systems": "systems" 6 | }, 7 | "locked": { 8 | "lastModified": 1731533236, 9 | "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=", 10 | "owner": "numtide", 11 | "repo": "flake-utils", 12 | "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b", 13 | "type": "github" 14 | }, 15 | "original": { 16 | "owner": "numtide", 17 | "repo": "flake-utils", 18 | "type": "github" 19 | } 20 | }, 21 | "nixpkgs": { 22 | "locked": { 23 | "lastModified": 1747744144, 24 | "narHash": "sha256-W7lqHp0qZiENCDwUZ5EX/lNhxjMdNapFnbErcbnP11Q=", 25 | "owner": "NixOS", 26 | "repo": "nixpkgs", 27 | "rev": "2795c506fe8fb7b03c36ccb51f75b6df0ab2553f", 28 | "type": "github" 29 | }, 30 | "original": { 31 | "owner": "NixOS", 32 | "ref": "nixos-unstable", 33 | "repo": "nixpkgs", 34 | "type": "github" 35 | } 36 | }, 37 | "nixpkgs_2": { 38 | "locked": { 39 | "lastModified": 1744536153, 40 | "narHash": "sha256-awS2zRgF4uTwrOKwwiJcByDzDOdo3Q1rPZbiHQg/N38=", 41 | "owner": "NixOS", 42 | "repo": "nixpkgs", 43 | "rev": "18dd725c29603f582cf1900e0d25f9f1063dbf11", 44 | "type": "github" 45 | }, 46 | "original": { 47 | "owner": "NixOS", 48 | "ref": "nixpkgs-unstable", 49 | "repo": "nixpkgs", 50 | "type": "github" 51 | } 52 | }, 53 | "root": { 54 | "inputs": { 55 | "flake-utils": "flake-utils", 56 | "nixpkgs": "nixpkgs", 57 | "rust-overlay": "rust-overlay" 58 | } 59 | }, 60 | "rust-overlay": { 61 | "inputs": { 62 | "nixpkgs": "nixpkgs_2" 63 | }, 64 | "locked": { 65 | "lastModified": 1747881408, 66 | "narHash": "sha256-LmpQ28JNi5OPqRamih6+QvVQE1DurLOgKUlyM4fRiRU=", 67 | "owner": "oxalica", 68 | "repo": "rust-overlay", 69 | "rev": "6e322a70e8a6c15bab8a5e3cf690fd65414b9d81", 70 | "type": "github" 71 | }, 72 | "original": { 73 | "owner": "oxalica", 74 | "repo": "rust-overlay", 75 | "type": "github" 76 | } 77 | }, 78 | "systems": { 79 | "locked": { 80 | "lastModified": 1681028828, 81 | "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", 82 | "owner": "nix-systems", 83 | "repo": "default", 84 | "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", 85 | "type": "github" 86 | }, 87 | "original": { 88 | "owner": "nix-systems", 89 | "repo": "default", 90 | "type": "github" 91 | } 92 | } 93 | }, 94 | "root": "root", 95 | "version": 7 96 | } 97 | -------------------------------------------------------------------------------- /flake.nix: -------------------------------------------------------------------------------- 1 | { 2 | description = "Liquid Cache Flake Configuration"; 3 | 4 | inputs = { 5 | nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; 6 | rust-overlay.url = "github:oxalica/rust-overlay"; 7 | flake-utils.url = "github:numtide/flake-utils"; 8 | }; 9 | 10 | outputs = 11 | { nixpkgs 12 | , rust-overlay 13 | , flake-utils 14 | , ... 15 | }: 16 | flake-utils.lib.eachDefaultSystem ( 17 | system: 18 | let 19 | overlays = [ (import rust-overlay) ]; 20 | pkgs = import nixpkgs { 21 | inherit system overlays; 22 | }; 23 | in 24 | { 25 | devShells.default = with pkgs; 26 | mkShell { 27 | buildInputs = [ 28 | openssl 29 | pkg-config 30 | eza 31 | fd 32 | llvmPackages.bintools 33 | (rust-bin.fromRustupToolchainFile (./rust-toolchain.toml)) 34 | ]; 35 | }; 36 | } 37 | ); 38 | } 39 | -------------------------------------------------------------------------------- /rust-toolchain.toml: -------------------------------------------------------------------------------- 1 | [toolchain] 2 | # I really don't want to use nightly, but we have to, until: 3 | # https://github.com/spiraldb/fastlanes/issues/45 is fixed. 4 | channel = "nightly-2025-05-20" 5 | components = ["rustfmt", "clippy", "rust-src", "miri"] 6 | -------------------------------------------------------------------------------- /src/client/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "liquid-cache-client" 3 | authors = { workspace = true } 4 | edition = { workspace = true } 5 | version = { workspace = true } 6 | license = { workspace = true } 7 | readme = { workspace = true } 8 | description = { workspace = true } 9 | repository = { workspace = true } 10 | 11 | [dependencies] 12 | arrow = { workspace = true } 13 | arrow-flight = { workspace = true } 14 | datafusion = { workspace = true } 15 | datafusion-proto = { workspace = true } 16 | arrow-schema = { workspace = true } 17 | futures = { workspace = true } 18 | serde = { workspace = true } 19 | tonic = { workspace = true } 20 | async-trait = { workspace = true } 21 | log = { workspace = true } 22 | prost = { workspace = true } 23 | uuid = { workspace = true } 24 | liquid-cache-common = { workspace = true } 25 | fastrace = { workspace = true } 26 | fastrace-tonic = { workspace = true } 27 | fastrace-futures = { workspace = true } 28 | tower = "0.5.2" 29 | tokio = { workspace = true } 30 | 31 | [dev-dependencies] 32 | insta = { version = "1.43.1" } 33 | -------------------------------------------------------------------------------- /src/client/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![warn(missing_docs)] 2 | #![cfg_attr(not(doctest), doc = include_str!(concat!("../", std::env!("CARGO_PKG_README"))))] 3 | use std::collections::HashMap; 4 | use std::error::Error; 5 | use std::sync::Arc; 6 | use std::time::Duration; 7 | mod client_exec; 8 | mod metrics; 9 | mod optimizer; 10 | pub use client_exec::LiquidCacheClientExec; 11 | use datafusion::{ 12 | error::{DataFusionError, Result}, 13 | execution::{SessionStateBuilder, object_store::ObjectStoreUrl, runtime_env::RuntimeEnv}, 14 | prelude::*, 15 | }; 16 | use fastrace_tonic::FastraceClientService; 17 | use liquid_cache_common::CacheMode; 18 | pub use optimizer::PushdownOptimizer; 19 | use tonic::transport::Channel; 20 | 21 | #[cfg(test)] 22 | mod tests; 23 | 24 | /// The builder for LiquidCache client state. 25 | /// 26 | /// # Example 27 | /// 28 | /// ```ignore 29 | /// use liquid_cache_client::LiquidCacheBuilder; 30 | /// let ctx = LiquidCacheBuilder::new("localhost:15214") 31 | /// .with_object_store("s3://my_bucket", None) 32 | /// .with_cache_mode(CacheMode::Liquid) 33 | /// .build(SessionConfig::from_env().unwrap()) 34 | /// .unwrap(); 35 | /// 36 | /// ctx.register_parquet("my_table", "s3://my_bucket/my_table.parquet", Default::default()) 37 | /// .await?; 38 | /// let df = ctx.sql("SELECT * FROM my_table").await?.show().await?; 39 | /// println!("{:?}", df); 40 | /// ``` 41 | pub struct LiquidCacheBuilder { 42 | object_stores: Vec<(ObjectStoreUrl, HashMap)>, 43 | cache_mode: CacheMode, 44 | cache_server: String, 45 | } 46 | 47 | impl LiquidCacheBuilder { 48 | /// Create a new builder for LiquidCache client state. 49 | pub fn new(cache_server: impl AsRef) -> Self { 50 | Self { 51 | object_stores: vec![], 52 | cache_mode: CacheMode::Liquid, 53 | cache_server: cache_server.as_ref().to_string(), 54 | } 55 | } 56 | 57 | /// Add an object store to the builder. 58 | pub fn with_object_store( 59 | mut self, 60 | url: ObjectStoreUrl, 61 | object_store_options: Option>, 62 | ) -> Self { 63 | self.object_stores 64 | .push((url, object_store_options.unwrap_or_default())); 65 | self 66 | } 67 | 68 | /// Set the cache mode for the builder. 69 | pub fn with_cache_mode(mut self, cache_mode: CacheMode) -> Self { 70 | self.cache_mode = cache_mode; 71 | self 72 | } 73 | 74 | /// Build the [SessionContext]. 75 | pub fn build(self, config: SessionConfig) -> Result { 76 | let mut session_config = config; 77 | session_config 78 | .options_mut() 79 | .execution 80 | .parquet 81 | .pushdown_filters = true; 82 | session_config 83 | .options_mut() 84 | .execution 85 | .parquet 86 | .schema_force_view_types = false; 87 | session_config 88 | .options_mut() 89 | .execution 90 | .parquet 91 | .binary_as_string = true; 92 | session_config.options_mut().execution.batch_size = 8192 * 2; 93 | let session_state = SessionStateBuilder::new() 94 | .with_config(session_config) 95 | .with_runtime_env(Arc::new(RuntimeEnv::default())) 96 | .with_default_features() 97 | .with_physical_optimizer_rule(Arc::new(PushdownOptimizer::new( 98 | self.cache_server.clone(), 99 | self.cache_mode, 100 | self.object_stores.clone(), 101 | ))) 102 | .build(); 103 | Ok(SessionContext::new_with_state(session_state)) 104 | } 105 | } 106 | 107 | pub(crate) fn to_df_err(err: E) -> DataFusionError { 108 | DataFusionError::External(Box::new(err)) 109 | } 110 | 111 | pub(crate) async fn flight_channel( 112 | source: impl Into, 113 | ) -> Result> { 114 | use fastrace_tonic::FastraceClientLayer; 115 | use tower::ServiceBuilder; 116 | 117 | // No tls here, to avoid the overhead of TLS 118 | // we assume both server and client are running on the trusted network. 119 | let endpoint = Channel::from_shared(source.into()) 120 | .map_err(to_df_err)? 121 | .tcp_keepalive(Some(Duration::from_secs(10))); 122 | 123 | let channel = endpoint.connect().await.map_err(to_df_err)?; 124 | let channel = ServiceBuilder::new() 125 | .layer(FastraceClientLayer) 126 | .service(channel); 127 | Ok(channel) 128 | } 129 | -------------------------------------------------------------------------------- /src/client/src/metrics.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use datafusion::{ 19 | common::instant::Instant, 20 | physical_plan::metrics::{Count, ExecutionPlanMetricsSet, MetricBuilder, Time}, 21 | }; 22 | 23 | /// A timer that can be started and stopped. 24 | pub struct StartableTime { 25 | pub(crate) metrics: Time, 26 | // use for record each part cost time, will eventually add into 'metrics'. 27 | pub(crate) start: Option, 28 | } 29 | 30 | impl StartableTime { 31 | pub(crate) fn start(&mut self) { 32 | assert!(self.start.is_none()); 33 | self.start = Some(Instant::now()); 34 | } 35 | 36 | pub(crate) fn stop(&mut self) { 37 | if let Some(start) = self.start.take() { 38 | self.metrics.add_elapsed(start); 39 | } 40 | } 41 | } 42 | 43 | pub(crate) struct FlightStreamMetrics { 44 | pub time_processing: StartableTime, 45 | pub time_reading_total: StartableTime, 46 | pub poll_count: Count, 47 | pub output_rows: Count, 48 | pub bytes_decoded: Count, 49 | } 50 | 51 | impl FlightStreamMetrics { 52 | pub(crate) fn new(metrics: &ExecutionPlanMetricsSet, partition: usize) -> Self { 53 | Self { 54 | time_processing: StartableTime { 55 | metrics: MetricBuilder::new(metrics).subset_time("time_processing", partition), 56 | start: None, 57 | }, 58 | time_reading_total: StartableTime { 59 | metrics: MetricBuilder::new(metrics).subset_time("time_reading_total", partition), 60 | start: None, 61 | }, 62 | output_rows: MetricBuilder::new(metrics).output_rows(partition), 63 | poll_count: MetricBuilder::new(metrics).counter("poll_count", partition), 64 | bytes_decoded: MetricBuilder::new(metrics).counter("bytes_decoded", partition), 65 | } 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/client/src/tests/snapshots/liquid_cache_client__tests__tpch_q1.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/client/src/tests/mod.rs 3 | expression: displayable.tree_render().to_string() 4 | --- 5 | ┌───────────────────────────┐ 6 | │ SortPreservingMergeExec │ 7 | │ -------------------- │ 8 | │ l_returnflag ASC NULLS │ 9 | │ LAST, l_linestatus │ 10 | │ ASC NULLS LAST │ 11 | └─────────────┬─────────────┘ 12 | ┌─────────────┴─────────────┐ 13 | │ SortExec │ 14 | │ -------------------- │ 15 | │ l_returnflag@0 ASC NULLS │ 16 | │ LAST, l_linestatus@1 │ 17 | │ ASC NULLS LAST │ 18 | └─────────────┬─────────────┘ 19 | ┌─────────────┴─────────────┐ 20 | │ ProjectionExec │ 21 | │ -------------------- │ 22 | │ avg_disc: │ 23 | │ avg(lineitem.l_discount) │ 24 | │ │ 25 | │ avg_price: │ 26 | │ avg(lineitem │ 27 | │ .l_extendedp │ 28 | │ rice) │ 29 | │ │ 30 | │ avg_qty: │ 31 | │ avg(lineitem.l_quantity) │ 32 | │ │ 33 | │ count_order: │ 34 | │ count(Int64(1)) │ 35 | │ │ 36 | │ l_linestatus: │ 37 | │ l_linestatus │ 38 | │ │ 39 | │ l_returnflag: │ 40 | │ l_returnflag │ 41 | │ │ 42 | │ sum_base_price: │ 43 | │ sum(lineitem │ 44 | │ .l_extendedp │ 45 | │ rice) │ 46 | │ │ 47 | │ sum_charge: │ 48 | │ sum(lineitem │ 49 | │ .l_extendedp │ 50 | │ rice * Int64(1) - lineitem│ 51 | │ ... │ 52 | └─────────────┬─────────────┘ 53 | ┌─────────────┴─────────────┐ 54 | │ AggregateExec │ 55 | │ -------------------- │ 56 | │ aggr: │ 57 | │ sum(lineitem.l_quantity), │ 58 | │ sum(lineitem │ 59 | │ .l_extendedpric │ 60 | │ e), , , avg(lineitem │ 61 | │ .l_quantity), avg │ 62 | │ (lineitem │ 63 | │ .l_extendedp │ 64 | │ rice), avg(lineitem │ 65 | │ .l_discount), │ 66 | │ count(1) │ 67 | │ │ 68 | │ group_by: │ 69 | │ l_returnflag, l_linestatus│ 70 | │ │ 71 | │ mode: │ 72 | │ FinalPartitioned │ 73 | └─────────────┬─────────────┘ 74 | ┌─────────────┴─────────────┐ 75 | │ CoalesceBatchesExec │ 76 | │ -------------------- │ 77 | │ target_batch_size: │ 78 | │ 16384 │ 79 | └─────────────┬─────────────┘ 80 | ┌─────────────┴─────────────┐ 81 | │ RepartitionExec │ 82 | │ -------------------- │ 83 | │ partition_count(in->out): │ 84 | │ 8 -> 8 │ 85 | │ │ 86 | │ partitioning_scheme: │ 87 | │ Hash([l_returnflag@0, │ 88 | │ l_linestatus@1], 8) │ 89 | └─────────────┬─────────────┘ 90 | ┌─────────────┴─────────────┐ 91 | │ AggregateExec │ 92 | │ -------------------- │ 93 | │ aggr: │ 94 | │ sum(lineitem.l_quantity), │ 95 | │ sum(lineitem │ 96 | │ .l_extendedpric │ 97 | │ e), , , avg(lineitem │ 98 | │ .l_quantity), avg │ 99 | │ (lineitem │ 100 | │ .l_extendedp │ 101 | │ rice), avg(lineitem │ 102 | │ .l_discount), │ 103 | │ count(1) │ 104 | │ │ 105 | │ group_by: │ 106 | │ l_returnflag, l_linestatus│ 107 | │ │ 108 | │ mode: Partial │ 109 | └─────────────┬─────────────┘ 110 | ┌─────────────┴─────────────┐ 111 | │ ProjectionExec │ 112 | │ -------------------- │ 113 | │ __common_expr_1: │ 114 | │ l_extendedprice * (Some(1)│ 115 | │ ,20,0 - l_discount) │ 116 | │ │ 117 | │ l_discount: │ 118 | │ l_discount │ 119 | │ │ 120 | │ l_extendedprice: │ 121 | │ l_extendedprice │ 122 | │ │ 123 | │ l_linestatus: │ 124 | │ l_linestatus │ 125 | │ │ 126 | │ l_quantity: │ 127 | │ l_quantity │ 128 | │ │ 129 | │ l_returnflag: │ 130 | │ l_returnflag │ 131 | │ │ 132 | │ l_tax: l_tax │ 133 | └─────────────┬─────────────┘ 134 | ┌─────────────┴─────────────┐ 135 | │ LiquidCacheClientExec │ 136 | │ -------------------- │ 137 | │ server: │ 138 | │ http://localhost:50051, │ 139 | │ mode=liquid, │ 140 | │ object_stores=[] │ 141 | └─────────────┬─────────────┘ 142 | ┌─────────────┴─────────────┐ 143 | │ RepartitionExec │ 144 | │ -------------------- │ 145 | │ partition_count(in->out): │ 146 | │ 1 -> 8 │ 147 | │ │ 148 | │ partitioning_scheme: │ 149 | │ RoundRobinBatch(8) │ 150 | └─────────────┬─────────────┘ 151 | ┌─────────────┴─────────────┐ 152 | │ DataSourceExec │ 153 | │ -------------------- │ 154 | │ files: 1 │ 155 | │ format: parquet │ 156 | │ │ 157 | │ predicate: │ 158 | │ l_shipdate <= 1998-09-02 │ 159 | └───────────────────────────┘ 160 | -------------------------------------------------------------------------------- /src/client/src/tests/snapshots/liquid_cache_client__tests__tpch_q12.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/client/src/tests/mod.rs 3 | expression: displayable.tree_render().to_string() 4 | --- 5 | ┌───────────────────────────┐ 6 | │ SortPreservingMergeExec │ 7 | │ -------------------- │ 8 | │ l_shipmode ASC NULLS LAST │ 9 | └─────────────┬─────────────┘ 10 | ┌─────────────┴─────────────┐ 11 | │ SortExec │ 12 | │ -------------------- │ 13 | │l_shipmode@0 ASC NULLS LAST│ 14 | └─────────────┬─────────────┘ 15 | ┌─────────────┴─────────────┐ 16 | │ ProjectionExec │ 17 | │ -------------------- │ 18 | │ high_line_count: │ 19 | │ sum(CASE WHEN orders │ 20 | │ .o_orderpriority = │ 21 | │ Utf8("1-URGENT") OR │ 22 | │ orders.o_orderpriorit │ 23 | │ y = Utf8("2-HIGH") THEN │ 24 | │ Int64(1) ELSE Int64(0 │ 25 | │ ) END) │ 26 | │ │ 27 | │ l_shipmode: │ 28 | │ l_shipmode │ 29 | │ │ 30 | │ low_line_count: │ 31 | │ sum(CASE WHEN orders │ 32 | │ .o_orderpriority != │ 33 | │ Utf8("1-URGENT") AND │ 34 | │ orders.o_orderpriorit │ 35 | │ y != Utf8("2-HIGH") THEN │ 36 | │ Int64(1) ELSE Int64(0) │ 37 | │ END) │ 38 | └─────────────┬─────────────┘ 39 | ┌─────────────┴─────────────┐ 40 | │ AggregateExec │ 41 | │ -------------------- │ 42 | │ aggr: │ 43 | │ sum(CASE WHEN orders │ 44 | │ .o_orderpriority = │ 45 | │ 1-URGENT OR orders │ 46 | │ .o_orderpriority = 2 │ 47 | │ -HIGH THEN 1 ELSE 0 END), │ 48 | │ sum(CASE WHEN orders │ 49 | │ .o_orderpriority != 1 │ 50 | │ -URGENT AND orders │ 51 | │ .o_orderpriority ! │ 52 | │ = 2-HIGH THEN 1 ELSE 0 │ 53 | │ END) │ 54 | │ │ 55 | │ group_by: l_shipmode │ 56 | │ │ 57 | │ mode: │ 58 | │ FinalPartitioned │ 59 | └─────────────┬─────────────┘ 60 | ┌─────────────┴─────────────┐ 61 | │ CoalesceBatchesExec │ 62 | │ -------------------- │ 63 | │ target_batch_size: │ 64 | │ 16384 │ 65 | └─────────────┬─────────────┘ 66 | ┌─────────────┴─────────────┐ 67 | │ RepartitionExec │ 68 | │ -------------------- │ 69 | │ partition_count(in->out): │ 70 | │ 8 -> 8 │ 71 | │ │ 72 | │ partitioning_scheme: │ 73 | │ Hash([l_shipmode@0], 8) │ 74 | └─────────────┬─────────────┘ 75 | ┌─────────────┴─────────────┐ 76 | │ AggregateExec │ 77 | │ -------------------- │ 78 | │ aggr: │ 79 | │ sum(CASE WHEN orders │ 80 | │ .o_orderpriority = │ 81 | │ 1-URGENT OR orders │ 82 | │ .o_orderpriority = 2 │ 83 | │ -HIGH THEN 1 ELSE 0 END), │ 84 | │ sum(CASE WHEN orders │ 85 | │ .o_orderpriority != 1 │ 86 | │ -URGENT AND orders │ 87 | │ .o_orderpriority ! │ 88 | │ = 2-HIGH THEN 1 ELSE 0 │ 89 | │ END) │ 90 | │ │ 91 | │ group_by: l_shipmode │ 92 | │ mode: Partial │ 93 | └─────────────┬─────────────┘ 94 | ┌─────────────┴─────────────┐ 95 | │ ProjectionExec │ 96 | │ -------------------- │ 97 | │ l_shipmode: │ 98 | │ l_shipmode │ 99 | │ │ 100 | │ o_orderpriority: │ 101 | │ o_orderpriority │ 102 | └─────────────┬─────────────┘ 103 | ┌─────────────┴─────────────┐ 104 | │ CoalesceBatchesExec │ 105 | │ -------------------- │ 106 | │ target_batch_size: │ 107 | │ 16384 │ 108 | └─────────────┬─────────────┘ 109 | ┌─────────────┴─────────────┐ 110 | │ HashJoinExec │ 111 | │ -------------------- │ 112 | │ on: ├──────────────┐ 113 | │ (o_orderkey = l_orderkey) │ │ 114 | └─────────────┬─────────────┘ │ 115 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐ 116 | │ LiquidCacheClientExec ││ LiquidCacheClientExec │ 117 | │ -------------------- ││ -------------------- │ 118 | │ server: ││ server: │ 119 | │ http://localhost:50051, ││ http://localhost:50051, │ 120 | │ mode=liquid, ││ mode=liquid, │ 121 | │ object_stores=[] ││ object_stores=[] │ 122 | └─────────────┬─────────────┘└─────────────┬─────────────┘ 123 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐ 124 | │ DataSourceExec ││ RepartitionExec │ 125 | │ -------------------- ││ -------------------- │ 126 | │ files: 1 ││ partition_count(in->out): │ 127 | │ format: parquet ││ 1 -> 8 │ 128 | │ ││ │ 129 | │ ││ partitioning_scheme: │ 130 | │ ││ RoundRobinBatch(8) │ 131 | └───────────────────────────┘└─────────────┬─────────────┘ 132 | ┌─────────────┴─────────────┐ 133 | │ DataSourceExec │ 134 | │ -------------------- │ 135 | │ files: 1 │ 136 | │ format: parquet │ 137 | │ │ 138 | │ predicate: │ 139 | │ (l_shipmode = MAIL OR │ 140 | │ l_shipmode = SHIP) │ 141 | │ AND l_receiptdate > │ 142 | │ l_commitdate AND │ 143 | │ l_shipdate < │ 144 | │ l_commitdate AND │ 145 | │ l_receiptdate >= 1994-01 │ 146 | │ -01 AND l_receiptdate < │ 147 | │ 1995-01-01 │ 148 | └───────────────────────────┘ 149 | -------------------------------------------------------------------------------- /src/client/src/tests/snapshots/liquid_cache_client__tests__tpch_q13.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/client/src/tests/mod.rs 3 | expression: displayable.tree_render().to_string() 4 | --- 5 | ┌───────────────────────────┐ 6 | │ SortPreservingMergeExec │ 7 | │ -------------------- │ 8 | │custdist DESC, c_count DESC│ 9 | └─────────────┬─────────────┘ 10 | ┌─────────────┴─────────────┐ 11 | │ SortExec │ 12 | │ -------------------- │ 13 | │ custdist@1 DESC, c_count@0│ 14 | │ DESC │ 15 | └─────────────┬─────────────┘ 16 | ┌─────────────┴─────────────┐ 17 | │ ProjectionExec │ 18 | │ -------------------- │ 19 | │ c_count: c_count │ 20 | │ │ 21 | │ custdist: │ 22 | │ count(Int64(1)) │ 23 | └─────────────┬─────────────┘ 24 | ┌─────────────┴─────────────┐ 25 | │ AggregateExec │ 26 | │ -------------------- │ 27 | │ aggr: count(1) │ 28 | │ group_by: c_count │ 29 | │ │ 30 | │ mode: │ 31 | │ FinalPartitioned │ 32 | └─────────────┬─────────────┘ 33 | ┌─────────────┴─────────────┐ 34 | │ CoalesceBatchesExec │ 35 | │ -------------------- │ 36 | │ target_batch_size: │ 37 | │ 16384 │ 38 | └─────────────┬─────────────┘ 39 | ┌─────────────┴─────────────┐ 40 | │ RepartitionExec │ 41 | │ -------------------- │ 42 | │ partition_count(in->out): │ 43 | │ 8 -> 8 │ 44 | │ │ 45 | │ partitioning_scheme: │ 46 | │ Hash([c_count@0], 8) │ 47 | └─────────────┬─────────────┘ 48 | ┌─────────────┴─────────────┐ 49 | │ AggregateExec │ 50 | │ -------------------- │ 51 | │ aggr: count(1) │ 52 | │ group_by: c_count │ 53 | │ mode: Partial │ 54 | └─────────────┬─────────────┘ 55 | ┌─────────────┴─────────────┐ 56 | │ ProjectionExec │ 57 | │ -------------------- │ 58 | │ c_count: │ 59 | │ count(orders.o_orderkey) │ 60 | └─────────────┬─────────────┘ 61 | ┌─────────────┴─────────────┐ 62 | │ AggregateExec │ 63 | │ -------------------- │ 64 | │ aggr: │ 65 | │ count(orders.o_orderkey) │ 66 | │ │ 67 | │ group_by: c_custkey │ 68 | │ │ 69 | │ mode: │ 70 | │ FinalPartitioned │ 71 | └─────────────┬─────────────┘ 72 | ┌─────────────┴─────────────┐ 73 | │ CoalesceBatchesExec │ 74 | │ -------------------- │ 75 | │ target_batch_size: │ 76 | │ 16384 │ 77 | └─────────────┬─────────────┘ 78 | ┌─────────────┴─────────────┐ 79 | │ RepartitionExec │ 80 | │ -------------------- │ 81 | │ partition_count(in->out): │ 82 | │ 8 -> 8 │ 83 | │ │ 84 | │ partitioning_scheme: │ 85 | │ Hash([c_custkey@0], 8) │ 86 | └─────────────┬─────────────┘ 87 | ┌─────────────┴─────────────┐ 88 | │ AggregateExec │ 89 | │ -------------------- │ 90 | │ aggr: │ 91 | │ count(orders.o_orderkey) │ 92 | │ │ 93 | │ group_by: c_custkey │ 94 | │ mode: Partial │ 95 | └─────────────┬─────────────┘ 96 | ┌─────────────┴─────────────┐ 97 | │ CoalesceBatchesExec │ 98 | │ -------------------- │ 99 | │ target_batch_size: │ 100 | │ 16384 │ 101 | └─────────────┬─────────────┘ 102 | ┌─────────────┴─────────────┐ 103 | │ HashJoinExec │ 104 | │ -------------------- │ 105 | │ join_type: Left │ 106 | │ ├──────────────┐ 107 | │ on: │ │ 108 | │ (c_custkey = o_custkey) │ │ 109 | └─────────────┬─────────────┘ │ 110 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐ 111 | │ LiquidCacheClientExec ││ LiquidCacheClientExec │ 112 | │ -------------------- ││ -------------------- │ 113 | │ server: ││ server: │ 114 | │ http://localhost:50051, ││ http://localhost:50051, │ 115 | │ mode=liquid, ││ mode=liquid, │ 116 | │ object_stores=[] ││ object_stores=[] │ 117 | └─────────────┬─────────────┘└─────────────┬─────────────┘ 118 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐ 119 | │ DataSourceExec ││ RepartitionExec │ 120 | │ -------------------- ││ -------------------- │ 121 | │ files: 1 ││ partition_count(in->out): │ 122 | │ format: parquet ││ 1 -> 8 │ 123 | │ ││ │ 124 | │ ││ partitioning_scheme: │ 125 | │ ││ RoundRobinBatch(8) │ 126 | └───────────────────────────┘└─────────────┬─────────────┘ 127 | ┌─────────────┴─────────────┐ 128 | │ DataSourceExec │ 129 | │ -------------------- │ 130 | │ files: 1 │ 131 | │ format: parquet │ 132 | │ │ 133 | │ predicate: │ 134 | │ o_comment NOT LIKE │ 135 | │ %special%requests% │ 136 | └───────────────────────────┘ 137 | -------------------------------------------------------------------------------- /src/client/src/tests/snapshots/liquid_cache_client__tests__tpch_q14.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/client/src/tests/mod.rs 3 | expression: displayable.tree_render().to_string() 4 | --- 5 | ┌───────────────────────────┐ 6 | │ ProjectionExec │ 7 | │ -------------------- │ 8 | │ promo_revenue: │ 9 | │ 100 * CAST(sum(CASE WHEN │ 10 | │ part.p_type LIKE Utf8( │ 11 | │ "PROMO%") THEN lineitem │ 12 | │ .l_extendedprice * │ 13 | │ Int64(1) - lineitem │ 14 | │ .l_discount ELSE Int64(0) │ 15 | │ END) AS Float64) / CAST │ 16 | │ (sum(lineitem │ 17 | │ .l_extendedpric │ 18 | │ e * Int64(1) - lineitem │ 19 | │ .l_discount) AS │ 20 | │ Float64) │ 21 | └─────────────┬─────────────┘ 22 | ┌─────────────┴─────────────┐ 23 | │ AggregateExec │ 24 | │ -------------------- │ 25 | │ aggr: , │ 26 | │ mode: Final │ 27 | └─────────────┬─────────────┘ 28 | ┌─────────────┴─────────────┐ 29 | │ CoalescePartitionsExec │ 30 | └─────────────┬─────────────┘ 31 | ┌─────────────┴─────────────┐ 32 | │ AggregateExec │ 33 | │ -------------------- │ 34 | │ aggr: , │ 35 | │ mode: Partial │ 36 | └─────────────┬─────────────┘ 37 | ┌─────────────┴─────────────┐ 38 | │ ProjectionExec │ 39 | │ -------------------- │ 40 | │ __common_expr_1: │ 41 | │ l_extendedprice * (Some(1)│ 42 | │ ,20,0 - l_discount) │ 43 | │ │ 44 | │ p_type: p_type │ 45 | └─────────────┬─────────────┘ 46 | ┌─────────────┴─────────────┐ 47 | │ CoalesceBatchesExec │ 48 | │ -------------------- │ 49 | │ target_batch_size: │ 50 | │ 16384 │ 51 | └─────────────┬─────────────┘ 52 | ┌─────────────┴─────────────┐ 53 | │ HashJoinExec │ 54 | │ -------------------- │ 55 | │ on: ├──────────────┐ 56 | │ (p_partkey = l_partkey) │ │ 57 | └─────────────┬─────────────┘ │ 58 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐ 59 | │ LiquidCacheClientExec ││ LiquidCacheClientExec │ 60 | │ -------------------- ││ -------------------- │ 61 | │ server: ││ server: │ 62 | │ http://localhost:50051, ││ http://localhost:50051, │ 63 | │ mode=liquid, ││ mode=liquid, │ 64 | │ object_stores=[] ││ object_stores=[] │ 65 | └─────────────┬─────────────┘└─────────────┬─────────────┘ 66 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐ 67 | │ DataSourceExec ││ RepartitionExec │ 68 | │ -------------------- ││ -------------------- │ 69 | │ files: 1 ││ partition_count(in->out): │ 70 | │ format: parquet ││ 1 -> 8 │ 71 | │ ││ │ 72 | │ ││ partitioning_scheme: │ 73 | │ ││ RoundRobinBatch(8) │ 74 | └───────────────────────────┘└─────────────┬─────────────┘ 75 | ┌─────────────┴─────────────┐ 76 | │ DataSourceExec │ 77 | │ -------------------- │ 78 | │ files: 1 │ 79 | │ format: parquet │ 80 | │ │ 81 | │ predicate: │ 82 | │ l_shipdate >= 1995-09-01 │ 83 | │ AND l_shipdate < 1995 │ 84 | │ -10-01 │ 85 | └───────────────────────────┘ 86 | -------------------------------------------------------------------------------- /src/client/src/tests/snapshots/liquid_cache_client__tests__tpch_q17.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/client/src/tests/mod.rs 3 | expression: displayable.tree_render().to_string() 4 | --- 5 | ┌───────────────────────────┐ 6 | │ ProjectionExec │ 7 | │ -------------------- │ 8 | │ avg_yearly: │ 9 | │ CAST(sum(lineitem │ 10 | │ .l_extendedprice │ 11 | │ ) AS Float64) / 7 │ 12 | └─────────────┬─────────────┘ 13 | ┌─────────────┴─────────────┐ 14 | │ AggregateExec │ 15 | │ -------------------- │ 16 | │ aggr: │ 17 | │ sum(lineitem │ 18 | │ .l_extendedp │ 19 | │ rice) │ 20 | │ │ 21 | │ mode: Final │ 22 | └─────────────┬─────────────┘ 23 | ┌─────────────┴─────────────┐ 24 | │ CoalescePartitionsExec │ 25 | └─────────────┬─────────────┘ 26 | ┌─────────────┴─────────────┐ 27 | │ AggregateExec │ 28 | │ -------------------- │ 29 | │ aggr: │ 30 | │ sum(lineitem │ 31 | │ .l_extendedp │ 32 | │ rice) │ 33 | │ │ 34 | │ mode: Partial │ 35 | └─────────────┬─────────────┘ 36 | ┌─────────────┴─────────────┐ 37 | │ CoalesceBatchesExec │ 38 | │ -------------------- │ 39 | │ target_batch_size: │ 40 | │ 16384 │ 41 | └─────────────┬─────────────┘ 42 | ┌─────────────┴─────────────┐ 43 | │ HashJoinExec │ 44 | │ -------------------- │ 45 | │ on: ├──────────────┐ 46 | │ (l_partkey = p_partkey) │ │ 47 | └─────────────┬─────────────┘ │ 48 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐ 49 | │ CoalescePartitionsExec ││ RepartitionExec │ 50 | │ ││ -------------------- │ 51 | │ ││ partition_count(in->out): │ 52 | │ ││ 1 -> 8 │ 53 | │ ││ │ 54 | │ ││ partitioning_scheme: │ 55 | │ ││ RoundRobinBatch(8) │ 56 | └─────────────┬─────────────┘└─────────────┬─────────────┘ 57 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐ 58 | │ ProjectionExec ││ ProjectionExec │ 59 | │ -------------------- ││ -------------------- │ 60 | │ Float64(0.2) * avg ││ l_extendedprice: │ 61 | │ (lineitem ││ l_extendedprice │ 62 | │ .l_quantity): ││ │ 63 | │ CAST(0.2 * CAST(avg ││ l_quantity: │ 64 | │ (lineitem ││ l_quantity │ 65 | │ .l_quantity) AS ││ │ 66 | │ Float64) AS Decimal128 ││ p_partkey: p_partkey │ 67 | │ (30, 15)) ││ │ 68 | │ ││ │ 69 | │ l_partkey: l_partkey ││ │ 70 | └─────────────┬─────────────┘└─────────────┬─────────────┘ 71 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐ 72 | │ AggregateExec ││ CoalesceBatchesExec │ 73 | │ -------------------- ││ -------------------- │ 74 | │ aggr: ││ target_batch_size: │ 75 | │ avg(lineitem.l_quantity) ││ 16384 │ 76 | │ ││ │ 77 | │ group_by: l_partkey ││ │ 78 | │ ││ │ 79 | │ mode: ││ │ 80 | │ FinalPartitioned ││ │ 81 | └─────────────┬─────────────┘└─────────────┬─────────────┘ 82 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐ 83 | │ CoalesceBatchesExec ││ HashJoinExec │ 84 | │ -------------------- ││ -------------------- │ 85 | │ target_batch_size: ││ on: ├──────────────┐ 86 | │ 16384 ││ (p_partkey = l_partkey) │ │ 87 | └─────────────┬─────────────┘└─────────────┬─────────────┘ │ 88 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐┌─────────────┴─────────────┐ 89 | │ RepartitionExec ││ LiquidCacheClientExec ││ LiquidCacheClientExec │ 90 | │ -------------------- ││ -------------------- ││ -------------------- │ 91 | │ partition_count(in->out): ││ server: ││ server: │ 92 | │ 8 -> 8 ││ http://localhost:50051, ││ http://localhost:50051, │ 93 | │ ││ mode=liquid, ││ mode=liquid, │ 94 | │ partitioning_scheme: ││ object_stores=[] ││ object_stores=[] │ 95 | │ Hash([l_partkey@0], 8) ││ ││ │ 96 | └─────────────┬─────────────┘└─────────────┬─────────────┘└─────────────┬─────────────┘ 97 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐┌─────────────┴─────────────┐ 98 | │ RepartitionExec ││ DataSourceExec ││ DataSourceExec │ 99 | │ -------------------- ││ -------------------- ││ -------------------- │ 100 | │ partition_count(in->out): ││ files: 1 ││ files: 1 │ 101 | │ 1 -> 8 ││ format: parquet ││ format: parquet │ 102 | │ ││ ││ │ 103 | │ partitioning_scheme: ││ predicate: ││ │ 104 | │ RoundRobinBatch(8) ││ p_brand = Brand#23 AND ││ │ 105 | │ ││ p_container = MED BOX ││ │ 106 | └─────────────┬─────────────┘└───────────────────────────┘└───────────────────────────┘ 107 | ┌─────────────┴─────────────┐ 108 | │ AggregateExec │ 109 | │ -------------------- │ 110 | │ aggr: │ 111 | │ avg(lineitem.l_quantity) │ 112 | │ │ 113 | │ group_by: l_partkey │ 114 | │ mode: Partial │ 115 | └─────────────┬─────────────┘ 116 | ┌─────────────┴─────────────┐ 117 | │ LiquidCacheClientExec │ 118 | │ -------------------- │ 119 | │ server: │ 120 | │ http://localhost:50051, │ 121 | │ mode=liquid, │ 122 | │ object_stores=[] │ 123 | └─────────────┬─────────────┘ 124 | ┌─────────────┴─────────────┐ 125 | │ DataSourceExec │ 126 | │ -------------------- │ 127 | │ files: 1 │ 128 | │ format: parquet │ 129 | └───────────────────────────┘ 130 | -------------------------------------------------------------------------------- /src/client/src/tests/snapshots/liquid_cache_client__tests__tpch_q19.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/client/src/tests/mod.rs 3 | expression: displayable.tree_render().to_string() 4 | --- 5 | ┌───────────────────────────┐ 6 | │ ProjectionExec │ 7 | │ -------------------- │ 8 | │ revenue: │ 9 | │ sum(lineitem │ 10 | │ .l_extendedp │ 11 | │ rice * Int64(1) - lineitem│ 12 | │ .l_discount) │ 13 | └─────────────┬─────────────┘ 14 | ┌─────────────┴─────────────┐ 15 | │ AggregateExec │ 16 | │ -------------------- │ 17 | │ aggr │ 18 | │ │ 19 | │ mode: Final │ 20 | └─────────────┬─────────────┘ 21 | ┌─────────────┴─────────────┐ 22 | │ CoalescePartitionsExec │ 23 | └─────────────┬─────────────┘ 24 | ┌─────────────┴─────────────┐ 25 | │ AggregateExec │ 26 | │ -------------------- │ 27 | │ aggr │ 28 | │ │ 29 | │ mode: Partial │ 30 | └─────────────┬─────────────┘ 31 | ┌─────────────┴─────────────┐ 32 | │ CoalesceBatchesExec │ 33 | │ -------------------- │ 34 | │ target_batch_size: │ 35 | │ 16384 │ 36 | └─────────────┬─────────────┘ 37 | ┌─────────────┴─────────────┐ 38 | │ HashJoinExec │ 39 | │ -------------------- │ 40 | │ on: ├──────────────┐ 41 | │ (p_partkey = l_partkey) │ │ 42 | └─────────────┬─────────────┘ │ 43 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐ 44 | │ LiquidCacheClientExec ││ LiquidCacheClientExec │ 45 | │ -------------------- ││ -------------------- │ 46 | │ server: ││ server: │ 47 | │ http://localhost:50051, ││ http://localhost:50051, │ 48 | │ mode=liquid, ││ mode=liquid, │ 49 | │ object_stores=[] ││ object_stores=[] │ 50 | └─────────────┬─────────────┘└─────────────┬─────────────┘ 51 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐ 52 | │ DataSourceExec ││ RepartitionExec │ 53 | │ -------------------- ││ -------------------- │ 54 | │ files: 1 ││ partition_count(in->out): │ 55 | │ format: parquet ││ 1 -> 8 │ 56 | │ ││ │ 57 | │ predicate: ││ partitioning_scheme: │ 58 | │ p_size >= 1 AND (p_brand =││ RoundRobinBatch(8) │ 59 | │ Brand#12 AND p_container ││ │ 60 | │ IN (SM CASE, SM BOX, SM ││ │ 61 | │ PACK, SM PKG) AND ││ │ 62 | │ p_size <= 5 OR ││ │ 63 | │ p_brand = Brand#23 ││ │ 64 | │ AND p_container IN ││ │ 65 | │ (MED BAG, MED BOX, MED ││ │ 66 | │ PKG, MED PACK) AND ││ │ 67 | │ p_size <= 10 OR ││ │ 68 | │ p_brand = Brand#34 ││ │ 69 | │ AND p_container IN ││ │ 70 | │ (LG CASE, LG BOX, LG ││ │ 71 | │ PACK, LG PKG) AND ││ │ 72 | │ p_size <= 15) ││ │ 73 | └───────────────────────────┘└─────────────┬─────────────┘ 74 | ┌─────────────┴─────────────┐ 75 | │ DataSourceExec │ 76 | │ -------------------- │ 77 | │ files: 1 │ 78 | │ format: parquet │ 79 | │ │ 80 | │ predicate: │ 81 | │ (l_shipmode = AIR OR │ 82 | │ l_shipmode = AIR │ 83 | │ REG) AND │ 84 | │ l_shipinstr │ 85 | │ uct = DELIVER IN PERSON │ 86 | │ AND (l_quantity >= │ 87 | │ Some(100),15,2 AND │ 88 | │ l_quantity <= Some │ 89 | │ (1100),15,2 OR │ 90 | │ l_quantity >= │ 91 | │ Some(1000),15,2 AND │ 92 | │ l_quantity <= Some │ 93 | │ (2000),15,2 OR │ 94 | │ l_quantity >= │ 95 | │ Some(2000),15,2 AND │ 96 | │ l_quantity <= Some(3000 │ 97 | │ ),15,2) │ 98 | └───────────────────────────┘ 99 | -------------------------------------------------------------------------------- /src/client/src/tests/snapshots/liquid_cache_client__tests__tpch_q22.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/client/src/tests/mod.rs 3 | expression: displayable.tree_render().to_string() 4 | --- 5 | ┌───────────────────────────┐ 6 | │ SortPreservingMergeExec │ 7 | │ -------------------- │ 8 | │ cntrycode ASC NULLS LAST │ 9 | └─────────────┬─────────────┘ 10 | ┌─────────────┴─────────────┐ 11 | │ SortExec │ 12 | │ -------------------- │ 13 | │ cntrycode@0 ASC NULLS LAST│ 14 | └─────────────┬─────────────┘ 15 | ┌─────────────┴─────────────┐ 16 | │ ProjectionExec │ 17 | │ -------------------- │ 18 | │ cntrycode: cntrycode │ 19 | │ │ 20 | │ numcust: │ 21 | │ count(Int64(1)) │ 22 | │ │ 23 | │ totacctbal: │ 24 | │ sum(custsale.c_acctbal) │ 25 | └─────────────┬─────────────┘ 26 | ┌─────────────┴─────────────┐ 27 | │ AggregateExec │ 28 | │ -------------------- │ 29 | │ aggr: │ 30 | │ count(1), sum(custsale │ 31 | │ .c_acctbal) │ 32 | │ │ 33 | │ group_by: cntrycode │ 34 | │ │ 35 | │ mode: │ 36 | │ FinalPartitioned │ 37 | └─────────────┬─────────────┘ 38 | ┌─────────────┴─────────────┐ 39 | │ CoalesceBatchesExec │ 40 | │ -------------------- │ 41 | │ target_batch_size: │ 42 | │ 16384 │ 43 | └─────────────┬─────────────┘ 44 | ┌─────────────┴─────────────┐ 45 | │ RepartitionExec │ 46 | │ -------------------- │ 47 | │ partition_count(in->out): │ 48 | │ 8 -> 8 │ 49 | │ │ 50 | │ partitioning_scheme: │ 51 | │ Hash([cntrycode@0], 8) │ 52 | └─────────────┬─────────────┘ 53 | ┌─────────────┴─────────────┐ 54 | │ AggregateExec │ 55 | │ -------------------- │ 56 | │ aggr: │ 57 | │ count(1), sum(custsale │ 58 | │ .c_acctbal) │ 59 | │ │ 60 | │ group_by: cntrycode │ 61 | │ mode: Partial │ 62 | └─────────────┬─────────────┘ 63 | ┌─────────────┴─────────────┐ 64 | │ ProjectionExec │ 65 | │ -------------------- │ 66 | │ c_acctbal: c_acctbal │ 67 | │ │ 68 | │ cntrycode: │ 69 | │ substr(c_phone, 1, 2) │ 70 | └─────────────┬─────────────┘ 71 | ┌─────────────┴─────────────┐ 72 | │ NestedLoopJoinExec ├──────────────┐ 73 | └─────────────┬─────────────┘ │ 74 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐ 75 | │ AggregateExec ││ RepartitionExec │ 76 | │ -------------------- ││ -------------------- │ 77 | │ aggr: ││ partition_count(in->out): │ 78 | │ avg(customer.c_acctbal) ││ 1 -> 8 │ 79 | │ ││ │ 80 | │ mode: Final ││ partitioning_scheme: │ 81 | │ ││ RoundRobinBatch(8) │ 82 | └─────────────┬─────────────┘└─────────────┬─────────────┘ 83 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐ 84 | │ CoalescePartitionsExec ││ CoalesceBatchesExec │ 85 | │ ││ -------------------- │ 86 | │ ││ target_batch_size: │ 87 | │ ││ 16384 │ 88 | └─────────────┬─────────────┘└─────────────┬─────────────┘ 89 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐ 90 | │ LiquidCacheClientExec ││ HashJoinExec │ 91 | │ -------------------- ││ -------------------- │ 92 | │ server: ││ join_type: LeftAnti │ 93 | │ http://localhost:50051, ││ ├──────────────┐ 94 | │ mode=liquid, ││ on: │ │ 95 | │ object_stores=[] ││ (c_custkey = o_custkey) │ │ 96 | └─────────────┬─────────────┘└─────────────┬─────────────┘ │ 97 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐┌─────────────┴─────────────┐ 98 | │ AggregateExec ││ LiquidCacheClientExec ││ LiquidCacheClientExec │ 99 | │ -------------------- ││ -------------------- ││ -------------------- │ 100 | │ aggr: ││ server: ││ server: │ 101 | │ avg(customer.c_acctbal) ││ http://localhost:50051, ││ http://localhost:50051, │ 102 | │ ││ mode=liquid, ││ mode=liquid, │ 103 | │ mode: Partial ││ object_stores=[] ││ object_stores=[] │ 104 | └─────────────┬─────────────┘└─────────────┬─────────────┘└─────────────┬─────────────┘ 105 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐┌─────────────┴─────────────┐ 106 | │ RepartitionExec ││ DataSourceExec ││ DataSourceExec │ 107 | │ -------------------- ││ -------------------- ││ -------------------- │ 108 | │ partition_count(in->out): ││ files: 1 ││ files: 1 │ 109 | │ 1 -> 8 ││ format: parquet ││ format: parquet │ 110 | │ ││ ││ │ 111 | │ partitioning_scheme: ││ predicate: ││ │ 112 | │ RoundRobinBatch(8) ││ substr(c_phone, 1, 2) IN ││ │ 113 | │ ││ (13, 31, 23, 29, 30, 18, ││ │ 114 | │ ││ 17) AND true ││ │ 115 | └─────────────┬─────────────┘└───────────────────────────┘└───────────────────────────┘ 116 | ┌─────────────┴─────────────┐ 117 | │ DataSourceExec │ 118 | │ -------------------- │ 119 | │ files: 1 │ 120 | │ format: parquet │ 121 | │ │ 122 | │ predicate: │ 123 | │ c_acctbal > Some(0),15,2 │ 124 | │ AND substr(c_phone, 1, │ 125 | │ 2) IN (13, 31, 23, 29, │ 126 | │ 30, 18, 17) │ 127 | └───────────────────────────┘ 128 | -------------------------------------------------------------------------------- /src/client/src/tests/snapshots/liquid_cache_client__tests__tpch_q4.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/client/src/tests/mod.rs 3 | expression: displayable.tree_render().to_string() 4 | --- 5 | ┌───────────────────────────┐ 6 | │ SortPreservingMergeExec │ 7 | │ -------------------- │ 8 | │ o_orderpriority ASC NULLS │ 9 | │ LAST │ 10 | └─────────────┬─────────────┘ 11 | ┌─────────────┴─────────────┐ 12 | │ SortExec │ 13 | │ -------------------- │ 14 | │ o_orderpriority@0 ASC │ 15 | │ NULLS LAST │ 16 | └─────────────┬─────────────┘ 17 | ┌─────────────┴─────────────┐ 18 | │ ProjectionExec │ 19 | │ -------------------- │ 20 | │ o_orderpriority: │ 21 | │ o_orderpriority │ 22 | │ │ 23 | │ order_count: │ 24 | │ count(Int64(1)) │ 25 | └─────────────┬─────────────┘ 26 | ┌─────────────┴─────────────┐ 27 | │ AggregateExec │ 28 | │ -------------------- │ 29 | │ aggr: count(1) │ 30 | │ │ 31 | │ group_by: │ 32 | │ o_orderpriority │ 33 | │ │ 34 | │ mode: │ 35 | │ FinalPartitioned │ 36 | └─────────────┬─────────────┘ 37 | ┌─────────────┴─────────────┐ 38 | │ CoalesceBatchesExec │ 39 | │ -------------------- │ 40 | │ target_batch_size: │ 41 | │ 16384 │ 42 | └─────────────┬─────────────┘ 43 | ┌─────────────┴─────────────┐ 44 | │ RepartitionExec │ 45 | │ -------------------- │ 46 | │ partition_count(in->out): │ 47 | │ 8 -> 8 │ 48 | │ │ 49 | │ partitioning_scheme: │ 50 | │ Hash([o_orderpriority@0], │ 51 | │ 8) │ 52 | └─────────────┬─────────────┘ 53 | ┌─────────────┴─────────────┐ 54 | │ AggregateExec │ 55 | │ -------------------- │ 56 | │ aggr: count(1) │ 57 | │ │ 58 | │ group_by: │ 59 | │ o_orderpriority │ 60 | │ │ 61 | │ mode: Partial │ 62 | └─────────────┬─────────────┘ 63 | ┌─────────────┴─────────────┐ 64 | │ CoalesceBatchesExec │ 65 | │ -------------------- │ 66 | │ target_batch_size: │ 67 | │ 16384 │ 68 | └─────────────┬─────────────┘ 69 | ┌─────────────┴─────────────┐ 70 | │ HashJoinExec │ 71 | │ -------------------- │ 72 | │ join_type: LeftSemi │ 73 | │ ├──────────────┐ 74 | │ on: │ │ 75 | │ (o_orderkey = l_orderkey) │ │ 76 | └─────────────┬─────────────┘ │ 77 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐ 78 | │ LiquidCacheClientExec ││ LiquidCacheClientExec │ 79 | │ -------------------- ││ -------------------- │ 80 | │ server: ││ server: │ 81 | │ http://localhost:50051, ││ http://localhost:50051, │ 82 | │ mode=liquid, ││ mode=liquid, │ 83 | │ object_stores=[] ││ object_stores=[] │ 84 | └─────────────┬─────────────┘└─────────────┬─────────────┘ 85 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐ 86 | │ DataSourceExec ││ RepartitionExec │ 87 | │ -------------------- ││ -------------------- │ 88 | │ files: 1 ││ partition_count(in->out): │ 89 | │ format: parquet ││ 1 -> 8 │ 90 | │ ││ │ 91 | │ predicate: ││ partitioning_scheme: │ 92 | │ o_orderdate >= 1993-07-01 ││ RoundRobinBatch(8) │ 93 | │ AND o_orderdate < 1993 ││ │ 94 | │ -10-01 ││ │ 95 | └───────────────────────────┘└─────────────┬─────────────┘ 96 | ┌─────────────┴─────────────┐ 97 | │ DataSourceExec │ 98 | │ -------------------- │ 99 | │ files: 1 │ 100 | │ format: parquet │ 101 | │ │ 102 | │ predicate: │ 103 | │ l_receiptdate > │ 104 | │ l_commitdate │ 105 | └───────────────────────────┘ 106 | -------------------------------------------------------------------------------- /src/client/src/tests/snapshots/liquid_cache_client__tests__tpch_q6.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/client/src/tests/mod.rs 3 | expression: displayable.tree_render().to_string() 4 | --- 5 | ┌───────────────────────────┐ 6 | │ ProjectionExec │ 7 | │ -------------------- │ 8 | │ revenue: │ 9 | │ sum(lineitem │ 10 | │ .l_extendedp │ 11 | │rice * lineitem.l_discount)│ 12 | └─────────────┬─────────────┘ 13 | ┌─────────────┴─────────────┐ 14 | │ AggregateExec │ 15 | │ -------------------- │ 16 | │ aggr: │ 17 | │ sum(lineitem │ 18 | │ .l_extendedp │ 19 | │rice * lineitem.l_discount)│ 20 | │ │ 21 | │ mode: Final │ 22 | └─────────────┬─────────────┘ 23 | ┌─────────────┴─────────────┐ 24 | │ CoalescePartitionsExec │ 25 | └─────────────┬─────────────┘ 26 | ┌─────────────┴─────────────┐ 27 | │ LiquidCacheClientExec │ 28 | │ -------------------- │ 29 | │ server: │ 30 | │ http://localhost:50051, │ 31 | │ mode=liquid, │ 32 | │ object_stores=[] │ 33 | └─────────────┬─────────────┘ 34 | ┌─────────────┴─────────────┐ 35 | │ AggregateExec │ 36 | │ -------------------- │ 37 | │ aggr: │ 38 | │ sum(lineitem │ 39 | │ .l_extendedp │ 40 | │rice * lineitem.l_discount)│ 41 | │ │ 42 | │ mode: Partial │ 43 | └─────────────┬─────────────┘ 44 | ┌─────────────┴─────────────┐ 45 | │ RepartitionExec │ 46 | │ -------------------- │ 47 | │ partition_count(in->out): │ 48 | │ 1 -> 8 │ 49 | │ │ 50 | │ partitioning_scheme: │ 51 | │ RoundRobinBatch(8) │ 52 | └─────────────┬─────────────┘ 53 | ┌─────────────┴─────────────┐ 54 | │ DataSourceExec │ 55 | │ -------------------- │ 56 | │ files: 1 │ 57 | │ format: parquet │ 58 | │ │ 59 | │ predicate: │ 60 | │ l_shipdate >= 1994-01-01 │ 61 | │ AND l_shipdate < 1995 │ 62 | │ -01-01 AND l_discount >= │ 63 | │ Some(5),15,2 AND │ 64 | │ l_discount <= │ 65 | │ Some(7),15,2 AND │ 66 | │ l_quantity < Some │ 67 | │ (2400),15,2 │ 68 | └───────────────────────────┘ 69 | -------------------------------------------------------------------------------- /src/common/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "liquid-cache-common" 3 | version = { workspace = true } 4 | edition = { workspace = true } 5 | license = { workspace = true } 6 | readme = { workspace = true } 7 | description = { workspace = true } 8 | repository = { workspace = true } 9 | 10 | [dependencies] 11 | arrow-flight = { workspace = true } 12 | arrow-schema = { workspace = true } 13 | arrow = { workspace = true } 14 | prost = { workspace = true } 15 | bytes = { workspace = true } 16 | url = { workspace = true } 17 | serde = { workspace = true } 18 | 19 | [dev-dependencies] 20 | tempfile = { workspace = true } 21 | -------------------------------------------------------------------------------- /src/common/src/utils.rs: -------------------------------------------------------------------------------- 1 | use url::Url; 2 | 3 | /// Sanitize an object store URL for use as a directory name. 4 | pub fn sanitize_object_store_url_for_dirname(url: &Url) -> String { 5 | let mut parts = vec![url.scheme()]; 6 | 7 | if let Some(host) = url.host_str() { 8 | parts.push(host); 9 | } 10 | 11 | let dirname = parts.join("_"); 12 | 13 | dirname.replace(['/', ':', '?', '&', '=', '\\'], "_") 14 | } 15 | 16 | /// Sanitize a path for use as a directory name. 17 | pub fn sanitize_path_for_dirname(path: &str) -> String { 18 | path.replace(['/', ':', '?', '&', '=', '\\'], "_") 19 | } 20 | 21 | #[cfg(test)] 22 | mod tests { 23 | use super::*; 24 | use std::fs; 25 | use tempfile::TempDir; 26 | use url::Url; 27 | 28 | #[test] 29 | fn test_can_create_directories_with_sanitized_names() { 30 | // Create a temporary directory for testing 31 | let temp_dir = TempDir::new().expect("Failed to create temp directory"); 32 | 33 | // Array of problematic URLs to test 34 | let test_urls = [ 35 | "http://example.com/path/to/resource", 36 | "https://example.com?param1=value1¶m2=value2", 37 | "s3://bucket-name/object/key", 38 | "https://user:password@example.com:8080/path?query=value#fragment", 39 | "file:///C:/Windows/System32/", 40 | "https://example.com/path/with/special?chars=%20%26%3F", 41 | "http://192.168.1.1:8080/admin?debug=true", 42 | "ftp://files.example.com/pub/file.txt", 43 | // Unicode characters in URL 44 | "https://例子.测试", 45 | // Very long URL 46 | &format!("https://example.com/{}", "a".repeat(200)), 47 | ]; 48 | 49 | // Test each URL 50 | for url_str in test_urls { 51 | let url = Url::parse(url_str).expect("Failed to parse URL"); 52 | let dirname = sanitize_object_store_url_for_dirname(&url); 53 | 54 | // Create a directory using the sanitized name 55 | let dir_path = temp_dir.path().join(dirname); 56 | fs::create_dir(&dir_path).expect("Failed to create directory"); 57 | 58 | // Verify the directory exists 59 | assert!(dir_path.exists()); 60 | assert!(dir_path.is_dir()); 61 | 62 | // Clean up 63 | fs::remove_dir(&dir_path).expect("Failed to remove test directory"); 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/liquid_parquet/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "liquid-cache-parquet" 3 | version = { workspace = true } 4 | edition = { workspace = true } 5 | license = { workspace = true } 6 | readme = { workspace = true } 7 | description = { workspace = true } 8 | repository = { workspace = true } 9 | 10 | [dependencies] 11 | arrow = { workspace = true } 12 | arrow-schema = { workspace = true } 13 | parquet = { workspace = true } 14 | datafusion = { workspace = true } 15 | async-trait = { workspace = true } 16 | futures = { workspace = true } 17 | tokio = { workspace = true } 18 | url = { workspace = true } 19 | ahash = { workspace = true } 20 | fsst-rs = "0.5.2" 21 | serde = { workspace = true } 22 | bytes = { workspace = true } 23 | log = { workspace = true } 24 | object_store = { workspace = true } 25 | itertools = { workspace = true } 26 | fastlanes = "0.1.8" 27 | num-traits = "0.2.19" 28 | zerocopy = { version = "0.8.25", features = ["derive"] } 29 | liquid-cache-common = { workspace = true } 30 | fastrace = { workspace = true } 31 | fastrace-futures = { workspace = true } 32 | congee = { workspace = true } 33 | 34 | [dev-dependencies] 35 | tempfile = "3.20.0" 36 | criterion = "0.6.0" 37 | rand = "0.9.1" 38 | shuttle = "0.8.0" 39 | tracing-subscriber = "0.3.19" 40 | paste = "1.0.15" 41 | 42 | [features] 43 | shuttle = [] 44 | 45 | 46 | [[bench]] 47 | name = "bitpacking" 48 | path = "bench/bitpacking.rs" 49 | harness = false 50 | 51 | [[bench]] 52 | name = "liquid_float_array" 53 | harness = false 54 | path = "bench/liquid_float_array.rs" 55 | 56 | [[bench]] 57 | name = "boolean_and_then" 58 | harness = false 59 | path = "bench/boolean_and_then.rs" 60 | 61 | [[bench]] 62 | name = "bench_eviction" 63 | path = "bench/bench_eviction.rs" 64 | 65 | [[bench]] 66 | name = "fsstarray" 67 | path = "bench/fsstarray.rs" 68 | harness = false 69 | -------------------------------------------------------------------------------- /src/liquid_parquet/bench/bench_eviction.rs: -------------------------------------------------------------------------------- 1 | mod eviction_cache; 2 | use eviction_cache::{Cache, ClockCache, FifoCache, LfuCache, LruCache}; 3 | use std::collections::HashSet; 4 | use std::fs::File; 5 | use std::io::{BufRead, BufReader}; 6 | 7 | fn pack_u16s(a: u16, b: u16, c: u16) -> u64 { 8 | ((a as u64) << 32) | ((b as u64) << 16) | (c as u64) 9 | } 10 | 11 | fn bench(total_size: u64, create: impl Fn(u64) -> C, name: String) { 12 | let mut cache_size = total_size; 13 | 14 | while cache_size > 0 { 15 | let mut cache = create(cache_size); 16 | let file = File::open("./cache_trace.csv").expect("Failed to reopen cache_trace.csv"); 17 | let reader = BufReader::new(file); 18 | 19 | for line in reader.lines().skip(1) { 20 | let line = line.expect("Failed to read line"); 21 | let fields: Vec<&str> = line.split(',').collect(); 22 | if fields.len() == 6 { 23 | let file_id: u16 = fields[0].parse().expect("Failed to parse file_id"); 24 | let row_group: u16 = fields[1].parse().expect("Failed to parse row_group"); 25 | let col: u16 = fields[2].parse().expect("Failed to parse col"); 26 | let size: u64 = fields[4].parse().expect("Failed to parse size"); 27 | 28 | let key = pack_u16s(file_id, row_group, col); 29 | 30 | cache.get(key, size); 31 | } 32 | } 33 | 34 | let (hits, total) = cache.result(); 35 | println!("{},{},{},{}", name, cache_size, hits, total); 36 | 37 | cache_size /= 10; 38 | } 39 | } 40 | 41 | fn main() { 42 | // Read and parse the cache trace file 43 | let file = File::open("./cache_trace.csv").expect("Failed to open cache_trace.csv"); 44 | let reader = BufReader::new(file); 45 | 46 | let mut total_size: u64 = 0; 47 | let mut count = 0; 48 | let mut cols = HashSet::new(); 49 | 50 | for line in reader.lines().skip(1) { 51 | let line = line.expect("Failed to read line"); 52 | let fields: Vec<&str> = line.split(',').collect(); 53 | if fields.len() == 6 { 54 | let file_id: u16 = fields[0].parse().expect("Failed to parse file_id"); 55 | let row_group: u16 = fields[1].parse().expect("Failed to parse row_group"); 56 | let col: u16 = fields[2].parse().expect("Failed to parse col"); 57 | let size: u64 = fields[4].parse().expect("Failed to parse size"); 58 | 59 | let new = cols.insert(pack_u16s(file_id, row_group, col)); 60 | if new { 61 | total_size += size; 62 | } 63 | count += 1; 64 | } 65 | } 66 | 67 | println!("Read {} inserts, total size: {}", count, total_size); 68 | 69 | bench(total_size, LruCache::new, "LRU".to_string()); 70 | bench(total_size, ClockCache::new, "CLOCK".to_string()); 71 | bench(total_size, LfuCache::new, "LFU".to_string()); 72 | bench(total_size, FifoCache::new, "FIFO".to_string()); 73 | } 74 | -------------------------------------------------------------------------------- /src/liquid_parquet/bench/bitpacking.rs: -------------------------------------------------------------------------------- 1 | use criterion::Throughput; 2 | use criterion::*; 3 | 4 | use std::num::NonZero; 5 | 6 | use arrow::array::PrimitiveArray; 7 | use liquid_cache_parquet::liquid_array::raw::BitPackedArray; 8 | use rand::Rng; 9 | 10 | const MAX_BIT_WIDTH: u8 = 32; 11 | const MAX_ARRAY_MULTIPLIER: usize = 8; 12 | const BASE_ARRAY_SIZE: usize = 8192; 13 | 14 | // Function to create a random vector of u32 values with a given size and bit width 15 | fn create_random_vec(array_size: usize, bit_width: u8) -> Vec { 16 | let max_value = (1u32 << bit_width) - 1; 17 | let mut rng = rand::rng(); 18 | let values: Vec = (0..array_size) 19 | .map(|_| rng.random_range(0..=max_value)) 20 | .collect(); 21 | values 22 | } 23 | 24 | // Benchmark function to measure the performance of from_primitive 25 | fn from_primitive_benchmark(c: &mut Criterion) { 26 | use arrow::datatypes::UInt32Type; 27 | 28 | // `bit_widths` represents the range of bit widths to test (1 through MAX_BIT_WIDTH). 29 | // Each bit width determines the maximum value that can be represented in the random vector. 30 | // For example, a bit width of 8 allows values in the range [0, 255]. 31 | let bit_widths: Vec = (1..=MAX_BIT_WIDTH).step_by(4).collect(); 32 | for bit_width in bit_widths { 33 | // `array_sizes` represents the range of array sizes to test. 34 | // Each size is a multiple of BASE_ARRAY_SIZE (e.g., 8192, 16384, etc.). 35 | let array_sizes: Vec = (1..=MAX_ARRAY_MULTIPLIER) 36 | .map(|i| BASE_ARRAY_SIZE * i) 37 | .collect(); 38 | for array_size in array_sizes { 39 | let values: Vec = create_random_vec(array_size, bit_width); 40 | 41 | // Convert the random vector into a PrimitiveArray 42 | let array = PrimitiveArray::::from(values); 43 | let bit_width = NonZero::new(bit_width).unwrap(); 44 | 45 | // Benchmark from_primitive() - the conversion from PrimitiveArray to BitPackedArray 46 | let mut group = c.benchmark_group(format!("from_primitive_bw_{}", bit_width)); 47 | group.throughput(Throughput::Bytes( 48 | (array_size * std::mem::size_of::()) as u64, 49 | )); 50 | group.bench_function(format!("size_{}", array_size), |b| { 51 | b.iter(|| { 52 | std::hint::black_box(BitPackedArray::from_primitive(array.clone(), bit_width)) 53 | }) 54 | }); 55 | group.finish(); 56 | } 57 | } 58 | } 59 | 60 | // Benchmark function to measure the performance of to_primitive 61 | fn to_primitive_benchmark(c: &mut Criterion) { 62 | use arrow::datatypes::UInt32Type; 63 | 64 | let bit_widths: Vec = (1..=MAX_BIT_WIDTH).collect(); 65 | for bit_width in bit_widths { 66 | let array_sizes: Vec = (1..=MAX_ARRAY_MULTIPLIER) 67 | .map(|i| BASE_ARRAY_SIZE * i) 68 | .collect(); 69 | for array_size in array_sizes { 70 | let values: Vec = create_random_vec(array_size, bit_width); 71 | 72 | // Convert the random vector into a PrimitiveArray 73 | let array = PrimitiveArray::::from(values); 74 | let bit_width = NonZero::new(bit_width).unwrap(); 75 | let bit_packed = BitPackedArray::from_primitive(array, bit_width); 76 | 77 | // Benchmark to_primitive() - the conversion from a BitPackedArray to PrimitiveArray 78 | let mut group = c.benchmark_group(format!("to_primitive_bw_{}", bit_width)); 79 | group.throughput(Throughput::Bytes( 80 | (array_size * std::mem::size_of::()) as u64, 81 | )); 82 | group.bench_function(format!("size_{}", array_size), |b| { 83 | b.iter(|| std::hint::black_box(bit_packed.to_primitive())) 84 | }); 85 | group.finish(); 86 | } 87 | } 88 | } 89 | 90 | criterion_group!(benches, from_primitive_benchmark, to_primitive_benchmark); 91 | 92 | // Entry point for Criterion benchmarking 93 | criterion_main!(benches); 94 | -------------------------------------------------------------------------------- /src/liquid_parquet/bench/boolean_and_then.rs: -------------------------------------------------------------------------------- 1 | use arrow::{array::BooleanBufferBuilder, buffer::BooleanBuffer}; 2 | use criterion::{Criterion, Throughput, criterion_group, criterion_main}; 3 | use liquid_cache_parquet::boolean_buffer_and_then; 4 | 5 | use rand::Rng; 6 | 7 | const BUFFER_SIZE: usize = 8192 * 2; // 16384 8 | 9 | /// Generate a BooleanBuffer with specified selectivity (percentage of true bits) 10 | fn generate_boolean_buffer(size: usize, selectivity: f64) -> BooleanBuffer { 11 | let mut rng = rand::rng(); 12 | let mut builder = BooleanBufferBuilder::new(size); 13 | 14 | for _ in 0..size { 15 | let should_set = rng.random_bool(selectivity); 16 | builder.append(should_set); 17 | } 18 | 19 | builder.finish() 20 | } 21 | 22 | /// Generate a right BooleanBuffer that has exactly `count_set_bits` bits 23 | fn generate_right_boolean_buffer(count_set_bits: usize, selectivity: f64) -> BooleanBuffer { 24 | let mut rng = rand::rng(); 25 | let mut builder = BooleanBufferBuilder::new(count_set_bits); 26 | 27 | for _ in 0..count_set_bits { 28 | let should_set = rng.random_bool(selectivity); 29 | builder.append(should_set); 30 | } 31 | 32 | builder.finish() 33 | } 34 | 35 | fn benchmark_boolean_and_then(c: &mut Criterion) { 36 | // Three selectivity levels: low (10%), medium (50%), high (90%) 37 | let selectivities = [0.1, 0.5, 0.9]; 38 | 39 | for left_selectivity in selectivities { 40 | for right_selectivity in selectivities { 41 | let group_name = format!( 42 | "boolean_and_then_left_{:.0}%_right_{:.0}%", 43 | left_selectivity * 100.0, 44 | right_selectivity * 100.0 45 | ); 46 | 47 | let mut group = c.benchmark_group(&group_name); 48 | 49 | // Set throughput based on the buffer size in bytes 50 | // Each boolean buffer uses approximately size/8 bytes 51 | group.throughput(Throughput::Bytes((BUFFER_SIZE / 8) as u64)); 52 | 53 | group.bench_function("size_16384", |b| { 54 | // Pre-generate test data 55 | let left = generate_boolean_buffer(BUFFER_SIZE, left_selectivity); 56 | let count_set_bits = left.count_set_bits(); 57 | let right = generate_right_boolean_buffer(count_set_bits, right_selectivity); 58 | 59 | b.iter(|| std::hint::black_box(boolean_buffer_and_then(&left, &right))) 60 | }); 61 | 62 | group.finish(); 63 | } 64 | } 65 | } 66 | 67 | criterion_group!(benches, benchmark_boolean_and_then); 68 | criterion_main!(benches); 69 | -------------------------------------------------------------------------------- /src/liquid_parquet/bench/fsstarray.rs: -------------------------------------------------------------------------------- 1 | use criterion::*; 2 | use std::sync::Arc; 3 | use std::time::Duration; 4 | 5 | extern crate arrow; 6 | 7 | use arrow::{ 8 | array::{Array, StringArray, StringBuilder}, 9 | datatypes::Utf8Type, 10 | }; 11 | use liquid_cache_parquet::liquid_array::raw::FsstArray; 12 | use std::fs; 13 | 14 | const CHUNK_SIZE: [usize; 5] = [12, 32, 64, 128, 256]; 15 | 16 | fn create_string_arrays_from_file() -> Vec<(usize, StringArray)> { 17 | const TEST_FILE_PATH: &str = "../../README.md"; 18 | const LICENSE_FILE_PATH: &str = "../../LICENSE"; 19 | 20 | let readme = fs::read_to_string(TEST_FILE_PATH).expect("Failed to read file"); 21 | let license = fs::read_to_string(LICENSE_FILE_PATH).expect("Failed to read file"); 22 | let content = format!("{}\n\n{}", readme, license); 23 | 24 | let mut result = Vec::new(); 25 | 26 | let chars: Vec = content.chars().collect(); 27 | 28 | for &chunk_size in &CHUNK_SIZE { 29 | let mut builder = StringBuilder::new(); 30 | for chunk in chars.chunks(chunk_size) { 31 | let chunk_str: String = chunk.iter().collect(); 32 | builder.append_value(chunk_str); 33 | } 34 | result.push((chunk_size, builder.finish())); 35 | } 36 | 37 | result 38 | } 39 | 40 | // Benchmark for training the FSST compressor 41 | fn compressor_benchmark(c: &mut Criterion) { 42 | let string_arrays = create_string_arrays_from_file(); 43 | 44 | let mut group = c.benchmark_group("fsst"); 45 | for (chunk_size, string_array) in string_arrays { 46 | let total_size = chunk_size * string_array.len(); 47 | // Set the measurement time for the benchmark 48 | group.measurement_time(Duration::new(10, 0)); 49 | 50 | // Set the throughput for the benchmark 51 | group.throughput(Throughput::Bytes(total_size as u64)); 52 | 53 | // Benchmark the FSST compressor training 54 | group.bench_function( 55 | format!("train_compressor - chunk_size: {}", chunk_size), 56 | |b| { 57 | b.iter(|| { 58 | let input = std::hint::black_box( 59 | string_array.iter().flat_map(|s| s.map(|a| a.as_bytes())), 60 | ); 61 | FsstArray::train_compressor(input) 62 | }); 63 | }, 64 | ); 65 | } 66 | group.finish(); 67 | } 68 | 69 | // Benchmark for creating an FSST array from a byte array using a pre-trained compressor 70 | fn from_byte_array_with_compressor_benchmark(c: &mut Criterion) { 71 | let string_arrays = create_string_arrays_from_file(); 72 | 73 | let mut group = c.benchmark_group(format!("fsst")); 74 | for (chunk_size, string_array) in string_arrays { 75 | // Train the FSST compressor 76 | let compressor = 77 | FsstArray::train_compressor(string_array.iter().flat_map(|s| s.map(|s| s.as_bytes()))); 78 | 79 | let compressed = 80 | FsstArray::from_byte_array_with_compressor(&string_array, Arc::new(compressor.clone())); 81 | let compressed_size = compressed.get_array_memory_size(); 82 | let uncompressed_size = chunk_size * string_array.len(); 83 | println!( 84 | "compressed_size: {}, uncompressed_size: {}, compression_ratio: {}", 85 | compressed_size, 86 | uncompressed_size, 87 | compressed_size as f64 / uncompressed_size as f64 88 | ); 89 | 90 | // Set the throughput for the benchmark 91 | group.throughput(Throughput::Bytes(uncompressed_size as u64)); 92 | 93 | // Benchmark the creation of an FSST array from a byte array 94 | group.bench_function(format!("compress - chunk_size: {}", chunk_size), |b| { 95 | b.iter(|| { 96 | std::hint::black_box(FsstArray::from_byte_array_with_compressor( 97 | &string_array, 98 | Arc::new(compressor.clone()), 99 | )) 100 | }); 101 | }); 102 | } 103 | group.finish(); 104 | } 105 | 106 | // Benchmark for converting an FSST array to an Arrow byte array 107 | fn to_arrow_byte_array_benchmark(c: &mut Criterion) { 108 | let string_arrays = create_string_arrays_from_file(); 109 | 110 | let mut group = c.benchmark_group(format!("fsst")); 111 | for (chunk_size, string_array) in string_arrays { 112 | // Train the FSST compressor 113 | let compressor = 114 | FsstArray::train_compressor(string_array.iter().flat_map(|s| s.map(|s| s.as_bytes()))); 115 | 116 | // Create an FSST array using the trained compressor 117 | let fsst_values = 118 | FsstArray::from_byte_array_with_compressor(&string_array, Arc::new(compressor)); 119 | 120 | let total_size = chunk_size * string_array.len(); 121 | 122 | // Set the throughput for the benchmark 123 | group.throughput(Throughput::Bytes(total_size as u64)); 124 | 125 | // Benchmark the conversion of FSST array to Arrow byte array 126 | group.bench_function(format!("decompress - chunk_size: {}", chunk_size), |b| { 127 | b.iter(|| std::hint::black_box(fsst_values.to_arrow_byte_array::())); 128 | }); 129 | } 130 | group.finish(); 131 | } 132 | 133 | // Define the benchmark group 134 | criterion_group!( 135 | benches, 136 | compressor_benchmark, 137 | from_byte_array_with_compressor_benchmark, 138 | to_arrow_byte_array_benchmark 139 | ); 140 | 141 | // Entry point for Criterion benchmarking 142 | criterion_main!(benches); 143 | -------------------------------------------------------------------------------- /src/liquid_parquet/bench/liquid_float_array.rs: -------------------------------------------------------------------------------- 1 | use criterion::{Criterion, Throughput, criterion_group, criterion_main}; 2 | use datafusion::arrow::{ 3 | array::PrimitiveArray, 4 | buffer::ScalarBuffer, 5 | datatypes::{Float32Type, Float64Type}, 6 | }; 7 | use liquid_cache_parquet::liquid_array::{LiquidArray, LiquidFloatArray}; 8 | use rand::Rng; 9 | 10 | fn criterion_benchmark(c: &mut Criterion) { 11 | // Encoding benchmarks for float32 12 | let bench_sizes = [8192, 16384, 24576]; 13 | for size in bench_sizes { 14 | let mut group = c.benchmark_group(format!("float32_liquid_encode")); 15 | group.throughput(Throughput::Bytes( 16 | (size * std::mem::size_of::()) as u64, 17 | )); 18 | group.bench_function(format!("size_{}", size), |b| { 19 | let mut rng = rand::rng(); 20 | let mut array: Vec = vec![]; 21 | for _ in 0..size { 22 | array.push(rng.random_range(-1.3e3..1.3e3)); 23 | } 24 | let arrow_array = PrimitiveArray::new(ScalarBuffer::from(array), None); 25 | b.iter(|| { 26 | let _x = LiquidFloatArray::::from_arrow_array(arrow_array.clone()); 27 | }) 28 | }); 29 | group.finish(); 30 | } 31 | 32 | for size in bench_sizes { 33 | let mut group = c.benchmark_group(format!("float64_liquid_encode")); 34 | group.throughput(Throughput::Bytes( 35 | (size * std::mem::size_of::()) as u64, 36 | )); 37 | group.bench_function(format!("size_{}", size), |b| { 38 | let mut rng = rand::rng(); 39 | let mut array: Vec = vec![]; 40 | for _ in 0..size { 41 | array.push(rng.random_range(-1.3e3..1.3e3)); 42 | } 43 | let arrow_array = PrimitiveArray::new(ScalarBuffer::from(array), None); 44 | b.iter(|| { 45 | let _x = LiquidFloatArray::::from_arrow_array(arrow_array.clone()); 46 | }) 47 | }); 48 | group.finish(); 49 | } 50 | 51 | // Decoding benchmarks for float32 52 | for size in bench_sizes { 53 | let mut rng = rand::rng(); 54 | let mut array: Vec = vec![]; 55 | for _ in 0..size { 56 | array.push(rng.random_range(-1.3e3..1.3e3)); 57 | } 58 | let arrow_array = PrimitiveArray::::new(ScalarBuffer::from(array), None); 59 | let liquid_array = LiquidFloatArray::::from_arrow_array(arrow_array); 60 | 61 | let mut group = c.benchmark_group(format!("float32_liquid_decode")); 62 | group.throughput(Throughput::Bytes( 63 | (size * std::mem::size_of::()) as u64, 64 | )); 65 | group.bench_function(format!("size_{}", size), |b| { 66 | b.iter(|| { 67 | let _x = liquid_array.to_arrow_array(); 68 | }) 69 | }); 70 | group.finish(); 71 | } 72 | 73 | // Decoding benchmarks for float64 74 | for size in bench_sizes { 75 | let mut rng = rand::rng(); 76 | let mut array: Vec = vec![]; 77 | for _ in 0..size { 78 | array.push(rng.random_range(-1.3e3..1.3e3)); 79 | } 80 | let arrow_array = PrimitiveArray::::new(ScalarBuffer::from(array), None); 81 | let liquid_array = LiquidFloatArray::::from_arrow_array(arrow_array); 82 | 83 | let mut group = c.benchmark_group(format!("float64_liquid_decode")); 84 | group.throughput(Throughput::Bytes( 85 | (size * std::mem::size_of::()) as u64, 86 | )); 87 | group.bench_function(format!("size_{}", size), |b| { 88 | b.iter(|| { 89 | let _x = liquid_array.to_arrow_array(); 90 | }) 91 | }); 92 | group.finish(); 93 | } 94 | } 95 | 96 | criterion_group!(benches, criterion_benchmark); 97 | criterion_main!(benches); 98 | -------------------------------------------------------------------------------- /src/liquid_parquet/clippy.toml: -------------------------------------------------------------------------------- 1 | disallowed-methods = [] 2 | 3 | disallowed-types = [ 4 | { path = "dashmap::DashMap", reason = "DashMap can easily lead to deadlocks, use RwLock with shuttle tests instead" }, 5 | ] 6 | 7 | # Lowering the threshold to help prevent stack overflows (default is 16384) 8 | # See: https://rust-lang.github.io/rust-clippy/master/index.html#/large_futures 9 | future-size-threshold = 10000 10 | too-many-lines-threshold = 50 11 | -------------------------------------------------------------------------------- /src/liquid_parquet/src/cache/budget.rs: -------------------------------------------------------------------------------- 1 | use crate::sync::atomic::{AtomicUsize, Ordering}; 2 | 3 | use log::warn; 4 | 5 | #[derive(Debug)] 6 | pub(super) struct BudgetAccounting { 7 | max_memory_bytes: usize, 8 | used_memory_bytes: AtomicUsize, 9 | used_disk_bytes: AtomicUsize, 10 | } 11 | 12 | impl BudgetAccounting { 13 | pub(super) fn new(max_memory_bytes: usize) -> Self { 14 | Self { 15 | max_memory_bytes, 16 | used_memory_bytes: AtomicUsize::new(0), 17 | used_disk_bytes: AtomicUsize::new(0), 18 | } 19 | } 20 | 21 | pub(super) fn reset_usage(&self) { 22 | self.used_memory_bytes.store(0, Ordering::Relaxed); 23 | self.used_disk_bytes.store(0, Ordering::Relaxed); 24 | } 25 | 26 | /// Try to reserve space in the cache. 27 | /// Returns true if the space was reserved, false if the cache is full. 28 | pub(super) fn try_reserve_memory(&self, request_bytes: usize) -> Result<(), ()> { 29 | let used = self.used_memory_bytes.load(Ordering::Relaxed); 30 | if used + request_bytes > self.max_memory_bytes { 31 | return Err(()); 32 | } 33 | 34 | match self.used_memory_bytes.compare_exchange( 35 | used, 36 | used + request_bytes, 37 | Ordering::Relaxed, 38 | Ordering::Relaxed, 39 | ) { 40 | Ok(_) => Ok(()), 41 | Err(_) => self.try_reserve_memory(request_bytes), 42 | } 43 | } 44 | 45 | /// Adjust the cache size after transcoding. 46 | /// Returns true if the size was adjusted, false if the cache is full, when new_size is larger than old_size. 47 | pub(super) fn try_update_memory_usage( 48 | &self, 49 | old_size: usize, 50 | new_size: usize, 51 | ) -> Result<(), ()> { 52 | if old_size < new_size { 53 | let diff = new_size - old_size; 54 | if diff > 1024 * 1024 { 55 | warn!( 56 | "Transcoding increased the size of the array by at least 1MB, previous size: {old_size}, new size: {new_size}, double check this is correct" 57 | ); 58 | } 59 | 60 | self.try_reserve_memory(diff)?; 61 | Ok(()) 62 | } else { 63 | self.used_memory_bytes 64 | .fetch_sub(old_size - new_size, Ordering::Relaxed); 65 | Ok(()) 66 | } 67 | } 68 | 69 | pub fn memory_usage_bytes(&self) -> usize { 70 | self.used_memory_bytes.load(Ordering::Relaxed) 71 | } 72 | 73 | pub fn disk_usage_bytes(&self) -> usize { 74 | self.used_disk_bytes.load(Ordering::Relaxed) 75 | } 76 | 77 | pub fn add_used_disk_bytes(&self, bytes: usize) { 78 | self.used_disk_bytes.fetch_add(bytes, Ordering::Relaxed); 79 | } 80 | } 81 | 82 | #[cfg(test)] 83 | mod tests { 84 | use super::*; 85 | use crate::sync::{Arc, Barrier, thread}; 86 | 87 | #[test] 88 | fn test_memory_reservation_and_accounting() { 89 | let config = BudgetAccounting::new(1000); 90 | 91 | assert_eq!(config.memory_usage_bytes(), 0); 92 | 93 | assert!(config.try_reserve_memory(500).is_ok()); 94 | assert_eq!(config.memory_usage_bytes(), 500); 95 | 96 | assert!(config.try_reserve_memory(300).is_ok()); 97 | assert_eq!(config.memory_usage_bytes(), 800); 98 | 99 | assert!(config.try_reserve_memory(300).is_err()); 100 | assert_eq!(config.memory_usage_bytes(), 800); 101 | 102 | config.reset_usage(); 103 | assert_eq!(config.memory_usage_bytes(), 0); 104 | } 105 | 106 | #[test] 107 | fn test_concurrent_memory_operations() { 108 | test_concurrent_memory_budget(); 109 | } 110 | 111 | #[cfg(feature = "shuttle")] 112 | #[test] 113 | fn shuttle_memory_budget_operations() { 114 | crate::utils::shuttle_test(test_concurrent_memory_budget); 115 | } 116 | 117 | fn test_concurrent_memory_budget() { 118 | let num_threads = 3; 119 | let max_memory = 10000; 120 | let operations_per_thread = 100; 121 | 122 | let budget = Arc::new(BudgetAccounting::new(max_memory)); 123 | let barrier = Arc::new(Barrier::new(num_threads)); 124 | 125 | let mut thread_handles = vec![]; 126 | 127 | for _ in 0..num_threads { 128 | let budget_clone = budget.clone(); 129 | let barrier_clone = barrier.clone(); 130 | 131 | let handle = thread::spawn(move || { 132 | let mut successful_reservations = Vec::new(); 133 | 134 | barrier_clone.wait(); 135 | 136 | for i in 0..operations_per_thread { 137 | let reserve_size = 10 + (i % 20) * 5; // 10 to 105 bytes 138 | if budget_clone.try_reserve_memory(reserve_size).is_ok() { 139 | successful_reservations.push(reserve_size); 140 | } 141 | 142 | if i % 5 == 0 && !successful_reservations.is_empty() { 143 | let idx = i % successful_reservations.len(); 144 | let old_size = successful_reservations[idx]; 145 | let new_size = if i % 2 == 0 { 146 | old_size + 5 // Grow 147 | } else { 148 | old_size.saturating_sub(5) // Shrink 149 | }; 150 | 151 | if budget_clone 152 | .try_update_memory_usage(old_size, new_size) 153 | .is_ok() 154 | { 155 | successful_reservations[idx] = new_size; 156 | } 157 | } 158 | } 159 | successful_reservations 160 | }); 161 | 162 | thread_handles.push(handle); 163 | } 164 | 165 | let mut expected_memory_usage = 0; 166 | for handle in thread_handles { 167 | let reservations = handle.join().unwrap(); 168 | for size in reservations { 169 | expected_memory_usage += size; 170 | } 171 | } 172 | 173 | assert_eq!(budget.memory_usage_bytes(), expected_memory_usage); 174 | assert!(budget.memory_usage_bytes() <= max_memory); 175 | } 176 | } 177 | -------------------------------------------------------------------------------- /src/liquid_parquet/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![warn(missing_docs)] 2 | #![cfg_attr(not(doctest), doc = include_str!(concat!("../", std::env!("CARGO_PKG_README"))))] 3 | 4 | mod cache; 5 | pub mod liquid_array; 6 | mod reader; 7 | mod sync; 8 | pub use cache::policies; 9 | pub use cache::{LiquidCache, LiquidCacheRef, LiquidCachedFileRef}; 10 | pub use reader::LiquidParquetSource; 11 | pub use reader::LiquidPredicate; 12 | pub(crate) mod utils; 13 | pub use utils::boolean_buffer_and_then; 14 | 15 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Ord, PartialOrd)] 16 | #[allow(unused)] 17 | enum AblationStudyMode { 18 | FullDecoding = 0, 19 | SelectiveDecoding = 1, 20 | SelectiveWithLateMaterialization = 2, 21 | EvaluateOnEncodedData = 3, 22 | EvaluateOnPartialEncodedData = 4, 23 | } 24 | 25 | // This is deliberately made const to avoid the overhead of runtime branching. 26 | const ABLATION_STUDY_MODE: AblationStudyMode = AblationStudyMode::EvaluateOnPartialEncodedData; 27 | -------------------------------------------------------------------------------- /src/liquid_parquet/src/liquid_array/mod.rs: -------------------------------------------------------------------------------- 1 | //! LiquidArray is the core data structure of LiquidCache. 2 | //! You should not use this module directly. 3 | //! Instead, use `liquid_cache_server` or `liquid_cache_client` to interact with LiquidCache. 4 | mod byte_array; 5 | mod fix_len_byte_array; 6 | mod float_array; 7 | pub(crate) mod ipc; 8 | mod primitive_array; 9 | pub mod raw; 10 | pub(crate) mod utils; 11 | 12 | use std::{any::Any, num::NonZero, sync::Arc}; 13 | 14 | use arrow::array::{ArrayRef, BooleanArray}; 15 | pub use byte_array::LiquidByteArray; 16 | pub use fix_len_byte_array::LiquidFixedLenByteArray; 17 | use float_array::LiquidFloatType; 18 | pub use float_array::{LiquidFloat32Array, LiquidFloat64Array, LiquidFloatArray}; 19 | pub use primitive_array::{ 20 | LiquidDate32Array, LiquidDate64Array, LiquidI8Array, LiquidI16Array, LiquidI32Array, 21 | LiquidI64Array, LiquidPrimitiveArray, LiquidPrimitiveType, LiquidU8Array, LiquidU16Array, 22 | LiquidU32Array, LiquidU64Array, 23 | }; 24 | 25 | /// Liquid data type is only logical type 26 | #[derive(Debug, Clone, Copy)] 27 | #[repr(u16)] 28 | pub enum LiquidDataType { 29 | /// A byte array. 30 | ByteArray = 0, 31 | /// An integer. 32 | Integer = 1, 33 | /// A float. 34 | Float = 2, 35 | /// A fixed length byte array. 36 | FixedLenByteArray = 3, 37 | } 38 | 39 | impl From for LiquidDataType { 40 | fn from(value: u16) -> Self { 41 | match value { 42 | 0 => LiquidDataType::ByteArray, 43 | 1 => LiquidDataType::Integer, 44 | 2 => LiquidDataType::Float, 45 | 3 => LiquidDataType::FixedLenByteArray, 46 | _ => panic!("Invalid liquid data type: {value}"), 47 | } 48 | } 49 | } 50 | 51 | /// A trait to access the underlying Liquid array. 52 | pub trait AsLiquidArray { 53 | /// Get the underlying string array. 54 | fn as_string_array_opt(&self) -> Option<&LiquidByteArray>; 55 | 56 | /// Get the underlying string array. 57 | fn as_string(&self) -> &LiquidByteArray { 58 | self.as_string_array_opt().expect("liquid string array") 59 | } 60 | 61 | /// Get the underlying binary array. 62 | fn as_binary_array_opt(&self) -> Option<&LiquidByteArray>; 63 | 64 | /// Get the underlying binary array. 65 | fn as_binary(&self) -> &LiquidByteArray { 66 | self.as_binary_array_opt().expect("liquid binary array") 67 | } 68 | 69 | /// Get the underlying primitive array. 70 | fn as_primitive_array_opt(&self) -> Option<&LiquidPrimitiveArray>; 71 | 72 | /// Get the underlying primitive array. 73 | fn as_primitive(&self) -> &LiquidPrimitiveArray { 74 | self.as_primitive_array_opt() 75 | .expect("liquid primitive array") 76 | } 77 | 78 | /// Get the underlying float array. 79 | fn as_float_array_opt(&self) -> Option<&LiquidFloatArray>; 80 | 81 | /// Get the underlying float array. 82 | fn as_float(&self) -> &LiquidFloatArray { 83 | self.as_float_array_opt().expect("liquid float array") 84 | } 85 | } 86 | 87 | impl AsLiquidArray for dyn LiquidArray + '_ { 88 | fn as_string_array_opt(&self) -> Option<&LiquidByteArray> { 89 | self.as_any().downcast_ref() 90 | } 91 | 92 | fn as_primitive_array_opt(&self) -> Option<&LiquidPrimitiveArray> { 93 | self.as_any().downcast_ref() 94 | } 95 | 96 | fn as_binary_array_opt(&self) -> Option<&LiquidByteArray> { 97 | self.as_any().downcast_ref() 98 | } 99 | 100 | fn as_float_array_opt(&self) -> Option<&LiquidFloatArray> { 101 | self.as_any().downcast_ref() 102 | } 103 | } 104 | 105 | /// A Liquid array. 106 | pub trait LiquidArray: std::fmt::Debug + Send + Sync { 107 | /// Get the underlying any type. 108 | fn as_any(&self) -> &dyn Any; 109 | 110 | /// Get the memory size of the Liquid array. 111 | fn get_array_memory_size(&self) -> usize; 112 | 113 | /// Get the length of the Liquid array. 114 | fn len(&self) -> usize; 115 | 116 | /// Check if the Liquid array is empty. 117 | fn is_empty(&self) -> bool { 118 | self.len() == 0 119 | } 120 | 121 | /// Convert the Liquid array to an Arrow array. 122 | fn to_arrow_array(&self) -> ArrayRef; 123 | 124 | /// Convert the Liquid array to an Arrow array. 125 | /// Except that it will pick the best encoding for the arrow array. 126 | /// Meaning that it may not obey the data type of the original arrow array. 127 | fn to_best_arrow_array(&self) -> ArrayRef { 128 | self.to_arrow_array() 129 | } 130 | 131 | /// Get the logical data type of the Liquid array. 132 | fn data_type(&self) -> LiquidDataType; 133 | 134 | /// Serialize the Liquid array to a byte array. 135 | fn to_bytes(&self) -> Vec; 136 | 137 | /// Filter the Liquid array with a boolean array. 138 | fn filter(&self, selection: &BooleanArray) -> LiquidArrayRef; 139 | } 140 | 141 | /// A reference to a Liquid array. 142 | pub type LiquidArrayRef = Arc; 143 | 144 | pub(crate) fn get_bit_width(max_value: u64) -> NonZero { 145 | if max_value <= 1 { 146 | // todo: here we actually should return 0, as we should just use constant encoding. 147 | // but that's not implemented yet. 148 | NonZero::new(1).unwrap() 149 | } else { 150 | NonZero::new(64 - max_value.leading_zeros() as u8).unwrap() 151 | } 152 | } 153 | -------------------------------------------------------------------------------- /src/liquid_parquet/src/liquid_array/raw/mod.rs: -------------------------------------------------------------------------------- 1 | //! Low level array primitives. 2 | //! You should not use this module directly. 3 | //! Instead, use `liquid_cache_server` or `liquid_cache_client` to interact with LiquidCache. 4 | pub(super) mod bit_pack_array; 5 | pub(super) mod fsst_array; 6 | pub use bit_pack_array::BitPackedArray; 7 | pub use fsst_array::FsstArray; 8 | -------------------------------------------------------------------------------- /src/liquid_parquet/src/liquid_array/utils.rs: -------------------------------------------------------------------------------- 1 | #[cfg(test)] 2 | pub(crate) fn gen_test_decimal_array( 3 | data_type: arrow_schema::DataType, 4 | ) -> arrow::array::PrimitiveArray { 5 | use arrow::{ 6 | array::{AsArray, Int64Builder}, 7 | compute::kernels::cast, 8 | }; 9 | 10 | let mut builder = Int64Builder::new(); 11 | for i in 0..4096i64 { 12 | if i % 97 == 0 { 13 | builder.append_null(); 14 | } else { 15 | let value = if i % 5 == 0 { 16 | i * 1000 + 123 17 | } else if i % 3 == 0 { 18 | 42 19 | } else if i % 7 == 0 { 20 | i * 1_000_000 + 456789 21 | } else { 22 | i * 100 + 42 23 | }; 24 | builder.append_value(value as i64); 25 | } 26 | } 27 | let array = builder.finish(); 28 | cast(&array, &data_type) 29 | .unwrap() 30 | .as_primitive::() 31 | .clone() 32 | } 33 | -------------------------------------------------------------------------------- /src/liquid_parquet/src/reader/mod.rs: -------------------------------------------------------------------------------- 1 | /// Everything happens during query planning time 2 | mod plantime; 3 | 4 | /// Everything happens during query execution time 5 | mod runtime; 6 | 7 | mod utils; 8 | 9 | pub use plantime::LiquidParquetSource; 10 | pub use runtime::LiquidPredicate; 11 | -------------------------------------------------------------------------------- /src/liquid_parquet/src/reader/plantime/mod.rs: -------------------------------------------------------------------------------- 1 | #[cfg(test)] 2 | pub(crate) use source::CachedMetaReaderFactory; 3 | pub use source::LiquidParquetSource; 4 | pub(crate) use source::ParquetMetadataCacheReader; 5 | 6 | // This is entirely copied from DataFusion 7 | // We should make DataFusion to public this 8 | mod row_filter; 9 | 10 | // This is entirely copied from DataFusion 11 | // We should make DataFusion to public this 12 | mod opener; 13 | mod row_group_filter; 14 | mod source; 15 | -------------------------------------------------------------------------------- /src/liquid_parquet/src/reader/runtime/mod.rs: -------------------------------------------------------------------------------- 1 | use crate::liquid_array::LiquidArrayRef; 2 | use arrow::array::{BooleanArray, RecordBatch}; 3 | use arrow_schema::ArrowError; 4 | use in_memory_rg::InMemoryRowGroup; 5 | use parquet::arrow::arrow_reader::ArrowPredicate; 6 | pub(crate) use parquet_bridge::ArrowReaderBuilderBridge; 7 | use parquet_bridge::get_predicate_column_id; 8 | 9 | mod in_memory_rg; 10 | mod liquid_stream; 11 | mod parquet_bridge; 12 | mod reader; 13 | mod utils; 14 | 15 | /// A predicate that can be evaluated on a liquid array. 16 | pub trait LiquidPredicate: ArrowPredicate { 17 | /// Evaluates the predicate on a liquid array. 18 | /// Returns `None` if the predicate is not applicable to the array. 19 | fn evaluate_liquid( 20 | &mut self, 21 | array: &LiquidArrayRef, 22 | ) -> Result, ArrowError>; 23 | 24 | /// Evaluates the predicate on an arrow record batch. 25 | fn evaluate_arrow(&mut self, array: RecordBatch) -> Result { 26 | self.evaluate(array) 27 | } 28 | 29 | /// Returns the column ids of the predicate. 30 | fn predicate_column_ids(&self) -> Vec { 31 | let projection = self.projection(); 32 | get_predicate_column_id(projection) 33 | } 34 | } 35 | 36 | pub struct LiquidRowFilter { 37 | pub(crate) predicates: Vec>, 38 | } 39 | 40 | impl LiquidRowFilter { 41 | pub fn new(predicates: Vec>) -> Self { 42 | Self { predicates } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/liquid_parquet/src/reader/runtime/reader/mod.rs: -------------------------------------------------------------------------------- 1 | use super::LiquidRowFilter; 2 | 3 | mod cached_array_reader; 4 | mod liquid_batch_reader; 5 | pub(crate) use cached_array_reader::build_cached_array_reader; 6 | pub(crate) use liquid_batch_reader::LiquidBatchReader; 7 | pub(super) mod cached_page; 8 | 9 | #[cfg(test)] 10 | mod tests; 11 | -------------------------------------------------------------------------------- /src/liquid_parquet/src/reader/utils/mod.rs: -------------------------------------------------------------------------------- 1 | pub(crate) mod boolean_selection; 2 | -------------------------------------------------------------------------------- /src/liquid_parquet/src/sync.rs: -------------------------------------------------------------------------------- 1 | #[cfg(all(feature = "shuttle", test))] 2 | #[allow(unused_imports)] 3 | pub use shuttle::{sync::*, thread}; 4 | #[cfg(not(all(feature = "shuttle", test)))] 5 | #[allow(unused_imports)] 6 | pub use std::{sync::*, thread}; 7 | -------------------------------------------------------------------------------- /src/server/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "liquid-cache-server" 3 | version = { workspace = true } 4 | edition = { workspace = true } 5 | license = { workspace = true } 6 | readme = { workspace = true } 7 | description = { workspace = true } 8 | repository = { workspace = true } 9 | 10 | 11 | [dependencies] 12 | datafusion = { workspace = true } 13 | datafusion-proto = { workspace = true } 14 | futures = { workspace = true } 15 | arrow = { workspace = true } 16 | arrow-flight = { workspace = true } 17 | arrow-schema = { workspace = true } 18 | log = { workspace = true } 19 | prost = { workspace = true } 20 | tonic = { workspace = true } 21 | tokio = { workspace = true } 22 | url = { workspace = true } 23 | liquid-cache-parquet = { workspace = true } 24 | object_store = { workspace = true, features = ["aws", "http"] } 25 | liquid-cache-common = { workspace = true } 26 | async-trait = { workspace = true } 27 | bytes = { workspace = true } 28 | async-stream = "0.3" 29 | tempfile = { workspace = true } 30 | axum = "0.8.4" 31 | serde = { workspace = true } 32 | serde_json = { workspace = true } 33 | tower-http = { version = "0.6.4", features = ["cors"] } 34 | sysinfo = { version = "0.35.1", default-features = false, features = [ 35 | "component", 36 | "disk", 37 | "network", 38 | "system", 39 | "user", 40 | ] } 41 | uuid = { workspace = true } 42 | fastrace = { workspace = true } 43 | fastrace-futures = { workspace = true } 44 | pprof = { version = "0.14.0", features = ["flamegraph"] } 45 | anyhow = "1.0" 46 | 47 | [dev-dependencies] 48 | liquid-cache-client = { workspace = true } 49 | insta = { version = "1.43.1" } 50 | parquet = { workspace = true } 51 | -------------------------------------------------------------------------------- /src/server/src/admin_server/flamegraph.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Mutex; 2 | 3 | use pprof::ProfilerGuard; 4 | 5 | pub(super) struct FlameGraph { 6 | guard: Mutex>>, 7 | } 8 | 9 | impl FlameGraph { 10 | pub fn new() -> Self { 11 | Self { 12 | guard: Mutex::new(None), 13 | } 14 | } 15 | 16 | pub fn start(&self) { 17 | let mut guard = self.guard.lock().unwrap(); 18 | let old = guard.take(); 19 | assert!(old.is_none(), "FlameGraph is already started"); 20 | *guard = Some( 21 | pprof::ProfilerGuardBuilder::default() 22 | .frequency(500) 23 | .blocklist(&["libpthread.so.0", "libm.so.6", "libgcc_s.so.1"]) 24 | .build() 25 | .unwrap(), 26 | ); 27 | } 28 | 29 | pub fn stop_to_string(&self) -> anyhow::Result { 30 | let mut guard = self.guard.lock().unwrap(); 31 | let old = guard.take(); 32 | if old.is_none() { 33 | return Err(anyhow::anyhow!("FlameGraph is not started")); 34 | } 35 | let profiler = old.unwrap(); 36 | drop(guard); 37 | 38 | let report = profiler.report().build()?; 39 | let mut svg_data = Vec::new(); 40 | report.flamegraph(&mut svg_data)?; 41 | Ok(String::from_utf8(svg_data)?) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/server/src/admin_server/mod.rs: -------------------------------------------------------------------------------- 1 | //! Admin server for the liquid cache server 2 | //! 3 | //! This server is used to manage the liquid cache server 4 | 5 | use axum::http::{HeaderValue, Method}; 6 | use axum::{ 7 | Router, 8 | routing::{get, post}, 9 | }; 10 | use flamegraph::FlameGraph; 11 | use std::sync::atomic::AtomicU32; 12 | use std::{net::SocketAddr, sync::Arc}; 13 | use tower_http::cors::CorsLayer; 14 | 15 | mod flamegraph; 16 | mod handlers; 17 | pub(crate) mod models; 18 | 19 | use crate::LiquidCacheService; 20 | 21 | pub(crate) struct AppState { 22 | liquid_cache: Arc, 23 | trace_id: AtomicU32, 24 | stats_id: AtomicU32, 25 | flamegraph: Arc, 26 | } 27 | 28 | /// Run the admin server 29 | pub async fn run_admin_server( 30 | addr: SocketAddr, 31 | liquid_cache: Arc, 32 | ) -> Result<(), Box> { 33 | let state = Arc::new(AppState { 34 | liquid_cache, 35 | trace_id: AtomicU32::new(0), 36 | stats_id: AtomicU32::new(0), 37 | flamegraph: Arc::new(FlameGraph::new()), 38 | }); 39 | 40 | // Create a CORS layer that allows all localhost origins 41 | let cors = CorsLayer::new() 42 | // Allow all localhost origins (http and https) 43 | .allow_origin([ 44 | "http://localhost:3000".parse::().unwrap(), 45 | "http://127.0.0.1:3000".parse::().unwrap(), 46 | "https://liquid-cache-admin.xiangpeng.systems" 47 | .parse::() 48 | .unwrap(), 49 | ]) 50 | .allow_methods([Method::GET, Method::POST, Method::PUT, Method::DELETE]) 51 | .allow_headers([axum::http::header::CONTENT_TYPE]); 52 | 53 | let app = Router::new() 54 | .route("/shutdown", get(handlers::shutdown_handler)) 55 | .route("/reset_cache", get(handlers::reset_cache_handler)) 56 | .route( 57 | "/parquet_cache_usage", 58 | get(handlers::get_parquet_cache_usage_handler), 59 | ) 60 | .route("/cache_info", get(handlers::get_cache_info_handler)) 61 | .route("/system_info", get(handlers::get_system_info_handler)) 62 | .route("/start_trace", get(handlers::start_trace_handler)) 63 | .route("/stop_trace", get(handlers::stop_trace_handler)) 64 | .route( 65 | "/execution_metrics", 66 | get(handlers::get_execution_metrics_handler), 67 | ) 68 | .route("/execution_plans", get(handlers::get_execution_stats)) 69 | .route("/cache_stats", get(handlers::get_cache_stats_handler)) 70 | .route("/start_flamegraph", get(handlers::start_flamegraph_handler)) 71 | .route("/stop_flamegraph", get(handlers::stop_flamegraph_handler)) 72 | .route( 73 | "/set_execution_stats", 74 | post(handlers::add_execution_stats_handler), 75 | ) 76 | .with_state(state) 77 | .layer(cors); 78 | 79 | let listener = tokio::net::TcpListener::bind(addr).await?; 80 | axum::serve(listener, app).await?; 81 | 82 | Ok(()) 83 | } 84 | -------------------------------------------------------------------------------- /src/server/src/admin_server/models.rs: -------------------------------------------------------------------------------- 1 | use serde::{Deserialize, Serialize}; 2 | 3 | /// Parameters for the set_execution_stats endpoint 4 | #[derive(Deserialize, Serialize, Clone)] 5 | pub struct ExecutionStats { 6 | /// Plan ID for the execution plan 7 | pub plan_ids: Vec, 8 | /// Display name for the execution plan 9 | pub display_name: String, 10 | /// Flamegraph SVG for the execution plan 11 | pub flamegraph_svg: Option, 12 | /// Network traffic bytes for the execution plan 13 | pub network_traffic_bytes: u64, 14 | /// Execution time in milliseconds 15 | pub execution_time_ms: u64, 16 | /// User input SQL 17 | pub user_sql: String, 18 | } 19 | 20 | /// Execution stats with plan 21 | #[derive(Serialize)] 22 | pub struct ExecutionStatsWithPlan { 23 | /// Execution stats 24 | pub execution_stats: ExecutionStats, 25 | /// Plan info 26 | pub plans: Vec, 27 | } 28 | 29 | /// Response for the admin server 30 | #[derive(Serialize, Deserialize)] 31 | pub struct ApiResponse { 32 | /// Message for the response 33 | pub message: String, 34 | /// Status for the response 35 | pub status: String, 36 | } 37 | 38 | /// Schema field 39 | #[derive(Serialize)] 40 | pub struct SchemaField { 41 | /// Field name 42 | pub name: String, 43 | /// Field data type 44 | pub data_type: String, 45 | } 46 | 47 | /// Column statistics 48 | #[derive(Serialize)] 49 | pub struct ColumnStatistics { 50 | /// Column name 51 | pub name: String, 52 | /// Null count 53 | pub null: Option, 54 | /// Max value 55 | pub max: Option, 56 | /// Min value 57 | pub min: Option, 58 | /// Sum value 59 | pub sum: Option, 60 | /// Distinct count 61 | pub distinct_count: Option, 62 | } 63 | 64 | /// Statistics 65 | #[derive(Serialize)] 66 | pub struct Statistics { 67 | /// Number of rows 68 | pub num_rows: String, 69 | /// Total byte size 70 | pub total_byte_size: String, 71 | /// Column statistics 72 | pub column_statistics: Vec, 73 | } 74 | 75 | /// Metric 76 | #[derive(Serialize)] 77 | pub struct MetricValues { 78 | /// Metric name 79 | pub name: String, 80 | /// Metric value 81 | pub value: String, 82 | } 83 | 84 | /// Execution plan with stats 85 | #[derive(Serialize)] 86 | pub struct ExecutionPlanWithStats { 87 | /// Execution plan name 88 | pub name: String, 89 | /// Schema fields 90 | pub schema: Vec, 91 | /// Statistics 92 | pub statistics: Statistics, 93 | /// Metrics 94 | pub metrics: Vec, 95 | /// Children 96 | pub children: Vec, 97 | } 98 | 99 | /// Plan info 100 | #[derive(Serialize)] 101 | pub struct PlanInfo { 102 | /// Created at 103 | pub created_at: u64, 104 | /// Execution plan 105 | pub plan: ExecutionPlanWithStats, 106 | /// ID 107 | pub id: String, 108 | /// Predicate 109 | pub predicate: Option, 110 | } 111 | -------------------------------------------------------------------------------- /src/server/src/errors.rs: -------------------------------------------------------------------------------- 1 | //! Error handling utilities for LiquidCache server. 2 | //! 3 | //! This module provides enhanced error handling with stack traces to help 4 | //! developers and users identify the exact location where errors occur. 5 | 6 | use anyhow::{Context, Result as AnyhowResult}; 7 | use tonic::Status; 8 | 9 | /// Result type alias for LiquidCache operations 10 | pub type LiquidCacheResult = AnyhowResult; 11 | 12 | /// Extension trait to add context to Results for better error reporting 13 | pub trait LiquidCacheErrorExt { 14 | /// Add context to an error for better error reporting 15 | fn with_liquid_context(self, message: impl Into) -> LiquidCacheResult; 16 | } 17 | 18 | impl LiquidCacheErrorExt for Result 19 | where 20 | E: std::error::Error + Send + Sync + 'static, 21 | { 22 | fn with_liquid_context(self, message: impl Into) -> LiquidCacheResult { 23 | self.map_err(anyhow::Error::from).context(message.into()) 24 | } 25 | } 26 | 27 | /// Convert anyhow::Error to tonic Status with detailed error information including stack trace 28 | pub fn anyhow_to_status(err: anyhow::Error) -> Status { 29 | // Format the error with full error chain and backtrace for debugging 30 | let error_with_context = format!("{err:?}"); 31 | 32 | // Determine the appropriate gRPC status code based on error type 33 | if let Some(datafusion_err) = err.downcast_ref::() { 34 | match datafusion_err { 35 | datafusion::error::DataFusionError::Plan(_) => { 36 | Status::invalid_argument(error_with_context) 37 | } 38 | datafusion::error::DataFusionError::SchemaError(_, _) => { 39 | Status::invalid_argument(error_with_context) 40 | } 41 | _ => Status::internal(error_with_context), 42 | } 43 | } else if err.downcast_ref::().is_some() 44 | || err.downcast_ref::().is_some() 45 | { 46 | Status::invalid_argument(error_with_context) 47 | } else if err.downcast_ref::().is_some() { 48 | Status::internal(error_with_context) 49 | } else { 50 | // Default to internal error for unknown error types 51 | Status::internal(error_with_context) 52 | } 53 | } 54 | 55 | /// Legacy compatibility: convert DataFusionError to Status with stack trace 56 | pub fn df_error_to_status_with_trace(err: datafusion::error::DataFusionError) -> Status { 57 | anyhow_to_status(err.into()) 58 | } 59 | -------------------------------------------------------------------------------- /src/server/src/tests/cases.rs: -------------------------------------------------------------------------------- 1 | use std::fs::File; 2 | use std::path::{Path, PathBuf}; 3 | use std::sync::Arc; 4 | 5 | use liquid_cache_common::CacheMode; 6 | 7 | use crate::tests::run_sql; 8 | 9 | fn gen_parquet(dir: impl AsRef) -> PathBuf { 10 | use arrow::array::UInt32Array; 11 | use arrow::datatypes::{DataType, Field, Schema}; 12 | use arrow::record_batch::RecordBatch; 13 | use parquet::arrow::ArrowWriter; 14 | use parquet::file::properties::WriterProperties; 15 | let temp_path = dir.as_ref().join("parquet_page_index.parquet"); 16 | let file = File::create(&temp_path).unwrap(); 17 | let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::UInt32, false)])); 18 | let id_array = UInt32Array::from_iter_values(0..200_000); 19 | let id_batch = RecordBatch::try_new(Arc::clone(&schema), vec![Arc::new(id_array)]).unwrap(); 20 | let props = WriterProperties::builder() 21 | .set_offset_index_disabled(false) 22 | .build(); 23 | let mut writer = ArrowWriter::try_new(file, Arc::clone(&schema), Some(props)).unwrap(); 24 | writer.write(&id_batch).unwrap(); 25 | writer.into_inner().unwrap(); 26 | temp_path 27 | } 28 | 29 | #[tokio::test(flavor = "multi_thread")] 30 | async fn test_parquet_with_page_index() { 31 | let temp_dir = tempfile::tempdir().unwrap(); 32 | let file = gen_parquet(&temp_dir); 33 | let file_path = file.to_str().unwrap(); 34 | 35 | let result = run_sql( 36 | "SELECT * FROM hits WHERE id = 0", 37 | CacheMode::LiquidEagerTranscode, 38 | 1000, 39 | file_path, 40 | ) 41 | .await; 42 | insta::assert_snapshot!(result); 43 | } 44 | -------------------------------------------------------------------------------- /src/server/src/tests/mod.rs: -------------------------------------------------------------------------------- 1 | use std::{path::PathBuf, sync::Arc}; 2 | 3 | use arrow::util::pretty::pretty_format_batches; 4 | use datafusion::{ 5 | physical_plan::{ExecutionPlan, collect}, 6 | prelude::SessionContext, 7 | }; 8 | use liquid_cache_common::CacheEvictionStrategy::Discard; 9 | use liquid_cache_common::CacheMode; 10 | use uuid::Uuid; 11 | 12 | mod cases; 13 | 14 | use crate::{LiquidCacheService, LiquidCacheServiceInner}; 15 | 16 | const TEST_FILE: &str = "../../examples/nano_hits.parquet"; 17 | 18 | async fn get_physical_plan(sql: &str, ctx: &SessionContext) -> Arc { 19 | let df = ctx.sql(sql).await.unwrap(); 20 | let (state, plan) = df.into_parts(); 21 | state.create_physical_plan(&plan).await.unwrap() 22 | } 23 | 24 | async fn run_sql(sql: &str, mode: CacheMode, cache_size_bytes: usize, file_path: &str) -> String { 25 | let ctx = Arc::new(LiquidCacheService::context().unwrap()); 26 | ctx.register_parquet("hits", file_path, Default::default()) 27 | .await 28 | .unwrap(); 29 | let service = LiquidCacheServiceInner::new( 30 | ctx.clone(), 31 | Some(cache_size_bytes), 32 | PathBuf::from("test"), 33 | mode, 34 | Discard, 35 | ); 36 | async fn get_result(service: &LiquidCacheServiceInner, sql: &str) -> String { 37 | let handle = Uuid::new_v4(); 38 | let ctx = service.get_ctx(); 39 | let plan = get_physical_plan(sql, &ctx).await; 40 | service.register_plan(handle, plan); 41 | let plan = service.get_plan(&handle).unwrap(); 42 | let batches = collect(plan.plan, ctx.task_ctx()).await.unwrap(); 43 | pretty_format_batches(&batches).unwrap().to_string() 44 | } 45 | 46 | let first_iter = get_result(&service, sql).await; 47 | let second_iter = get_result(&service, sql).await; 48 | 49 | assert_eq!(first_iter, second_iter); 50 | 51 | first_iter 52 | } 53 | 54 | async fn test_runner(sql: &str, reference: &str) { 55 | let modes = [ 56 | CacheMode::LiquidEagerTranscode, 57 | CacheMode::Arrow, 58 | CacheMode::Liquid, 59 | ]; 60 | 61 | // 573960 is the first batch size of URL 62 | let sizes = [10, 573960, usize::MAX]; 63 | 64 | for mode in modes { 65 | for size in sizes { 66 | let result = run_sql(sql, mode, size, TEST_FILE).await; 67 | assert_eq!(result, reference); 68 | } 69 | } 70 | } 71 | 72 | #[tokio::test(flavor = "multi_thread")] 73 | async fn test_url_prefix() { 74 | let sql = r#"select COUNT(*) from hits where "URL" like 'https://%'"#; 75 | let reference = run_sql(sql, CacheMode::LiquidEagerTranscode, 573960, TEST_FILE).await; 76 | insta::assert_snapshot!(reference); 77 | test_runner(sql, &reference).await; 78 | } 79 | 80 | #[tokio::test(flavor = "multi_thread")] 81 | async fn test_url() { 82 | let sql = r#"select "URL" from hits where "URL" like '%tours%' order by "URL" desc"#; 83 | let reference = run_sql(sql, CacheMode::LiquidEagerTranscode, 573960, TEST_FILE).await; 84 | insta::assert_snapshot!(reference); 85 | test_runner(sql, &reference).await; 86 | } 87 | 88 | #[tokio::test(flavor = "multi_thread")] 89 | async fn test_os() { 90 | let sql = r#"select "OS" from hits where "URL" like '%tours%' order by "OS" desc"#; 91 | let reference = run_sql(sql, CacheMode::LiquidEagerTranscode, 573960, TEST_FILE).await; 92 | insta::assert_snapshot!(reference); 93 | test_runner(sql, &reference).await; 94 | } 95 | 96 | #[tokio::test(flavor = "multi_thread")] 97 | async fn test_referer() { 98 | let sql = r#"select "Referer" from hits where "Referer" <> '' AND "URL" like '%tours%' order by "Referer" desc"#; 99 | let reference = run_sql(sql, CacheMode::LiquidEagerTranscode, 573960, TEST_FILE).await; 100 | insta::assert_snapshot!(reference); 101 | test_runner(sql, &reference).await; 102 | } 103 | 104 | #[tokio::test(flavor = "multi_thread")] 105 | #[ignore = "Wait for https://github.com/apache/datafusion/pull/15827 to be merged"] 106 | async fn test_min_max() { 107 | let sql = r#"select min("Referer"), max("Referer") from hits where "Referer" <> '' AND "URL" like '%tours%'"#; 108 | let reference = run_sql(sql, CacheMode::LiquidEagerTranscode, 573960, TEST_FILE).await; 109 | insta::assert_snapshot!(reference); 110 | test_runner(sql, &reference).await; 111 | } 112 | -------------------------------------------------------------------------------- /src/server/src/tests/snapshots/liquid_cache_server__tests__cases__parquet_with_page_index.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/server/src/tests/cases.rs 3 | expression: result 4 | --- 5 | +----+ 6 | | id | 7 | +----+ 8 | | 0 | 9 | +----+ 10 | -------------------------------------------------------------------------------- /src/server/src/tests/snapshots/liquid_cache_server__tests__min_max.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/server/src/tests/mod.rs 3 | expression: eager 4 | --- 5 | +----------------------------------------------------------------+-----------------------------------------------------------+ 6 | | min(hits.Referer) | max(hits.Referer) | 7 | +----------------------------------------------------------------+-----------------------------------------------------------+ 8 | | http://tambov.irr.ru/avtoma-gorod55.ru/cars/micros/out-of-town | https://go.mail/folder-1/online.ru/search?text=скачать из | 9 | +----------------------------------------------------------------+-----------------------------------------------------------+ 10 | -------------------------------------------------------------------------------- /src/server/src/tests/snapshots/liquid_cache_server__tests__os.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/server/src/tests/mod.rs 3 | expression: eager 4 | --- 5 | +----+ 6 | | OS | 7 | +----+ 8 | | 44 | 9 | | 44 | 10 | | 44 | 11 | | 44 | 12 | | 44 | 13 | | 2 | 14 | | 2 | 15 | | 2 | 16 | | 2 | 17 | | 2 | 18 | | 2 | 19 | +----+ 20 | -------------------------------------------------------------------------------- /src/server/src/tests/snapshots/liquid_cache_server__tests__referer.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/server/src/tests/mod.rs 3 | expression: eager 4 | --- 5 | +---------------------------------------------------------------------------------------------------------+ 6 | | Referer | 7 | +---------------------------------------------------------------------------------------------------------+ 8 | | https://go.mail/folder-1/online.ru/search?text=скачать из | 9 | | http://tambov.irr.ru/registrict=2660628&cbv=r2013%2F&ei | 10 | | http://tambov.irr.ru/registrict=2660628&cbv=r2013%2F&ei | 11 | | http://tambov.irr.ru/registrict=2660628&cbv=r2013%26ev_positions/2/transmittaD3xnA%26ad%3D1%26bid%3D400 | 12 | | http://tambov.irr.ru/registrict=2660628&cbv=r2013%26ev_pl%3Dh%26utm_source=view.php | 13 | | http://tambov.irr.ru/registrict=2660628&cbv=r2013%26ev_pl%3Dh%26utm_source=view.php | 14 | | http://tambov.irr.ru/filmId=BcVrXpM5UXI&where=any&numphoto | 15 | | http://tambov.irr.ru/filmId=BcVrXpM5UXI&where=any&numphoto | 16 | | http://tambov.irr.ru/avtoma-gorod55.ru/cars/micros/out-of-town | 17 | | http://tambov.irr.ru/avtoma-gorod55.ru/cars/micros/out-of-town | 18 | +---------------------------------------------------------------------------------------------------------+ 19 | -------------------------------------------------------------------------------- /src/server/src/tests/snapshots/liquid_cache_server__tests__title.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/server/src/tests/mod.rs 3 | expression: eager 4 | --- 5 | +----+ 6 | | OS | 7 | +----+ 8 | | 44 | 9 | | 44 | 10 | | 44 | 11 | | 44 | 12 | | 44 | 13 | | 2 | 14 | | 2 | 15 | | 2 | 16 | | 2 | 17 | | 2 | 18 | | 2 | 19 | +----+ 20 | -------------------------------------------------------------------------------- /src/server/src/tests/snapshots/liquid_cache_server__tests__url.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/server/src/tests/mod.rs 3 | expression: eager 4 | --- 5 | +-------------------------------------------------------------------------------------------------------------------------------+ 6 | | URL | 7 | +-------------------------------------------------------------------------------------------------------------------------------+ 8 | | https://produkty%2Fpulove.ru/search_terms-vzyat_kobrye-russion/russia/piterators-tourse | 9 | | https://produkty%2Fpulove.ru/search_terms-vzyat_kobrye-russion/russia/piterators-tourse | 10 | | https://produkty%2Fpulove.ru/booklyattion-ware/tours.ru | 11 | | https://produkty%2Fpulove.ru/booklyattion-ware/tours.ru | 12 | | https://produkty%2Fpulove.ru/booklyattion-ware/activaro.ru/cars.ru/footours-index.ru/files/item=11497449%26pz%3D0 | 13 | | https://produkty%2Fpulove.ru/booklyattion-ware/activaro.ru/cars.ru/footours-index.ru/files/item=11497449%26pz%3D0 | 14 | | https://produkty%2Fpulove.ru/booklyattion-ware/activaro.ru/cars.ru/footours-index.ru/files/item=11497449%26pz%3D0 | 15 | | https://produkty%2Fpulove.ru/booklyattion-ware/activaro.ru/cars.ru/footours-index.ru/files/item=11497449%26pz%3D0 | 16 | | https://produkty%2Fplata.ru/filmId=e308f57e213e9eee96bcb752910-widget/tours.ru/product_priznaniya-1-metallic=0&engineVolumeTo | 17 | | https://produkty%2Fplata.ru/filmId=e308f57e213e9eee96bcb752910-widget/tours.ru/product_priznaniya-1-metallic=0&engineVolumeTo | 18 | | http://tours/Ekategoriya%2F&sr=http://slovareniye | 19 | +-------------------------------------------------------------------------------------------------------------------------------+ 20 | -------------------------------------------------------------------------------- /src/server/src/tests/snapshots/liquid_cache_server__tests__url_prefix.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/server/src/tests/mod.rs 3 | expression: eager 4 | --- 5 | +----------+ 6 | | count(*) | 7 | +----------+ 8 | | 23113 | 9 | +----------+ 10 | -------------------------------------------------------------------------------- /src/server/src/utils.rs: -------------------------------------------------------------------------------- 1 | use arrow::{array::RecordBatch, compute::concat_batches}; 2 | use datafusion::error::Result; 3 | use futures::{Stream, ready}; 4 | use futures::{StreamExt, stream::BoxStream}; 5 | use std::{ 6 | pin::Pin, 7 | task::{Context, Poll}, 8 | }; 9 | 10 | /// A stream that finalizes the record batches. 11 | /// It currently do two things: 12 | /// 1. Gc the record batches, especially for arrays after filtering. 13 | /// 2. Merge small batches into a large one. 14 | pub struct FinalStream { 15 | inner: BoxStream<'static, Result>, 16 | target_batch_size: usize, 17 | buffered_batches: Vec, 18 | current_buffered_rows: usize, 19 | span: fastrace::Span, 20 | } 21 | 22 | impl FinalStream { 23 | pub fn new> + Send + 'static>( 24 | inner: S, 25 | target_batch_size: usize, 26 | span: fastrace::Span, 27 | ) -> Self { 28 | Self { 29 | inner: inner.boxed(), 30 | target_batch_size, 31 | buffered_batches: Vec::new(), 32 | current_buffered_rows: 0, 33 | span, 34 | } 35 | } 36 | } 37 | 38 | impl Stream for FinalStream { 39 | type Item = Result; 40 | 41 | fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { 42 | let this = &mut *self; 43 | let _guard = this.span.set_local_parent(); 44 | loop { 45 | let threshold = (this.target_batch_size * 3) / 4; 46 | if this.current_buffered_rows > threshold { 47 | this.current_buffered_rows = 0; 48 | let batches = std::mem::take(&mut this.buffered_batches); 49 | let schema = batches[0].schema(); 50 | let result = concat_batches(&schema, batches.iter()); 51 | return Poll::Ready(Some(Ok(result?))); 52 | } 53 | 54 | match ready!(this.inner.poll_next_unpin(cx)).transpose()? { 55 | Some(batch) => { 56 | let num_rows = batch.num_rows(); 57 | this.current_buffered_rows += num_rows; 58 | this.buffered_batches.push(batch); 59 | } 60 | None => { 61 | if this.buffered_batches.is_empty() { 62 | return Poll::Ready(None); 63 | } 64 | this.current_buffered_rows = 0; 65 | let batches = std::mem::take(&mut this.buffered_batches); 66 | let schema = batches[0].schema(); 67 | let result = concat_batches(&schema, batches.iter()); 68 | return Poll::Ready(Some(Ok(result?))); 69 | } 70 | } 71 | } 72 | } 73 | } 74 | --------------------------------------------------------------------------------