├── .envrc
├── .github
    ├── release.yml
    └── workflows
    │   ├── ci.yml
    │   ├── docker-publish.yml
    │   └── publish.yml
├── .gitignore
├── .vscode
    └── launch.json
├── Cargo.lock
├── Cargo.toml
├── LICENSE
├── README.md
├── benchmark
    ├── .gitignore
    ├── Cargo.toml
    ├── README.md
    ├── bench_server.rs
    ├── clickbench
    │   ├── answers
    │   │   └── clickbench_0
    │   │   │   ├── Q0.parquet
    │   │   │   ├── Q1.parquet
    │   │   │   ├── Q10.parquet
    │   │   │   ├── Q11.parquet
    │   │   │   ├── Q12.parquet
    │   │   │   ├── Q13.parquet
    │   │   │   ├── Q14.parquet
    │   │   │   ├── Q15.parquet
    │   │   │   ├── Q16.parquet
    │   │   │   ├── Q17.parquet
    │   │   │   ├── Q18.parquet
    │   │   │   ├── Q2.parquet
    │   │   │   ├── Q20.parquet
    │   │   │   ├── Q21.parquet
    │   │   │   ├── Q22.parquet
    │   │   │   ├── Q23.parquet
    │   │   │   ├── Q24.parquet
    │   │   │   ├── Q25.parquet
    │   │   │   ├── Q26.parquet
    │   │   │   ├── Q27.parquet
    │   │   │   ├── Q28.parquet
    │   │   │   ├── Q29.parquet
    │   │   │   ├── Q3.parquet
    │   │   │   ├── Q30.parquet
    │   │   │   ├── Q31.parquet
    │   │   │   ├── Q32.parquet
    │   │   │   ├── Q33.parquet
    │   │   │   ├── Q34.parquet
    │   │   │   ├── Q35.parquet
    │   │   │   ├── Q36.parquet
    │   │   │   ├── Q37.parquet
    │   │   │   ├── Q38.parquet
    │   │   │   ├── Q39.parquet
    │   │   │   ├── Q4.parquet
    │   │   │   ├── Q40.parquet
    │   │   │   ├── Q42.parquet
    │   │   │   ├── Q5.parquet
    │   │   │   ├── Q6.parquet
    │   │   │   ├── Q7.parquet
    │   │   │   ├── Q8.parquet
    │   │   │   └── Q9.parquet
    │   ├── clickbench_client.rs
    │   ├── data
    │   │   └── .gitkeep
    │   └── queries
    │   │   ├── queries.sql
    │   │   └── query_select.sql
    ├── src
    │   ├── bin
    │   │   ├── create_nano_hits.rs
    │   │   └── encoding.rs
    │   ├── lib.rs
    │   ├── observability.rs
    │   ├── runner.rs
    │   └── utils.rs
    └── tpch
    │   ├── answers
    │       └── .gitkeep
    │   ├── data
    │       ├── .gitkeep
    │       └── sf0.001
    │       │   ├── customer.parquet
    │       │   ├── lineitem.parquet
    │       │   ├── nation.parquet
    │       │   ├── orders.parquet
    │       │   ├── part.parquet
    │       │   ├── partsupp.parquet
    │       │   ├── region.parquet
    │       │   └── supplier.parquet
    │   ├── queries
    │       ├── q1.sql
    │       ├── q10.sql
    │       ├── q11.sql
    │       ├── q12.sql
    │       ├── q13.sql
    │       ├── q14.sql
    │       ├── q15.sql
    │       ├── q16.sql
    │       ├── q17.sql
    │       ├── q18.sql
    │       ├── q19.sql
    │       ├── q2.sql
    │       ├── q20.sql
    │       ├── q21.sql
    │       ├── q22.sql
    │       ├── q3.sql
    │       ├── q4.sql
    │       ├── q5.sql
    │       ├── q6.sql
    │       ├── q7.sql
    │       ├── q8.sql
    │       └── q9.sql
    │   ├── tpch_client.rs
    │   └── tpch_gen.py
├── dev
    ├── README.md
    ├── doc
    │   ├── arch.png
    │   ├── arch.svg
    │   ├── liquid-cache-vldb.pdf
    │   └── logo.png
    ├── git-hooks
    │   └── pre-push
    ├── install-git-hooks.sh
    ├── liquid_cache_server.dockerfile
    └── thoughts
    │   ├── architecture.md
    │   ├── artifact-eval.md
    │   ├── debugging-tips.md
    │   └── thoughts.md
├── examples
    ├── Cargo.toml
    ├── README.md
    ├── example_client.rs
    ├── example_server.rs
    └── nano_hits.parquet
├── flake.lock
├── flake.nix
├── rust-toolchain.toml
└── src
    ├── client
        ├── Cargo.toml
        └── src
        │   ├── client_exec.rs
        │   ├── lib.rs
        │   ├── metrics.rs
        │   ├── optimizer.rs
        │   └── tests
        │       ├── mod.rs
        │       └── snapshots
        │           ├── liquid_cache_client__tests__tpch_q1.snap
        │           ├── liquid_cache_client__tests__tpch_q10.snap
        │           ├── liquid_cache_client__tests__tpch_q11.snap
        │           ├── liquid_cache_client__tests__tpch_q12.snap
        │           ├── liquid_cache_client__tests__tpch_q13.snap
        │           ├── liquid_cache_client__tests__tpch_q14.snap
        │           ├── liquid_cache_client__tests__tpch_q16.snap
        │           ├── liquid_cache_client__tests__tpch_q17.snap
        │           ├── liquid_cache_client__tests__tpch_q18.snap
        │           ├── liquid_cache_client__tests__tpch_q19.snap
        │           ├── liquid_cache_client__tests__tpch_q2.snap
        │           ├── liquid_cache_client__tests__tpch_q20.snap
        │           ├── liquid_cache_client__tests__tpch_q21.snap
        │           ├── liquid_cache_client__tests__tpch_q22.snap
        │           ├── liquid_cache_client__tests__tpch_q3.snap
        │           ├── liquid_cache_client__tests__tpch_q4.snap
        │           ├── liquid_cache_client__tests__tpch_q5.snap
        │           ├── liquid_cache_client__tests__tpch_q6.snap
        │           ├── liquid_cache_client__tests__tpch_q7.snap
        │           ├── liquid_cache_client__tests__tpch_q8.snap
        │           └── liquid_cache_client__tests__tpch_q9.snap
    ├── common
        ├── Cargo.toml
        └── src
        │   ├── lib.rs
        │   ├── rpc.rs
        │   └── utils.rs
    ├── liquid_parquet
        ├── Cargo.toml
        ├── bench
        │   ├── bench_eviction.rs
        │   ├── bitpacking.rs
        │   ├── boolean_and_then.rs
        │   ├── eviction_cache.rs
        │   ├── fsstarray.rs
        │   └── liquid_float_array.rs
        ├── clippy.toml
        └── src
        │   ├── cache
        │       ├── budget.rs
        │       ├── mod.rs
        │       ├── policies.rs
        │       ├── stats.rs
        │       ├── store.rs
        │       ├── tracer.rs
        │       ├── transcode.rs
        │       └── utils.rs
        │   ├── lib.rs
        │   ├── liquid_array
        │       ├── byte_array.rs
        │       ├── fix_len_byte_array.rs
        │       ├── float_array.rs
        │       ├── ipc.rs
        │       ├── mod.rs
        │       ├── primitive_array.rs
        │       ├── raw
        │       │   ├── bit_pack_array.rs
        │       │   ├── fsst_array.rs
        │       │   └── mod.rs
        │       └── utils.rs
        │   ├── reader
        │       ├── mod.rs
        │       ├── plantime
        │       │   ├── mod.rs
        │       │   ├── opener.rs
        │       │   ├── row_filter.rs
        │       │   ├── row_group_filter.rs
        │       │   └── source.rs
        │       ├── runtime
        │       │   ├── in_memory_rg.rs
        │       │   ├── liquid_stream.rs
        │       │   ├── mod.rs
        │       │   ├── parquet_bridge.rs
        │       │   ├── reader
        │       │   │   ├── cached_array_reader.rs
        │       │   │   ├── cached_page.rs
        │       │   │   ├── liquid_batch_reader.rs
        │       │   │   ├── mod.rs
        │       │   │   └── tests.rs
        │       │   └── utils.rs
        │       └── utils
        │       │   ├── boolean_selection.rs
        │       │   └── mod.rs
        │   ├── sync.rs
        │   └── utils.rs
    └── server
        ├── Cargo.toml
        └── src
            ├── admin_server
                ├── flamegraph.rs
                ├── handlers.rs
                ├── mod.rs
                └── models.rs
            ├── errors.rs
            ├── lib.rs
            ├── local_cache.rs
            ├── service.rs
            ├── tests
                ├── cases.rs
                ├── mod.rs
                └── snapshots
                │   ├── liquid_cache_server__tests__cases__parquet_with_page_index.snap
                │   ├── liquid_cache_server__tests__min_max.snap
                │   ├── liquid_cache_server__tests__os.snap
                │   ├── liquid_cache_server__tests__referer.snap
                │   ├── liquid_cache_server__tests__title.snap
                │   ├── liquid_cache_server__tests__url.snap
                │   └── liquid_cache_server__tests__url_prefix.snap
            └── utils.rs


/.envrc:
--------------------------------------------------------------------------------
1 | use flake
2 | 


--------------------------------------------------------------------------------
/.github/release.yml:
--------------------------------------------------------------------------------
 1 | # Configuration for GitHub's automatic release notes
 2 | # See: https://docs.github.com/en/repositories/releasing-projects-on-github/automatically-generated-release-notes
 3 | 
 4 | changelog:
 5 |   exclude:
 6 |     labels:
 7 |       - ignore-for-release
 8 |       - dependencies
 9 |   categories:
10 |     - title: 🚀 Features
11 |       labels:
12 |         - feature
13 |         - enhancement
14 |         - Semver-Minor
15 |     - title: 🐛 Bug Fixes
16 |       labels:
17 |         - bug
18 |         - bugfix
19 |         - fix
20 |     - title: ⚠️ Breaking Changes
21 |       labels:
22 |         - breaking-change
23 |         - Semver-Major
24 |     - title: 📚 Documentation
25 |       labels:
26 |         - documentation
27 |         - docs
28 |     - title: 🔧 Maintenance
29 |       labels:
30 |         - chore
31 |         - refactor
32 |         - test
33 |     - title: 📦 Dependencies
34 |       labels:
35 |         - dependencies
36 |     - title: 🌱 Other Changes
37 |       labels:
38 |         - "*" 


--------------------------------------------------------------------------------
/.github/workflows/docker-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Docker
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | jobs:
 8 |   build:
 9 |     runs-on: ubuntu-latest
10 |     permissions:
11 |       packages: write
12 | 
13 |     steps:
14 |       - uses: actions/checkout@v4
15 |       - uses: dtolnay/rust-toolchain@master
16 |         with:
17 |           toolchain: nightly-2025-03-03
18 |       - uses: Swatinem/rust-cache@v2
19 |       - name: Build binary
20 |         run: cargo build --release --bin bench_server 
21 | 
22 |       - name: Log in to the Container registry
23 |         uses: docker/login-action@v3
24 |         with:
25 |           registry: ghcr.io
26 |           username: ${{ github.actor }}
27 |           password: ${{ secrets.GITHUB_TOKEN }}
28 |       
29 |       - name: Set lower case owner name
30 |         run: |
31 |           echo "OWNER_LC=${OWNER,,}" >> ${GITHUB_ENV}
32 |         env:
33 |           OWNER: '${{ github.repository_owner }}'
34 | 
35 |       - name: Build and push Docker image
36 |         uses: docker/build-push-action@v4
37 |         with:
38 |           context: .
39 |           push: true
40 |           tags: ghcr.io/${{ env.OWNER_LC }}/liquid-cache/liquid-cache-server:latest,ghcr.io/${{ env.OWNER_LC }}/liquid-cache/liquid-cache-server:${{ github.event.release.tag_name }}
41 |           file: dev/liquid_cache_server.dockerfile
42 |           


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
  1 | name: Publish Release
  2 | 
  3 | on:
  4 |   workflow_dispatch:
  5 |     inputs:
  6 |       version_bump:
  7 |         description: 'Version bump type'
  8 |         required: true
  9 |         default: 'patch'
 10 |         type: 'choice'
 11 |         options:
 12 |           - patch
 13 |           - minor
 14 |           - major
 15 |       custom_version:
 16 |         description: 'Custom version (optional, overrides version_bump if provided)'
 17 |         required: false
 18 |         type: 'string'
 19 | 
 20 | jobs:
 21 |   release-and-publish:
 22 |     name: Release and Publish
 23 |     runs-on: ubuntu-latest
 24 |     permissions:
 25 |       contents: write
 26 |       pull-requests: write
 27 |     steps:
 28 |       - name: Checkout code
 29 |         uses: actions/checkout@v4
 30 |         with:
 31 |           fetch-depth: 0
 32 |           token: ${{ secrets.GITHUB_TOKEN }}
 33 | 
 34 |       - name: Set up Rust
 35 |         uses: dtolnay/rust-toolchain@master
 36 |         with:
 37 |           toolchain: nightly-2025-04-29
 38 | 
 39 |       - name: Install cargo-release
 40 |         run: cargo install cargo-release
 41 | 
 42 |       - name: Configure Git
 43 |         run: |
 44 |           git config --local user.email "github-actions[bot]@users.noreply.github.com"
 45 |           git config --local user.name "github-actions[bot]"
 46 | 
 47 |       - name: Generate branch name
 48 |         id: branch-name
 49 |         run: |
 50 |           echo "release_branch=release/$(date +'%Y%m%d')" >> $GITHUB_OUTPUT
 51 | 
 52 |       - name: Execute Release
 53 |         id: release
 54 |         run: |
 55 |           # Determine version bump level
 56 |           LEVEL="${{ github.event.inputs.version_bump }}"
 57 |           CUSTOM_VERSION="${{ github.event.inputs.custom_version }}"
 58 |           
 59 |           if [ -n "$CUSTOM_VERSION" ]; then
 60 |             # Use custom version
 61 |             cargo release "$CUSTOM_VERSION" --execute --no-confirm --no-push --no-tag --no-publish
 62 |             echo "new_version=$CUSTOM_VERSION" >> $GITHUB_OUTPUT
 63 |           else
 64 |             # Use version bump type
 65 |             cargo release $LEVEL --execute --no-confirm --no-push --no-tag --no-publish
 66 |             # Get the new version
 67 |             NEW_VERSION=$(grep -m 1 'version = ' Cargo.toml | sed 's/version = "\(.*\)"/\1/')
 68 |             echo "new_version=$NEW_VERSION" >> $GITHUB_OUTPUT
 69 |           fi
 70 | 
 71 |       - name: Publish crates
 72 |         env:
 73 |           CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
 74 |         run: |
 75 |           # Publish in dependency order
 76 |           cargo publish -p liquid-cache-common
 77 |           sleep 30  # Wait for the registry to update
 78 |           
 79 |           cargo publish -p liquid-cache-parquet
 80 |           sleep 30
 81 |           
 82 |           cargo publish -p liquid-cache-client
 83 |           sleep 30
 84 |           
 85 |           cargo publish -p liquid-cache-server
 86 |           sleep 30
 87 | 
 88 |       - name: Create GitHub Release
 89 |         uses: ncipollo/release-action@v1
 90 |         with:
 91 |           tag: v${{ steps.release.outputs.new_version }}
 92 |           name: Release v${{ steps.release.outputs.new_version }}
 93 |           generateReleaseNotes: true
 94 |           draft: false
 95 |           prerelease: false
 96 |           token: ${{ secrets.GITHUB_TOKEN }}
 97 | 
 98 |       - name: Create Pull Request
 99 |         id: create_pr
100 |         uses: peter-evans/create-pull-request@v7
101 |         with:
102 |           token: ${{ secrets.GITHUB_TOKEN }}
103 |           branch: ${{ steps.branch-name.outputs.release_branch }}
104 |           title: "Release v${{ steps.release.outputs.new_version }}"
105 |           body: |
106 |             This PR prepares release v${{ steps.release.outputs.new_version }}.
107 |             
108 |             **IMPORTANT**: This PR MUST be merged as the crates have already been published to crates.io with this version.
109 |           commit-message: "chore: prepare release v${{ steps.release.outputs.new_version }}"
110 |           delete-branch: true
111 | 
112 |       - name: Output PR URL
113 |         run: echo "Pull request created at ${{ steps.create_pr.outputs.pull-request-url }}"


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | .DS_Store
3 | .idea
4 | profile.json
5 | data/
6 | .direnv
7 | 


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	// Use IntelliSense to learn about possible attributes.
 3 | 	// Hover to view descriptions of existing attributes.
 4 | 	// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
 5 | 	"version": "0.2.0",
 6 | 	"configurations": [
 7 | 		{
 8 | 			"type": "lldb",
 9 | 			"request": "launch",
10 | 			"name": "bench_server",
11 | 			"cargo": {
12 | 				"args": [
13 | 					"build",
14 | 					"--bin=bench_server",
15 | 				],
16 | 				"filter": {
17 | 					"name": "bench_server",
18 | 					"kind": "bin"
19 | 				}
20 | 			},
21 | 			"env": {
22 | 				"RUST_LOG": "info"
23 | 			},
24 | 			"args": [
25 | 				"--max-cache-mb",
26 | 				"10"
27 | 			],
28 | 			"cwd": "${workspaceFolder}",
29 | 		},
30 | 		{
31 | 			"type": "lldb",
32 | 			"request": "launch",
33 | 			"name": "clickbench_client",
34 | 			"cargo": {
35 | 				"args": [
36 | 					"build",
37 | 					"--bin=clickbench_client",
38 | 				],
39 | 				"filter": {
40 | 					"name": "clickbench_client",
41 | 					"kind": "bin"
42 | 				}
43 | 			},
44 | 			"env": {
45 | 				"RUST_LOG": "info"
46 | 			},
47 | 			"args": [
48 | 				"--query-path",
49 | 				"benchmark/clickbench/queries/queries.sql",
50 | 				"--file",
51 | 				"benchmark/clickbench/data/hits_0.parquet",
52 | 				"--query",
53 | 				"24"
54 | 			],
55 | 			"cwd": "${workspaceFolder}"
56 | 		},
57 | 		{
58 | 			"type": "lldb",
59 | 			"request": "launch",
60 | 			"name": "tpch_client",
61 | 			"cargo": {
62 | 				"args": [
63 | 					"build",
64 | 					"--bin=tpch_client",
65 | 				],
66 | 				"filter": {
67 | 					"name": "tpch_client",
68 | 					"kind": "bin"
69 | 				}
70 | 			},
71 | 			"env": {
72 | 				"RUST_LOG": "info"
73 | 			},
74 | 			"args": [
75 | 				"--query-dir",
76 | 				"benchmark/tpch/queries",
77 | 				"--data-dir",
78 | 				"benchmark/tpch/data/sf100.0",
79 | 				"--query",
80 | 				"6",
81 | 			],
82 | 			"cwd": "${workspaceFolder}"
83 | 		}
84 | 	]
85 | }


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [workspace.package]
 2 | version = "0.1.4"
 3 | edition = "2024"
 4 | repository = "https://github.com/XiangpengHao/liquid-cache"
 5 | authors = ["XiangpengHao <xiangpeng.hao@wisc.edu>"]
 6 | license = "Apache-2.0 OR MIT"
 7 | readme = "README.md"
 8 | description = "10x lower latency for cloud-native DataFusion"
 9 | 
10 | 
11 | [workspace]
12 | members = [
13 | 	"benchmark",
14 | 	"src/server",
15 | 	"src/client",
16 | 	"src/liquid_parquet",
17 | 	"src/common",
18 | 	"examples",
19 | ]
20 | resolver = "3"
21 | 
22 | [workspace.dependencies]
23 | liquid-cache-server = { path = "src/server", version = "0.1.4" }
24 | liquid-cache-client = { path = "src/client", version = "0.1.4" }
25 | liquid-cache-parquet = { path = "src/liquid_parquet", version = "0.1.4" }
26 | liquid-cache-common = { path = "src/common", version = "0.1.4" }
27 | arrow = { version = "55.1.0", default-features = false, features = [
28 | 	"prettyprint",
29 | ] }
30 | arrow-flight = { version = "55.1.0", features = ["flight-sql-experimental"] }
31 | arrow-schema = { version = "55.1.0", features = ["serde"] }
32 | parquet = { version = "55.1.0", features = ["async", "experimental"] }
33 | datafusion = { version = "47.0.0" }
34 | datafusion-proto = { version = "47.0.0" }
35 | async-trait = "0.1.88"
36 | futures = { version = "0.3.31", default-features = false, features = ["std"] }
37 | tokio = { version = "1.45.0", features = ["rt-multi-thread"] }
38 | log = "0.4.27"
39 | tonic = { version = "0.12" }
40 | url = "2.5.4"
41 | itertools = "0.14.0"
42 | bytes = { version = "1.10.1", default-features = false }
43 | ahash = "0.8.12"
44 | prost = "0.13.5"
45 | object_store = { version = "0.12.1", default-features = false }
46 | serde = { version = "1.0", default-features = false, features = ["derive"] }
47 | serde_json = { version = "1.0", default-features = false, features = ["std"] }
48 | tempfile = "3.20.0"
49 | uuid = { version = "1.16.0", features = ["v4"] }
50 | fastrace = "0.7"
51 | fastrace-futures = "0.7"
52 | fastrace-tonic = "0.1"
53 | congee = "0.4.1"
54 | 
55 | 
56 | [profile.dev.package]
57 | insta.opt-level = 3
58 | 
59 | [patch.crates-io]
60 | # datafusion = { path = "../datafusion/datafusion/core" }
61 | # datafusion-proto = { path = "../datafusion/datafusion/proto" }
62 | 


--------------------------------------------------------------------------------
/benchmark/.gitignore:
--------------------------------------------------------------------------------
1 | data
2 | tpch/answers
3 | 


--------------------------------------------------------------------------------
/benchmark/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "liquid-cache-benchmarks"
 3 | description = "LiquidCache Benchmarks"
 4 | edition = { workspace = true }
 5 | publish = false
 6 | 
 7 | [dependencies]
 8 | liquid-cache-server = { workspace = true }
 9 | liquid-cache-client = { workspace = true }
10 | liquid-cache-parquet = { workspace = true }
11 | liquid-cache-common = { workspace = true }
12 | async-trait = { workspace = true }
13 | futures = { workspace = true }
14 | datafusion = { workspace = true }
15 | tokio = { workspace = true }
16 | log = { workspace = true }
17 | arrow-flight = { workspace = true }
18 | tonic = { workspace = true }
19 | clap = { version = "4.5.38", features = ["derive"] }
20 | url = { workspace = true }
21 | mimalloc = "0.1.46"
22 | serde_json.workspace = true
23 | serde.workspace = true
24 | sysinfo = { version = "0.35.1", default-features = false, features = [
25 | 	"network",
26 | ] }
27 | object_store = { workspace = true, features = ["http"] }
28 | bytes = { workspace = true }
29 | prost = { workspace = true }
30 | fsst-rs = "0.5.2"
31 | parquet = { workspace = true }
32 | fastrace = { version = "0.7.9", features = ["enable"] }
33 | fastrace-tonic = { workspace = true }
34 | fastrace-opentelemetry = "0.10"
35 | opentelemetry = "0.29.1"
36 | opentelemetry_sdk = "0.29.0"
37 | opentelemetry-otlp = { version = "0.29.0", features = ["trace", "grpc-tonic"] }
38 | tower = "0.5.2"
39 | logforth = { version = "0.25.0", features = ["opentelemetry"] }
40 | reqwest = { version = "0.12.15", default-features = false, features = ["json"] }
41 | uuid = { version = "1.13.0", features = ["v4"] }
42 | 
43 | [[bin]]
44 | name = "clickbench_client"
45 | path = "clickbench/clickbench_client.rs"
46 | 
47 | [[bin]]
48 | name = "tpch_client"
49 | path = "tpch/tpch_client.rs"
50 | 
51 | [[bin]]
52 | name = "bench_server"
53 | path = "bench_server.rs"
54 | 


--------------------------------------------------------------------------------
/benchmark/README.md:
--------------------------------------------------------------------------------
  1 | # Benchmark Guide
  2 | 
  3 | ## [ClickBench](https://github.com/ClickHouse/ClickBench) 
  4 | 
  5 | ### Download dataset
  6 | To download partitioned dataset (~100MB):
  7 | ```bash
  8 | wget https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_0.parquet -O benchmark/data/hits_0.parquet
  9 | ```
 10 | 
 11 | To download the entire dataset (~15GB):
 12 | 
 13 | ```bash
 14 | wget https://datasets.clickhouse.com/hits_compatible/athena/hits.parquet -O benchmark/clickbench/data/hits.parquet
 15 | ```
 16 | 
 17 | To download the partitioned dataset (100 files, ~150MB each):
 18 | ```bash
 19 | for i in (seq 0 99)
 20 |     wget https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_$i.parquet -O benchmark/clickbench/data/partitioned/hits_$i.parquet
 21 | end
 22 | ```
 23 | Or bash :
 24 | ```bash
 25 | for i in {0..99}; do
 26 |     wget https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_$i.parquet -O benchmark/clickbench/data/partitioned/hits_$i.parquet
 27 | done
 28 | ```
 29 | 
 30 | ### Run benchmarks
 31 | 
 32 | #### Minimal 
 33 | 
 34 | ```bash
 35 | cargo run --release --bin bench_server
 36 | cargo run --release --bin clickbench_client -- --query-path benchmark/clickbench/queries/queries.sql --file benchmark/clickbench/data/hits.parquet
 37 | ```
 38 | 
 39 | #### Advanced
 40 | 
 41 | ```bash
 42 | env RUST_LOG=info RUST_BACKTRACE=1 RUSTFLAGS='-C target-cpu=native' cargo run --release --bin bench_server -- --cache-mode liquid_eager_transcode
 43 | env RUST_LOG=info RUST_BACKTRACE=1 RUSTFLAGS='-C target-cpu=native' cargo run --release --bin clickbench_client -- --query-path benchmark/clickbench/queries/queries.sql --file benchmark/clickbench/data/hits.parquet --query 42
 44 | ```
 45 | 
 46 | ## TPCH
 47 | 
 48 | ### Generate data
 49 | 
 50 | (make sure you have [uv](https://docs.astral.sh/uv/getting-started/installation/) installed)
 51 | 
 52 | ```bash
 53 | cd benchmark/tpch
 54 | uvx --from duckdb python tpch_gen.py --scale 0.01
 55 | ```
 56 | 
 57 | In NixOS, you want to set `env LD_LIBRARY_PATH=$NIX_LD_LIBRARY_PATH`
 58 | 
 59 | ### Run server (same as ClickBench)
 60 | 
 61 | ```bash
 62 | cargo run --release --bin bench_server -- --cache-mode liquid_eager_transcode
 63 | ```
 64 | 
 65 | ### Run client
 66 | 
 67 | ```bash
 68 | env RUST_LOG=info,clickbench_client=debug RUSTFLAGS='-C target-cpu=native' cargo run --release --bin tpch_client -- --query-dir benchmark/tpch/queries/ --data-dir benchmark/tpch/data/sf0.1  --iteration 3 --answer-dir benchmark/tpch/answers/sf0.1
 69 | ```
 70 | 
 71 | 
 72 | 
 73 | ## Profile
 74 | 
 75 | ### Flamegraph
 76 | 
 77 | To collect flamegraph from server side, simply add `--flamegraph-dir benchmark/data/flamegraph` to the server command, for example:
 78 | ```bash
 79 | cargo run --release --bin bench_server -- --flamegraph-dir benchmark/data/flamegraph
 80 | ```
 81 | It will generate flamegraph for each query that the server executed.
 82 | 
 83 | ### Cache stats
 84 | 
 85 | To collect cache stats, simply add `--stats-dir benchmark/data/cache_stats` to the server command, for example:
 86 | ```bash
 87 | cargo run --release --bin bench_server -- --stats-dir benchmark/data/cache_stats
 88 | ```
 89 | It will generate a parquet file that contains the cache stats for each query that the server executed.
 90 | You can use [`parquet-viewer`](https://parquet-viewer.xiangpeng.systems) to view the stats in the browser.
 91 | 
 92 | ### Collect cache trace
 93 | 
 94 | To collect cache trace, simply add `--cache-trace-dir benchmark/data/cache_trace` to the client command, for example:
 95 | ```bash
 96 | env RUST_LOG=info cargo run --bin clickbench_client --release -- --query-path benchmark/clickbench/queries/queries.sql --file benchmark/clickbench/data/hits.parquet --query 20 --iteration 2 --partitions 8 --cache-trace-dir benchmark/data/
 97 | ```
 98 | It will generate a parquet file that contains the cache trace for each query that the server executed.
 99 | 
100 | 
101 | ### Run encoding benchmarks
102 | 
103 | ```bash
104 | RUST_LOG=info RUSTFLAGS='-C target-cpu=native' cargo run --release --bin encoding -- --file benchmark/clickbench/data/hits.parquet --column 2
105 | ```
106 | This will benchmark the encoding time of the `URL` column.
107 | 


--------------------------------------------------------------------------------
/benchmark/bench_server.rs:
--------------------------------------------------------------------------------
  1 | use arrow_flight::flight_service_server::FlightServiceServer;
  2 | use clap::Parser;
  3 | use fastrace_tonic::FastraceServerLayer;
  4 | use liquid_cache_benchmarks::setup_observability;
  5 | use liquid_cache_common::{CacheEvictionStrategy, CacheMode};
  6 | use liquid_cache_server::{LiquidCacheService, run_admin_server};
  7 | use log::info;
  8 | use mimalloc::MiMalloc;
  9 | use std::{net::SocketAddr, path::PathBuf, sync::Arc};
 10 | use tonic::transport::Server;
 11 | 
 12 | #[global_allocator]
 13 | static GLOBAL: MiMalloc = MiMalloc;
 14 | 
 15 | #[derive(Parser)]
 16 | #[command(name = "ClickBench Benchmark Server")]
 17 | struct CliArgs {
 18 |     /// Address to listen on
 19 |     #[arg(long, default_value = "127.0.0.1:15214")]
 20 |     address: SocketAddr,
 21 | 
 22 |     /// HTTP address for admin endpoint
 23 |     #[arg(long = "admin-address", default_value = "127.0.0.1:53703")]
 24 |     admin_address: SocketAddr,
 25 | 
 26 |     /// Abort the server if any thread panics
 27 |     #[arg(long = "abort-on-panic")]
 28 |     abort_on_panic: bool,
 29 | 
 30 |     /// Maximum cache size in MB
 31 |     #[arg(long = "max-cache-mb")]
 32 |     max_cache_mb: Option<usize>,
 33 | 
 34 |     /// Path to disk cache directory
 35 |     #[arg(long = "disk-cache-dir")]
 36 |     disk_cache_dir: Option<PathBuf>,
 37 | 
 38 |     /// Cache mode
 39 |     #[arg(long = "cache-mode", default_value = "liquid_eager_transcode")]
 40 |     cache_mode: CacheMode,
 41 | 
 42 |     /// Openobserve auth token
 43 |     #[arg(long)]
 44 |     openobserve_auth: Option<String>,
 45 | }
 46 | 
 47 | #[tokio::main]
 48 | async fn main() -> Result<(), Box<dyn std::error::Error>> {
 49 |     let args = CliArgs::parse();
 50 |     setup_observability(
 51 |         "liquid-cache-server",
 52 |         opentelemetry::trace::SpanKind::Server,
 53 |         args.openobserve_auth.as_deref(),
 54 |     );
 55 | 
 56 |     let max_cache_bytes = args.max_cache_mb.map(|size| size * 1024 * 1024);
 57 | 
 58 |     if args.abort_on_panic {
 59 |         // Be loud and crash loudly if any thread panics.
 60 |         // This will stop the server if any thread panics.
 61 |         // But will prevent debugger to break on panic.
 62 |         std::panic::set_hook(Box::new(|info| {
 63 |             eprintln!("Some thread panicked: {info:?}");
 64 |             std::process::exit(1);
 65 |         }));
 66 |     }
 67 | 
 68 |     let ctx = LiquidCacheService::context()?;
 69 |     let liquid_cache_server = LiquidCacheService::new(
 70 |         ctx,
 71 |         max_cache_bytes,
 72 |         args.disk_cache_dir.clone(),
 73 |         args.cache_mode,
 74 |         CacheEvictionStrategy::Discard,
 75 |     )?;
 76 | 
 77 |     let liquid_cache_server = Arc::new(liquid_cache_server);
 78 |     let flight = FlightServiceServer::from_arc(liquid_cache_server.clone());
 79 | 
 80 |     info!("LiquidCache server listening on {}", args.address);
 81 |     info!("Admin server listening on {}", args.admin_address);
 82 |     info!(
 83 |         "Dashboard: https://liquid-cache-admin.xiangpeng.systems/?host=http://{}",
 84 |         args.admin_address
 85 |     );
 86 | 
 87 |     // Run both servers concurrently
 88 |     tokio::select! {
 89 |         result = Server::builder().layer(FastraceServerLayer).add_service(flight).serve(args.address) => {
 90 |             result?;
 91 |         },
 92 |         result = run_admin_server(args.admin_address, liquid_cache_server) => {
 93 |             result?;
 94 |         },
 95 |     }
 96 | 
 97 |     fastrace::flush();
 98 |     Ok(())
 99 | }
100 | 


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q0.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q0.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q1.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q1.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q10.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q10.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q11.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q11.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q12.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q12.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q13.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q13.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q14.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q14.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q15.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q15.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q16.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q16.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q17.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q17.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q18.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q18.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q2.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q2.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q20.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q20.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q21.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q21.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q22.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q22.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q23.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q23.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q24.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q24.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q25.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q25.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q26.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q26.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q27.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q27.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q28.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q28.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q29.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q29.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q3.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q3.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q30.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q30.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q31.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q31.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q32.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q32.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q33.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q33.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q34.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q34.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q35.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q35.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q36.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q36.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q37.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q37.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q38.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q38.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q39.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q39.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q4.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q4.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q40.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q40.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q42.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q42.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q5.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q5.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q6.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q6.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q7.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q7.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q8.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q8.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/answers/clickbench_0/Q9.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/answers/clickbench_0/Q9.parquet


--------------------------------------------------------------------------------
/benchmark/clickbench/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/clickbench/data/.gitkeep


--------------------------------------------------------------------------------
/benchmark/clickbench/queries/query_select.sql:
--------------------------------------------------------------------------------
1 | SELECT "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhoneModel" ORDER BY u DESC LIMIT 10;
2 | SELECT "UserID" FROM hits WHERE "UserID" = 435090932899640449;
3 | SELECT COUNT(*) FROM hits WHERE "URL" LIKE '%google%';
4 | SELECT "SearchPhrase", MIN("URL"), COUNT(*) AS c FROM hits WHERE "URL" LIKE '%google%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
5 | SELECT "SearchPhrase", MIN("URL"), MIN("Title"), COUNT(*) AS c, COUNT(DISTINCT "UserID") FROM hits WHERE "Title" LIKE '%Google%' AND "URL" NOT LIKE '%.google.%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
6 | SELECT * FROM hits WHERE "URL" LIKE '%google%' ORDER BY to_timestamp_seconds("EventTime") LIMIT 10;
7 | SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10;
8 | 


--------------------------------------------------------------------------------
/benchmark/src/bin/create_nano_hits.rs:
--------------------------------------------------------------------------------
 1 | use parquet::arrow::ArrowWriter;
 2 | use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
 3 | use parquet::file::properties::WriterProperties;
 4 | use std::fs::File;
 5 | 
 6 | fn main() {
 7 |     let file = File::open("benchmark/clickbench/data/hits.parquet").unwrap();
 8 |     let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
 9 | 
10 |     let compression_alg = builder.metadata().row_groups()[0].columns()[0].compression();
11 | 
12 |     let mut arrow_reader = builder.with_batch_size(8192 * 3).build().unwrap();
13 | 
14 |     let batch_one = arrow_reader.next().unwrap().unwrap();
15 |     let batch_two = arrow_reader.next().unwrap().unwrap();
16 | 
17 |     let props = WriterProperties::builder()
18 |         .set_compression(compression_alg)
19 |         .set_max_row_group_size(8192 * 3)
20 |         .build();
21 | 
22 |     let file = File::create("examples/nano_hits.parquet").unwrap();
23 | 
24 |     let mut writer = ArrowWriter::try_new(file, batch_one.schema(), Some(props)).unwrap();
25 | 
26 |     writer
27 |         .write(&batch_one)
28 |         .expect("Writing batch 1 (Full Batch)");
29 |     writer
30 |         .write(&batch_two.slice(0, 10))
31 |         .expect("Writing batch 2 (Small Batch)");
32 | 
33 |     writer.close().unwrap();
34 | }
35 | 


--------------------------------------------------------------------------------
/benchmark/src/observability.rs:
--------------------------------------------------------------------------------
 1 | use fastrace_opentelemetry::OpenTelemetryReporter;
 2 | use logforth::append::opentelemetry::OpentelemetryLogBuilder;
 3 | use logforth::filter::EnvFilter;
 4 | use opentelemetry::InstrumentationScope;
 5 | use opentelemetry::KeyValue;
 6 | use opentelemetry::trace::SpanKind;
 7 | use opentelemetry_otlp::LogExporter;
 8 | use opentelemetry_otlp::WithExportConfig;
 9 | use opentelemetry_otlp::{SpanExporter, WithTonicConfig};
10 | use opentelemetry_sdk::Resource;
11 | use std::borrow::Cow;
12 | use tonic::metadata::MetadataMap;
13 | 
14 | fn otl_metadata(auth: &str) -> MetadataMap {
15 |     let mut map = MetadataMap::with_capacity(3);
16 |     map.insert("authorization", format!("Basic {auth}").parse().unwrap());
17 |     map.insert("organization", "default".parse().unwrap());
18 |     map.insert("stream-name", "default".parse().unwrap());
19 |     map
20 | }
21 | 
22 | pub fn setup_observability(service_name: &str, kind: SpanKind, auth: Option<&str>) {
23 |     let Some(auth) = auth else {
24 |         logforth::builder()
25 |             .dispatch(|d| {
26 |                 d.filter(EnvFilter::from_default_env())
27 |                     .append(logforth::append::Stdout::default())
28 |             })
29 |             .apply();
30 |         return;
31 |     };
32 | 
33 |     // Setup logging with logforth
34 |     let log_exporter = LogExporter::builder()
35 |         .with_tonic()
36 |         .with_endpoint("http://localhost:5081/api/development".to_string())
37 |         .with_protocol(opentelemetry_otlp::Protocol::Grpc)
38 |         .with_metadata(otl_metadata(auth))
39 |         .build()
40 |         .unwrap();
41 |     logforth::builder()
42 |         .dispatch(|d| {
43 |             d.filter(EnvFilter::from_default_env())
44 |                 .append(logforth::append::Stdout::default())
45 |         })
46 |         .dispatch(|d| {
47 |             let otl_appender = OpentelemetryLogBuilder::new(service_name, log_exporter)
48 |                 .build()
49 |                 .unwrap();
50 |             d.filter(EnvFilter::from_default_env()).append(otl_appender)
51 |         })
52 |         .apply();
53 | 
54 |     let trace_exporter = OpenTelemetryReporter::new(
55 |         SpanExporter::builder()
56 |             .with_tonic()
57 |             .with_endpoint("http://localhost:5081/api/development".to_string())
58 |             .with_metadata(otl_metadata(auth))
59 |             .with_protocol(opentelemetry_otlp::Protocol::Grpc)
60 |             .build()
61 |             .expect("initialize oltp exporter"),
62 |         kind,
63 |         Cow::Owned(
64 |             Resource::builder()
65 |                 .with_attributes([KeyValue::new("service.name", service_name.to_string())])
66 |                 .build(),
67 |         ),
68 |         InstrumentationScope::builder(env!("CARGO_PKG_NAME"))
69 |             .with_version(env!("CARGO_PKG_VERSION"))
70 |             .build(),
71 |     );
72 |     fastrace::set_reporter(trace_exporter, fastrace::collector::Config::default());
73 | }
74 | 


--------------------------------------------------------------------------------
/benchmark/src/runner.rs:
--------------------------------------------------------------------------------
  1 | use crate::{
  2 |     BenchmarkResult, CommonBenchmarkArgs, IterationResult, Query, QueryResult, setup_observability,
  3 | };
  4 | use datafusion::{
  5 |     arrow::{array::RecordBatch, util::pretty},
  6 |     error::Result,
  7 |     physical_plan::display::DisplayableExecutionPlan,
  8 |     prelude::SessionContext,
  9 | };
 10 | use fastrace::prelude::*;
 11 | use log::{debug, info};
 12 | use serde::Serialize;
 13 | use std::{fs::File, sync::Arc, time::Instant};
 14 | use sysinfo::Networks;
 15 | use uuid::Uuid;
 16 | 
 17 | /// Trait that benchmarks must implement
 18 | #[allow(async_fn_in_trait)]
 19 | pub trait Benchmark: Serialize + Clone {
 20 |     type Args: Serialize + Clone;
 21 | 
 22 |     /// Get the common benchmark arguments
 23 |     fn common_args(&self) -> &CommonBenchmarkArgs;
 24 | 
 25 |     /// Get the benchmark-specific arguments
 26 |     fn args(&self) -> &Self::Args;
 27 | 
 28 |     /// Setup the session context for this benchmark
 29 |     async fn setup_context(&self) -> Result<Arc<SessionContext>>;
 30 | 
 31 |     /// Get all queries to run for this benchmark
 32 |     async fn get_queries(&self) -> Result<Vec<Query>>;
 33 | 
 34 |     /// Validate query results against expected answers (optional)
 35 |     async fn validate_result(&self, query: &Query, results: &[RecordBatch]) -> Result<()>;
 36 | 
 37 |     /// Custom query execution logic (optional, for special cases like TPCH Q15)
 38 |     async fn execute_query(
 39 |         &self,
 40 |         ctx: &Arc<SessionContext>,
 41 |         query: &Query,
 42 |     ) -> Result<(
 43 |         Vec<RecordBatch>,
 44 |         Arc<dyn datafusion::physical_plan::ExecutionPlan>,
 45 |         Vec<Uuid>,
 46 |     )>;
 47 | 
 48 |     /// Get the benchmark name for tracing
 49 |     fn benchmark_name(&self) -> &'static str;
 50 | }
 51 | 
 52 | /// Generic benchmark runner that handles the common execution logic
 53 | pub struct BenchmarkRunner;
 54 | 
 55 | impl BenchmarkRunner {
 56 |     /// Run a benchmark using the provided benchmark implementation
 57 |     pub async fn run<B: Benchmark>(benchmark: B) -> Result<BenchmarkResult<B::Args>> {
 58 |         let common = benchmark.common_args();
 59 | 
 60 |         setup_observability(
 61 |             benchmark.benchmark_name(),
 62 |             opentelemetry::trace::SpanKind::Client,
 63 |             common.openobserve_auth.as_deref(),
 64 |         );
 65 | 
 66 |         let ctx = benchmark.setup_context().await?;
 67 |         let queries = benchmark.get_queries().await?;
 68 |         let queries = if let Some(query) = common.query {
 69 |             vec![queries.into_iter().find(|q| q.id == query).unwrap()]
 70 |         } else {
 71 |             queries
 72 |         };
 73 | 
 74 |         let mut benchmark_result = BenchmarkResult {
 75 |             args: benchmark.args().clone(),
 76 |             results: Vec::new(),
 77 |         };
 78 | 
 79 |         std::fs::create_dir_all("benchmark/data/results")?;
 80 | 
 81 |         let mut networks = Networks::new_with_refreshed_list();
 82 |         let bench_start_time = Instant::now();
 83 | 
 84 |         for query in queries {
 85 |             let mut query_result = QueryResult::new(query.id, query.sql.clone());
 86 | 
 87 |             for it in 0..common.iteration {
 88 |                 let iteration_result = Self::run_single_iteration(
 89 |                     &benchmark,
 90 |                     &ctx,
 91 |                     &query,
 92 |                     it,
 93 |                     &mut networks,
 94 |                     bench_start_time,
 95 |                 )
 96 |                 .await?;
 97 | 
 98 |                 query_result.add(iteration_result);
 99 |             }
100 | 
101 |             if common.reset_cache {
102 |                 common.reset_cache().await?;
103 |             }
104 | 
105 |             benchmark_result.results.push(query_result);
106 |         }
107 | 
108 |         if let Some(output_path) = &common.output {
109 |             let output_file = File::create(output_path)?;
110 |             serde_json::to_writer_pretty(output_file, &benchmark_result).unwrap();
111 |         }
112 | 
113 |         fastrace::flush();
114 |         Ok(benchmark_result)
115 |     }
116 | 
117 |     async fn run_single_iteration<B: Benchmark>(
118 |         benchmark: &B,
119 |         ctx: &Arc<SessionContext>,
120 |         query: &Query,
121 |         iteration: u32,
122 |         networks: &mut Networks,
123 |         bench_start_time: Instant,
124 |     ) -> Result<IterationResult> {
125 |         let common = benchmark.common_args();
126 | 
127 |         info!("Running query {}: \n{}", query.id, query.sql);
128 | 
129 |         common.start_trace().await;
130 |         common.start_flamegraph().await;
131 | 
132 |         let root = Span::root(
133 |             format!("{}-{}-{}", benchmark.benchmark_name(), query.id, iteration),
134 |             SpanContext::random(),
135 |         );
136 |         let _g = root.set_local_parent();
137 | 
138 |         let now = Instant::now();
139 |         let starting_timestamp = bench_start_time.elapsed();
140 | 
141 |         let (results, physical_plan, plan_uuid) = benchmark.execute_query(ctx, query).await?;
142 |         let elapsed = now.elapsed();
143 | 
144 |         networks.refresh(true);
145 |         let network_info = networks
146 |             .get("lo0")
147 |             .or_else(|| networks.get("lo"))
148 |             .expect("No loopback interface found in networks");
149 | 
150 |         let flamegraph = if !plan_uuid.is_empty() {
151 |             common.stop_flamegraph().await
152 |         } else {
153 |             None
154 |         };
155 |         common.stop_trace().await;
156 | 
157 |         let physical_plan_with_metrics =
158 |             DisplayableExecutionPlan::with_metrics(physical_plan.as_ref());
159 |         debug!(
160 |             "Physical plan: \n{}",
161 |             physical_plan_with_metrics.indent(true)
162 |         );
163 |         let result_str = pretty::pretty_format_batches(&results).unwrap();
164 |         debug!("Query result: \n{result_str}");
165 | 
166 |         benchmark.validate_result(query, &results).await?;
167 | 
168 |         common.get_cache_stats().await;
169 |         let network_traffic = network_info.received();
170 | 
171 |         if !plan_uuid.is_empty() {
172 |             common
173 |                 .set_execution_stats(
174 |                     plan_uuid,
175 |                     flamegraph,
176 |                     format!("{}-q{}-{}", benchmark.benchmark_name(), query.id, iteration),
177 |                     network_traffic,
178 |                     elapsed.as_millis() as u64,
179 |                     query.sql.clone(),
180 |                 )
181 |                 .await;
182 |         }
183 | 
184 |         let metrics_response = common.get_execution_metrics(&physical_plan).await;
185 | 
186 |         let result = IterationResult {
187 |             network_traffic,
188 |             time_millis: elapsed.as_millis() as u64,
189 |             cache_cpu_time: metrics_response.pushdown_eval_time,
190 |             cache_memory_usage: metrics_response.cache_memory_usage,
191 |             liquid_cache_usage: metrics_response.liquid_cache_usage,
192 |             starting_timestamp,
193 |         };
194 | 
195 |         result.log();
196 |         Ok(result)
197 |     }
198 | }
199 | 


--------------------------------------------------------------------------------
/benchmark/src/utils.rs:
--------------------------------------------------------------------------------
  1 | use std::sync::Arc;
  2 | 
  3 | use datafusion::arrow;
  4 | use datafusion::arrow::array::Array;
  5 | use datafusion::arrow::compute::kernels::numeric::{div, sub_wrapping};
  6 | use datafusion::arrow::datatypes::Float64Type;
  7 | use datafusion::arrow::{
  8 |     array::{AsArray, RecordBatch},
  9 |     datatypes::DataType,
 10 | };
 11 | use datafusion::common::tree_node::TreeNode;
 12 | use datafusion::physical_plan::ExecutionPlan;
 13 | use liquid_cache_client::LiquidCacheClientExec;
 14 | use log::warn;
 15 | use uuid::Uuid;
 16 | 
 17 | pub(crate) fn get_plan_uuids(plan: &Arc<dyn ExecutionPlan>) -> Vec<Uuid> {
 18 |     let mut uuids = Vec::new();
 19 |     plan.apply(|plan| {
 20 |         if let Some(plan) = plan.as_any().downcast_ref::<LiquidCacheClientExec>() {
 21 |             uuids.push(plan.get_uuid());
 22 |         }
 23 |         Ok(datafusion::common::tree_node::TreeNodeRecursion::Continue)
 24 |     })
 25 |     .unwrap();
 26 |     uuids
 27 | }
 28 | 
 29 | fn float_eq_helper(left: &dyn Array, right: &dyn Array, tol: f64) -> bool {
 30 |     let diff = sub_wrapping(&left, &right).unwrap();
 31 |     let diff = arrow::compute::kernels::cast(&diff, &DataType::Float64).unwrap();
 32 |     let diff = diff.as_primitive_opt::<Float64Type>().unwrap();
 33 | 
 34 |     // Check if all differences are within tolerance
 35 |     if diff.iter().flatten().all(|v| v.abs() <= tol) {
 36 |         return true;
 37 |     }
 38 | 
 39 |     let scale = div(&diff, &left).unwrap();
 40 |     let scale = arrow::compute::kernels::cast(&scale, &DataType::Float64).unwrap();
 41 |     let scale = scale.as_primitive_opt::<Float64Type>().unwrap();
 42 |     for d in scale.iter().flatten() {
 43 |         if d.abs() > tol {
 44 |             warn!("scale: {scale:?}");
 45 |             return false;
 46 |         }
 47 |     }
 48 |     true
 49 | }
 50 | 
 51 | pub fn assert_batch_eq(expected: &RecordBatch, actual: &RecordBatch) {
 52 |     use datafusion::arrow::compute::*;
 53 | 
 54 |     if expected.num_rows() != actual.num_rows() {
 55 |         panic!(
 56 |             "Left (answer) had {} rows, but right (result) had {} rows",
 57 |             expected.num_rows(),
 58 |             actual.num_rows()
 59 |         );
 60 |     }
 61 |     if expected.columns().len() != actual.columns().len() {
 62 |         panic!(
 63 |             "Left (answer) had {} cols, but right (result) had {} cols",
 64 |             expected.columns().len(),
 65 |             actual.columns().len()
 66 |         );
 67 |     }
 68 |     for (i, (c_expected, c_actual)) in expected
 69 |         .columns()
 70 |         .iter()
 71 |         .zip(actual.columns().iter())
 72 |         .enumerate()
 73 |     {
 74 |         let casted_expected = cast(c_expected, c_actual.data_type()).unwrap();
 75 |         let sorted_expected = sort(&casted_expected, None).unwrap();
 76 |         let sorted_actual = sort(c_actual, None).unwrap();
 77 | 
 78 |         let data_type = c_expected.data_type();
 79 |         let tol: f64 = 1e-4;
 80 |         let ok = match data_type {
 81 |             DataType::Float16 => {
 82 |                 unreachable!()
 83 |             }
 84 |             DataType::Float32 | DataType::Float64 => {
 85 |                 float_eq_helper(&sorted_expected, &sorted_actual, tol)
 86 |             }
 87 |             _ => {
 88 |                 let eq =
 89 |                     arrow::compute::kernels::cmp::eq(&sorted_expected, &sorted_actual).unwrap();
 90 |                 eq.false_count() == 0
 91 |             }
 92 |         };
 93 |         assert!(
 94 |             ok,
 95 |             "Column {} answer does not match result\nExpected: {:?}\n Actual: {:?}",
 96 |             expected.schema().field(i).name(),
 97 |             sorted_expected,
 98 |             sorted_actual
 99 |         );
100 |     }
101 | }
102 | 
103 | #[cfg(test)]
104 | mod tests {
105 | 
106 |     use super::*;
107 |     use datafusion::arrow::array::Float64Array;
108 | 
109 |     #[test]
110 |     fn test_float_eq() {
111 |         let left = Float64Array::from(vec![
112 |             1.9481948778949233e18,
113 |             1.9481948778941111e18,
114 |             1.9481948778949233e18,
115 |             0.00,
116 |         ]);
117 |         let right = Float64Array::from(vec![
118 |             1.948194877894922e18,
119 |             1.9481948778942222e18,
120 |             1.948194877894922e18,
121 |             0.00,
122 |         ]);
123 |         assert!(float_eq_helper(&left, &right, 1e-9));
124 | 
125 |         let left = Float64Array::from(vec![0.00]);
126 |         let right = Float64Array::from(vec![0.00]);
127 |         assert!(float_eq_helper(&left, &right, 1e-9));
128 |     }
129 | 
130 |     #[should_panic]
131 |     #[test]
132 |     fn test_float_eq_helper() {
133 |         let left = Float64Array::from(vec![0.00]);
134 |         let right = Float64Array::from(vec![1.00]);
135 |         assert!(float_eq_helper(&left, &right, 1e-9));
136 |     }
137 | }
138 | 


--------------------------------------------------------------------------------
/benchmark/tpch/answers/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/tpch/answers/.gitkeep


--------------------------------------------------------------------------------
/benchmark/tpch/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/tpch/data/.gitkeep


--------------------------------------------------------------------------------
/benchmark/tpch/data/sf0.001/customer.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/tpch/data/sf0.001/customer.parquet


--------------------------------------------------------------------------------
/benchmark/tpch/data/sf0.001/lineitem.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/tpch/data/sf0.001/lineitem.parquet


--------------------------------------------------------------------------------
/benchmark/tpch/data/sf0.001/nation.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/tpch/data/sf0.001/nation.parquet


--------------------------------------------------------------------------------
/benchmark/tpch/data/sf0.001/orders.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/tpch/data/sf0.001/orders.parquet


--------------------------------------------------------------------------------
/benchmark/tpch/data/sf0.001/part.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/tpch/data/sf0.001/part.parquet


--------------------------------------------------------------------------------
/benchmark/tpch/data/sf0.001/partsupp.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/tpch/data/sf0.001/partsupp.parquet


--------------------------------------------------------------------------------
/benchmark/tpch/data/sf0.001/region.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/tpch/data/sf0.001/region.parquet


--------------------------------------------------------------------------------
/benchmark/tpch/data/sf0.001/supplier.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/benchmark/tpch/data/sf0.001/supplier.parquet


--------------------------------------------------------------------------------
/benchmark/tpch/queries/q1.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |     l_returnflag,
 3 |     l_linestatus,
 4 |     sum(l_quantity) as sum_qty,
 5 |     sum(l_extendedprice) as sum_base_price,
 6 |     sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
 7 |     sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
 8 |     avg(l_quantity) as avg_qty,
 9 |     avg(l_extendedprice) as avg_price,
10 |     avg(l_discount) as avg_disc,
11 |     count(*) as count_order
12 | from
13 |     lineitem
14 | where
15 |         l_shipdate <= date '1998-09-02'
16 | group by
17 |     l_returnflag,
18 |     l_linestatus
19 | order by
20 |     l_returnflag,
21 |     l_linestatus;


--------------------------------------------------------------------------------
/benchmark/tpch/queries/q10.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |     c_custkey,
 3 |     c_name,
 4 |     sum(l_extendedprice * (1 - l_discount)) as revenue,
 5 |     c_acctbal,
 6 |     n_name,
 7 |     c_address,
 8 |     c_phone,
 9 |     c_comment
10 | from
11 |     customer,
12 |     orders,
13 |     lineitem,
14 |     nation
15 | where
16 |         c_custkey = o_custkey
17 |   and l_orderkey = o_orderkey
18 |   and o_orderdate >= date '1993-10-01'
19 |   and o_orderdate < date '1994-01-01'
20 |   and l_returnflag = 'R'
21 |   and c_nationkey = n_nationkey
22 | group by
23 |     c_custkey,
24 |     c_name,
25 |     c_acctbal,
26 |     c_phone,
27 |     n_name,
28 |     c_address,
29 |     c_comment
30 | order by
31 |     revenue desc
32 | LIMIT 20;
33 | 


--------------------------------------------------------------------------------
/benchmark/tpch/queries/q11.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |     ps_partkey,
 3 |     sum(ps_supplycost * ps_availqty) as value
 4 | from
 5 |     partsupp,
 6 |     supplier,
 7 |     nation
 8 | where
 9 |     ps_suppkey = s_suppkey
10 |   and s_nationkey = n_nationkey
11 |   and n_name = 'GERMANY'
12 | group by
13 |     ps_partkey having
14 |     sum(ps_supplycost * ps_availqty) > (
15 |     select
16 |     sum(ps_supplycost * ps_availqty) * 0.0001
17 |     from
18 |     partsupp,
19 |     supplier,
20 |     nation
21 |     where
22 |     ps_suppkey = s_suppkey
23 |                   and s_nationkey = n_nationkey
24 |                   and n_name = 'GERMANY'
25 |     )
26 | order by
27 |     value desc;


--------------------------------------------------------------------------------
/benchmark/tpch/queries/q12.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |     l_shipmode,
 3 |     sum(case
 4 |             when o_orderpriority = '1-URGENT'
 5 |                 or o_orderpriority = '2-HIGH'
 6 |                 then 1
 7 |             else 0
 8 |         end) as high_line_count,
 9 |     sum(case
10 |             when o_orderpriority <> '1-URGENT'
11 |                 and o_orderpriority <> '2-HIGH'
12 |                 then 1
13 |             else 0
14 |         end) as low_line_count
15 | from
16 |     lineitem
17 |         join
18 |     orders
19 |     on
20 |             l_orderkey = o_orderkey
21 | where
22 |         l_shipmode in ('MAIL', 'SHIP')
23 |   and l_commitdate < l_receiptdate
24 |   and l_shipdate < l_commitdate
25 |   and l_receiptdate >= date '1994-01-01'
26 |   and l_receiptdate < date '1995-01-01'
27 | group by
28 |     l_shipmode
29 | order by
30 |     l_shipmode;


--------------------------------------------------------------------------------
/benchmark/tpch/queries/q13.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |     c_count,
 3 |     count(*) as custdist
 4 | from
 5 |     (
 6 |         select
 7 |             c_custkey,
 8 |             count(o_orderkey)
 9 |         from
10 |             customer left outer join orders on
11 |                         c_custkey = o_custkey
12 |                     and o_comment not like '%special%requests%'
13 |         group by
14 |             c_custkey
15 |     ) as c_orders (c_custkey, c_count)
16 | group by
17 |     c_count
18 | order by
19 |     custdist desc,
20 |     c_count desc;


--------------------------------------------------------------------------------
/benchmark/tpch/queries/q14.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |             100.00 * sum(case
 3 |                              when p_type like 'PROMO%'
 4 |                                  then l_extendedprice * (1 - l_discount)
 5 |                              else 0
 6 |             end) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue
 7 | from
 8 |     lineitem,
 9 |     part
10 | where
11 |         l_partkey = p_partkey
12 |   and l_shipdate >= date '1995-09-01'
13 |   and l_shipdate < date '1995-10-01';


--------------------------------------------------------------------------------
/benchmark/tpch/queries/q15.sql:
--------------------------------------------------------------------------------
 1 | create view revenue0 (supplier_no, total_revenue) as
 2 | 	select
 3 | 		l_suppkey,
 4 | 		sum(l_extendedprice * (1 - l_discount))
 5 | 	from
 6 | 		lineitem
 7 | 	where
 8 | 		l_shipdate >= date '1996-01-01'
 9 | 		and l_shipdate < date '1996-01-01' + interval '3' month
10 | 	group by
11 | 		l_suppkey;
12 | 
13 | 
14 | select
15 | 	s_suppkey,
16 | 	s_name,
17 | 	s_address,
18 | 	s_phone,
19 | 	total_revenue
20 | from
21 | 	supplier,
22 | 	revenue0
23 | where
24 | 	s_suppkey = supplier_no
25 | 	and total_revenue = (
26 | 		select
27 | 			max(total_revenue)
28 | 		from
29 | 			revenue0
30 | 	)
31 | order by
32 | 	s_suppkey;
33 | 
34 | drop view revenue0;


--------------------------------------------------------------------------------
/benchmark/tpch/queries/q16.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |     p_brand,
 3 |     p_type,
 4 |     p_size,
 5 |     count(distinct ps_suppkey) as supplier_cnt
 6 | from
 7 |     partsupp,
 8 |     part
 9 | where
10 |         p_partkey = ps_partkey
11 |   and p_brand <> 'Brand#45'
12 |   and p_type not like 'MEDIUM POLISHED%'
13 |   and p_size in (49, 14, 23, 45, 19, 3, 36, 9)
14 |   and ps_suppkey not in (
15 |     select
16 |         s_suppkey
17 |     from
18 |         supplier
19 |     where
20 |             s_comment like '%Customer%Complaints%'
21 | )
22 | group by
23 |     p_brand,
24 |     p_type,
25 |     p_size
26 | order by
27 |     supplier_cnt desc,
28 |     p_brand,
29 |     p_type,
30 |     p_size;


--------------------------------------------------------------------------------
/benchmark/tpch/queries/q17.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |         sum(l_extendedprice) / 7.0 as avg_yearly
 3 | from
 4 |     lineitem,
 5 |     part
 6 | where
 7 |         p_partkey = l_partkey
 8 |   and p_brand = 'Brand#23'
 9 |   and p_container = 'MED BOX'
10 |   and l_quantity < (
11 |     select
12 |             0.2 * avg(l_quantity)
13 |     from
14 |         lineitem
15 |     where
16 |             l_partkey = p_partkey
17 | );


--------------------------------------------------------------------------------
/benchmark/tpch/queries/q18.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |     c_name,
 3 |     c_custkey,
 4 |     o_orderkey,
 5 |     o_orderdate,
 6 |     o_totalprice,
 7 |     sum(l_quantity)
 8 | from
 9 |     customer,
10 |     orders,
11 |     lineitem
12 | where
13 |         o_orderkey in (
14 |         select
15 |             l_orderkey
16 |         from
17 |             lineitem
18 |         group by
19 |             l_orderkey having
20 |                 sum(l_quantity) > 300
21 |     )
22 |   and c_custkey = o_custkey
23 |   and o_orderkey = l_orderkey
24 | group by
25 |     c_name,
26 |     c_custkey,
27 |     o_orderkey,
28 |     o_orderdate,
29 |     o_totalprice
30 | order by
31 |     o_totalprice desc,
32 |     o_orderdate;


--------------------------------------------------------------------------------
/benchmark/tpch/queries/q19.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |     sum(l_extendedprice* (1 - l_discount)) as revenue
 3 | from
 4 |     lineitem,
 5 |     part
 6 | where
 7 |     (
 8 |                 p_partkey = l_partkey
 9 |             and p_brand = 'Brand#12'
10 |             and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG')
11 |             and l_quantity >= 1 and l_quantity <= 1 + 10
12 |             and p_size between 1 and 5
13 |             and l_shipmode in ('AIR', 'AIR REG')
14 |             and l_shipinstruct = 'DELIVER IN PERSON'
15 |         )
16 |    or
17 |     (
18 |                 p_partkey = l_partkey
19 |             and p_brand = 'Brand#23'
20 |             and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK')
21 |             and l_quantity >= 10 and l_quantity <= 10 + 10
22 |             and p_size between 1 and 10
23 |             and l_shipmode in ('AIR', 'AIR REG')
24 |             and l_shipinstruct = 'DELIVER IN PERSON'
25 |         )
26 |    or
27 |     (
28 |                 p_partkey = l_partkey
29 |             and p_brand = 'Brand#34'
30 |             and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG')
31 |             and l_quantity >= 20 and l_quantity <= 20 + 10
32 |             and p_size between 1 and 15
33 |             and l_shipmode in ('AIR', 'AIR REG')
34 |             and l_shipinstruct = 'DELIVER IN PERSON'
35 |         );


--------------------------------------------------------------------------------
/benchmark/tpch/queries/q2.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |     s_acctbal,
 3 |     s_name,
 4 |     n_name,
 5 |     p_partkey,
 6 |     p_mfgr,
 7 |     s_address,
 8 |     s_phone,
 9 |     s_comment
10 | from
11 |     part,
12 |     supplier,
13 |     partsupp,
14 |     nation,
15 |     region
16 | where
17 |         p_partkey = ps_partkey
18 |   and s_suppkey = ps_suppkey
19 |   and p_size = 15
20 |   and p_type like '%BRASS'
21 |   and s_nationkey = n_nationkey
22 |   and n_regionkey = r_regionkey
23 |   and r_name = 'EUROPE'
24 |   and ps_supplycost = (
25 |     select
26 |         min(ps_supplycost)
27 |     from
28 |         partsupp,
29 |         supplier,
30 |         nation,
31 |         region
32 |     where
33 |             p_partkey = ps_partkey
34 |       and s_suppkey = ps_suppkey
35 |       and s_nationkey = n_nationkey
36 |       and n_regionkey = r_regionkey
37 |       and r_name = 'EUROPE'
38 | )
39 | order by
40 |     s_acctbal desc,
41 |     n_name,
42 |     s_name,
43 |     p_partkey;


--------------------------------------------------------------------------------
/benchmark/tpch/queries/q20.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |     s_name,
 3 |     s_address
 4 | from
 5 |     supplier,
 6 |     nation
 7 | where
 8 |         s_suppkey in (
 9 |         select
10 |             ps_suppkey
11 |         from
12 |             partsupp
13 |         where
14 |                 ps_partkey in (
15 |                 select
16 |                     p_partkey
17 |                 from
18 |                     part
19 |                 where
20 |                         p_name like 'forest%'
21 |             )
22 |           and ps_availqty > (
23 |             select
24 |                     0.5 * sum(l_quantity)
25 |             from
26 |                 lineitem
27 |             where
28 |                     l_partkey = ps_partkey
29 |               and l_suppkey = ps_suppkey
30 |               and l_shipdate >= date '1994-01-01'
31 |               and l_shipdate < date '1994-01-01' + interval '1' year
32 |         )
33 |     )
34 |   and s_nationkey = n_nationkey
35 |   and n_name = 'CANADA'
36 | order by
37 |     s_name;


--------------------------------------------------------------------------------
/benchmark/tpch/queries/q21.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |     s_name,
 3 |     count(*) as numwait
 4 | from
 5 |     supplier,
 6 |     lineitem l1,
 7 |     orders,
 8 |     nation
 9 | where
10 |         s_suppkey = l1.l_suppkey
11 |   and o_orderkey = l1.l_orderkey
12 |   and o_orderstatus = 'F'
13 |   and l1.l_receiptdate > l1.l_commitdate
14 |   and exists (
15 |         select
16 |             *
17 |         from
18 |             lineitem l2
19 |         where
20 |                 l2.l_orderkey = l1.l_orderkey
21 |           and l2.l_suppkey <> l1.l_suppkey
22 |     )
23 |   and not exists (
24 |         select
25 |             *
26 |         from
27 |             lineitem l3
28 |         where
29 |                 l3.l_orderkey = l1.l_orderkey
30 |           and l3.l_suppkey <> l1.l_suppkey
31 |           and l3.l_receiptdate > l3.l_commitdate
32 |     )
33 |   and s_nationkey = n_nationkey
34 |   and n_name = 'SAUDI ARABIA'
35 | group by
36 |     s_name
37 | order by
38 |     numwait desc,
39 |     s_name;


--------------------------------------------------------------------------------
/benchmark/tpch/queries/q22.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |     cntrycode,
 3 |     count(*) as numcust,
 4 |     sum(c_acctbal) as totacctbal
 5 | from
 6 |     (
 7 |         select
 8 |             substring(c_phone from 1 for 2) as cntrycode,
 9 |             c_acctbal
10 |         from
11 |             customer
12 |         where
13 |                 substring(c_phone from 1 for 2) in
14 |                 ('13', '31', '23', '29', '30', '18', '17')
15 |           and c_acctbal > (
16 |             select
17 |                 avg(c_acctbal)
18 |             from
19 |                 customer
20 |             where
21 |                     c_acctbal > 0.00
22 |               and substring(c_phone from 1 for 2) in
23 |                   ('13', '31', '23', '29', '30', '18', '17')
24 |         )
25 |           and not exists (
26 |                 select
27 |                     *
28 |                 from
29 |                     orders
30 |                 where
31 |                         o_custkey = c_custkey
32 |             )
33 |     ) as custsale
34 | group by
35 |     cntrycode
36 | order by
37 |     cntrycode;


--------------------------------------------------------------------------------
/benchmark/tpch/queries/q3.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |     l_orderkey,
 3 |     sum(l_extendedprice * (1 - l_discount)) as revenue,
 4 |     o_orderdate,
 5 |     o_shippriority
 6 | from
 7 |     customer,
 8 |     orders,
 9 |     lineitem
10 | where
11 |         c_mktsegment = 'BUILDING'
12 |   and c_custkey = o_custkey
13 |   and l_orderkey = o_orderkey
14 |   and o_orderdate < date '1995-03-15'
15 |   and l_shipdate > date '1995-03-15'
16 | group by
17 |     l_orderkey,
18 |     o_orderdate,
19 |     o_shippriority
20 | order by
21 |     revenue desc,
22 |     o_orderdate
23 | LIMIT 10;


--------------------------------------------------------------------------------
/benchmark/tpch/queries/q4.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |     o_orderpriority,
 3 |     count(*) as order_count
 4 | from
 5 |     orders
 6 | where
 7 |         o_orderdate >= '1993-07-01'
 8 |   and o_orderdate < date '1993-07-01' + interval '3' month
 9 |   and exists (
10 |         select
11 |             *
12 |         from
13 |             lineitem
14 |         where
15 |                 l_orderkey = o_orderkey
16 |           and l_commitdate < l_receiptdate
17 |     )
18 | group by
19 |     o_orderpriority
20 | order by
21 |     o_orderpriority;


--------------------------------------------------------------------------------
/benchmark/tpch/queries/q5.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |     n_name,
 3 |     sum(l_extendedprice * (1 - l_discount)) as revenue
 4 | from
 5 |     customer,
 6 |     orders,
 7 |     lineitem,
 8 |     supplier,
 9 |     nation,
10 |     region
11 | where
12 |         c_custkey = o_custkey
13 |   and l_orderkey = o_orderkey
14 |   and l_suppkey = s_suppkey
15 |   and c_nationkey = s_nationkey
16 |   and s_nationkey = n_nationkey
17 |   and n_regionkey = r_regionkey
18 |   and r_name = 'ASIA'
19 |   and o_orderdate >= date '1994-01-01'
20 |   and o_orderdate < date '1995-01-01'
21 | group by
22 |     n_name
23 | order by
24 |     revenue desc;


--------------------------------------------------------------------------------
/benchmark/tpch/queries/q6.sql:
--------------------------------------------------------------------------------
1 | select
2 |     sum(l_extendedprice * l_discount) as revenue
3 | from
4 |     lineitem
5 | where
6 |         l_shipdate >= date '1994-01-01'
7 |   and l_shipdate < date '1995-01-01'
8 |   and l_discount between 0.06 - 0.01 and 0.06 + 0.01
9 |   and l_quantity < 24;


--------------------------------------------------------------------------------
/benchmark/tpch/queries/q7.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |     supp_nation,
 3 |     cust_nation,
 4 |     l_year,
 5 |     sum(volume) as revenue
 6 | from
 7 |     (
 8 |         select
 9 |             n1.n_name as supp_nation,
10 |             n2.n_name as cust_nation,
11 |             extract(year from l_shipdate) as l_year,
12 |             l_extendedprice * (1 - l_discount) as volume
13 |         from
14 |             supplier,
15 |             lineitem,
16 |             orders,
17 |             customer,
18 |             nation n1,
19 |             nation n2
20 |         where
21 |                 s_suppkey = l_suppkey
22 |           and o_orderkey = l_orderkey
23 |           and c_custkey = o_custkey
24 |           and s_nationkey = n1.n_nationkey
25 |           and c_nationkey = n2.n_nationkey
26 |           and (
27 |                 (n1.n_name = 'FRANCE' and n2.n_name = 'GERMANY')
28 |                 or (n1.n_name = 'GERMANY' and n2.n_name = 'FRANCE')
29 |             )
30 |           and l_shipdate between date '1995-01-01' and date '1996-12-31'
31 |     ) as shipping
32 | group by
33 |     supp_nation,
34 |     cust_nation,
35 |     l_year
36 | order by
37 |     supp_nation,
38 |     cust_nation,
39 |     l_year;
40 | 


--------------------------------------------------------------------------------
/benchmark/tpch/queries/q8.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |     o_year,
 3 |     sum(case
 4 |             when nation = 'BRAZIL' then volume
 5 |             else 0
 6 |         end) / sum(volume) as mkt_share
 7 | from
 8 |     (
 9 |         select
10 |             extract(year from o_orderdate) as o_year,
11 |             l_extendedprice * (1 - l_discount) as volume,
12 |             n2.n_name as nation
13 |         from
14 |             part,
15 |             supplier,
16 |             lineitem,
17 |             orders,
18 |             customer,
19 |             nation n1,
20 |             nation n2,
21 |             region
22 |         where
23 |                 p_partkey = l_partkey
24 |           and s_suppkey = l_suppkey
25 |           and l_orderkey = o_orderkey
26 |           and o_custkey = c_custkey
27 |           and c_nationkey = n1.n_nationkey
28 |           and n1.n_regionkey = r_regionkey
29 |           and r_name = 'AMERICA'
30 |           and s_nationkey = n2.n_nationkey
31 |           and o_orderdate between date '1995-01-01' and date '1996-12-31'
32 |           and p_type = 'ECONOMY ANODIZED STEEL'
33 |     ) as all_nations
34 | group by
35 |     o_year
36 | order by
37 |     o_year;


--------------------------------------------------------------------------------
/benchmark/tpch/queries/q9.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |     nation,
 3 |     o_year,
 4 |     sum(amount) as sum_profit
 5 | from
 6 |     (
 7 |         select
 8 |             n_name as nation,
 9 |             extract(year from o_orderdate) as o_year,
10 |             l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount
11 |         from
12 |             part,
13 |             supplier,
14 |             lineitem,
15 |             partsupp,
16 |             orders,
17 |             nation
18 |         where
19 |                 s_suppkey = l_suppkey
20 |           and ps_suppkey = l_suppkey
21 |           and ps_partkey = l_partkey
22 |           and p_partkey = l_partkey
23 |           and o_orderkey = l_orderkey
24 |           and s_nationkey = n_nationkey
25 |           and p_name like '%green%'
26 |     ) as profit
27 | group by
28 |     nation,
29 |     o_year
30 | order by
31 |     nation,
32 |     o_year desc;


--------------------------------------------------------------------------------
/benchmark/tpch/tpch_gen.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | import sys
 5 | import argparse
 6 | import duckdb
 7 | from pathlib import Path
 8 | 
 9 | def ensure_dir(dir_path):
10 |     """Ensure directory exists"""
11 |     Path(dir_path).mkdir(parents=True, exist_ok=True)
12 | 
13 | def generate_tpch_data(scale_factor, data_dir, answers_dir):
14 |     """Generate TPCH data at specified scale factor and save to parquet files"""
15 |     print(f"Generating TPCH data with scale factor {scale_factor}")
16 |     
17 |     ensure_dir(data_dir)
18 |     ensure_dir(answers_dir)
19 |     
20 |     conn = duckdb.connect(database=':memory:')
21 |     
22 |     try:
23 |         conn.execute("INSTALL tpch")
24 |         conn.execute("LOAD tpch")
25 |         
26 |         conn.execute(f"CALL dbgen(sf={scale_factor})")
27 |         
28 |         tables = ['lineitem', 'orders', 'customer', 'part', 'partsupp', 'supplier', 'nation', 'region']
29 |         
30 |         for table in tables:
31 |             output_path = os.path.join(data_dir, f"{table}.parquet")
32 |             print(f"Saving {table} to {output_path}")
33 |             conn.execute(f"COPY {table} TO '{output_path}' (FORMAT 'PARQUET')")
34 |         
35 |         print(f"Retrieving answers for scale factor {scale_factor}")
36 |         
37 |         conn.execute(f"CREATE TEMPORARY TABLE tpch_answers_temp AS SELECT * FROM tpch_answers() WHERE scale_factor = {scale_factor}")
38 |         
39 |         result = conn.execute("SELECT query_nr FROM tpch_answers_temp ORDER BY query_nr").fetchall()
40 |         
41 |         for row in result:
42 |             query_nr = row[0]
43 |             
44 |             answer_csv_result = conn.execute(f"SELECT answer FROM tpch_answers_temp WHERE query_nr = {query_nr}").fetchone()
45 |             if not answer_csv_result:
46 |                 print(f"Warning: No answer found for query {query_nr}, skipping")
47 |                 continue
48 |                 
49 |             answer_csv = answer_csv_result[0]
50 |             if not answer_csv or answer_csv.isspace():
51 |                 print(f"Warning: Empty answer for query {query_nr}, skipping")
52 |                 continue
53 |                 
54 |             output_path = os.path.join(answers_dir, f"q{query_nr}.parquet")
55 |             print(f"Processing answer for query {query_nr} and saving to {output_path}")
56 |             
57 |             temp_csv = os.path.join(answers_dir, f"q{query_nr}_temp.csv")
58 |             with open(temp_csv, 'w') as f:
59 |                 f.write(answer_csv)
60 |             
61 |             conn.execute(f"CREATE OR REPLACE TABLE q{query_nr}_temp AS SELECT * FROM read_csv('{temp_csv}', delim='|', header=true)")
62 |             conn.execute(f"COPY q{query_nr}_temp TO '{output_path}' (FORMAT 'PARQUET')")
63 |             
64 |             os.remove(temp_csv)
65 |     
66 |     except Exception as e:
67 |         print(f"Error: {e}")
68 |         sys.exit(1)
69 |     finally:
70 |         conn.close()
71 |     
72 |     print("TPCH data and query answers generation completed successfully")
73 | 
74 | def main():
75 |     parser = argparse.ArgumentParser(description='Generate TPCH data and query answers using DuckDB')
76 |     parser.add_argument('--scale', type=float, default=0.01, help='Scale factor (default: 0.01)')
77 |     parser.add_argument('--data-dir', type=str, default='data', help='Directory to store data parquet files')
78 |     parser.add_argument('--answers-dir', type=str, default='answers', help='Directory to store query answers parquet files')
79 |     
80 |     args = parser.parse_args()
81 |     data_dir = os.path.join(args.data_dir, f"sf{args.scale}")
82 |     answers_dir = os.path.join(args.answers_dir, f"sf{args.scale}")
83 | 
84 |     generate_tpch_data(args.scale, data_dir, answers_dir)
85 | 
86 | if __name__ == "__main__":
87 |     main()
88 | 


--------------------------------------------------------------------------------
/dev/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Development Setup
 3 | 
 4 | **Engineering is art, it has to be beautiful.**
 5 | 
 6 | ### Install Rust toolchain
 7 | 
 8 | ```bash
 9 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
10 | ```
11 | 
12 | Run tests:
13 | 
14 | ```bash
15 | cargo test
16 | ```
17 | 
18 | ### Observability
19 | 
20 | LiquidCache exports opentelemetry metrics.
21 | 
22 | First, start a [openobserve](https://openobserve.ai/) instance:
23 | ```bash
24 | docker run -d \
25 |       --name openobserve \
26 |       -v $PWD/data:/data \
27 |       -p 5080:5080 \
28 |       -p 5081:5081 \
29 |       -e ZO_ROOT_USER_EMAIL="root@example.com" \
30 |       -e ZO_ROOT_USER_PASSWORD="Complexpass#123" \
31 |       public.ecr.aws/zinclabs/openobserve:latest
32 | ```
33 | 
34 | Then, get the auth token from the instance: http://localhost:5080/web/ingestion/recommended/traces
35 | 
36 | You will see a token like this:
37 | ```
38 | cm9vdEBleGFtcGxlLmNvbTpGT01qZ3NRUlNmelNoNzJQ
39 | ```
40 | 
41 | Then, run the server/client with the auth token:
42 | ```bash
43 | cargo run --release --bin bench_server -- --openobserve-auth cm9vdEBleGFtcGxlLmNvbTpGT01qZ3NRUlNmelNoNzJQ
44 | ```
45 | 
46 | Then open http://localhost:5080 to view the traces.
47 | 
48 | 
49 | ### Deploy a LiquidCache server with Docker
50 | 
51 | ```bash
52 | docker run -p 15214:15214 -p 53793:53793 ghcr.io/xiangpenghao/liquid-cache/liquid-cache-server:latest
53 | ```
54 | 
55 | ### Git hooks
56 | 
57 | After cloning the repository, run the following command to set up git hooks: 
58 | 
59 | ```bash
60 | ./dev/install-git-hooks.sh
61 | ```
62 | 
63 | This will set up pre-commit hooks that check formatting, run clippy, and verify documentation.
64 | 


--------------------------------------------------------------------------------
/dev/doc/arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/dev/doc/arch.png


--------------------------------------------------------------------------------
/dev/doc/liquid-cache-vldb.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/dev/doc/liquid-cache-vldb.pdf


--------------------------------------------------------------------------------
/dev/doc/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/dev/doc/logo.png


--------------------------------------------------------------------------------
/dev/git-hooks/pre-push:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "Running pre-push checks..."
 4 | 
 5 | # Check formatting
 6 | echo "Checking formatting..."
 7 | if ! cargo fmt -- --check; then
 8 |     echo "❌ Formatting check failed. Please run 'cargo fmt' to fix formatting issues."
 9 |     exit 1
10 | fi
11 | 
12 | # Run clippy
13 | echo "Running clippy..."
14 | if ! cargo clippy -- -D warnings; then
15 |     echo "❌ Clippy check failed. Please fix the warnings before committing."
16 |     exit 1
17 | fi
18 | 
19 | # Check documentation
20 | echo "Checking documentation..."
21 | if ! RUSTDOCFLAGS="-D warnings" cargo doc --no-deps --document-private-items --all-features; then
22 |     echo "❌ Documentation check failed. Please fix documentation issues."
23 |     exit 1
24 | fi
25 | 
26 | echo "✅ All checks passed!"
27 | exit 0 
28 |  


--------------------------------------------------------------------------------
/dev/install-git-hooks.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Create symbolic link for pre-push hook
4 | ln -sf "../../dev/git-hooks/pre-push" ".git/hooks/pre-push"
5 | chmod +x "dev/git-hooks/pre-push"
6 | 
7 | echo "Git hooks installed successfully!" 


--------------------------------------------------------------------------------
/dev/liquid_cache_server.dockerfile:
--------------------------------------------------------------------------------
 1 | # Use a minimal Debian base image
 2 | FROM ubuntu:24.04
 3 | 
 4 | # Install minimal runtime dependencies
 5 | RUN apt-get update && apt-get install -y \
 6 |     ca-certificates \
 7 |     && rm -rf /var/lib/apt/lists/*
 8 | 
 9 | ENV RUST_BACKTRACE=1
10 | ENV RUST_LOG=info
11 | 
12 | WORKDIR /app
13 | 
14 | COPY ./target/release/bench_server /app/bench_server
15 | 
16 | EXPOSE 15214
17 | EXPOSE 53793
18 | 
19 | # Run the server when the container starts
20 | CMD ["/app/bench_server", "--address", "0.0.0.0:15214", "--admin-address", "0.0.0.0:53793"]
21 | 


--------------------------------------------------------------------------------
/dev/thoughts/architecture.md:
--------------------------------------------------------------------------------
 1 | # LiquidCache Architecture
 2 | 
 3 | LiquidCache consists of three parts: 
 4 | - Cache: a server that caches data and evaluates the predicates.
 5 | - Compute: the DataFusion instance that executes user queries. 
 6 | - LiquidParquet: the cache-specific file format used by the server. 
 7 | 
 8 | ## Cache
 9 | 
10 | ## Compute
11 | The compute node is stateless.
12 | 
13 | ## LiquidParquet


--------------------------------------------------------------------------------
/dev/thoughts/artifact-eval.md:
--------------------------------------------------------------------------------
 1 | ### Configure network bandwidth
 2 | 
 3 | Clean up the config:
 4 | ```bash
 5 | sudo tc qdisc del dev lo root
 6 | ```
 7 | 
 8 | Set the bandwidth limit to 20Gbps:
 9 | ```bash
10 | sudo tc qdisc add dev lo root tbf rate 20gbit burst 32mb limit 1000000
11 | ```
12 | 
13 | (I noticed very rarely the network will drop some connections causing client to panic, if that happens, either restart the benchmark or increase the limit a bit.)
14 | 
15 | ### Ablation study
16 | 
17 | 1. Change src/liquid_parquet/src/lib.rs:
18 | ```rust
19 | const ABLATION_STUDY_MODE: AblationStudyMode = AblationStudyMode::FullDecoding;
20 | ```
21 | 
22 | 2. Start server with:
23 | ```bash
24 | env RUST_LOG=info RUST_BACKTRACE=1 RUSTFLAGS='-C target-cpu=native' cargo run --release --bin bench_server -- --address 127.0.0.1:5001 --abort-on-panic
25 | ```
26 | 
27 | 3. Run client with:
28 | ```bash
29 | env RUST_LOG=info,clickbench_client=debug RUST_BACKTRACE=1 RUSTFLAGS='-C target-cpu=native' cargo run --release --bin clickbench_client -- --query-path benchmark/query_select.sql --file benchmark/data/hits.parquet --bench-mode liquid-eager-transcode --server http://127.0.0.1:5001 --iteration 5 --output benchmark/data/liquid_eager_transcode.json --reset-cache
30 | ```
31 | 
32 | 
33 | ### Start server with limited memory
34 | ```bash
35 | systemd-run --scope -p MemoryMax=16G ./target/release/bench_server --address 127.0.0.1:5001 --max-
36 | cache-mb 12288
37 | ```
38 | 


--------------------------------------------------------------------------------
/dev/thoughts/debugging-tips.md:
--------------------------------------------------------------------------------
 1 | ### Use memory sanitizer to find memory issues
 2 | 
 3 | We have to use unsafe.
 4 | Sometimes it creates invalid memory accesses, which are very hard to debug.
 5 | 
 6 | Be sure to disable `mimalloc` in the benchmark.
 7 | 
 8 | ```bash
 9 | env RUSTFLAGS="-Z sanitizer=address" RUST_LOG=info cargo run -Zbuild-std --target x86_64-unknown-linux-gnu --bin bench_server
10 | ```
11 | 
12 | ### Use RUST_LOG filtering
13 | 
14 | ```bash
15 | env RUST_LOG=clickbench_client=debug,info
16 | ```
17 | 
18 | Will only show debug logs for `clickbench_client` but info logs for other modules.
19 | 
20 | 


--------------------------------------------------------------------------------
/dev/thoughts/thoughts.md:
--------------------------------------------------------------------------------
 1 | ### Inventing a new file format is not a good idea.
 2 | One of the reason is that a file format needs to be well supported by the query engine.
 3 | Parquet format itself is only x line of code, but DataFusion has xxx lines of code to support Parquet. The effort to support parquet is x% of the effort to implement a new file format, where x is a very large number.
 4 | If we simply invent a new file format, we are likely have worse performance than Parquet.
 5 | 
 6 | ### Predicate pushdown can be slower
 7 | Because the output of predicate pushdown is in CSV/JSON format, meaning that the data is not compressed.
 8 | It can result in much larger network traffic if the filter is not selective enough.
 9 | 
10 | Why we can't compress the output or re-encode the data in Parquet? 
11 | That's a lot of CPU cost. (is that true?)
12 | 
13 | 
14 | ### TableProvider vs LiquidParquetExec 
15 | We currently pack our system as a `TableProvider`. 
16 | This does not work for people who already have their own `TableProvider`.
17 | In that case, they might want to re-implement many parts of our system. 
18 | 
19 | ### In-process mode
20 | While LiquidCache is a one-stop comprehensive solution for disaggregated cache, many people might want to use part of our components.
21 | For example, some people might want disaggregated cache, others might want to use only `LiquidParquetExec`.
22 | 
23 | ### Rigorous testing
24 | We need a lot of systematic testing to ensure our system is correct.
25 | Especially the `LiquidArray` part. Fuzzing is planned.
26 | 


--------------------------------------------------------------------------------
/examples/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "examples"
 3 | edition = { workspace = true }
 4 | publish = false
 5 | 
 6 | [[bin]]
 7 | name = "example_server"
 8 | path = "example_server.rs"
 9 | 
10 | [[bin]]
11 | name = "example_client"
12 | path = "example_client.rs"
13 | 
14 | [dependencies]
15 | datafusion = { workspace = true }
16 | liquid-cache-server = { workspace = true }
17 | liquid-cache-client = { workspace = true }
18 | liquid-cache-common = { workspace = true }
19 | async-trait = { workspace = true }
20 | futures = { workspace = true }
21 | tokio = { workspace = true }
22 | log = { workspace = true }
23 | arrow-flight = { workspace = true }
24 | tonic = { workspace = true }
25 | env_logger = "0.11.8"
26 | url = { workspace = true }
27 | tempfile = "3.20.0"
28 | clap = { version = "4.5.38", features = ["derive"] }
29 | object_store = { workspace = true, features = ["http"] }
30 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | DataFusion Cache examples
 2 | 
 3 | ## Server
 4 | 
 5 | ```bash
 6 | cargo run --bin example_server
 7 | ```
 8 | 
 9 | ## Client
10 | 
11 | ```bash
12 | cargo run --bin example_client
13 | ```
14 | 


--------------------------------------------------------------------------------
/examples/example_client.rs:
--------------------------------------------------------------------------------
 1 | // Licensed to the Apache Software Foundation (ASF) under one
 2 | // or more contributor license agreements.  See the NOTICE file
 3 | // distributed with this work for additional information
 4 | // regarding copyright ownership.  The ASF licenses this file
 5 | // to you under the Apache License, Version 2.0 (the
 6 | // "License"); you may not use this file except in compliance
 7 | // with the License.  You may obtain a copy of the License at
 8 | //
 9 | //   http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied.  See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 | 
18 | use clap::{Parser, command};
19 | use datafusion::{error::Result, execution::object_store::ObjectStoreUrl, prelude::*};
20 | use liquid_cache_client::LiquidCacheBuilder;
21 | use liquid_cache_common::CacheMode;
22 | use std::path::Path;
23 | use std::sync::Arc;
24 | use url::Url;
25 | 
26 | #[derive(Parser, Clone)]
27 | #[command(name = "Example Client")]
28 | struct CliArgs {
29 |     /// SQL query to execute
30 |     #[arg(
31 |         long,
32 |         default_value = "SELECT COUNT(*) FROM \"aws-edge-locations\" WHERE \"countryCode\" = 'US';"
33 |     )]
34 |     query: String,
35 | 
36 |     /// URL of the table to query
37 |     #[arg(
38 |         long,
39 |         default_value = "https://raw.githubusercontent.com/tobilg/aws-edge-locations/main/data/aws-edge-locations.parquet"
40 |     )]
41 |     file: String,
42 | 
43 |     /// Server URL
44 |     #[arg(long, default_value = "http://localhost:15214")]
45 |     cache_server: String,
46 | }
47 | 
48 | #[tokio::main]
49 | pub async fn main() -> Result<()> {
50 |     let args = CliArgs::parse();
51 |     let url = Url::parse(&args.file).unwrap();
52 |     let object_store_url = format!("{}://{}", url.scheme(), url.host_str().unwrap_or_default());
53 | 
54 |     let ctx = LiquidCacheBuilder::new(args.cache_server.clone())
55 |         .with_object_store(ObjectStoreUrl::parse(object_store_url.as_str())?, None)
56 |         .with_cache_mode(CacheMode::Liquid)
57 |         .build(SessionConfig::from_env()?)?;
58 |     let ctx = Arc::new(ctx);
59 | 
60 |     let table_name = Path::new(url.path())
61 |         .file_stem()
62 |         .unwrap_or_default()
63 |         .to_str()
64 |         .unwrap_or("default");
65 |     let sql = args.query;
66 |     let object_store = object_store::http::HttpBuilder::new()
67 |         .with_url(object_store_url.as_str())
68 |         .build()
69 |         .unwrap();
70 |     let object_store_url = ObjectStoreUrl::parse(object_store_url.as_str()).unwrap();
71 |     ctx.register_object_store(object_store_url.as_ref(), Arc::new(object_store));
72 |     ctx.register_parquet(table_name, url.as_ref(), Default::default())
73 |         .await?;
74 | 
75 |     ctx.sql(&sql).await?.show().await?;
76 | 
77 |     Ok(())
78 | }
79 | 


--------------------------------------------------------------------------------
/examples/example_server.rs:
--------------------------------------------------------------------------------
 1 | // Licensed to the Apache Software Foundation (ASF) under one
 2 | // or more contributor license agreements.  See the NOTICE file
 3 | // distributed with this work for additional information
 4 | // regarding copyright ownership.  The ASF licenses this file
 5 | // to you under the Apache License, Version 2.0 (the
 6 | // "License"); you may not use this file except in compliance
 7 | // with the License.  You may obtain a copy of the License at
 8 | //
 9 | //   http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied.  See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 | 
18 | use arrow_flight::flight_service_server::FlightServiceServer;
19 | use datafusion::prelude::SessionContext;
20 | use liquid_cache_common::{CacheEvictionStrategy, CacheMode};
21 | use liquid_cache_server::LiquidCacheService;
22 | use tonic::transport::Server;
23 | 
24 | #[tokio::main]
25 | async fn main() -> Result<(), Box<dyn std::error::Error>> {
26 |     let liquid_cache = LiquidCacheService::new(
27 |         SessionContext::new(),
28 |         Some(1024 * 1024 * 1024),          // max memory cache size 1GB
29 |         Some(tempfile::tempdir()?.keep()), // disk cache dir
30 |         CacheMode::LiquidEagerTranscode,
31 |         CacheEvictionStrategy::Discard,
32 |     )?;
33 | 
34 |     let flight = FlightServiceServer::new(liquid_cache);
35 | 
36 |     Server::builder()
37 |         .add_service(flight)
38 |         .serve("0.0.0.0:15214".parse()?)
39 |         .await?;
40 | 
41 |     Ok(())
42 | }
43 | 


--------------------------------------------------------------------------------
/examples/nano_hits.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XiangpengHao/liquid-cache/7537ba0dda3753c227803e75e992ef90a1bce084/examples/nano_hits.parquet


--------------------------------------------------------------------------------
/flake.lock:
--------------------------------------------------------------------------------
 1 | {
 2 |   "nodes": {
 3 |     "flake-utils": {
 4 |       "inputs": {
 5 |         "systems": "systems"
 6 |       },
 7 |       "locked": {
 8 |         "lastModified": 1731533236,
 9 |         "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
10 |         "owner": "numtide",
11 |         "repo": "flake-utils",
12 |         "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
13 |         "type": "github"
14 |       },
15 |       "original": {
16 |         "owner": "numtide",
17 |         "repo": "flake-utils",
18 |         "type": "github"
19 |       }
20 |     },
21 |     "nixpkgs": {
22 |       "locked": {
23 |         "lastModified": 1747744144,
24 |         "narHash": "sha256-W7lqHp0qZiENCDwUZ5EX/lNhxjMdNapFnbErcbnP11Q=",
25 |         "owner": "NixOS",
26 |         "repo": "nixpkgs",
27 |         "rev": "2795c506fe8fb7b03c36ccb51f75b6df0ab2553f",
28 |         "type": "github"
29 |       },
30 |       "original": {
31 |         "owner": "NixOS",
32 |         "ref": "nixos-unstable",
33 |         "repo": "nixpkgs",
34 |         "type": "github"
35 |       }
36 |     },
37 |     "nixpkgs_2": {
38 |       "locked": {
39 |         "lastModified": 1744536153,
40 |         "narHash": "sha256-awS2zRgF4uTwrOKwwiJcByDzDOdo3Q1rPZbiHQg/N38=",
41 |         "owner": "NixOS",
42 |         "repo": "nixpkgs",
43 |         "rev": "18dd725c29603f582cf1900e0d25f9f1063dbf11",
44 |         "type": "github"
45 |       },
46 |       "original": {
47 |         "owner": "NixOS",
48 |         "ref": "nixpkgs-unstable",
49 |         "repo": "nixpkgs",
50 |         "type": "github"
51 |       }
52 |     },
53 |     "root": {
54 |       "inputs": {
55 |         "flake-utils": "flake-utils",
56 |         "nixpkgs": "nixpkgs",
57 |         "rust-overlay": "rust-overlay"
58 |       }
59 |     },
60 |     "rust-overlay": {
61 |       "inputs": {
62 |         "nixpkgs": "nixpkgs_2"
63 |       },
64 |       "locked": {
65 |         "lastModified": 1747881408,
66 |         "narHash": "sha256-LmpQ28JNi5OPqRamih6+QvVQE1DurLOgKUlyM4fRiRU=",
67 |         "owner": "oxalica",
68 |         "repo": "rust-overlay",
69 |         "rev": "6e322a70e8a6c15bab8a5e3cf690fd65414b9d81",
70 |         "type": "github"
71 |       },
72 |       "original": {
73 |         "owner": "oxalica",
74 |         "repo": "rust-overlay",
75 |         "type": "github"
76 |       }
77 |     },
78 |     "systems": {
79 |       "locked": {
80 |         "lastModified": 1681028828,
81 |         "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
82 |         "owner": "nix-systems",
83 |         "repo": "default",
84 |         "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
85 |         "type": "github"
86 |       },
87 |       "original": {
88 |         "owner": "nix-systems",
89 |         "repo": "default",
90 |         "type": "github"
91 |       }
92 |     }
93 |   },
94 |   "root": "root",
95 |   "version": 7
96 | }
97 | 


--------------------------------------------------------------------------------
/flake.nix:
--------------------------------------------------------------------------------
 1 | {
 2 |   description = "Liquid Cache Flake Configuration";
 3 | 
 4 |   inputs = {
 5 |     nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
 6 |     rust-overlay.url = "github:oxalica/rust-overlay";
 7 |     flake-utils.url = "github:numtide/flake-utils";
 8 |   };
 9 | 
10 |   outputs =
11 |     { nixpkgs
12 |     , rust-overlay
13 |     , flake-utils
14 |     , ...
15 |     }:
16 |     flake-utils.lib.eachDefaultSystem (
17 |       system:
18 |       let
19 |         overlays = [ (import rust-overlay) ];
20 |         pkgs = import nixpkgs {
21 |           inherit system overlays;
22 |         };
23 |       in
24 |       {
25 |         devShells.default = with pkgs;
26 |           mkShell {
27 |             buildInputs = [
28 |               openssl
29 |               pkg-config
30 |               eza
31 |               fd
32 |               llvmPackages.bintools
33 |               (rust-bin.fromRustupToolchainFile (./rust-toolchain.toml))
34 |             ];
35 |           };
36 |       }
37 |     );
38 | }
39 | 


--------------------------------------------------------------------------------
/rust-toolchain.toml:
--------------------------------------------------------------------------------
1 | [toolchain]
2 | # I really don't want to use nightly, but we have to, until:
3 | # https://github.com/spiraldb/fastlanes/issues/45 is fixed.
4 | channel = "nightly-2025-05-20"
5 | components = ["rustfmt", "clippy", "rust-src", "miri"]
6 | 


--------------------------------------------------------------------------------
/src/client/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "liquid-cache-client"
 3 | authors = { workspace = true }
 4 | edition = { workspace = true }
 5 | version = { workspace = true }
 6 | license = { workspace = true }
 7 | readme = { workspace = true }
 8 | description = { workspace = true }
 9 | repository = { workspace = true }
10 | 
11 | [dependencies]
12 | arrow = { workspace = true }
13 | arrow-flight = { workspace = true }
14 | datafusion = { workspace = true }
15 | datafusion-proto = { workspace = true }
16 | arrow-schema = { workspace = true }
17 | futures = { workspace = true }
18 | serde = { workspace = true }
19 | tonic = { workspace = true }
20 | async-trait = { workspace = true }
21 | log = { workspace = true }
22 | prost = { workspace = true }
23 | uuid = { workspace = true }
24 | liquid-cache-common = { workspace = true }
25 | fastrace = { workspace = true }
26 | fastrace-tonic = { workspace = true }
27 | fastrace-futures = { workspace = true }
28 | tower = "0.5.2"
29 | tokio = { workspace = true }
30 | 
31 | [dev-dependencies]
32 | insta = { version = "1.43.1" }
33 | 


--------------------------------------------------------------------------------
/src/client/src/lib.rs:
--------------------------------------------------------------------------------
  1 | #![warn(missing_docs)]
  2 | #![cfg_attr(not(doctest), doc = include_str!(concat!("../", std::env!("CARGO_PKG_README"))))]
  3 | use std::collections::HashMap;
  4 | use std::error::Error;
  5 | use std::sync::Arc;
  6 | use std::time::Duration;
  7 | mod client_exec;
  8 | mod metrics;
  9 | mod optimizer;
 10 | pub use client_exec::LiquidCacheClientExec;
 11 | use datafusion::{
 12 |     error::{DataFusionError, Result},
 13 |     execution::{SessionStateBuilder, object_store::ObjectStoreUrl, runtime_env::RuntimeEnv},
 14 |     prelude::*,
 15 | };
 16 | use fastrace_tonic::FastraceClientService;
 17 | use liquid_cache_common::CacheMode;
 18 | pub use optimizer::PushdownOptimizer;
 19 | use tonic::transport::Channel;
 20 | 
 21 | #[cfg(test)]
 22 | mod tests;
 23 | 
 24 | /// The builder for LiquidCache client state.
 25 | ///
 26 | /// # Example
 27 | ///
 28 | /// ```ignore
 29 | /// use liquid_cache_client::LiquidCacheBuilder;
 30 | /// let ctx = LiquidCacheBuilder::new("localhost:15214")
 31 | ///     .with_object_store("s3://my_bucket", None)
 32 | ///     .with_cache_mode(CacheMode::Liquid)
 33 | ///     .build(SessionConfig::from_env().unwrap())
 34 | ///     .unwrap();
 35 | ///
 36 | /// ctx.register_parquet("my_table", "s3://my_bucket/my_table.parquet", Default::default())
 37 | ///     .await?;
 38 | /// let df = ctx.sql("SELECT * FROM my_table").await?.show().await?;
 39 | /// println!("{:?}", df);
 40 | /// ```
 41 | pub struct LiquidCacheBuilder {
 42 |     object_stores: Vec<(ObjectStoreUrl, HashMap<String, String>)>,
 43 |     cache_mode: CacheMode,
 44 |     cache_server: String,
 45 | }
 46 | 
 47 | impl LiquidCacheBuilder {
 48 |     /// Create a new builder for LiquidCache client state.
 49 |     pub fn new(cache_server: impl AsRef<str>) -> Self {
 50 |         Self {
 51 |             object_stores: vec![],
 52 |             cache_mode: CacheMode::Liquid,
 53 |             cache_server: cache_server.as_ref().to_string(),
 54 |         }
 55 |     }
 56 | 
 57 |     /// Add an object store to the builder.
 58 |     pub fn with_object_store(
 59 |         mut self,
 60 |         url: ObjectStoreUrl,
 61 |         object_store_options: Option<HashMap<String, String>>,
 62 |     ) -> Self {
 63 |         self.object_stores
 64 |             .push((url, object_store_options.unwrap_or_default()));
 65 |         self
 66 |     }
 67 | 
 68 |     /// Set the cache mode for the builder.
 69 |     pub fn with_cache_mode(mut self, cache_mode: CacheMode) -> Self {
 70 |         self.cache_mode = cache_mode;
 71 |         self
 72 |     }
 73 | 
 74 |     /// Build the [SessionContext].
 75 |     pub fn build(self, config: SessionConfig) -> Result<SessionContext> {
 76 |         let mut session_config = config;
 77 |         session_config
 78 |             .options_mut()
 79 |             .execution
 80 |             .parquet
 81 |             .pushdown_filters = true;
 82 |         session_config
 83 |             .options_mut()
 84 |             .execution
 85 |             .parquet
 86 |             .schema_force_view_types = false;
 87 |         session_config
 88 |             .options_mut()
 89 |             .execution
 90 |             .parquet
 91 |             .binary_as_string = true;
 92 |         session_config.options_mut().execution.batch_size = 8192 * 2;
 93 |         let session_state = SessionStateBuilder::new()
 94 |             .with_config(session_config)
 95 |             .with_runtime_env(Arc::new(RuntimeEnv::default()))
 96 |             .with_default_features()
 97 |             .with_physical_optimizer_rule(Arc::new(PushdownOptimizer::new(
 98 |                 self.cache_server.clone(),
 99 |                 self.cache_mode,
100 |                 self.object_stores.clone(),
101 |             )))
102 |             .build();
103 |         Ok(SessionContext::new_with_state(session_state))
104 |     }
105 | }
106 | 
107 | pub(crate) fn to_df_err<E: Error + Send + Sync + 'static>(err: E) -> DataFusionError {
108 |     DataFusionError::External(Box::new(err))
109 | }
110 | 
111 | pub(crate) async fn flight_channel(
112 |     source: impl Into<String>,
113 | ) -> Result<FastraceClientService<Channel>> {
114 |     use fastrace_tonic::FastraceClientLayer;
115 |     use tower::ServiceBuilder;
116 | 
117 |     // No tls here, to avoid the overhead of TLS
118 |     // we assume both server and client are running on the trusted network.
119 |     let endpoint = Channel::from_shared(source.into())
120 |         .map_err(to_df_err)?
121 |         .tcp_keepalive(Some(Duration::from_secs(10)));
122 | 
123 |     let channel = endpoint.connect().await.map_err(to_df_err)?;
124 |     let channel = ServiceBuilder::new()
125 |         .layer(FastraceClientLayer)
126 |         .service(channel);
127 |     Ok(channel)
128 | }
129 | 


--------------------------------------------------------------------------------
/src/client/src/metrics.rs:
--------------------------------------------------------------------------------
 1 | // Licensed to the Apache Software Foundation (ASF) under one
 2 | // or more contributor license agreements.  See the NOTICE file
 3 | // distributed with this work for additional information
 4 | // regarding copyright ownership.  The ASF licenses this file
 5 | // to you under the Apache License, Version 2.0 (the
 6 | // "License"); you may not use this file except in compliance
 7 | // with the License.  You may obtain a copy of the License at
 8 | //
 9 | //   http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied.  See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 | 
18 | use datafusion::{
19 |     common::instant::Instant,
20 |     physical_plan::metrics::{Count, ExecutionPlanMetricsSet, MetricBuilder, Time},
21 | };
22 | 
23 | /// A timer that can be started and stopped.
24 | pub struct StartableTime {
25 |     pub(crate) metrics: Time,
26 |     // use for record each part cost time, will eventually add into 'metrics'.
27 |     pub(crate) start: Option<Instant>,
28 | }
29 | 
30 | impl StartableTime {
31 |     pub(crate) fn start(&mut self) {
32 |         assert!(self.start.is_none());
33 |         self.start = Some(Instant::now());
34 |     }
35 | 
36 |     pub(crate) fn stop(&mut self) {
37 |         if let Some(start) = self.start.take() {
38 |             self.metrics.add_elapsed(start);
39 |         }
40 |     }
41 | }
42 | 
43 | pub(crate) struct FlightStreamMetrics {
44 |     pub time_processing: StartableTime,
45 |     pub time_reading_total: StartableTime,
46 |     pub poll_count: Count,
47 |     pub output_rows: Count,
48 |     pub bytes_decoded: Count,
49 | }
50 | 
51 | impl FlightStreamMetrics {
52 |     pub(crate) fn new(metrics: &ExecutionPlanMetricsSet, partition: usize) -> Self {
53 |         Self {
54 |             time_processing: StartableTime {
55 |                 metrics: MetricBuilder::new(metrics).subset_time("time_processing", partition),
56 |                 start: None,
57 |             },
58 |             time_reading_total: StartableTime {
59 |                 metrics: MetricBuilder::new(metrics).subset_time("time_reading_total", partition),
60 |                 start: None,
61 |             },
62 |             output_rows: MetricBuilder::new(metrics).output_rows(partition),
63 |             poll_count: MetricBuilder::new(metrics).counter("poll_count", partition),
64 |             bytes_decoded: MetricBuilder::new(metrics).counter("bytes_decoded", partition),
65 |         }
66 |     }
67 | }
68 | 


--------------------------------------------------------------------------------
/src/client/src/tests/snapshots/liquid_cache_client__tests__tpch_q1.snap:
--------------------------------------------------------------------------------
  1 | ---
  2 | source: src/client/src/tests/mod.rs
  3 | expression: displayable.tree_render().to_string()
  4 | ---
  5 | ┌───────────────────────────┐
  6 | │  SortPreservingMergeExec  │
  7 | │    --------------------   │
  8 | │   l_returnflag ASC NULLS  │
  9 | │     LAST, l_linestatus    │
 10 | │       ASC NULLS LAST      │
 11 | └─────────────┬─────────────┘
 12 | ┌─────────────┴─────────────┐
 13 | │          SortExec         │
 14 | │    --------------------   │
 15 | │  l_returnflag@0 ASC NULLS │
 16 | │    LAST, l_linestatus@1   │
 17 | │       ASC NULLS LAST      │
 18 | └─────────────┬─────────────┘
 19 | ┌─────────────┴─────────────┐
 20 | │       ProjectionExec      │
 21 | │    --------------------   │
 22 | │         avg_disc:         │
 23 | │  avg(lineitem.l_discount) │
 24 | │                           │
 25 | │         avg_price:        │
 26 | │        avg(lineitem       │
 27 | │        .l_extendedp       │
 28 | │           rice)           │
 29 | │                           │
 30 | │          avg_qty:         │
 31 | │  avg(lineitem.l_quantity) │
 32 | │                           │
 33 | │        count_order:       │
 34 | │      count(Int64(1))      │
 35 | │                           │
 36 | │       l_linestatus:       │
 37 | │        l_linestatus       │
 38 | │                           │
 39 | │       l_returnflag:       │
 40 | │        l_returnflag       │
 41 | │                           │
 42 | │      sum_base_price:      │
 43 | │        sum(lineitem       │
 44 | │        .l_extendedp       │
 45 | │           rice)           │
 46 | │                           │
 47 | │        sum_charge:        │
 48 | │        sum(lineitem       │
 49 | │        .l_extendedp       │
 50 | │ rice * Int64(1) - lineitem│
 51 | │            ...            │
 52 | └─────────────┬─────────────┘
 53 | ┌─────────────┴─────────────┐
 54 | │       AggregateExec       │
 55 | │    --------------------   │
 56 | │           aggr:           │
 57 | │ sum(lineitem.l_quantity), │
 58 | │        sum(lineitem       │
 59 | │      .l_extendedpric      │
 60 | │    e), , , avg(lineitem   │
 61 | │     .l_quantity), avg     │
 62 | │         (lineitem         │
 63 | │        .l_extendedp       │
 64 | │    rice), avg(lineitem    │
 65 | │       .l_discount),       │
 66 | │          count(1)         │
 67 | │                           │
 68 | │         group_by:         │
 69 | │ l_returnflag, l_linestatus│
 70 | │                           │
 71 | │           mode:           │
 72 | │      FinalPartitioned     │
 73 | └─────────────┬─────────────┘
 74 | ┌─────────────┴─────────────┐
 75 | │    CoalesceBatchesExec    │
 76 | │    --------------------   │
 77 | │     target_batch_size:    │
 78 | │           16384           │
 79 | └─────────────┬─────────────┘
 80 | ┌─────────────┴─────────────┐
 81 | │      RepartitionExec      │
 82 | │    --------------------   │
 83 | │ partition_count(in->out): │
 84 | │           8 -> 8          │
 85 | │                           │
 86 | │    partitioning_scheme:   │
 87 | │   Hash([l_returnflag@0,   │
 88 | │     l_linestatus@1], 8)   │
 89 | └─────────────┬─────────────┘
 90 | ┌─────────────┴─────────────┐
 91 | │       AggregateExec       │
 92 | │    --------------------   │
 93 | │           aggr:           │
 94 | │ sum(lineitem.l_quantity), │
 95 | │        sum(lineitem       │
 96 | │      .l_extendedpric      │
 97 | │    e), , , avg(lineitem   │
 98 | │     .l_quantity), avg     │
 99 | │         (lineitem         │
100 | │        .l_extendedp       │
101 | │    rice), avg(lineitem    │
102 | │       .l_discount),       │
103 | │          count(1)         │
104 | │                           │
105 | │         group_by:         │
106 | │ l_returnflag, l_linestatus│
107 | │                           │
108 | │       mode: Partial       │
109 | └─────────────┬─────────────┘
110 | ┌─────────────┴─────────────┐
111 | │       ProjectionExec      │
112 | │    --------------------   │
113 | │      __common_expr_1:     │
114 | │ l_extendedprice * (Some(1)│
115 | │    ,20,0 - l_discount)    │
116 | │                           │
117 | │        l_discount:        │
118 | │         l_discount        │
119 | │                           │
120 | │      l_extendedprice:     │
121 | │      l_extendedprice      │
122 | │                           │
123 | │       l_linestatus:       │
124 | │        l_linestatus       │
125 | │                           │
126 | │        l_quantity:        │
127 | │         l_quantity        │
128 | │                           │
129 | │       l_returnflag:       │
130 | │        l_returnflag       │
131 | │                           │
132 | │        l_tax: l_tax       │
133 | └─────────────┬─────────────┘
134 | ┌─────────────┴─────────────┐
135 | │   LiquidCacheClientExec   │
136 | │    --------------------   │
137 | │          server:          │
138 | │  http://localhost:50051,  │
139 | │        mode=liquid,       │
140 | │      object_stores=[]     │
141 | └─────────────┬─────────────┘
142 | ┌─────────────┴─────────────┐
143 | │      RepartitionExec      │
144 | │    --------------------   │
145 | │ partition_count(in->out): │
146 | │           1 -> 8          │
147 | │                           │
148 | │    partitioning_scheme:   │
149 | │     RoundRobinBatch(8)    │
150 | └─────────────┬─────────────┘
151 | ┌─────────────┴─────────────┐
152 | │       DataSourceExec      │
153 | │    --------------------   │
154 | │          files: 1         │
155 | │      format: parquet      │
156 | │                           │
157 | │         predicate:        │
158 | │  l_shipdate <= 1998-09-02 │
159 | └───────────────────────────┘
160 | 


--------------------------------------------------------------------------------
/src/client/src/tests/snapshots/liquid_cache_client__tests__tpch_q12.snap:
--------------------------------------------------------------------------------
  1 | ---
  2 | source: src/client/src/tests/mod.rs
  3 | expression: displayable.tree_render().to_string()
  4 | ---
  5 | ┌───────────────────────────┐
  6 | │  SortPreservingMergeExec  │
  7 | │    --------------------   │
  8 | │ l_shipmode ASC NULLS LAST │
  9 | └─────────────┬─────────────┘
 10 | ┌─────────────┴─────────────┐
 11 | │          SortExec         │
 12 | │    --------------------   │
 13 | │l_shipmode@0 ASC NULLS LAST│
 14 | └─────────────┬─────────────┘
 15 | ┌─────────────┴─────────────┐
 16 | │       ProjectionExec      │
 17 | │    --------------------   │
 18 | │      high_line_count:     │
 19 | │    sum(CASE WHEN orders   │
 20 | │     .o_orderpriority =    │
 21 | │     Utf8("1-URGENT") OR   │
 22 | │    orders.o_orderpriorit  │
 23 | │  y = Utf8("2-HIGH") THEN  │
 24 | │    Int64(1) ELSE Int64(0  │
 25 | │           ) END)          │
 26 | │                           │
 27 | │        l_shipmode:        │
 28 | │         l_shipmode        │
 29 | │                           │
 30 | │      low_line_count:      │
 31 | │    sum(CASE WHEN orders   │
 32 | │    .o_orderpriority !=    │
 33 | │    Utf8("1-URGENT") AND   │
 34 | │    orders.o_orderpriorit  │
 35 | │  y != Utf8("2-HIGH") THEN │
 36 | │   Int64(1) ELSE Int64(0)  │
 37 | │            END)           │
 38 | └─────────────┬─────────────┘
 39 | ┌─────────────┴─────────────┐
 40 | │       AggregateExec       │
 41 | │    --------------------   │
 42 | │           aggr:           │
 43 | │    sum(CASE WHEN orders   │
 44 | │     .o_orderpriority =    │
 45 | │     1-URGENT OR orders    │
 46 | │    .o_orderpriority = 2   │
 47 | │ -HIGH THEN 1 ELSE 0 END), │
 48 | │    sum(CASE WHEN orders   │
 49 | │   .o_orderpriority != 1   │
 50 | │     -URGENT AND orders    │
 51 | │     .o_orderpriority !    │
 52 | │   = 2-HIGH THEN 1 ELSE 0  │
 53 | │            END)           │
 54 | │                           │
 55 | │    group_by: l_shipmode   │
 56 | │                           │
 57 | │           mode:           │
 58 | │      FinalPartitioned     │
 59 | └─────────────┬─────────────┘
 60 | ┌─────────────┴─────────────┐
 61 | │    CoalesceBatchesExec    │
 62 | │    --------------------   │
 63 | │     target_batch_size:    │
 64 | │           16384           │
 65 | └─────────────┬─────────────┘
 66 | ┌─────────────┴─────────────┐
 67 | │      RepartitionExec      │
 68 | │    --------------------   │
 69 | │ partition_count(in->out): │
 70 | │           8 -> 8          │
 71 | │                           │
 72 | │    partitioning_scheme:   │
 73 | │  Hash([l_shipmode@0], 8)  │
 74 | └─────────────┬─────────────┘
 75 | ┌─────────────┴─────────────┐
 76 | │       AggregateExec       │
 77 | │    --------------------   │
 78 | │           aggr:           │
 79 | │    sum(CASE WHEN orders   │
 80 | │     .o_orderpriority =    │
 81 | │     1-URGENT OR orders    │
 82 | │    .o_orderpriority = 2   │
 83 | │ -HIGH THEN 1 ELSE 0 END), │
 84 | │    sum(CASE WHEN orders   │
 85 | │   .o_orderpriority != 1   │
 86 | │     -URGENT AND orders    │
 87 | │     .o_orderpriority !    │
 88 | │   = 2-HIGH THEN 1 ELSE 0  │
 89 | │            END)           │
 90 | │                           │
 91 | │    group_by: l_shipmode   │
 92 | │       mode: Partial       │
 93 | └─────────────┬─────────────┘
 94 | ┌─────────────┴─────────────┐
 95 | │       ProjectionExec      │
 96 | │    --------------------   │
 97 | │        l_shipmode:        │
 98 | │         l_shipmode        │
 99 | │                           │
100 | │      o_orderpriority:     │
101 | │      o_orderpriority      │
102 | └─────────────┬─────────────┘
103 | ┌─────────────┴─────────────┐
104 | │    CoalesceBatchesExec    │
105 | │    --------------------   │
106 | │     target_batch_size:    │
107 | │           16384           │
108 | └─────────────┬─────────────┘
109 | ┌─────────────┴─────────────┐
110 | │        HashJoinExec       │
111 | │    --------------------   │
112 | │            on:            ├──────────────┐
113 | │ (o_orderkey = l_orderkey) │              │
114 | └─────────────┬─────────────┘              │
115 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐
116 | │   LiquidCacheClientExec   ││   LiquidCacheClientExec   │
117 | │    --------------------   ││    --------------------   │
118 | │          server:          ││          server:          │
119 | │  http://localhost:50051,  ││  http://localhost:50051,  │
120 | │        mode=liquid,       ││        mode=liquid,       │
121 | │      object_stores=[]     ││      object_stores=[]     │
122 | └─────────────┬─────────────┘└─────────────┬─────────────┘
123 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐
124 | │       DataSourceExec      ││      RepartitionExec      │
125 | │    --------------------   ││    --------------------   │
126 | │          files: 1         ││ partition_count(in->out): │
127 | │      format: parquet      ││           1 -> 8          │
128 | │                           ││                           │
129 | │                           ││    partitioning_scheme:   │
130 | │                           ││     RoundRobinBatch(8)    │
131 | └───────────────────────────┘└─────────────┬─────────────┘
132 |                              ┌─────────────┴─────────────┐
133 |                              │       DataSourceExec      │
134 |                              │    --------------------   │
135 |                              │          files: 1         │
136 |                              │      format: parquet      │
137 |                              │                           │
138 |                              │         predicate:        │
139 |                              │   (l_shipmode = MAIL OR   │
140 |                              │     l_shipmode = SHIP)    │
141 |                              │     AND l_receiptdate >   │
142 |                              │      l_commitdate AND     │
143 |                              │        l_shipdate <       │
144 |                              │      l_commitdate AND     │
145 |                              │  l_receiptdate >= 1994-01 │
146 |                              │  -01 AND l_receiptdate <  │
147 |                              │         1995-01-01        │
148 |                              └───────────────────────────┘
149 | 


--------------------------------------------------------------------------------
/src/client/src/tests/snapshots/liquid_cache_client__tests__tpch_q13.snap:
--------------------------------------------------------------------------------
  1 | ---
  2 | source: src/client/src/tests/mod.rs
  3 | expression: displayable.tree_render().to_string()
  4 | ---
  5 | ┌───────────────────────────┐
  6 | │  SortPreservingMergeExec  │
  7 | │    --------------------   │
  8 | │custdist DESC, c_count DESC│
  9 | └─────────────┬─────────────┘
 10 | ┌─────────────┴─────────────┐
 11 | │          SortExec         │
 12 | │    --------------------   │
 13 | │ custdist@1 DESC, c_count@0│
 14 | │            DESC           │
 15 | └─────────────┬─────────────┘
 16 | ┌─────────────┴─────────────┐
 17 | │       ProjectionExec      │
 18 | │    --------------------   │
 19 | │      c_count: c_count     │
 20 | │                           │
 21 | │         custdist:         │
 22 | │      count(Int64(1))      │
 23 | └─────────────┬─────────────┘
 24 | ┌─────────────┴─────────────┐
 25 | │       AggregateExec       │
 26 | │    --------------------   │
 27 | │       aggr: count(1)      │
 28 | │     group_by: c_count     │
 29 | │                           │
 30 | │           mode:           │
 31 | │      FinalPartitioned     │
 32 | └─────────────┬─────────────┘
 33 | ┌─────────────┴─────────────┐
 34 | │    CoalesceBatchesExec    │
 35 | │    --------------------   │
 36 | │     target_batch_size:    │
 37 | │           16384           │
 38 | └─────────────┬─────────────┘
 39 | ┌─────────────┴─────────────┐
 40 | │      RepartitionExec      │
 41 | │    --------------------   │
 42 | │ partition_count(in->out): │
 43 | │           8 -> 8          │
 44 | │                           │
 45 | │    partitioning_scheme:   │
 46 | │    Hash([c_count@0], 8)   │
 47 | └─────────────┬─────────────┘
 48 | ┌─────────────┴─────────────┐
 49 | │       AggregateExec       │
 50 | │    --------------------   │
 51 | │       aggr: count(1)      │
 52 | │     group_by: c_count     │
 53 | │       mode: Partial       │
 54 | └─────────────┬─────────────┘
 55 | ┌─────────────┴─────────────┐
 56 | │       ProjectionExec      │
 57 | │    --------------------   │
 58 | │          c_count:         │
 59 | │  count(orders.o_orderkey) │
 60 | └─────────────┬─────────────┘
 61 | ┌─────────────┴─────────────┐
 62 | │       AggregateExec       │
 63 | │    --------------------   │
 64 | │           aggr:           │
 65 | │  count(orders.o_orderkey) │
 66 | │                           │
 67 | │    group_by: c_custkey    │
 68 | │                           │
 69 | │           mode:           │
 70 | │      FinalPartitioned     │
 71 | └─────────────┬─────────────┘
 72 | ┌─────────────┴─────────────┐
 73 | │    CoalesceBatchesExec    │
 74 | │    --------------------   │
 75 | │     target_batch_size:    │
 76 | │           16384           │
 77 | └─────────────┬─────────────┘
 78 | ┌─────────────┴─────────────┐
 79 | │      RepartitionExec      │
 80 | │    --------------------   │
 81 | │ partition_count(in->out): │
 82 | │           8 -> 8          │
 83 | │                           │
 84 | │    partitioning_scheme:   │
 85 | │   Hash([c_custkey@0], 8)  │
 86 | └─────────────┬─────────────┘
 87 | ┌─────────────┴─────────────┐
 88 | │       AggregateExec       │
 89 | │    --------------------   │
 90 | │           aggr:           │
 91 | │  count(orders.o_orderkey) │
 92 | │                           │
 93 | │    group_by: c_custkey    │
 94 | │       mode: Partial       │
 95 | └─────────────┬─────────────┘
 96 | ┌─────────────┴─────────────┐
 97 | │    CoalesceBatchesExec    │
 98 | │    --------------------   │
 99 | │     target_batch_size:    │
100 | │           16384           │
101 | └─────────────┬─────────────┘
102 | ┌─────────────┴─────────────┐
103 | │        HashJoinExec       │
104 | │    --------------------   │
105 | │      join_type: Left      │
106 | │                           ├──────────────┐
107 | │            on:            │              │
108 | │  (c_custkey = o_custkey)  │              │
109 | └─────────────┬─────────────┘              │
110 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐
111 | │   LiquidCacheClientExec   ││   LiquidCacheClientExec   │
112 | │    --------------------   ││    --------------------   │
113 | │          server:          ││          server:          │
114 | │  http://localhost:50051,  ││  http://localhost:50051,  │
115 | │        mode=liquid,       ││        mode=liquid,       │
116 | │      object_stores=[]     ││      object_stores=[]     │
117 | └─────────────┬─────────────┘└─────────────┬─────────────┘
118 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐
119 | │       DataSourceExec      ││      RepartitionExec      │
120 | │    --------------------   ││    --------------------   │
121 | │          files: 1         ││ partition_count(in->out): │
122 | │      format: parquet      ││           1 -> 8          │
123 | │                           ││                           │
124 | │                           ││    partitioning_scheme:   │
125 | │                           ││     RoundRobinBatch(8)    │
126 | └───────────────────────────┘└─────────────┬─────────────┘
127 |                              ┌─────────────┴─────────────┐
128 |                              │       DataSourceExec      │
129 |                              │    --------------------   │
130 |                              │          files: 1         │
131 |                              │      format: parquet      │
132 |                              │                           │
133 |                              │         predicate:        │
134 |                              │    o_comment NOT LIKE     │
135 |                              │     %special%requests%    │
136 |                              └───────────────────────────┘
137 | 


--------------------------------------------------------------------------------
/src/client/src/tests/snapshots/liquid_cache_client__tests__tpch_q14.snap:
--------------------------------------------------------------------------------
 1 | ---
 2 | source: src/client/src/tests/mod.rs
 3 | expression: displayable.tree_render().to_string()
 4 | ---
 5 | ┌───────────────────────────┐
 6 | │       ProjectionExec      │
 7 | │    --------------------   │
 8 | │       promo_revenue:      │
 9 | │  100 * CAST(sum(CASE WHEN │
10 | │   part.p_type LIKE Utf8(  │
11 | │  "PROMO%") THEN lineitem  │
12 | │     .l_extendedprice *    │
13 | │     Int64(1) - lineitem   │
14 | │ .l_discount ELSE Int64(0) │
15 | │   END) AS Float64) / CAST │
16 | │       (sum(lineitem       │
17 | │      .l_extendedpric      │
18 | │  e * Int64(1) - lineitem  │
19 | │      .l_discount) AS      │
20 | │          Float64)         │
21 | └─────────────┬─────────────┘
22 | ┌─────────────┴─────────────┐
23 | │       AggregateExec       │
24 | │    --------------------   │
25 | │          aggr: ,          │
26 | │        mode: Final        │
27 | └─────────────┬─────────────┘
28 | ┌─────────────┴─────────────┐
29 | │   CoalescePartitionsExec  │
30 | └─────────────┬─────────────┘
31 | ┌─────────────┴─────────────┐
32 | │       AggregateExec       │
33 | │    --------------------   │
34 | │          aggr: ,          │
35 | │       mode: Partial       │
36 | └─────────────┬─────────────┘
37 | ┌─────────────┴─────────────┐
38 | │       ProjectionExec      │
39 | │    --------------------   │
40 | │      __common_expr_1:     │
41 | │ l_extendedprice * (Some(1)│
42 | │    ,20,0 - l_discount)    │
43 | │                           │
44 | │       p_type: p_type      │
45 | └─────────────┬─────────────┘
46 | ┌─────────────┴─────────────┐
47 | │    CoalesceBatchesExec    │
48 | │    --------------------   │
49 | │     target_batch_size:    │
50 | │           16384           │
51 | └─────────────┬─────────────┘
52 | ┌─────────────┴─────────────┐
53 | │        HashJoinExec       │
54 | │    --------------------   │
55 | │            on:            ├──────────────┐
56 | │  (p_partkey = l_partkey)  │              │
57 | └─────────────┬─────────────┘              │
58 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐
59 | │   LiquidCacheClientExec   ││   LiquidCacheClientExec   │
60 | │    --------------------   ││    --------------------   │
61 | │          server:          ││          server:          │
62 | │  http://localhost:50051,  ││  http://localhost:50051,  │
63 | │        mode=liquid,       ││        mode=liquid,       │
64 | │      object_stores=[]     ││      object_stores=[]     │
65 | └─────────────┬─────────────┘└─────────────┬─────────────┘
66 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐
67 | │       DataSourceExec      ││      RepartitionExec      │
68 | │    --------------------   ││    --------------------   │
69 | │          files: 1         ││ partition_count(in->out): │
70 | │      format: parquet      ││           1 -> 8          │
71 | │                           ││                           │
72 | │                           ││    partitioning_scheme:   │
73 | │                           ││     RoundRobinBatch(8)    │
74 | └───────────────────────────┘└─────────────┬─────────────┘
75 |                              ┌─────────────┴─────────────┐
76 |                              │       DataSourceExec      │
77 |                              │    --------------------   │
78 |                              │          files: 1         │
79 |                              │      format: parquet      │
80 |                              │                           │
81 |                              │         predicate:        │
82 |                              │  l_shipdate >= 1995-09-01 │
83 |                              │    AND l_shipdate < 1995  │
84 |                              │           -10-01          │
85 |                              └───────────────────────────┘
86 | 


--------------------------------------------------------------------------------
/src/client/src/tests/snapshots/liquid_cache_client__tests__tpch_q17.snap:
--------------------------------------------------------------------------------
  1 | ---
  2 | source: src/client/src/tests/mod.rs
  3 | expression: displayable.tree_render().to_string()
  4 | ---
  5 | ┌───────────────────────────┐
  6 | │       ProjectionExec      │
  7 | │    --------------------   │
  8 | │        avg_yearly:        │
  9 | │     CAST(sum(lineitem     │
 10 | │      .l_extendedprice     │
 11 | │     ) AS Float64) / 7     │
 12 | └─────────────┬─────────────┘
 13 | ┌─────────────┴─────────────┐
 14 | │       AggregateExec       │
 15 | │    --------------------   │
 16 | │           aggr:           │
 17 | │        sum(lineitem       │
 18 | │        .l_extendedp       │
 19 | │           rice)           │
 20 | │                           │
 21 | │        mode: Final        │
 22 | └─────────────┬─────────────┘
 23 | ┌─────────────┴─────────────┐
 24 | │   CoalescePartitionsExec  │
 25 | └─────────────┬─────────────┘
 26 | ┌─────────────┴─────────────┐
 27 | │       AggregateExec       │
 28 | │    --------------------   │
 29 | │           aggr:           │
 30 | │        sum(lineitem       │
 31 | │        .l_extendedp       │
 32 | │           rice)           │
 33 | │                           │
 34 | │       mode: Partial       │
 35 | └─────────────┬─────────────┘
 36 | ┌─────────────┴─────────────┐
 37 | │    CoalesceBatchesExec    │
 38 | │    --------------------   │
 39 | │     target_batch_size:    │
 40 | │           16384           │
 41 | └─────────────┬─────────────┘
 42 | ┌─────────────┴─────────────┐
 43 | │        HashJoinExec       │
 44 | │    --------------------   │
 45 | │            on:            ├──────────────┐
 46 | │  (l_partkey = p_partkey)  │              │
 47 | └─────────────┬─────────────┘              │
 48 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐
 49 | │   CoalescePartitionsExec  ││      RepartitionExec      │
 50 | │                           ││    --------------------   │
 51 | │                           ││ partition_count(in->out): │
 52 | │                           ││           1 -> 8          │
 53 | │                           ││                           │
 54 | │                           ││    partitioning_scheme:   │
 55 | │                           ││     RoundRobinBatch(8)    │
 56 | └─────────────┬─────────────┘└─────────────┬─────────────┘
 57 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐
 58 | │       ProjectionExec      ││       ProjectionExec      │
 59 | │    --------------------   ││    --------------------   │
 60 | │     Float64(0.2) * avg    ││      l_extendedprice:     │
 61 | │         (lineitem         ││      l_extendedprice      │
 62 | │       .l_quantity):       ││                           │
 63 | │    CAST(0.2 * CAST(avg    ││        l_quantity:        │
 64 | │         (lineitem         ││         l_quantity        │
 65 | │      .l_quantity) AS      ││                           │
 66 | │   Float64) AS Decimal128  ││    p_partkey: p_partkey   │
 67 | │         (30, 15))         ││                           │
 68 | │                           ││                           │
 69 | │    l_partkey: l_partkey   ││                           │
 70 | └─────────────┬─────────────┘└─────────────┬─────────────┘
 71 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐
 72 | │       AggregateExec       ││    CoalesceBatchesExec    │
 73 | │    --------------------   ││    --------------------   │
 74 | │           aggr:           ││     target_batch_size:    │
 75 | │  avg(lineitem.l_quantity) ││           16384           │
 76 | │                           ││                           │
 77 | │    group_by: l_partkey    ││                           │
 78 | │                           ││                           │
 79 | │           mode:           ││                           │
 80 | │      FinalPartitioned     ││                           │
 81 | └─────────────┬─────────────┘└─────────────┬─────────────┘
 82 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐
 83 | │    CoalesceBatchesExec    ││        HashJoinExec       │
 84 | │    --------------------   ││    --------------------   │
 85 | │     target_batch_size:    ││            on:            ├──────────────┐
 86 | │           16384           ││  (p_partkey = l_partkey)  │              │
 87 | └─────────────┬─────────────┘└─────────────┬─────────────┘              │
 88 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐┌─────────────┴─────────────┐
 89 | │      RepartitionExec      ││   LiquidCacheClientExec   ││   LiquidCacheClientExec   │
 90 | │    --------------------   ││    --------------------   ││    --------------------   │
 91 | │ partition_count(in->out): ││          server:          ││          server:          │
 92 | │           8 -> 8          ││  http://localhost:50051,  ││  http://localhost:50051,  │
 93 | │                           ││        mode=liquid,       ││        mode=liquid,       │
 94 | │    partitioning_scheme:   ││      object_stores=[]     ││      object_stores=[]     │
 95 | │   Hash([l_partkey@0], 8)  ││                           ││                           │
 96 | └─────────────┬─────────────┘└─────────────┬─────────────┘└─────────────┬─────────────┘
 97 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐┌─────────────┴─────────────┐
 98 | │      RepartitionExec      ││       DataSourceExec      ││       DataSourceExec      │
 99 | │    --------------------   ││    --------------------   ││    --------------------   │
100 | │ partition_count(in->out): ││          files: 1         ││          files: 1         │
101 | │           1 -> 8          ││      format: parquet      ││      format: parquet      │
102 | │                           ││                           ││                           │
103 | │    partitioning_scheme:   ││         predicate:        ││                           │
104 | │     RoundRobinBatch(8)    ││   p_brand = Brand#23 AND  ││                           │
105 | │                           ││    p_container = MED BOX  ││                           │
106 | └─────────────┬─────────────┘└───────────────────────────┘└───────────────────────────┘
107 | ┌─────────────┴─────────────┐
108 | │       AggregateExec       │
109 | │    --------------------   │
110 | │           aggr:           │
111 | │  avg(lineitem.l_quantity) │
112 | │                           │
113 | │    group_by: l_partkey    │
114 | │       mode: Partial       │
115 | └─────────────┬─────────────┘
116 | ┌─────────────┴─────────────┐
117 | │   LiquidCacheClientExec   │
118 | │    --------------------   │
119 | │          server:          │
120 | │  http://localhost:50051,  │
121 | │        mode=liquid,       │
122 | │      object_stores=[]     │
123 | └─────────────┬─────────────┘
124 | ┌─────────────┴─────────────┐
125 | │       DataSourceExec      │
126 | │    --------------------   │
127 | │          files: 1         │
128 | │      format: parquet      │
129 | └───────────────────────────┘
130 | 


--------------------------------------------------------------------------------
/src/client/src/tests/snapshots/liquid_cache_client__tests__tpch_q19.snap:
--------------------------------------------------------------------------------
 1 | ---
 2 | source: src/client/src/tests/mod.rs
 3 | expression: displayable.tree_render().to_string()
 4 | ---
 5 | ┌───────────────────────────┐
 6 | │       ProjectionExec      │
 7 | │    --------------------   │
 8 | │          revenue:         │
 9 | │        sum(lineitem       │
10 | │        .l_extendedp       │
11 | │ rice * Int64(1) - lineitem│
12 | │        .l_discount)       │
13 | └─────────────┬─────────────┘
14 | ┌─────────────┴─────────────┐
15 | │       AggregateExec       │
16 | │    --------------------   │
17 | │            aggr           │
18 | │                           │
19 | │        mode: Final        │
20 | └─────────────┬─────────────┘
21 | ┌─────────────┴─────────────┐
22 | │   CoalescePartitionsExec  │
23 | └─────────────┬─────────────┘
24 | ┌─────────────┴─────────────┐
25 | │       AggregateExec       │
26 | │    --------------------   │
27 | │            aggr           │
28 | │                           │
29 | │       mode: Partial       │
30 | └─────────────┬─────────────┘
31 | ┌─────────────┴─────────────┐
32 | │    CoalesceBatchesExec    │
33 | │    --------------------   │
34 | │     target_batch_size:    │
35 | │           16384           │
36 | └─────────────┬─────────────┘
37 | ┌─────────────┴─────────────┐
38 | │        HashJoinExec       │
39 | │    --------------------   │
40 | │            on:            ├──────────────┐
41 | │  (p_partkey = l_partkey)  │              │
42 | └─────────────┬─────────────┘              │
43 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐
44 | │   LiquidCacheClientExec   ││   LiquidCacheClientExec   │
45 | │    --------------------   ││    --------------------   │
46 | │          server:          ││          server:          │
47 | │  http://localhost:50051,  ││  http://localhost:50051,  │
48 | │        mode=liquid,       ││        mode=liquid,       │
49 | │      object_stores=[]     ││      object_stores=[]     │
50 | └─────────────┬─────────────┘└─────────────┬─────────────┘
51 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐
52 | │       DataSourceExec      ││      RepartitionExec      │
53 | │    --------------------   ││    --------------------   │
54 | │          files: 1         ││ partition_count(in->out): │
55 | │      format: parquet      ││           1 -> 8          │
56 | │                           ││                           │
57 | │         predicate:        ││    partitioning_scheme:   │
58 | │ p_size >= 1 AND (p_brand =││     RoundRobinBatch(8)    │
59 | │  Brand#12 AND p_container ││                           │
60 | │   IN (SM CASE, SM BOX, SM ││                           │
61 | │      PACK, SM PKG) AND    ││                           │
62 | │       p_size <= 5 OR      ││                           │
63 | │     p_brand = Brand#23    ││                           │
64 | │     AND p_container IN    ││                           │
65 | │   (MED BAG, MED BOX, MED  ││                           │
66 | │     PKG, MED PACK) AND    ││                           │
67 | │       p_size <= 10 OR     ││                           │
68 | │     p_brand = Brand#34    ││                           │
69 | │     AND p_container IN    ││                           │
70 | │    (LG CASE, LG BOX, LG   ││                           │
71 | │      PACK, LG PKG) AND    ││                           │
72 | │        p_size <= 15)      ││                           │
73 | └───────────────────────────┘└─────────────┬─────────────┘
74 |                              ┌─────────────┴─────────────┐
75 |                              │       DataSourceExec      │
76 |                              │    --------------------   │
77 |                              │          files: 1         │
78 |                              │      format: parquet      │
79 |                              │                           │
80 |                              │         predicate:        │
81 |                              │    (l_shipmode = AIR OR   │
82 |                              │      l_shipmode = AIR     │
83 |                              │          REG) AND         │
84 |                              │         l_shipinstr       │
85 |                              │  uct = DELIVER IN PERSON  │
86 |                              │     AND (l_quantity >=    │
87 |                              │     Some(100),15,2 AND    │
88 |                              │     l_quantity <= Some    │
89 |                              │       (1100),15,2 OR      │
90 |                              │        l_quantity >=      │
91 |                              │     Some(1000),15,2 AND   │
92 |                              │     l_quantity <= Some    │
93 |                              │       (2000),15,2 OR      │
94 |                              │        l_quantity >=      │
95 |                              │     Some(2000),15,2 AND   │
96 |                              │   l_quantity <= Some(3000 │
97 |                              │          ),15,2)          │
98 |                              └───────────────────────────┘
99 | 


--------------------------------------------------------------------------------
/src/client/src/tests/snapshots/liquid_cache_client__tests__tpch_q22.snap:
--------------------------------------------------------------------------------
  1 | ---
  2 | source: src/client/src/tests/mod.rs
  3 | expression: displayable.tree_render().to_string()
  4 | ---
  5 | ┌───────────────────────────┐
  6 | │  SortPreservingMergeExec  │
  7 | │    --------------------   │
  8 | │  cntrycode ASC NULLS LAST │
  9 | └─────────────┬─────────────┘
 10 | ┌─────────────┴─────────────┐
 11 | │          SortExec         │
 12 | │    --------------------   │
 13 | │ cntrycode@0 ASC NULLS LAST│
 14 | └─────────────┬─────────────┘
 15 | ┌─────────────┴─────────────┐
 16 | │       ProjectionExec      │
 17 | │    --------------------   │
 18 | │    cntrycode: cntrycode   │
 19 | │                           │
 20 | │          numcust:         │
 21 | │      count(Int64(1))      │
 22 | │                           │
 23 | │        totacctbal:        │
 24 | │  sum(custsale.c_acctbal)  │
 25 | └─────────────┬─────────────┘
 26 | ┌─────────────┴─────────────┐
 27 | │       AggregateExec       │
 28 | │    --------------------   │
 29 | │           aggr:           │
 30 | │   count(1), sum(custsale  │
 31 | │        .c_acctbal)        │
 32 | │                           │
 33 | │    group_by: cntrycode    │
 34 | │                           │
 35 | │           mode:           │
 36 | │      FinalPartitioned     │
 37 | └─────────────┬─────────────┘
 38 | ┌─────────────┴─────────────┐
 39 | │    CoalesceBatchesExec    │
 40 | │    --------------------   │
 41 | │     target_batch_size:    │
 42 | │           16384           │
 43 | └─────────────┬─────────────┘
 44 | ┌─────────────┴─────────────┐
 45 | │      RepartitionExec      │
 46 | │    --------------------   │
 47 | │ partition_count(in->out): │
 48 | │           8 -> 8          │
 49 | │                           │
 50 | │    partitioning_scheme:   │
 51 | │   Hash([cntrycode@0], 8)  │
 52 | └─────────────┬─────────────┘
 53 | ┌─────────────┴─────────────┐
 54 | │       AggregateExec       │
 55 | │    --------------------   │
 56 | │           aggr:           │
 57 | │   count(1), sum(custsale  │
 58 | │        .c_acctbal)        │
 59 | │                           │
 60 | │    group_by: cntrycode    │
 61 | │       mode: Partial       │
 62 | └─────────────┬─────────────┘
 63 | ┌─────────────┴─────────────┐
 64 | │       ProjectionExec      │
 65 | │    --------------------   │
 66 | │    c_acctbal: c_acctbal   │
 67 | │                           │
 68 | │         cntrycode:        │
 69 | │   substr(c_phone, 1, 2)   │
 70 | └─────────────┬─────────────┘
 71 | ┌─────────────┴─────────────┐
 72 | │     NestedLoopJoinExec    ├──────────────┐
 73 | └─────────────┬─────────────┘              │
 74 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐
 75 | │       AggregateExec       ││      RepartitionExec      │
 76 | │    --------------------   ││    --------------------   │
 77 | │           aggr:           ││ partition_count(in->out): │
 78 | │  avg(customer.c_acctbal)  ││           1 -> 8          │
 79 | │                           ││                           │
 80 | │        mode: Final        ││    partitioning_scheme:   │
 81 | │                           ││     RoundRobinBatch(8)    │
 82 | └─────────────┬─────────────┘└─────────────┬─────────────┘
 83 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐
 84 | │   CoalescePartitionsExec  ││    CoalesceBatchesExec    │
 85 | │                           ││    --------------------   │
 86 | │                           ││     target_batch_size:    │
 87 | │                           ││           16384           │
 88 | └─────────────┬─────────────┘└─────────────┬─────────────┘
 89 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐
 90 | │   LiquidCacheClientExec   ││        HashJoinExec       │
 91 | │    --------------------   ││    --------------------   │
 92 | │          server:          ││    join_type: LeftAnti    │
 93 | │  http://localhost:50051,  ││                           ├──────────────┐
 94 | │        mode=liquid,       ││            on:            │              │
 95 | │      object_stores=[]     ││  (c_custkey = o_custkey)  │              │
 96 | └─────────────┬─────────────┘└─────────────┬─────────────┘              │
 97 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐┌─────────────┴─────────────┐
 98 | │       AggregateExec       ││   LiquidCacheClientExec   ││   LiquidCacheClientExec   │
 99 | │    --------------------   ││    --------------------   ││    --------------------   │
100 | │           aggr:           ││          server:          ││          server:          │
101 | │  avg(customer.c_acctbal)  ││  http://localhost:50051,  ││  http://localhost:50051,  │
102 | │                           ││        mode=liquid,       ││        mode=liquid,       │
103 | │       mode: Partial       ││      object_stores=[]     ││      object_stores=[]     │
104 | └─────────────┬─────────────┘└─────────────┬─────────────┘└─────────────┬─────────────┘
105 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐┌─────────────┴─────────────┐
106 | │      RepartitionExec      ││       DataSourceExec      ││       DataSourceExec      │
107 | │    --------------------   ││    --------------------   ││    --------------------   │
108 | │ partition_count(in->out): ││          files: 1         ││          files: 1         │
109 | │           1 -> 8          ││      format: parquet      ││      format: parquet      │
110 | │                           ││                           ││                           │
111 | │    partitioning_scheme:   ││         predicate:        ││                           │
112 | │     RoundRobinBatch(8)    ││ substr(c_phone, 1, 2) IN  ││                           │
113 | │                           ││  (13, 31, 23, 29, 30, 18, ││                           │
114 | │                           ││        17) AND true       ││                           │
115 | └─────────────┬─────────────┘└───────────────────────────┘└───────────────────────────┘
116 | ┌─────────────┴─────────────┐
117 | │       DataSourceExec      │
118 | │    --------------------   │
119 | │          files: 1         │
120 | │      format: parquet      │
121 | │                           │
122 | │         predicate:        │
123 | │  c_acctbal > Some(0),15,2 │
124 | │   AND substr(c_phone, 1,  │
125 | │   2) IN (13, 31, 23, 29,  │
126 | │         30, 18, 17)       │
127 | └───────────────────────────┘
128 | 


--------------------------------------------------------------------------------
/src/client/src/tests/snapshots/liquid_cache_client__tests__tpch_q4.snap:
--------------------------------------------------------------------------------
  1 | ---
  2 | source: src/client/src/tests/mod.rs
  3 | expression: displayable.tree_render().to_string()
  4 | ---
  5 | ┌───────────────────────────┐
  6 | │  SortPreservingMergeExec  │
  7 | │    --------------------   │
  8 | │ o_orderpriority ASC NULLS │
  9 | │            LAST           │
 10 | └─────────────┬─────────────┘
 11 | ┌─────────────┴─────────────┐
 12 | │          SortExec         │
 13 | │    --------------------   │
 14 | │   o_orderpriority@0 ASC   │
 15 | │         NULLS LAST        │
 16 | └─────────────┬─────────────┘
 17 | ┌─────────────┴─────────────┐
 18 | │       ProjectionExec      │
 19 | │    --------------------   │
 20 | │      o_orderpriority:     │
 21 | │      o_orderpriority      │
 22 | │                           │
 23 | │        order_count:       │
 24 | │      count(Int64(1))      │
 25 | └─────────────┬─────────────┘
 26 | ┌─────────────┴─────────────┐
 27 | │       AggregateExec       │
 28 | │    --------------------   │
 29 | │       aggr: count(1)      │
 30 | │                           │
 31 | │         group_by:         │
 32 | │      o_orderpriority      │
 33 | │                           │
 34 | │           mode:           │
 35 | │      FinalPartitioned     │
 36 | └─────────────┬─────────────┘
 37 | ┌─────────────┴─────────────┐
 38 | │    CoalesceBatchesExec    │
 39 | │    --------------------   │
 40 | │     target_batch_size:    │
 41 | │           16384           │
 42 | └─────────────┬─────────────┘
 43 | ┌─────────────┴─────────────┐
 44 | │      RepartitionExec      │
 45 | │    --------------------   │
 46 | │ partition_count(in->out): │
 47 | │           8 -> 8          │
 48 | │                           │
 49 | │    partitioning_scheme:   │
 50 | │ Hash([o_orderpriority@0], │
 51 | │             8)            │
 52 | └─────────────┬─────────────┘
 53 | ┌─────────────┴─────────────┐
 54 | │       AggregateExec       │
 55 | │    --------------------   │
 56 | │       aggr: count(1)      │
 57 | │                           │
 58 | │         group_by:         │
 59 | │      o_orderpriority      │
 60 | │                           │
 61 | │       mode: Partial       │
 62 | └─────────────┬─────────────┘
 63 | ┌─────────────┴─────────────┐
 64 | │    CoalesceBatchesExec    │
 65 | │    --------------------   │
 66 | │     target_batch_size:    │
 67 | │           16384           │
 68 | └─────────────┬─────────────┘
 69 | ┌─────────────┴─────────────┐
 70 | │        HashJoinExec       │
 71 | │    --------------------   │
 72 | │    join_type: LeftSemi    │
 73 | │                           ├──────────────┐
 74 | │            on:            │              │
 75 | │ (o_orderkey = l_orderkey) │              │
 76 | └─────────────┬─────────────┘              │
 77 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐
 78 | │   LiquidCacheClientExec   ││   LiquidCacheClientExec   │
 79 | │    --------------------   ││    --------------------   │
 80 | │          server:          ││          server:          │
 81 | │  http://localhost:50051,  ││  http://localhost:50051,  │
 82 | │        mode=liquid,       ││        mode=liquid,       │
 83 | │      object_stores=[]     ││      object_stores=[]     │
 84 | └─────────────┬─────────────┘└─────────────┬─────────────┘
 85 | ┌─────────────┴─────────────┐┌─────────────┴─────────────┐
 86 | │       DataSourceExec      ││      RepartitionExec      │
 87 | │    --------------------   ││    --------------------   │
 88 | │          files: 1         ││ partition_count(in->out): │
 89 | │      format: parquet      ││           1 -> 8          │
 90 | │                           ││                           │
 91 | │         predicate:        ││    partitioning_scheme:   │
 92 | │ o_orderdate >= 1993-07-01 ││     RoundRobinBatch(8)    │
 93 | │   AND o_orderdate < 1993  ││                           │
 94 | │           -10-01          ││                           │
 95 | └───────────────────────────┘└─────────────┬─────────────┘
 96 |                              ┌─────────────┴─────────────┐
 97 |                              │       DataSourceExec      │
 98 |                              │    --------------------   │
 99 |                              │          files: 1         │
100 |                              │      format: parquet      │
101 |                              │                           │
102 |                              │         predicate:        │
103 |                              │      l_receiptdate >      │
104 |                              │        l_commitdate       │
105 |                              └───────────────────────────┘
106 | 


--------------------------------------------------------------------------------
/src/client/src/tests/snapshots/liquid_cache_client__tests__tpch_q6.snap:
--------------------------------------------------------------------------------
 1 | ---
 2 | source: src/client/src/tests/mod.rs
 3 | expression: displayable.tree_render().to_string()
 4 | ---
 5 | ┌───────────────────────────┐
 6 | │       ProjectionExec      │
 7 | │    --------------------   │
 8 | │          revenue:         │
 9 | │        sum(lineitem       │
10 | │        .l_extendedp       │
11 | │rice * lineitem.l_discount)│
12 | └─────────────┬─────────────┘
13 | ┌─────────────┴─────────────┐
14 | │       AggregateExec       │
15 | │    --------------------   │
16 | │           aggr:           │
17 | │        sum(lineitem       │
18 | │        .l_extendedp       │
19 | │rice * lineitem.l_discount)│
20 | │                           │
21 | │        mode: Final        │
22 | └─────────────┬─────────────┘
23 | ┌─────────────┴─────────────┐
24 | │   CoalescePartitionsExec  │
25 | └─────────────┬─────────────┘
26 | ┌─────────────┴─────────────┐
27 | │   LiquidCacheClientExec   │
28 | │    --------------------   │
29 | │          server:          │
30 | │  http://localhost:50051,  │
31 | │        mode=liquid,       │
32 | │      object_stores=[]     │
33 | └─────────────┬─────────────┘
34 | ┌─────────────┴─────────────┐
35 | │       AggregateExec       │
36 | │    --------------------   │
37 | │           aggr:           │
38 | │        sum(lineitem       │
39 | │        .l_extendedp       │
40 | │rice * lineitem.l_discount)│
41 | │                           │
42 | │       mode: Partial       │
43 | └─────────────┬─────────────┘
44 | ┌─────────────┴─────────────┐
45 | │      RepartitionExec      │
46 | │    --------------------   │
47 | │ partition_count(in->out): │
48 | │           1 -> 8          │
49 | │                           │
50 | │    partitioning_scheme:   │
51 | │     RoundRobinBatch(8)    │
52 | └─────────────┬─────────────┘
53 | ┌─────────────┴─────────────┐
54 | │       DataSourceExec      │
55 | │    --------------------   │
56 | │          files: 1         │
57 | │      format: parquet      │
58 | │                           │
59 | │         predicate:        │
60 | │  l_shipdate >= 1994-01-01 │
61 | │    AND l_shipdate < 1995  │
62 | │  -01-01 AND l_discount >= │
63 | │      Some(5),15,2 AND     │
64 | │        l_discount <=      │
65 | │      Some(7),15,2 AND     │
66 | │      l_quantity < Some    │
67 | │        (2400),15,2        │
68 | └───────────────────────────┘
69 | 


--------------------------------------------------------------------------------
/src/common/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "liquid-cache-common"
 3 | version = { workspace = true }
 4 | edition = { workspace = true }
 5 | license = { workspace = true }
 6 | readme = { workspace = true }
 7 | description = { workspace = true }
 8 | repository = { workspace = true }
 9 | 
10 | [dependencies]
11 | arrow-flight = { workspace = true }
12 | arrow-schema = { workspace = true }
13 | arrow = { workspace = true }
14 | prost = { workspace = true }
15 | bytes = { workspace = true }
16 | url = { workspace = true }
17 | serde = { workspace = true }
18 | 
19 | [dev-dependencies]
20 | tempfile = { workspace = true }
21 | 


--------------------------------------------------------------------------------
/src/common/src/utils.rs:
--------------------------------------------------------------------------------
 1 | use url::Url;
 2 | 
 3 | /// Sanitize an object store URL for use as a directory name.
 4 | pub fn sanitize_object_store_url_for_dirname(url: &Url) -> String {
 5 |     let mut parts = vec![url.scheme()];
 6 | 
 7 |     if let Some(host) = url.host_str() {
 8 |         parts.push(host);
 9 |     }
10 | 
11 |     let dirname = parts.join("_");
12 | 
13 |     dirname.replace(['/', ':', '?', '&', '=', '\\'], "_")
14 | }
15 | 
16 | /// Sanitize a path for use as a directory name.
17 | pub fn sanitize_path_for_dirname(path: &str) -> String {
18 |     path.replace(['/', ':', '?', '&', '=', '\\'], "_")
19 | }
20 | 
21 | #[cfg(test)]
22 | mod tests {
23 |     use super::*;
24 |     use std::fs;
25 |     use tempfile::TempDir;
26 |     use url::Url;
27 | 
28 |     #[test]
29 |     fn test_can_create_directories_with_sanitized_names() {
30 |         // Create a temporary directory for testing
31 |         let temp_dir = TempDir::new().expect("Failed to create temp directory");
32 | 
33 |         // Array of problematic URLs to test
34 |         let test_urls = [
35 |             "http://example.com/path/to/resource",
36 |             "https://example.com?param1=value1&param2=value2",
37 |             "s3://bucket-name/object/key",
38 |             "https://user:password@example.com:8080/path?query=value#fragment",
39 |             "file:///C:/Windows/System32/",
40 |             "https://example.com/path/with/special?chars=%20%26%3F",
41 |             "http://192.168.1.1:8080/admin?debug=true",
42 |             "ftp://files.example.com/pub/file.txt",
43 |             // Unicode characters in URL
44 |             "https://例子.测试",
45 |             // Very long URL
46 |             &format!("https://example.com/{}", "a".repeat(200)),
47 |         ];
48 | 
49 |         // Test each URL
50 |         for url_str in test_urls {
51 |             let url = Url::parse(url_str).expect("Failed to parse URL");
52 |             let dirname = sanitize_object_store_url_for_dirname(&url);
53 | 
54 |             // Create a directory using the sanitized name
55 |             let dir_path = temp_dir.path().join(dirname);
56 |             fs::create_dir(&dir_path).expect("Failed to create directory");
57 | 
58 |             // Verify the directory exists
59 |             assert!(dir_path.exists());
60 |             assert!(dir_path.is_dir());
61 | 
62 |             // Clean up
63 |             fs::remove_dir(&dir_path).expect("Failed to remove test directory");
64 |         }
65 |     }
66 | }
67 | 


--------------------------------------------------------------------------------
/src/liquid_parquet/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "liquid-cache-parquet"
 3 | version = { workspace = true }
 4 | edition = { workspace = true }
 5 | license = { workspace = true }
 6 | readme = { workspace = true }
 7 | description = { workspace = true }
 8 | repository = { workspace = true }
 9 | 
10 | [dependencies]
11 | arrow = { workspace = true }
12 | arrow-schema = { workspace = true }
13 | parquet = { workspace = true }
14 | datafusion = { workspace = true }
15 | async-trait = { workspace = true }
16 | futures = { workspace = true }
17 | tokio = { workspace = true }
18 | url = { workspace = true }
19 | ahash = { workspace = true }
20 | fsst-rs = "0.5.2"
21 | serde = { workspace = true }
22 | bytes = { workspace = true }
23 | log = { workspace = true }
24 | object_store = { workspace = true }
25 | itertools = { workspace = true }
26 | fastlanes = "0.1.8"
27 | num-traits = "0.2.19"
28 | zerocopy = { version = "0.8.25", features = ["derive"] }
29 | liquid-cache-common = { workspace = true }
30 | fastrace = { workspace = true }
31 | fastrace-futures = { workspace = true }
32 | congee = { workspace = true }
33 | 
34 | [dev-dependencies]
35 | tempfile = "3.20.0"
36 | criterion = "0.6.0"
37 | rand = "0.9.1"
38 | shuttle = "0.8.0"
39 | tracing-subscriber = "0.3.19"
40 | paste = "1.0.15"
41 | 
42 | [features]
43 | shuttle = []
44 | 
45 | 
46 | [[bench]]
47 | name = "bitpacking"
48 | path = "bench/bitpacking.rs"
49 | harness = false
50 | 
51 | [[bench]]
52 | name = "liquid_float_array"
53 | harness = false
54 | path = "bench/liquid_float_array.rs"
55 | 
56 | [[bench]]
57 | name = "boolean_and_then"
58 | harness = false
59 | path = "bench/boolean_and_then.rs"
60 | 
61 | [[bench]]
62 | name = "bench_eviction"
63 | path = "bench/bench_eviction.rs"
64 | 
65 | [[bench]]
66 | name = "fsstarray"
67 | path = "bench/fsstarray.rs"
68 | harness = false
69 | 


--------------------------------------------------------------------------------
/src/liquid_parquet/bench/bench_eviction.rs:
--------------------------------------------------------------------------------
 1 | mod eviction_cache;
 2 | use eviction_cache::{Cache, ClockCache, FifoCache, LfuCache, LruCache};
 3 | use std::collections::HashSet;
 4 | use std::fs::File;
 5 | use std::io::{BufRead, BufReader};
 6 | 
 7 | fn pack_u16s(a: u16, b: u16, c: u16) -> u64 {
 8 |     ((a as u64) << 32) | ((b as u64) << 16) | (c as u64)
 9 | }
10 | 
11 | fn bench<C: Cache>(total_size: u64, create: impl Fn(u64) -> C, name: String) {
12 |     let mut cache_size = total_size;
13 | 
14 |     while cache_size > 0 {
15 |         let mut cache = create(cache_size);
16 |         let file = File::open("./cache_trace.csv").expect("Failed to reopen cache_trace.csv");
17 |         let reader = BufReader::new(file);
18 | 
19 |         for line in reader.lines().skip(1) {
20 |             let line = line.expect("Failed to read line");
21 |             let fields: Vec<&str> = line.split(',').collect();
22 |             if fields.len() == 6 {
23 |                 let file_id: u16 = fields[0].parse().expect("Failed to parse file_id");
24 |                 let row_group: u16 = fields[1].parse().expect("Failed to parse row_group");
25 |                 let col: u16 = fields[2].parse().expect("Failed to parse col");
26 |                 let size: u64 = fields[4].parse().expect("Failed to parse size");
27 | 
28 |                 let key = pack_u16s(file_id, row_group, col);
29 | 
30 |                 cache.get(key, size);
31 |             }
32 |         }
33 | 
34 |         let (hits, total) = cache.result();
35 |         println!("{},{},{},{}", name, cache_size, hits, total);
36 | 
37 |         cache_size /= 10;
38 |     }
39 | }
40 | 
41 | fn main() {
42 |     // Read and parse the cache trace file
43 |     let file = File::open("./cache_trace.csv").expect("Failed to open cache_trace.csv");
44 |     let reader = BufReader::new(file);
45 | 
46 |     let mut total_size: u64 = 0;
47 |     let mut count = 0;
48 |     let mut cols = HashSet::new();
49 | 
50 |     for line in reader.lines().skip(1) {
51 |         let line = line.expect("Failed to read line");
52 |         let fields: Vec<&str> = line.split(',').collect();
53 |         if fields.len() == 6 {
54 |             let file_id: u16 = fields[0].parse().expect("Failed to parse file_id");
55 |             let row_group: u16 = fields[1].parse().expect("Failed to parse row_group");
56 |             let col: u16 = fields[2].parse().expect("Failed to parse col");
57 |             let size: u64 = fields[4].parse().expect("Failed to parse size");
58 | 
59 |             let new = cols.insert(pack_u16s(file_id, row_group, col));
60 |             if new {
61 |                 total_size += size;
62 |             }
63 |             count += 1;
64 |         }
65 |     }
66 | 
67 |     println!("Read {} inserts, total size: {}", count, total_size);
68 | 
69 |     bench(total_size, LruCache::new, "LRU".to_string());
70 |     bench(total_size, ClockCache::new, "CLOCK".to_string());
71 |     bench(total_size, LfuCache::new, "LFU".to_string());
72 |     bench(total_size, FifoCache::new, "FIFO".to_string());
73 | }
74 | 


--------------------------------------------------------------------------------
/src/liquid_parquet/bench/bitpacking.rs:
--------------------------------------------------------------------------------
 1 | use criterion::Throughput;
 2 | use criterion::*;
 3 | 
 4 | use std::num::NonZero;
 5 | 
 6 | use arrow::array::PrimitiveArray;
 7 | use liquid_cache_parquet::liquid_array::raw::BitPackedArray;
 8 | use rand::Rng;
 9 | 
10 | const MAX_BIT_WIDTH: u8 = 32;
11 | const MAX_ARRAY_MULTIPLIER: usize = 8;
12 | const BASE_ARRAY_SIZE: usize = 8192;
13 | 
14 | // Function to create a random vector of u32 values with a given size and bit width
15 | fn create_random_vec(array_size: usize, bit_width: u8) -> Vec<u32> {
16 |     let max_value = (1u32 << bit_width) - 1;
17 |     let mut rng = rand::rng();
18 |     let values: Vec<u32> = (0..array_size)
19 |         .map(|_| rng.random_range(0..=max_value))
20 |         .collect();
21 |     values
22 | }
23 | 
24 | // Benchmark function to measure the performance of from_primitive
25 | fn from_primitive_benchmark(c: &mut Criterion) {
26 |     use arrow::datatypes::UInt32Type;
27 | 
28 |     // `bit_widths` represents the range of bit widths to test (1 through MAX_BIT_WIDTH).
29 |     // Each bit width determines the maximum value that can be represented in the random vector.
30 |     // For example, a bit width of 8 allows values in the range [0, 255].
31 |     let bit_widths: Vec<u8> = (1..=MAX_BIT_WIDTH).step_by(4).collect();
32 |     for bit_width in bit_widths {
33 |         // `array_sizes` represents the range of array sizes to test.
34 |         // Each size is a multiple of BASE_ARRAY_SIZE (e.g., 8192, 16384, etc.).
35 |         let array_sizes: Vec<usize> = (1..=MAX_ARRAY_MULTIPLIER)
36 |             .map(|i| BASE_ARRAY_SIZE * i)
37 |             .collect();
38 |         for array_size in array_sizes {
39 |             let values: Vec<u32> = create_random_vec(array_size, bit_width);
40 | 
41 |             // Convert the random vector into a PrimitiveArray
42 |             let array = PrimitiveArray::<UInt32Type>::from(values);
43 |             let bit_width = NonZero::new(bit_width).unwrap();
44 | 
45 |             // Benchmark from_primitive() - the conversion from PrimitiveArray to BitPackedArray
46 |             let mut group = c.benchmark_group(format!("from_primitive_bw_{}", bit_width));
47 |             group.throughput(Throughput::Bytes(
48 |                 (array_size * std::mem::size_of::<u32>()) as u64,
49 |             ));
50 |             group.bench_function(format!("size_{}", array_size), |b| {
51 |                 b.iter(|| {
52 |                     std::hint::black_box(BitPackedArray::from_primitive(array.clone(), bit_width))
53 |                 })
54 |             });
55 |             group.finish();
56 |         }
57 |     }
58 | }
59 | 
60 | // Benchmark function to measure the performance of to_primitive
61 | fn to_primitive_benchmark(c: &mut Criterion) {
62 |     use arrow::datatypes::UInt32Type;
63 | 
64 |     let bit_widths: Vec<u8> = (1..=MAX_BIT_WIDTH).collect();
65 |     for bit_width in bit_widths {
66 |         let array_sizes: Vec<usize> = (1..=MAX_ARRAY_MULTIPLIER)
67 |             .map(|i| BASE_ARRAY_SIZE * i)
68 |             .collect();
69 |         for array_size in array_sizes {
70 |             let values: Vec<u32> = create_random_vec(array_size, bit_width);
71 | 
72 |             // Convert the random vector into a PrimitiveArray
73 |             let array = PrimitiveArray::<UInt32Type>::from(values);
74 |             let bit_width = NonZero::new(bit_width).unwrap();
75 |             let bit_packed = BitPackedArray::from_primitive(array, bit_width);
76 | 
77 |             // Benchmark to_primitive() - the conversion from a BitPackedArray to PrimitiveArray
78 |             let mut group = c.benchmark_group(format!("to_primitive_bw_{}", bit_width));
79 |             group.throughput(Throughput::Bytes(
80 |                 (array_size * std::mem::size_of::<u32>()) as u64,
81 |             ));
82 |             group.bench_function(format!("size_{}", array_size), |b| {
83 |                 b.iter(|| std::hint::black_box(bit_packed.to_primitive()))
84 |             });
85 |             group.finish();
86 |         }
87 |     }
88 | }
89 | 
90 | criterion_group!(benches, from_primitive_benchmark, to_primitive_benchmark);
91 | 
92 | // Entry point for Criterion benchmarking
93 | criterion_main!(benches);
94 | 


--------------------------------------------------------------------------------
/src/liquid_parquet/bench/boolean_and_then.rs:
--------------------------------------------------------------------------------
 1 | use arrow::{array::BooleanBufferBuilder, buffer::BooleanBuffer};
 2 | use criterion::{Criterion, Throughput, criterion_group, criterion_main};
 3 | use liquid_cache_parquet::boolean_buffer_and_then;
 4 | 
 5 | use rand::Rng;
 6 | 
 7 | const BUFFER_SIZE: usize = 8192 * 2; // 16384
 8 | 
 9 | /// Generate a BooleanBuffer with specified selectivity (percentage of true bits)
10 | fn generate_boolean_buffer(size: usize, selectivity: f64) -> BooleanBuffer {
11 |     let mut rng = rand::rng();
12 |     let mut builder = BooleanBufferBuilder::new(size);
13 | 
14 |     for _ in 0..size {
15 |         let should_set = rng.random_bool(selectivity);
16 |         builder.append(should_set);
17 |     }
18 | 
19 |     builder.finish()
20 | }
21 | 
22 | /// Generate a right BooleanBuffer that has exactly `count_set_bits` bits
23 | fn generate_right_boolean_buffer(count_set_bits: usize, selectivity: f64) -> BooleanBuffer {
24 |     let mut rng = rand::rng();
25 |     let mut builder = BooleanBufferBuilder::new(count_set_bits);
26 | 
27 |     for _ in 0..count_set_bits {
28 |         let should_set = rng.random_bool(selectivity);
29 |         builder.append(should_set);
30 |     }
31 | 
32 |     builder.finish()
33 | }
34 | 
35 | fn benchmark_boolean_and_then(c: &mut Criterion) {
36 |     // Three selectivity levels: low (10%), medium (50%), high (90%)
37 |     let selectivities = [0.1, 0.5, 0.9];
38 | 
39 |     for left_selectivity in selectivities {
40 |         for right_selectivity in selectivities {
41 |             let group_name = format!(
42 |                 "boolean_and_then_left_{:.0}%_right_{:.0}%",
43 |                 left_selectivity * 100.0,
44 |                 right_selectivity * 100.0
45 |             );
46 | 
47 |             let mut group = c.benchmark_group(&group_name);
48 | 
49 |             // Set throughput based on the buffer size in bytes
50 |             // Each boolean buffer uses approximately size/8 bytes
51 |             group.throughput(Throughput::Bytes((BUFFER_SIZE / 8) as u64));
52 | 
53 |             group.bench_function("size_16384", |b| {
54 |                 // Pre-generate test data
55 |                 let left = generate_boolean_buffer(BUFFER_SIZE, left_selectivity);
56 |                 let count_set_bits = left.count_set_bits();
57 |                 let right = generate_right_boolean_buffer(count_set_bits, right_selectivity);
58 | 
59 |                 b.iter(|| std::hint::black_box(boolean_buffer_and_then(&left, &right)))
60 |             });
61 | 
62 |             group.finish();
63 |         }
64 |     }
65 | }
66 | 
67 | criterion_group!(benches, benchmark_boolean_and_then);
68 | criterion_main!(benches);
69 | 


--------------------------------------------------------------------------------
/src/liquid_parquet/bench/fsstarray.rs:
--------------------------------------------------------------------------------
  1 | use criterion::*;
  2 | use std::sync::Arc;
  3 | use std::time::Duration;
  4 | 
  5 | extern crate arrow;
  6 | 
  7 | use arrow::{
  8 |     array::{Array, StringArray, StringBuilder},
  9 |     datatypes::Utf8Type,
 10 | };
 11 | use liquid_cache_parquet::liquid_array::raw::FsstArray;
 12 | use std::fs;
 13 | 
 14 | const CHUNK_SIZE: [usize; 5] = [12, 32, 64, 128, 256];
 15 | 
 16 | fn create_string_arrays_from_file() -> Vec<(usize, StringArray)> {
 17 |     const TEST_FILE_PATH: &str = "../../README.md";
 18 |     const LICENSE_FILE_PATH: &str = "../../LICENSE";
 19 | 
 20 |     let readme = fs::read_to_string(TEST_FILE_PATH).expect("Failed to read file");
 21 |     let license = fs::read_to_string(LICENSE_FILE_PATH).expect("Failed to read file");
 22 |     let content = format!("{}\n\n{}", readme, license);
 23 | 
 24 |     let mut result = Vec::new();
 25 | 
 26 |     let chars: Vec<char> = content.chars().collect();
 27 | 
 28 |     for &chunk_size in &CHUNK_SIZE {
 29 |         let mut builder = StringBuilder::new();
 30 |         for chunk in chars.chunks(chunk_size) {
 31 |             let chunk_str: String = chunk.iter().collect();
 32 |             builder.append_value(chunk_str);
 33 |         }
 34 |         result.push((chunk_size, builder.finish()));
 35 |     }
 36 | 
 37 |     result
 38 | }
 39 | 
 40 | // Benchmark for training the FSST compressor
 41 | fn compressor_benchmark(c: &mut Criterion) {
 42 |     let string_arrays = create_string_arrays_from_file();
 43 | 
 44 |     let mut group = c.benchmark_group("fsst");
 45 |     for (chunk_size, string_array) in string_arrays {
 46 |         let total_size = chunk_size * string_array.len();
 47 |         // Set the measurement time for the benchmark
 48 |         group.measurement_time(Duration::new(10, 0));
 49 | 
 50 |         // Set the throughput for the benchmark
 51 |         group.throughput(Throughput::Bytes(total_size as u64));
 52 | 
 53 |         // Benchmark the FSST compressor training
 54 |         group.bench_function(
 55 |             format!("train_compressor - chunk_size: {}", chunk_size),
 56 |             |b| {
 57 |                 b.iter(|| {
 58 |                     let input = std::hint::black_box(
 59 |                         string_array.iter().flat_map(|s| s.map(|a| a.as_bytes())),
 60 |                     );
 61 |                     FsstArray::train_compressor(input)
 62 |                 });
 63 |             },
 64 |         );
 65 |     }
 66 |     group.finish();
 67 | }
 68 | 
 69 | // Benchmark for creating an FSST array from a byte array using a pre-trained compressor
 70 | fn from_byte_array_with_compressor_benchmark(c: &mut Criterion) {
 71 |     let string_arrays = create_string_arrays_from_file();
 72 | 
 73 |     let mut group = c.benchmark_group(format!("fsst"));
 74 |     for (chunk_size, string_array) in string_arrays {
 75 |         // Train the FSST compressor
 76 |         let compressor =
 77 |             FsstArray::train_compressor(string_array.iter().flat_map(|s| s.map(|s| s.as_bytes())));
 78 | 
 79 |         let compressed =
 80 |             FsstArray::from_byte_array_with_compressor(&string_array, Arc::new(compressor.clone()));
 81 |         let compressed_size = compressed.get_array_memory_size();
 82 |         let uncompressed_size = chunk_size * string_array.len();
 83 |         println!(
 84 |             "compressed_size: {}, uncompressed_size: {}, compression_ratio: {}",
 85 |             compressed_size,
 86 |             uncompressed_size,
 87 |             compressed_size as f64 / uncompressed_size as f64
 88 |         );
 89 | 
 90 |         // Set the throughput for the benchmark
 91 |         group.throughput(Throughput::Bytes(uncompressed_size as u64));
 92 | 
 93 |         // Benchmark the creation of an FSST array from a byte array
 94 |         group.bench_function(format!("compress - chunk_size: {}", chunk_size), |b| {
 95 |             b.iter(|| {
 96 |                 std::hint::black_box(FsstArray::from_byte_array_with_compressor(
 97 |                     &string_array,
 98 |                     Arc::new(compressor.clone()),
 99 |                 ))
100 |             });
101 |         });
102 |     }
103 |     group.finish();
104 | }
105 | 
106 | // Benchmark for converting an FSST array to an Arrow byte array
107 | fn to_arrow_byte_array_benchmark(c: &mut Criterion) {
108 |     let string_arrays = create_string_arrays_from_file();
109 | 
110 |     let mut group = c.benchmark_group(format!("fsst"));
111 |     for (chunk_size, string_array) in string_arrays {
112 |         // Train the FSST compressor
113 |         let compressor =
114 |             FsstArray::train_compressor(string_array.iter().flat_map(|s| s.map(|s| s.as_bytes())));
115 | 
116 |         // Create an FSST array using the trained compressor
117 |         let fsst_values =
118 |             FsstArray::from_byte_array_with_compressor(&string_array, Arc::new(compressor));
119 | 
120 |         let total_size = chunk_size * string_array.len();
121 | 
122 |         // Set the throughput for the benchmark
123 |         group.throughput(Throughput::Bytes(total_size as u64));
124 | 
125 |         // Benchmark the conversion of FSST array to Arrow byte array
126 |         group.bench_function(format!("decompress - chunk_size: {}", chunk_size), |b| {
127 |             b.iter(|| std::hint::black_box(fsst_values.to_arrow_byte_array::<Utf8Type>()));
128 |         });
129 |     }
130 |     group.finish();
131 | }
132 | 
133 | // Define the benchmark group
134 | criterion_group!(
135 |     benches,
136 |     compressor_benchmark,
137 |     from_byte_array_with_compressor_benchmark,
138 |     to_arrow_byte_array_benchmark
139 | );
140 | 
141 | // Entry point for Criterion benchmarking
142 | criterion_main!(benches);
143 | 


--------------------------------------------------------------------------------
/src/liquid_parquet/bench/liquid_float_array.rs:
--------------------------------------------------------------------------------
 1 | use criterion::{Criterion, Throughput, criterion_group, criterion_main};
 2 | use datafusion::arrow::{
 3 |     array::PrimitiveArray,
 4 |     buffer::ScalarBuffer,
 5 |     datatypes::{Float32Type, Float64Type},
 6 | };
 7 | use liquid_cache_parquet::liquid_array::{LiquidArray, LiquidFloatArray};
 8 | use rand::Rng;
 9 | 
10 | fn criterion_benchmark(c: &mut Criterion) {
11 |     // Encoding benchmarks for float32
12 |     let bench_sizes = [8192, 16384, 24576];
13 |     for size in bench_sizes {
14 |         let mut group = c.benchmark_group(format!("float32_liquid_encode"));
15 |         group.throughput(Throughput::Bytes(
16 |             (size * std::mem::size_of::<f32>()) as u64,
17 |         ));
18 |         group.bench_function(format!("size_{}", size), |b| {
19 |             let mut rng = rand::rng();
20 |             let mut array: Vec<f32> = vec![];
21 |             for _ in 0..size {
22 |                 array.push(rng.random_range(-1.3e3..1.3e3));
23 |             }
24 |             let arrow_array = PrimitiveArray::new(ScalarBuffer::from(array), None);
25 |             b.iter(|| {
26 |                 let _x = LiquidFloatArray::<Float32Type>::from_arrow_array(arrow_array.clone());
27 |             })
28 |         });
29 |         group.finish();
30 |     }
31 | 
32 |     for size in bench_sizes {
33 |         let mut group = c.benchmark_group(format!("float64_liquid_encode"));
34 |         group.throughput(Throughput::Bytes(
35 |             (size * std::mem::size_of::<f64>()) as u64,
36 |         ));
37 |         group.bench_function(format!("size_{}", size), |b| {
38 |             let mut rng = rand::rng();
39 |             let mut array: Vec<f64> = vec![];
40 |             for _ in 0..size {
41 |                 array.push(rng.random_range(-1.3e3..1.3e3));
42 |             }
43 |             let arrow_array = PrimitiveArray::new(ScalarBuffer::from(array), None);
44 |             b.iter(|| {
45 |                 let _x = LiquidFloatArray::<Float64Type>::from_arrow_array(arrow_array.clone());
46 |             })
47 |         });
48 |         group.finish();
49 |     }
50 | 
51 |     // Decoding benchmarks for float32
52 |     for size in bench_sizes {
53 |         let mut rng = rand::rng();
54 |         let mut array: Vec<f32> = vec![];
55 |         for _ in 0..size {
56 |             array.push(rng.random_range(-1.3e3..1.3e3));
57 |         }
58 |         let arrow_array = PrimitiveArray::<Float32Type>::new(ScalarBuffer::from(array), None);
59 |         let liquid_array = LiquidFloatArray::<Float32Type>::from_arrow_array(arrow_array);
60 | 
61 |         let mut group = c.benchmark_group(format!("float32_liquid_decode"));
62 |         group.throughput(Throughput::Bytes(
63 |             (size * std::mem::size_of::<f32>()) as u64,
64 |         ));
65 |         group.bench_function(format!("size_{}", size), |b| {
66 |             b.iter(|| {
67 |                 let _x = liquid_array.to_arrow_array();
68 |             })
69 |         });
70 |         group.finish();
71 |     }
72 | 
73 |     // Decoding benchmarks for float64
74 |     for size in bench_sizes {
75 |         let mut rng = rand::rng();
76 |         let mut array: Vec<f64> = vec![];
77 |         for _ in 0..size {
78 |             array.push(rng.random_range(-1.3e3..1.3e3));
79 |         }
80 |         let arrow_array = PrimitiveArray::<Float64Type>::new(ScalarBuffer::from(array), None);
81 |         let liquid_array = LiquidFloatArray::<Float64Type>::from_arrow_array(arrow_array);
82 | 
83 |         let mut group = c.benchmark_group(format!("float64_liquid_decode"));
84 |         group.throughput(Throughput::Bytes(
85 |             (size * std::mem::size_of::<f64>()) as u64,
86 |         ));
87 |         group.bench_function(format!("size_{}", size), |b| {
88 |             b.iter(|| {
89 |                 let _x = liquid_array.to_arrow_array();
90 |             })
91 |         });
92 |         group.finish();
93 |     }
94 | }
95 | 
96 | criterion_group!(benches, criterion_benchmark);
97 | criterion_main!(benches);
98 | 


--------------------------------------------------------------------------------
/src/liquid_parquet/clippy.toml:
--------------------------------------------------------------------------------
 1 | disallowed-methods = []
 2 | 
 3 | disallowed-types = [
 4 | 	{ path = "dashmap::DashMap", reason = "DashMap can easily lead to deadlocks, use RwLock<HashMap> with shuttle tests instead" },
 5 | ]
 6 | 
 7 | # Lowering the threshold to help prevent stack overflows (default is 16384)
 8 | # See: https://rust-lang.github.io/rust-clippy/master/index.html#/large_futures
 9 | future-size-threshold = 10000
10 | too-many-lines-threshold = 50
11 | 


--------------------------------------------------------------------------------
/src/liquid_parquet/src/cache/budget.rs:
--------------------------------------------------------------------------------
  1 | use crate::sync::atomic::{AtomicUsize, Ordering};
  2 | 
  3 | use log::warn;
  4 | 
  5 | #[derive(Debug)]
  6 | pub(super) struct BudgetAccounting {
  7 |     max_memory_bytes: usize,
  8 |     used_memory_bytes: AtomicUsize,
  9 |     used_disk_bytes: AtomicUsize,
 10 | }
 11 | 
 12 | impl BudgetAccounting {
 13 |     pub(super) fn new(max_memory_bytes: usize) -> Self {
 14 |         Self {
 15 |             max_memory_bytes,
 16 |             used_memory_bytes: AtomicUsize::new(0),
 17 |             used_disk_bytes: AtomicUsize::new(0),
 18 |         }
 19 |     }
 20 | 
 21 |     pub(super) fn reset_usage(&self) {
 22 |         self.used_memory_bytes.store(0, Ordering::Relaxed);
 23 |         self.used_disk_bytes.store(0, Ordering::Relaxed);
 24 |     }
 25 | 
 26 |     /// Try to reserve space in the cache.
 27 |     /// Returns true if the space was reserved, false if the cache is full.
 28 |     pub(super) fn try_reserve_memory(&self, request_bytes: usize) -> Result<(), ()> {
 29 |         let used = self.used_memory_bytes.load(Ordering::Relaxed);
 30 |         if used + request_bytes > self.max_memory_bytes {
 31 |             return Err(());
 32 |         }
 33 | 
 34 |         match self.used_memory_bytes.compare_exchange(
 35 |             used,
 36 |             used + request_bytes,
 37 |             Ordering::Relaxed,
 38 |             Ordering::Relaxed,
 39 |         ) {
 40 |             Ok(_) => Ok(()),
 41 |             Err(_) => self.try_reserve_memory(request_bytes),
 42 |         }
 43 |     }
 44 | 
 45 |     /// Adjust the cache size after transcoding.
 46 |     /// Returns true if the size was adjusted, false if the cache is full, when new_size is larger than old_size.
 47 |     pub(super) fn try_update_memory_usage(
 48 |         &self,
 49 |         old_size: usize,
 50 |         new_size: usize,
 51 |     ) -> Result<(), ()> {
 52 |         if old_size < new_size {
 53 |             let diff = new_size - old_size;
 54 |             if diff > 1024 * 1024 {
 55 |                 warn!(
 56 |                     "Transcoding increased the size of the array by at least 1MB, previous size: {old_size}, new size: {new_size}, double check this is correct"
 57 |                 );
 58 |             }
 59 | 
 60 |             self.try_reserve_memory(diff)?;
 61 |             Ok(())
 62 |         } else {
 63 |             self.used_memory_bytes
 64 |                 .fetch_sub(old_size - new_size, Ordering::Relaxed);
 65 |             Ok(())
 66 |         }
 67 |     }
 68 | 
 69 |     pub fn memory_usage_bytes(&self) -> usize {
 70 |         self.used_memory_bytes.load(Ordering::Relaxed)
 71 |     }
 72 | 
 73 |     pub fn disk_usage_bytes(&self) -> usize {
 74 |         self.used_disk_bytes.load(Ordering::Relaxed)
 75 |     }
 76 | 
 77 |     pub fn add_used_disk_bytes(&self, bytes: usize) {
 78 |         self.used_disk_bytes.fetch_add(bytes, Ordering::Relaxed);
 79 |     }
 80 | }
 81 | 
 82 | #[cfg(test)]
 83 | mod tests {
 84 |     use super::*;
 85 |     use crate::sync::{Arc, Barrier, thread};
 86 | 
 87 |     #[test]
 88 |     fn test_memory_reservation_and_accounting() {
 89 |         let config = BudgetAccounting::new(1000);
 90 | 
 91 |         assert_eq!(config.memory_usage_bytes(), 0);
 92 | 
 93 |         assert!(config.try_reserve_memory(500).is_ok());
 94 |         assert_eq!(config.memory_usage_bytes(), 500);
 95 | 
 96 |         assert!(config.try_reserve_memory(300).is_ok());
 97 |         assert_eq!(config.memory_usage_bytes(), 800);
 98 | 
 99 |         assert!(config.try_reserve_memory(300).is_err());
100 |         assert_eq!(config.memory_usage_bytes(), 800);
101 | 
102 |         config.reset_usage();
103 |         assert_eq!(config.memory_usage_bytes(), 0);
104 |     }
105 | 
106 |     #[test]
107 |     fn test_concurrent_memory_operations() {
108 |         test_concurrent_memory_budget();
109 |     }
110 | 
111 |     #[cfg(feature = "shuttle")]
112 |     #[test]
113 |     fn shuttle_memory_budget_operations() {
114 |         crate::utils::shuttle_test(test_concurrent_memory_budget);
115 |     }
116 | 
117 |     fn test_concurrent_memory_budget() {
118 |         let num_threads = 3;
119 |         let max_memory = 10000;
120 |         let operations_per_thread = 100;
121 | 
122 |         let budget = Arc::new(BudgetAccounting::new(max_memory));
123 |         let barrier = Arc::new(Barrier::new(num_threads));
124 | 
125 |         let mut thread_handles = vec![];
126 | 
127 |         for _ in 0..num_threads {
128 |             let budget_clone = budget.clone();
129 |             let barrier_clone = barrier.clone();
130 | 
131 |             let handle = thread::spawn(move || {
132 |                 let mut successful_reservations = Vec::new();
133 | 
134 |                 barrier_clone.wait();
135 | 
136 |                 for i in 0..operations_per_thread {
137 |                     let reserve_size = 10 + (i % 20) * 5; // 10 to 105 bytes
138 |                     if budget_clone.try_reserve_memory(reserve_size).is_ok() {
139 |                         successful_reservations.push(reserve_size);
140 |                     }
141 | 
142 |                     if i % 5 == 0 && !successful_reservations.is_empty() {
143 |                         let idx = i % successful_reservations.len();
144 |                         let old_size = successful_reservations[idx];
145 |                         let new_size = if i % 2 == 0 {
146 |                             old_size + 5 // Grow
147 |                         } else {
148 |                             old_size.saturating_sub(5) // Shrink
149 |                         };
150 | 
151 |                         if budget_clone
152 |                             .try_update_memory_usage(old_size, new_size)
153 |                             .is_ok()
154 |                         {
155 |                             successful_reservations[idx] = new_size;
156 |                         }
157 |                     }
158 |                 }
159 |                 successful_reservations
160 |             });
161 | 
162 |             thread_handles.push(handle);
163 |         }
164 | 
165 |         let mut expected_memory_usage = 0;
166 |         for handle in thread_handles {
167 |             let reservations = handle.join().unwrap();
168 |             for size in reservations {
169 |                 expected_memory_usage += size;
170 |             }
171 |         }
172 | 
173 |         assert_eq!(budget.memory_usage_bytes(), expected_memory_usage);
174 |         assert!(budget.memory_usage_bytes() <= max_memory);
175 |     }
176 | }
177 | 


--------------------------------------------------------------------------------
/src/liquid_parquet/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![warn(missing_docs)]
 2 | #![cfg_attr(not(doctest), doc = include_str!(concat!("../", std::env!("CARGO_PKG_README"))))]
 3 | 
 4 | mod cache;
 5 | pub mod liquid_array;
 6 | mod reader;
 7 | mod sync;
 8 | pub use cache::policies;
 9 | pub use cache::{LiquidCache, LiquidCacheRef, LiquidCachedFileRef};
10 | pub use reader::LiquidParquetSource;
11 | pub use reader::LiquidPredicate;
12 | pub(crate) mod utils;
13 | pub use utils::boolean_buffer_and_then;
14 | 
15 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Ord, PartialOrd)]
16 | #[allow(unused)]
17 | enum AblationStudyMode {
18 |     FullDecoding = 0,
19 |     SelectiveDecoding = 1,
20 |     SelectiveWithLateMaterialization = 2,
21 |     EvaluateOnEncodedData = 3,
22 |     EvaluateOnPartialEncodedData = 4,
23 | }
24 | 
25 | // This is deliberately made const to avoid the overhead of runtime branching.
26 | const ABLATION_STUDY_MODE: AblationStudyMode = AblationStudyMode::EvaluateOnPartialEncodedData;
27 | 


--------------------------------------------------------------------------------
/src/liquid_parquet/src/liquid_array/mod.rs:
--------------------------------------------------------------------------------
  1 | //! LiquidArray is the core data structure of LiquidCache.
  2 | //! You should not use this module directly.
  3 | //! Instead, use `liquid_cache_server` or `liquid_cache_client` to interact with LiquidCache.
  4 | mod byte_array;
  5 | mod fix_len_byte_array;
  6 | mod float_array;
  7 | pub(crate) mod ipc;
  8 | mod primitive_array;
  9 | pub mod raw;
 10 | pub(crate) mod utils;
 11 | 
 12 | use std::{any::Any, num::NonZero, sync::Arc};
 13 | 
 14 | use arrow::array::{ArrayRef, BooleanArray};
 15 | pub use byte_array::LiquidByteArray;
 16 | pub use fix_len_byte_array::LiquidFixedLenByteArray;
 17 | use float_array::LiquidFloatType;
 18 | pub use float_array::{LiquidFloat32Array, LiquidFloat64Array, LiquidFloatArray};
 19 | pub use primitive_array::{
 20 |     LiquidDate32Array, LiquidDate64Array, LiquidI8Array, LiquidI16Array, LiquidI32Array,
 21 |     LiquidI64Array, LiquidPrimitiveArray, LiquidPrimitiveType, LiquidU8Array, LiquidU16Array,
 22 |     LiquidU32Array, LiquidU64Array,
 23 | };
 24 | 
 25 | /// Liquid data type is only logical type
 26 | #[derive(Debug, Clone, Copy)]
 27 | #[repr(u16)]
 28 | pub enum LiquidDataType {
 29 |     /// A byte array.
 30 |     ByteArray = 0,
 31 |     /// An integer.
 32 |     Integer = 1,
 33 |     /// A float.
 34 |     Float = 2,
 35 |     /// A fixed length byte array.
 36 |     FixedLenByteArray = 3,
 37 | }
 38 | 
 39 | impl From<u16> for LiquidDataType {
 40 |     fn from(value: u16) -> Self {
 41 |         match value {
 42 |             0 => LiquidDataType::ByteArray,
 43 |             1 => LiquidDataType::Integer,
 44 |             2 => LiquidDataType::Float,
 45 |             3 => LiquidDataType::FixedLenByteArray,
 46 |             _ => panic!("Invalid liquid data type: {value}"),
 47 |         }
 48 |     }
 49 | }
 50 | 
 51 | /// A trait to access the underlying Liquid array.
 52 | pub trait AsLiquidArray {
 53 |     /// Get the underlying string array.
 54 |     fn as_string_array_opt(&self) -> Option<&LiquidByteArray>;
 55 | 
 56 |     /// Get the underlying string array.
 57 |     fn as_string(&self) -> &LiquidByteArray {
 58 |         self.as_string_array_opt().expect("liquid string array")
 59 |     }
 60 | 
 61 |     /// Get the underlying binary array.
 62 |     fn as_binary_array_opt(&self) -> Option<&LiquidByteArray>;
 63 | 
 64 |     /// Get the underlying binary array.
 65 |     fn as_binary(&self) -> &LiquidByteArray {
 66 |         self.as_binary_array_opt().expect("liquid binary array")
 67 |     }
 68 | 
 69 |     /// Get the underlying primitive array.
 70 |     fn as_primitive_array_opt<T: LiquidPrimitiveType>(&self) -> Option<&LiquidPrimitiveArray<T>>;
 71 | 
 72 |     /// Get the underlying primitive array.
 73 |     fn as_primitive<T: LiquidPrimitiveType>(&self) -> &LiquidPrimitiveArray<T> {
 74 |         self.as_primitive_array_opt()
 75 |             .expect("liquid primitive array")
 76 |     }
 77 | 
 78 |     /// Get the underlying float array.
 79 |     fn as_float_array_opt<T: LiquidFloatType>(&self) -> Option<&LiquidFloatArray<T>>;
 80 | 
 81 |     /// Get the underlying float array.
 82 |     fn as_float<T: LiquidFloatType>(&self) -> &LiquidFloatArray<T> {
 83 |         self.as_float_array_opt().expect("liquid float array")
 84 |     }
 85 | }
 86 | 
 87 | impl AsLiquidArray for dyn LiquidArray + '_ {
 88 |     fn as_string_array_opt(&self) -> Option<&LiquidByteArray> {
 89 |         self.as_any().downcast_ref()
 90 |     }
 91 | 
 92 |     fn as_primitive_array_opt<T: LiquidPrimitiveType>(&self) -> Option<&LiquidPrimitiveArray<T>> {
 93 |         self.as_any().downcast_ref()
 94 |     }
 95 | 
 96 |     fn as_binary_array_opt(&self) -> Option<&LiquidByteArray> {
 97 |         self.as_any().downcast_ref()
 98 |     }
 99 | 
100 |     fn as_float_array_opt<T: LiquidFloatType>(&self) -> Option<&LiquidFloatArray<T>> {
101 |         self.as_any().downcast_ref()
102 |     }
103 | }
104 | 
105 | /// A Liquid array.
106 | pub trait LiquidArray: std::fmt::Debug + Send + Sync {
107 |     /// Get the underlying any type.
108 |     fn as_any(&self) -> &dyn Any;
109 | 
110 |     /// Get the memory size of the Liquid array.
111 |     fn get_array_memory_size(&self) -> usize;
112 | 
113 |     /// Get the length of the Liquid array.
114 |     fn len(&self) -> usize;
115 | 
116 |     /// Check if the Liquid array is empty.
117 |     fn is_empty(&self) -> bool {
118 |         self.len() == 0
119 |     }
120 | 
121 |     /// Convert the Liquid array to an Arrow array.
122 |     fn to_arrow_array(&self) -> ArrayRef;
123 | 
124 |     /// Convert the Liquid array to an Arrow array.
125 |     /// Except that it will pick the best encoding for the arrow array.
126 |     /// Meaning that it may not obey the data type of the original arrow array.
127 |     fn to_best_arrow_array(&self) -> ArrayRef {
128 |         self.to_arrow_array()
129 |     }
130 | 
131 |     /// Get the logical data type of the Liquid array.
132 |     fn data_type(&self) -> LiquidDataType;
133 | 
134 |     /// Serialize the Liquid array to a byte array.
135 |     fn to_bytes(&self) -> Vec<u8>;
136 | 
137 |     /// Filter the Liquid array with a boolean array.
138 |     fn filter(&self, selection: &BooleanArray) -> LiquidArrayRef;
139 | }
140 | 
141 | /// A reference to a Liquid array.
142 | pub type LiquidArrayRef = Arc<dyn LiquidArray>;
143 | 
144 | pub(crate) fn get_bit_width(max_value: u64) -> NonZero<u8> {
145 |     if max_value <= 1 {
146 |         // todo: here we actually should return 0, as we should just use constant encoding.
147 |         // but that's not implemented yet.
148 |         NonZero::new(1).unwrap()
149 |     } else {
150 |         NonZero::new(64 - max_value.leading_zeros() as u8).unwrap()
151 |     }
152 | }
153 | 


--------------------------------------------------------------------------------
/src/liquid_parquet/src/liquid_array/raw/mod.rs:
--------------------------------------------------------------------------------
1 | //! Low level array primitives.
2 | //! You should not use this module directly.
3 | //! Instead, use `liquid_cache_server` or `liquid_cache_client` to interact with LiquidCache.
4 | pub(super) mod bit_pack_array;
5 | pub(super) mod fsst_array;
6 | pub use bit_pack_array::BitPackedArray;
7 | pub use fsst_array::FsstArray;
8 | 


--------------------------------------------------------------------------------
/src/liquid_parquet/src/liquid_array/utils.rs:
--------------------------------------------------------------------------------
 1 | #[cfg(test)]
 2 | pub(crate) fn gen_test_decimal_array<T: arrow::datatypes::DecimalType>(
 3 |     data_type: arrow_schema::DataType,
 4 | ) -> arrow::array::PrimitiveArray<T> {
 5 |     use arrow::{
 6 |         array::{AsArray, Int64Builder},
 7 |         compute::kernels::cast,
 8 |     };
 9 | 
10 |     let mut builder = Int64Builder::new();
11 |     for i in 0..4096i64 {
12 |         if i % 97 == 0 {
13 |             builder.append_null();
14 |         } else {
15 |             let value = if i % 5 == 0 {
16 |                 i * 1000 + 123
17 |             } else if i % 3 == 0 {
18 |                 42
19 |             } else if i % 7 == 0 {
20 |                 i * 1_000_000 + 456789
21 |             } else {
22 |                 i * 100 + 42
23 |             };
24 |             builder.append_value(value as i64);
25 |         }
26 |     }
27 |     let array = builder.finish();
28 |     cast(&array, &data_type)
29 |         .unwrap()
30 |         .as_primitive::<T>()
31 |         .clone()
32 | }
33 | 


--------------------------------------------------------------------------------
/src/liquid_parquet/src/reader/mod.rs:
--------------------------------------------------------------------------------
 1 | /// Everything happens during query planning time
 2 | mod plantime;
 3 | 
 4 | /// Everything happens during query execution time
 5 | mod runtime;
 6 | 
 7 | mod utils;
 8 | 
 9 | pub use plantime::LiquidParquetSource;
10 | pub use runtime::LiquidPredicate;
11 | 


--------------------------------------------------------------------------------
/src/liquid_parquet/src/reader/plantime/mod.rs:
--------------------------------------------------------------------------------
 1 | #[cfg(test)]
 2 | pub(crate) use source::CachedMetaReaderFactory;
 3 | pub use source::LiquidParquetSource;
 4 | pub(crate) use source::ParquetMetadataCacheReader;
 5 | 
 6 | // This is entirely copied from DataFusion
 7 | // We should make DataFusion to public this
 8 | mod row_filter;
 9 | 
10 | // This is entirely copied from DataFusion
11 | // We should make DataFusion to public this
12 | mod opener;
13 | mod row_group_filter;
14 | mod source;
15 | 


--------------------------------------------------------------------------------
/src/liquid_parquet/src/reader/runtime/mod.rs:
--------------------------------------------------------------------------------
 1 | use crate::liquid_array::LiquidArrayRef;
 2 | use arrow::array::{BooleanArray, RecordBatch};
 3 | use arrow_schema::ArrowError;
 4 | use in_memory_rg::InMemoryRowGroup;
 5 | use parquet::arrow::arrow_reader::ArrowPredicate;
 6 | pub(crate) use parquet_bridge::ArrowReaderBuilderBridge;
 7 | use parquet_bridge::get_predicate_column_id;
 8 | 
 9 | mod in_memory_rg;
10 | mod liquid_stream;
11 | mod parquet_bridge;
12 | mod reader;
13 | mod utils;
14 | 
15 | /// A predicate that can be evaluated on a liquid array.
16 | pub trait LiquidPredicate: ArrowPredicate {
17 |     /// Evaluates the predicate on a liquid array.
18 |     /// Returns `None` if the predicate is not applicable to the array.
19 |     fn evaluate_liquid(
20 |         &mut self,
21 |         array: &LiquidArrayRef,
22 |     ) -> Result<Option<BooleanArray>, ArrowError>;
23 | 
24 |     /// Evaluates the predicate on an arrow record batch.
25 |     fn evaluate_arrow(&mut self, array: RecordBatch) -> Result<BooleanArray, ArrowError> {
26 |         self.evaluate(array)
27 |     }
28 | 
29 |     /// Returns the column ids of the predicate.
30 |     fn predicate_column_ids(&self) -> Vec<usize> {
31 |         let projection = self.projection();
32 |         get_predicate_column_id(projection)
33 |     }
34 | }
35 | 
36 | pub struct LiquidRowFilter {
37 |     pub(crate) predicates: Vec<Box<dyn LiquidPredicate>>,
38 | }
39 | 
40 | impl LiquidRowFilter {
41 |     pub fn new(predicates: Vec<Box<dyn LiquidPredicate>>) -> Self {
42 |         Self { predicates }
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/src/liquid_parquet/src/reader/runtime/reader/mod.rs:
--------------------------------------------------------------------------------
 1 | use super::LiquidRowFilter;
 2 | 
 3 | mod cached_array_reader;
 4 | mod liquid_batch_reader;
 5 | pub(crate) use cached_array_reader::build_cached_array_reader;
 6 | pub(crate) use liquid_batch_reader::LiquidBatchReader;
 7 | pub(super) mod cached_page;
 8 | 
 9 | #[cfg(test)]
10 | mod tests;
11 | 


--------------------------------------------------------------------------------
/src/liquid_parquet/src/reader/utils/mod.rs:
--------------------------------------------------------------------------------
1 | pub(crate) mod boolean_selection;
2 | 


--------------------------------------------------------------------------------
/src/liquid_parquet/src/sync.rs:
--------------------------------------------------------------------------------
1 | #[cfg(all(feature = "shuttle", test))]
2 | #[allow(unused_imports)]
3 | pub use shuttle::{sync::*, thread};
4 | #[cfg(not(all(feature = "shuttle", test)))]
5 | #[allow(unused_imports)]
6 | pub use std::{sync::*, thread};
7 | 


--------------------------------------------------------------------------------
/src/server/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "liquid-cache-server"
 3 | version = { workspace = true }
 4 | edition = { workspace = true }
 5 | license = { workspace = true }
 6 | readme = { workspace = true }
 7 | description = { workspace = true }
 8 | repository = { workspace = true }
 9 | 
10 | 
11 | [dependencies]
12 | datafusion = { workspace = true }
13 | datafusion-proto = { workspace = true }
14 | futures = { workspace = true }
15 | arrow = { workspace = true }
16 | arrow-flight = { workspace = true }
17 | arrow-schema = { workspace = true }
18 | log = { workspace = true }
19 | prost = { workspace = true }
20 | tonic = { workspace = true }
21 | tokio = { workspace = true }
22 | url = { workspace = true }
23 | liquid-cache-parquet = { workspace = true }
24 | object_store = { workspace = true, features = ["aws", "http"] }
25 | liquid-cache-common = { workspace = true }
26 | async-trait = { workspace = true }
27 | bytes = { workspace = true }
28 | async-stream = "0.3"
29 | tempfile = { workspace = true }
30 | axum = "0.8.4"
31 | serde = { workspace = true }
32 | serde_json = { workspace = true }
33 | tower-http = { version = "0.6.4", features = ["cors"] }
34 | sysinfo = { version = "0.35.1", default-features = false, features = [
35 | 	"component",
36 | 	"disk",
37 | 	"network",
38 | 	"system",
39 | 	"user",
40 | ] }
41 | uuid = { workspace = true }
42 | fastrace = { workspace = true }
43 | fastrace-futures = { workspace = true }
44 | pprof = { version = "0.14.0", features = ["flamegraph"] }
45 | anyhow = "1.0"
46 | 
47 | [dev-dependencies]
48 | liquid-cache-client = { workspace = true }
49 | insta = { version = "1.43.1" }
50 | parquet = { workspace = true }
51 | 


--------------------------------------------------------------------------------
/src/server/src/admin_server/flamegraph.rs:
--------------------------------------------------------------------------------
 1 | use std::sync::Mutex;
 2 | 
 3 | use pprof::ProfilerGuard;
 4 | 
 5 | pub(super) struct FlameGraph {
 6 |     guard: Mutex<Option<ProfilerGuard<'static>>>,
 7 | }
 8 | 
 9 | impl FlameGraph {
10 |     pub fn new() -> Self {
11 |         Self {
12 |             guard: Mutex::new(None),
13 |         }
14 |     }
15 | 
16 |     pub fn start(&self) {
17 |         let mut guard = self.guard.lock().unwrap();
18 |         let old = guard.take();
19 |         assert!(old.is_none(), "FlameGraph is already started");
20 |         *guard = Some(
21 |             pprof::ProfilerGuardBuilder::default()
22 |                 .frequency(500)
23 |                 .blocklist(&["libpthread.so.0", "libm.so.6", "libgcc_s.so.1"])
24 |                 .build()
25 |                 .unwrap(),
26 |         );
27 |     }
28 | 
29 |     pub fn stop_to_string(&self) -> anyhow::Result<String> {
30 |         let mut guard = self.guard.lock().unwrap();
31 |         let old = guard.take();
32 |         if old.is_none() {
33 |             return Err(anyhow::anyhow!("FlameGraph is not started"));
34 |         }
35 |         let profiler = old.unwrap();
36 |         drop(guard);
37 | 
38 |         let report = profiler.report().build()?;
39 |         let mut svg_data = Vec::new();
40 |         report.flamegraph(&mut svg_data)?;
41 |         Ok(String::from_utf8(svg_data)?)
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/src/server/src/admin_server/mod.rs:
--------------------------------------------------------------------------------
 1 | //! Admin server for the liquid cache server
 2 | //!
 3 | //! This server is used to manage the liquid cache server
 4 | 
 5 | use axum::http::{HeaderValue, Method};
 6 | use axum::{
 7 |     Router,
 8 |     routing::{get, post},
 9 | };
10 | use flamegraph::FlameGraph;
11 | use std::sync::atomic::AtomicU32;
12 | use std::{net::SocketAddr, sync::Arc};
13 | use tower_http::cors::CorsLayer;
14 | 
15 | mod flamegraph;
16 | mod handlers;
17 | pub(crate) mod models;
18 | 
19 | use crate::LiquidCacheService;
20 | 
21 | pub(crate) struct AppState {
22 |     liquid_cache: Arc<LiquidCacheService>,
23 |     trace_id: AtomicU32,
24 |     stats_id: AtomicU32,
25 |     flamegraph: Arc<FlameGraph>,
26 | }
27 | 
28 | /// Run the admin server
29 | pub async fn run_admin_server(
30 |     addr: SocketAddr,
31 |     liquid_cache: Arc<LiquidCacheService>,
32 | ) -> Result<(), Box<dyn std::error::Error>> {
33 |     let state = Arc::new(AppState {
34 |         liquid_cache,
35 |         trace_id: AtomicU32::new(0),
36 |         stats_id: AtomicU32::new(0),
37 |         flamegraph: Arc::new(FlameGraph::new()),
38 |     });
39 | 
40 |     // Create a CORS layer that allows all localhost origins
41 |     let cors = CorsLayer::new()
42 |         // Allow all localhost origins (http and https)
43 |         .allow_origin([
44 |             "http://localhost:3000".parse::<HeaderValue>().unwrap(),
45 |             "http://127.0.0.1:3000".parse::<HeaderValue>().unwrap(),
46 |             "https://liquid-cache-admin.xiangpeng.systems"
47 |                 .parse::<HeaderValue>()
48 |                 .unwrap(),
49 |         ])
50 |         .allow_methods([Method::GET, Method::POST, Method::PUT, Method::DELETE])
51 |         .allow_headers([axum::http::header::CONTENT_TYPE]);
52 | 
53 |     let app = Router::new()
54 |         .route("/shutdown", get(handlers::shutdown_handler))
55 |         .route("/reset_cache", get(handlers::reset_cache_handler))
56 |         .route(
57 |             "/parquet_cache_usage",
58 |             get(handlers::get_parquet_cache_usage_handler),
59 |         )
60 |         .route("/cache_info", get(handlers::get_cache_info_handler))
61 |         .route("/system_info", get(handlers::get_system_info_handler))
62 |         .route("/start_trace", get(handlers::start_trace_handler))
63 |         .route("/stop_trace", get(handlers::stop_trace_handler))
64 |         .route(
65 |             "/execution_metrics",
66 |             get(handlers::get_execution_metrics_handler),
67 |         )
68 |         .route("/execution_plans", get(handlers::get_execution_stats))
69 |         .route("/cache_stats", get(handlers::get_cache_stats_handler))
70 |         .route("/start_flamegraph", get(handlers::start_flamegraph_handler))
71 |         .route("/stop_flamegraph", get(handlers::stop_flamegraph_handler))
72 |         .route(
73 |             "/set_execution_stats",
74 |             post(handlers::add_execution_stats_handler),
75 |         )
76 |         .with_state(state)
77 |         .layer(cors);
78 | 
79 |     let listener = tokio::net::TcpListener::bind(addr).await?;
80 |     axum::serve(listener, app).await?;
81 | 
82 |     Ok(())
83 | }
84 | 


--------------------------------------------------------------------------------
/src/server/src/admin_server/models.rs:
--------------------------------------------------------------------------------
  1 | use serde::{Deserialize, Serialize};
  2 | 
  3 | /// Parameters for the set_execution_stats endpoint
  4 | #[derive(Deserialize, Serialize, Clone)]
  5 | pub struct ExecutionStats {
  6 |     /// Plan ID for the execution plan
  7 |     pub plan_ids: Vec<String>,
  8 |     /// Display name for the execution plan
  9 |     pub display_name: String,
 10 |     /// Flamegraph SVG for the execution plan
 11 |     pub flamegraph_svg: Option<String>,
 12 |     /// Network traffic bytes for the execution plan
 13 |     pub network_traffic_bytes: u64,
 14 |     /// Execution time in milliseconds
 15 |     pub execution_time_ms: u64,
 16 |     /// User input SQL
 17 |     pub user_sql: String,
 18 | }
 19 | 
 20 | /// Execution stats with plan
 21 | #[derive(Serialize)]
 22 | pub struct ExecutionStatsWithPlan {
 23 |     /// Execution stats
 24 |     pub execution_stats: ExecutionStats,
 25 |     /// Plan info
 26 |     pub plans: Vec<PlanInfo>,
 27 | }
 28 | 
 29 | /// Response for the admin server
 30 | #[derive(Serialize, Deserialize)]
 31 | pub struct ApiResponse {
 32 |     /// Message for the response
 33 |     pub message: String,
 34 |     /// Status for the response
 35 |     pub status: String,
 36 | }
 37 | 
 38 | /// Schema field
 39 | #[derive(Serialize)]
 40 | pub struct SchemaField {
 41 |     /// Field name
 42 |     pub name: String,
 43 |     /// Field data type
 44 |     pub data_type: String,
 45 | }
 46 | 
 47 | /// Column statistics
 48 | #[derive(Serialize)]
 49 | pub struct ColumnStatistics {
 50 |     /// Column name
 51 |     pub name: String,
 52 |     /// Null count
 53 |     pub null: Option<String>,
 54 |     /// Max value
 55 |     pub max: Option<String>,
 56 |     /// Min value
 57 |     pub min: Option<String>,
 58 |     /// Sum value
 59 |     pub sum: Option<String>,
 60 |     /// Distinct count
 61 |     pub distinct_count: Option<String>,
 62 | }
 63 | 
 64 | /// Statistics
 65 | #[derive(Serialize)]
 66 | pub struct Statistics {
 67 |     /// Number of rows
 68 |     pub num_rows: String,
 69 |     /// Total byte size
 70 |     pub total_byte_size: String,
 71 |     /// Column statistics
 72 |     pub column_statistics: Vec<ColumnStatistics>,
 73 | }
 74 | 
 75 | /// Metric
 76 | #[derive(Serialize)]
 77 | pub struct MetricValues {
 78 |     /// Metric name
 79 |     pub name: String,
 80 |     /// Metric value
 81 |     pub value: String,
 82 | }
 83 | 
 84 | /// Execution plan with stats
 85 | #[derive(Serialize)]
 86 | pub struct ExecutionPlanWithStats {
 87 |     /// Execution plan name
 88 |     pub name: String,
 89 |     /// Schema fields
 90 |     pub schema: Vec<SchemaField>,
 91 |     /// Statistics
 92 |     pub statistics: Statistics,
 93 |     /// Metrics
 94 |     pub metrics: Vec<MetricValues>,
 95 |     /// Children
 96 |     pub children: Vec<ExecutionPlanWithStats>,
 97 | }
 98 | 
 99 | /// Plan info
100 | #[derive(Serialize)]
101 | pub struct PlanInfo {
102 |     /// Created at
103 |     pub created_at: u64,
104 |     /// Execution plan
105 |     pub plan: ExecutionPlanWithStats,
106 |     /// ID
107 |     pub id: String,
108 |     /// Predicate
109 |     pub predicate: Option<String>,
110 | }
111 | 


--------------------------------------------------------------------------------
/src/server/src/errors.rs:
--------------------------------------------------------------------------------
 1 | //! Error handling utilities for LiquidCache server.
 2 | //!
 3 | //! This module provides enhanced error handling with stack traces to help
 4 | //! developers and users identify the exact location where errors occur.
 5 | 
 6 | use anyhow::{Context, Result as AnyhowResult};
 7 | use tonic::Status;
 8 | 
 9 | /// Result type alias for LiquidCache operations
10 | pub type LiquidCacheResult<T> = AnyhowResult<T>;
11 | 
12 | /// Extension trait to add context to Results for better error reporting
13 | pub trait LiquidCacheErrorExt<T> {
14 |     /// Add context to an error for better error reporting
15 |     fn with_liquid_context(self, message: impl Into<String>) -> LiquidCacheResult<T>;
16 | }
17 | 
18 | impl<T, E> LiquidCacheErrorExt<T> for Result<T, E>
19 | where
20 |     E: std::error::Error + Send + Sync + 'static,
21 | {
22 |     fn with_liquid_context(self, message: impl Into<String>) -> LiquidCacheResult<T> {
23 |         self.map_err(anyhow::Error::from).context(message.into())
24 |     }
25 | }
26 | 
27 | /// Convert anyhow::Error to tonic Status with detailed error information including stack trace
28 | pub fn anyhow_to_status(err: anyhow::Error) -> Status {
29 |     // Format the error with full error chain and backtrace for debugging
30 |     let error_with_context = format!("{err:?}");
31 | 
32 |     // Determine the appropriate gRPC status code based on error type
33 |     if let Some(datafusion_err) = err.downcast_ref::<datafusion::error::DataFusionError>() {
34 |         match datafusion_err {
35 |             datafusion::error::DataFusionError::Plan(_) => {
36 |                 Status::invalid_argument(error_with_context)
37 |             }
38 |             datafusion::error::DataFusionError::SchemaError(_, _) => {
39 |                 Status::invalid_argument(error_with_context)
40 |             }
41 |             _ => Status::internal(error_with_context),
42 |         }
43 |     } else if err.downcast_ref::<url::ParseError>().is_some()
44 |         || err.downcast_ref::<uuid::Error>().is_some()
45 |     {
46 |         Status::invalid_argument(error_with_context)
47 |     } else if err.downcast_ref::<object_store::Error>().is_some() {
48 |         Status::internal(error_with_context)
49 |     } else {
50 |         // Default to internal error for unknown error types
51 |         Status::internal(error_with_context)
52 |     }
53 | }
54 | 
55 | /// Legacy compatibility: convert DataFusionError to Status with stack trace
56 | pub fn df_error_to_status_with_trace(err: datafusion::error::DataFusionError) -> Status {
57 |     anyhow_to_status(err.into())
58 | }
59 | 


--------------------------------------------------------------------------------
/src/server/src/tests/cases.rs:
--------------------------------------------------------------------------------
 1 | use std::fs::File;
 2 | use std::path::{Path, PathBuf};
 3 | use std::sync::Arc;
 4 | 
 5 | use liquid_cache_common::CacheMode;
 6 | 
 7 | use crate::tests::run_sql;
 8 | 
 9 | fn gen_parquet(dir: impl AsRef<Path>) -> PathBuf {
10 |     use arrow::array::UInt32Array;
11 |     use arrow::datatypes::{DataType, Field, Schema};
12 |     use arrow::record_batch::RecordBatch;
13 |     use parquet::arrow::ArrowWriter;
14 |     use parquet::file::properties::WriterProperties;
15 |     let temp_path = dir.as_ref().join("parquet_page_index.parquet");
16 |     let file = File::create(&temp_path).unwrap();
17 |     let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::UInt32, false)]));
18 |     let id_array = UInt32Array::from_iter_values(0..200_000);
19 |     let id_batch = RecordBatch::try_new(Arc::clone(&schema), vec![Arc::new(id_array)]).unwrap();
20 |     let props = WriterProperties::builder()
21 |         .set_offset_index_disabled(false)
22 |         .build();
23 |     let mut writer = ArrowWriter::try_new(file, Arc::clone(&schema), Some(props)).unwrap();
24 |     writer.write(&id_batch).unwrap();
25 |     writer.into_inner().unwrap();
26 |     temp_path
27 | }
28 | 
29 | #[tokio::test(flavor = "multi_thread")]
30 | async fn test_parquet_with_page_index() {
31 |     let temp_dir = tempfile::tempdir().unwrap();
32 |     let file = gen_parquet(&temp_dir);
33 |     let file_path = file.to_str().unwrap();
34 | 
35 |     let result = run_sql(
36 |         "SELECT * FROM hits WHERE id = 0",
37 |         CacheMode::LiquidEagerTranscode,
38 |         1000,
39 |         file_path,
40 |     )
41 |     .await;
42 |     insta::assert_snapshot!(result);
43 | }
44 | 


--------------------------------------------------------------------------------
/src/server/src/tests/mod.rs:
--------------------------------------------------------------------------------
  1 | use std::{path::PathBuf, sync::Arc};
  2 | 
  3 | use arrow::util::pretty::pretty_format_batches;
  4 | use datafusion::{
  5 |     physical_plan::{ExecutionPlan, collect},
  6 |     prelude::SessionContext,
  7 | };
  8 | use liquid_cache_common::CacheEvictionStrategy::Discard;
  9 | use liquid_cache_common::CacheMode;
 10 | use uuid::Uuid;
 11 | 
 12 | mod cases;
 13 | 
 14 | use crate::{LiquidCacheService, LiquidCacheServiceInner};
 15 | 
 16 | const TEST_FILE: &str = "../../examples/nano_hits.parquet";
 17 | 
 18 | async fn get_physical_plan(sql: &str, ctx: &SessionContext) -> Arc<dyn ExecutionPlan> {
 19 |     let df = ctx.sql(sql).await.unwrap();
 20 |     let (state, plan) = df.into_parts();
 21 |     state.create_physical_plan(&plan).await.unwrap()
 22 | }
 23 | 
 24 | async fn run_sql(sql: &str, mode: CacheMode, cache_size_bytes: usize, file_path: &str) -> String {
 25 |     let ctx = Arc::new(LiquidCacheService::context().unwrap());
 26 |     ctx.register_parquet("hits", file_path, Default::default())
 27 |         .await
 28 |         .unwrap();
 29 |     let service = LiquidCacheServiceInner::new(
 30 |         ctx.clone(),
 31 |         Some(cache_size_bytes),
 32 |         PathBuf::from("test"),
 33 |         mode,
 34 |         Discard,
 35 |     );
 36 |     async fn get_result(service: &LiquidCacheServiceInner, sql: &str) -> String {
 37 |         let handle = Uuid::new_v4();
 38 |         let ctx = service.get_ctx();
 39 |         let plan = get_physical_plan(sql, &ctx).await;
 40 |         service.register_plan(handle, plan);
 41 |         let plan = service.get_plan(&handle).unwrap();
 42 |         let batches = collect(plan.plan, ctx.task_ctx()).await.unwrap();
 43 |         pretty_format_batches(&batches).unwrap().to_string()
 44 |     }
 45 | 
 46 |     let first_iter = get_result(&service, sql).await;
 47 |     let second_iter = get_result(&service, sql).await;
 48 | 
 49 |     assert_eq!(first_iter, second_iter);
 50 | 
 51 |     first_iter
 52 | }
 53 | 
 54 | async fn test_runner(sql: &str, reference: &str) {
 55 |     let modes = [
 56 |         CacheMode::LiquidEagerTranscode,
 57 |         CacheMode::Arrow,
 58 |         CacheMode::Liquid,
 59 |     ];
 60 | 
 61 |     // 573960 is the first batch size of URL
 62 |     let sizes = [10, 573960, usize::MAX];
 63 | 
 64 |     for mode in modes {
 65 |         for size in sizes {
 66 |             let result = run_sql(sql, mode, size, TEST_FILE).await;
 67 |             assert_eq!(result, reference);
 68 |         }
 69 |     }
 70 | }
 71 | 
 72 | #[tokio::test(flavor = "multi_thread")]
 73 | async fn test_url_prefix() {
 74 |     let sql = r#"select COUNT(*) from hits where "URL" like 'https://%'"#;
 75 |     let reference = run_sql(sql, CacheMode::LiquidEagerTranscode, 573960, TEST_FILE).await;
 76 |     insta::assert_snapshot!(reference);
 77 |     test_runner(sql, &reference).await;
 78 | }
 79 | 
 80 | #[tokio::test(flavor = "multi_thread")]
 81 | async fn test_url() {
 82 |     let sql = r#"select "URL" from hits where "URL" like '%tours%' order by "URL" desc"#;
 83 |     let reference = run_sql(sql, CacheMode::LiquidEagerTranscode, 573960, TEST_FILE).await;
 84 |     insta::assert_snapshot!(reference);
 85 |     test_runner(sql, &reference).await;
 86 | }
 87 | 
 88 | #[tokio::test(flavor = "multi_thread")]
 89 | async fn test_os() {
 90 |     let sql = r#"select "OS" from hits where "URL" like '%tours%' order by "OS" desc"#;
 91 |     let reference = run_sql(sql, CacheMode::LiquidEagerTranscode, 573960, TEST_FILE).await;
 92 |     insta::assert_snapshot!(reference);
 93 |     test_runner(sql, &reference).await;
 94 | }
 95 | 
 96 | #[tokio::test(flavor = "multi_thread")]
 97 | async fn test_referer() {
 98 |     let sql = r#"select "Referer" from hits where "Referer" <> '' AND "URL" like '%tours%' order by "Referer" desc"#;
 99 |     let reference = run_sql(sql, CacheMode::LiquidEagerTranscode, 573960, TEST_FILE).await;
100 |     insta::assert_snapshot!(reference);
101 |     test_runner(sql, &reference).await;
102 | }
103 | 
104 | #[tokio::test(flavor = "multi_thread")]
105 | #[ignore = "Wait for https://github.com/apache/datafusion/pull/15827 to be merged"]
106 | async fn test_min_max() {
107 |     let sql = r#"select min("Referer"), max("Referer") from hits where "Referer" <> '' AND "URL" like '%tours%'"#;
108 |     let reference = run_sql(sql, CacheMode::LiquidEagerTranscode, 573960, TEST_FILE).await;
109 |     insta::assert_snapshot!(reference);
110 |     test_runner(sql, &reference).await;
111 | }
112 | 


--------------------------------------------------------------------------------
/src/server/src/tests/snapshots/liquid_cache_server__tests__cases__parquet_with_page_index.snap:
--------------------------------------------------------------------------------
 1 | ---
 2 | source: src/server/src/tests/cases.rs
 3 | expression: result
 4 | ---
 5 | +----+
 6 | | id |
 7 | +----+
 8 | | 0  |
 9 | +----+
10 | 


--------------------------------------------------------------------------------
/src/server/src/tests/snapshots/liquid_cache_server__tests__min_max.snap:
--------------------------------------------------------------------------------
 1 | ---
 2 | source: src/server/src/tests/mod.rs
 3 | expression: eager
 4 | ---
 5 | +----------------------------------------------------------------+-----------------------------------------------------------+
 6 | | min(hits.Referer)                                              | max(hits.Referer)                                         |
 7 | +----------------------------------------------------------------+-----------------------------------------------------------+
 8 | | http://tambov.irr.ru/avtoma-gorod55.ru/cars/micros/out-of-town | https://go.mail/folder-1/online.ru/search?text=скачать из |
 9 | +----------------------------------------------------------------+-----------------------------------------------------------+
10 | 


--------------------------------------------------------------------------------
/src/server/src/tests/snapshots/liquid_cache_server__tests__os.snap:
--------------------------------------------------------------------------------
 1 | ---
 2 | source: src/server/src/tests/mod.rs
 3 | expression: eager
 4 | ---
 5 | +----+
 6 | | OS |
 7 | +----+
 8 | | 44 |
 9 | | 44 |
10 | | 44 |
11 | | 44 |
12 | | 44 |
13 | | 2  |
14 | | 2  |
15 | | 2  |
16 | | 2  |
17 | | 2  |
18 | | 2  |
19 | +----+
20 | 


--------------------------------------------------------------------------------
/src/server/src/tests/snapshots/liquid_cache_server__tests__referer.snap:
--------------------------------------------------------------------------------
 1 | ---
 2 | source: src/server/src/tests/mod.rs
 3 | expression: eager
 4 | ---
 5 | +---------------------------------------------------------------------------------------------------------+
 6 | | Referer                                                                                                 |
 7 | +---------------------------------------------------------------------------------------------------------+
 8 | | https://go.mail/folder-1/online.ru/search?text=скачать из                                               |
 9 | | http://tambov.irr.ru/registrict=2660628&cbv=r2013%2F&ei                                                 |
10 | | http://tambov.irr.ru/registrict=2660628&cbv=r2013%2F&ei                                                 |
11 | | http://tambov.irr.ru/registrict=2660628&cbv=r2013%26ev_positions/2/transmittaD3xnA%26ad%3D1%26bid%3D400 |
12 | | http://tambov.irr.ru/registrict=2660628&cbv=r2013%26ev_pl%3Dh%26utm_source=view.php                     |
13 | | http://tambov.irr.ru/registrict=2660628&cbv=r2013%26ev_pl%3Dh%26utm_source=view.php                     |
14 | | http://tambov.irr.ru/filmId=BcVrXpM5UXI&where=any&numphoto                                              |
15 | | http://tambov.irr.ru/filmId=BcVrXpM5UXI&where=any&numphoto                                              |
16 | | http://tambov.irr.ru/avtoma-gorod55.ru/cars/micros/out-of-town                                          |
17 | | http://tambov.irr.ru/avtoma-gorod55.ru/cars/micros/out-of-town                                          |
18 | +---------------------------------------------------------------------------------------------------------+
19 | 


--------------------------------------------------------------------------------
/src/server/src/tests/snapshots/liquid_cache_server__tests__title.snap:
--------------------------------------------------------------------------------
 1 | ---
 2 | source: src/server/src/tests/mod.rs
 3 | expression: eager
 4 | ---
 5 | +----+
 6 | | OS |
 7 | +----+
 8 | | 44 |
 9 | | 44 |
10 | | 44 |
11 | | 44 |
12 | | 44 |
13 | | 2  |
14 | | 2  |
15 | | 2  |
16 | | 2  |
17 | | 2  |
18 | | 2  |
19 | +----+
20 | 


--------------------------------------------------------------------------------
/src/server/src/tests/snapshots/liquid_cache_server__tests__url.snap:
--------------------------------------------------------------------------------
 1 | ---
 2 | source: src/server/src/tests/mod.rs
 3 | expression: eager
 4 | ---
 5 | +-------------------------------------------------------------------------------------------------------------------------------+
 6 | | URL                                                                                                                           |
 7 | +-------------------------------------------------------------------------------------------------------------------------------+
 8 | | https://produkty%2Fpulove.ru/search_terms-vzyat_kobrye-russion/russia/piterators-tourse                                       |
 9 | | https://produkty%2Fpulove.ru/search_terms-vzyat_kobrye-russion/russia/piterators-tourse                                       |
10 | | https://produkty%2Fpulove.ru/booklyattion-ware/tours.ru                                                                       |
11 | | https://produkty%2Fpulove.ru/booklyattion-ware/tours.ru                                                                       |
12 | | https://produkty%2Fpulove.ru/booklyattion-ware/activaro.ru/cars.ru/footours-index.ru/files/item=11497449%26pz%3D0             |
13 | | https://produkty%2Fpulove.ru/booklyattion-ware/activaro.ru/cars.ru/footours-index.ru/files/item=11497449%26pz%3D0             |
14 | | https://produkty%2Fpulove.ru/booklyattion-ware/activaro.ru/cars.ru/footours-index.ru/files/item=11497449%26pz%3D0             |
15 | | https://produkty%2Fpulove.ru/booklyattion-ware/activaro.ru/cars.ru/footours-index.ru/files/item=11497449%26pz%3D0             |
16 | | https://produkty%2Fplata.ru/filmId=e308f57e213e9eee96bcb752910-widget/tours.ru/product_priznaniya-1-metallic=0&engineVolumeTo |
17 | | https://produkty%2Fplata.ru/filmId=e308f57e213e9eee96bcb752910-widget/tours.ru/product_priznaniya-1-metallic=0&engineVolumeTo |
18 | | http://tours/Ekategoriya%2F&sr=http://slovareniye                                                                             |
19 | +-------------------------------------------------------------------------------------------------------------------------------+
20 | 


--------------------------------------------------------------------------------
/src/server/src/tests/snapshots/liquid_cache_server__tests__url_prefix.snap:
--------------------------------------------------------------------------------
 1 | ---
 2 | source: src/server/src/tests/mod.rs
 3 | expression: eager
 4 | ---
 5 | +----------+
 6 | | count(*) |
 7 | +----------+
 8 | | 23113    |
 9 | +----------+
10 | 


--------------------------------------------------------------------------------
/src/server/src/utils.rs:
--------------------------------------------------------------------------------
 1 | use arrow::{array::RecordBatch, compute::concat_batches};
 2 | use datafusion::error::Result;
 3 | use futures::{Stream, ready};
 4 | use futures::{StreamExt, stream::BoxStream};
 5 | use std::{
 6 |     pin::Pin,
 7 |     task::{Context, Poll},
 8 | };
 9 | 
10 | /// A stream that finalizes the record batches.
11 | /// It currently do two things:
12 | /// 1. Gc the record batches, especially for arrays after filtering.
13 | /// 2. Merge small batches into a large one.
14 | pub struct FinalStream {
15 |     inner: BoxStream<'static, Result<RecordBatch>>,
16 |     target_batch_size: usize,
17 |     buffered_batches: Vec<RecordBatch>,
18 |     current_buffered_rows: usize,
19 |     span: fastrace::Span,
20 | }
21 | 
22 | impl FinalStream {
23 |     pub fn new<S: Stream<Item = Result<RecordBatch>> + Send + 'static>(
24 |         inner: S,
25 |         target_batch_size: usize,
26 |         span: fastrace::Span,
27 |     ) -> Self {
28 |         Self {
29 |             inner: inner.boxed(),
30 |             target_batch_size,
31 |             buffered_batches: Vec::new(),
32 |             current_buffered_rows: 0,
33 |             span,
34 |         }
35 |     }
36 | }
37 | 
38 | impl Stream for FinalStream {
39 |     type Item = Result<RecordBatch>;
40 | 
41 |     fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
42 |         let this = &mut *self;
43 |         let _guard = this.span.set_local_parent();
44 |         loop {
45 |             let threshold = (this.target_batch_size * 3) / 4;
46 |             if this.current_buffered_rows > threshold {
47 |                 this.current_buffered_rows = 0;
48 |                 let batches = std::mem::take(&mut this.buffered_batches);
49 |                 let schema = batches[0].schema();
50 |                 let result = concat_batches(&schema, batches.iter());
51 |                 return Poll::Ready(Some(Ok(result?)));
52 |             }
53 | 
54 |             match ready!(this.inner.poll_next_unpin(cx)).transpose()? {
55 |                 Some(batch) => {
56 |                     let num_rows = batch.num_rows();
57 |                     this.current_buffered_rows += num_rows;
58 |                     this.buffered_batches.push(batch);
59 |                 }
60 |                 None => {
61 |                     if this.buffered_batches.is_empty() {
62 |                         return Poll::Ready(None);
63 |                     }
64 |                     this.current_buffered_rows = 0;
65 |                     let batches = std::mem::take(&mut this.buffered_batches);
66 |                     let schema = batches[0].schema();
67 |                     let result = concat_batches(&schema, batches.iter());
68 |                     return Poll::Ready(Some(Ok(result?)));
69 |                 }
70 |             }
71 |         }
72 |     }
73 | }
74 | 


--------------------------------------------------------------------------------