├── .editorconfig
├── docs
    └── duck-sketches-2.jpg
├── .gitignore
├── vcpkg-overlay
    └── apache-datasketches
    │   ├── usage
    │   ├── vcpkg.json
    │   └── portfile.cmake
├── vcpkg.json
├── .gitmodules
├── Makefile
├── src
    ├── include
    │   ├── datasketches_extension.hpp
    │   └── query_farm_telemetry.hpp
    ├── generated.h
    ├── datasketches_extension.cpp
    ├── query_farm_telemetry.cpp
    └── theta_sketch.cpp
├── extension_config.cmake
├── test
    ├── README.md
    └── sql
    │   ├── datasketch_tdigest.test
    │   ├── datasketch_cpc.test
    │   ├── datasketch_kll.test
    │   ├── datasketch_req.test
    │   ├── datasketch_quantiles.test
    │   ├── datasketch_hll.test
    │   ├── datasketch_theta.test
    │   └── datasketch_frequent.test
├── scripts
    ├── setup-custom-toolchain.sh
    ├── extension-upload.sh
    └── bootstrap-template.py
├── .github
    └── workflows
    │   ├── scheduled-1.4.yml
    │   └── MainDistributionPipeline.yml
├── .vscode
    ├── c_cpp_properties.json
    └── settings.json
├── LICENSE
├── CMakeLists.txt
└── codegen
    ├── generator.py
    └── generated.cpp.j2


/.editorconfig:
--------------------------------------------------------------------------------
1 | duckdb/.editorconfig


--------------------------------------------------------------------------------
/docs/duck-sketches-2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Query-farm/datasketches/HEAD/docs/duck-sketches-2.jpg


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | .idea
3 | cmake-build-debug
4 | duckdb_unittest_tempdir/
5 | .DS_Store
6 | testext
7 | test/python/__pycache__/
8 | .Rhistory
9 | 


--------------------------------------------------------------------------------
/vcpkg-overlay/apache-datasketches/usage:
--------------------------------------------------------------------------------
1 | apache-datasketches provides CMake targets:
2 | 
3 |     find_package(DataSketches CONFIG REQUIRED)
4 |     target_link_libraries(main PRIVATE datasketches)
5 | 


--------------------------------------------------------------------------------
/vcpkg.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "dependencies": [
 3 |     "apache-datasketches"
 4 |   ],
 5 |   "vcpkg-configuration": {
 6 |     "overlay-ports": [
 7 |       "./vcpkg-overlay/apache-datasketches"
 8 |     ]
 9 |   }
10 | }


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "duckdb"]
2 | 	path = duckdb
3 | 	url = https://github.com/duckdb/duckdb
4 | 	branch = main
5 | [submodule "extension-ci-tools"]
6 | 	path = extension-ci-tools
7 | 	url = https://github.com/duckdb/extension-ci-tools
8 | 	branch = main


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | PROJ_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
2 | 
3 | # Configuration of extension
4 | EXT_NAME=datasketches
5 | EXT_CONFIG=${PROJ_DIR}extension_config.cmake
6 | 
7 | # Include the Makefile from extension-ci-tools
8 | include extension-ci-tools/makefiles/duckdb_extension.Makefile


--------------------------------------------------------------------------------
/src/include/datasketches_extension.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "duckdb.hpp"
 4 | 
 5 | namespace duckdb
 6 | {
 7 | 
 8 | 	class DatasketchesExtension : public Extension
 9 | 	{
10 | 	public:
11 | 		void Load(ExtensionLoader &loader) override;
12 | 		std::string Name() override;
13 | 	};
14 | 
15 | } // namespace duckdb
16 | 


--------------------------------------------------------------------------------
/extension_config.cmake:
--------------------------------------------------------------------------------
 1 | # This file is included by DuckDB's build system. It specifies which extension to load
 2 | 
 3 | # Extension from this repo
 4 | duckdb_extension_load(datasketches
 5 |     SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}
 6 |     LOAD_TESTS
 7 | )
 8 | 
 9 | # Any extra extensions that should be built
10 | # e.g.: duckdb_extension_load(json)


--------------------------------------------------------------------------------
/src/generated.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "datasketches_extension.hpp"
 4 | 
 5 | using namespace duckdb;
 6 | namespace duckdb_datasketches
 7 | {
 8 |   void LoadQuantilesSketch(ExtensionLoader &loader);
 9 |   void LoadKLLSketch(ExtensionLoader &loader);
10 |   void LoadREQSketch(ExtensionLoader &loader);
11 |   void LoadTDigestSketch(ExtensionLoader &loader);
12 |   void LoadHLLSketch(ExtensionLoader &loader);
13 |   void LoadCPCSketch(ExtensionLoader &loader);
14 | }


--------------------------------------------------------------------------------
/vcpkg-overlay/apache-datasketches/vcpkg.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "apache-datasketches",
 3 |   "version": "5.1.0",
 4 |   "port-version": 1,
 5 |   "description": "Apache DataSketches Core C++ Library Component.",
 6 |   "homepage": "https://datasketches.apache.org/",
 7 |   "license": "Apache-2.0",
 8 |   "dependencies": [
 9 |     {
10 |       "name": "vcpkg-cmake",
11 |       "host": true
12 |     },
13 |     {
14 |       "name": "vcpkg-cmake-config",
15 |       "host": true
16 |     }
17 |   ]
18 | }


--------------------------------------------------------------------------------
/test/README.md:
--------------------------------------------------------------------------------
 1 | # Testing this extension
 2 | This directory contains all the tests for this extension. The `sql` directory holds tests that are written as [SQLLogicTests](https://duckdb.org/dev/sqllogictest/intro.html). DuckDB aims to have most its tests in this format as SQL statements, so for the datasketches extension, this should probably be the goal too.
 3 | 
 4 | The root makefile contains targets to build and run all of these tests. To run the SQLLogicTests:
 5 | ```bash
 6 | make test
 7 | ```
 8 | or
 9 | ```bash
10 | make test_debug
11 | ```


--------------------------------------------------------------------------------
/src/include/query_farm_telemetry.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <string>
 3 | #include "duckdb.hpp"
 4 | 
 5 | #if defined(_WIN32) || defined(_WIN64)
 6 | // Windows: functions are hidden by default unless exported
 7 | #define INTERNAL_FUNC
 8 | #elif defined(__GNUC__) || defined(__clang__)
 9 | // Linux / macOS: hide symbol using visibility attribute
10 | #define INTERNAL_FUNC __attribute__((visibility("hidden")))
11 | #else
12 | #define INTERNAL_FUNC
13 | #endif
14 | 
15 | namespace duckdb {
16 | void QueryFarmSendTelemetry(ExtensionLoader &loader, const string &extension_name, const string &extension_version);
17 | }


--------------------------------------------------------------------------------
/scripts/setup-custom-toolchain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This is an example script that can be used to install additional toolchain dependencies. Feel free to remove this script
 4 | # if no additional toolchains are required
 5 | 
 6 | # To enable this script, set the `custom_toolchain_script` option to true when calling the reusable workflow
 7 | # `.github/workflows/_extension_distribution.yml` from `https://github.com/duckdb/extension-ci-tools`
 8 | 
 9 | # note that the $DUCKDB_PLATFORM environment variable can be used to discern between the platforms
10 | echo "This is the sample custom toolchain script running for architecture '$DUCKDB_PLATFORM' for the datasketches extension."
11 | 
12 | 


--------------------------------------------------------------------------------
/.github/workflows/scheduled-1.4.yml:
--------------------------------------------------------------------------------
 1 | name: Scheduled Trigger for 1.4
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: '0 12 * * *'  # Runs at 12:00 UTC every day
 6 |   workflow_dispatch:  # Allows manual trigger
 7 | 
 8 | jobs:
 9 |   trigger:
10 |     runs-on: ubuntu-latest
11 |     permissions:
12 |       actions: write  # Allow triggering workflows
13 |     steps:
14 |       - name: Checkout repository  # Required for gh to work
15 |         uses: actions/checkout@v4
16 | 
17 |       - name: Install GitHub CLI
18 |         run: |
19 |           sudo apt update && sudo apt install gh -y
20 | 
21 |       - name: Authenticate GH CLI
22 |         run: |
23 |           echo "${{ secrets.GITHUB_TOKEN }}" | gh auth login --with-token
24 | 
25 |       - name: Trigger Workflow on my-branch
26 |         run: |
27 |           gh workflow run MainDistributionPipeline.yml --ref v1.4
28 | 


--------------------------------------------------------------------------------
/vcpkg-overlay/apache-datasketches/portfile.cmake:
--------------------------------------------------------------------------------
 1 | vcpkg_from_github(
 2 |     OUT_SOURCE_PATH SOURCE_PATH
 3 |     REPO apache/datasketches-cpp
 4 |     REF dddc4a668cdc47ad8a221cf7d4cb5054e53a40ee
 5 |     SHA512 2fff76d978acecccadcf712389bc1b724569ab7f70c256473f59680f04473a7ffd6b1ed94125f6665bfad31a22874a18ea73d455eda49ff3e9f8a994548c52d5
 6 |     HEAD_REF master
 7 | )
 8 | 
 9 | set(VCPKG_BUILD_TYPE release) # header-only port
10 | 
11 | vcpkg_cmake_configure(
12 |     SOURCE_PATH "${SOURCE_PATH}"
13 |     OPTIONS
14 |         -DBUILD_TESTS=OFF
15 | )
16 | 
17 | vcpkg_cmake_install()
18 | vcpkg_cmake_config_fixup(PACKAGE_NAME DataSketches CONFIG_PATH lib/DataSketches/cmake)
19 | 
20 | file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/lib")
21 | 
22 | vcpkg_install_copyright(FILE_LIST "${SOURCE_PATH}/LICENSE")
23 | file(INSTALL "${CMAKE_CURRENT_LIST_DIR}/usage" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}")
24 | 


--------------------------------------------------------------------------------
/.vscode/c_cpp_properties.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "configurations": [
 3 |         {
 4 |             "name": "Mac",
 5 |             "includePath": [
 6 |                 "${workspaceFolder}/**",
 7 |                 "${workspaceFolder}/src/include/",
 8 |                 "${workspaceFolder}/duckdb/src/include/",
 9 |                 "${workspaceFolder}/duckdb/third_party/**/include/",
10 |                 "${workspaceFolder}/vcpkg_installed/arm64-osx/include"
11 |             ],
12 |             "defines": [],
13 |             "macFrameworkPath": [
14 |                 "/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks"
15 |             ],
16 |             "compilerPath": "/usr/bin/clang",
17 |             "cStandard": "c17",
18 |             "cppStandard": "c++17",
19 |             "intelliSenseMode": "macos-clang-arm64",
20 |             "configurationProvider": "ms-vscode.makefile-tools"
21 |         }
22 |     ],
23 |     "version": 4
24 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2024 Rusty Conover (rusty@conover.me)
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/.github/workflows/MainDistributionPipeline.yml:
--------------------------------------------------------------------------------
 1 | name: Main Extension Distribution Pipeline
 2 | on:
 3 |   push: null
 4 |   pull_request: null
 5 |   workflow_dispatch: null
 6 |   schedule:
 7 |   - cron: 0 2 * * *
 8 | concurrency:
 9 |   group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{
10 |     github.base_ref || '' }}-${{ github.ref != 'refs/heads/main' || github.sha }}
11 |   cancel-in-progress: true
12 | jobs:
13 |   duckdb-stable-build:
14 |     name: Build extension binaries
15 |     uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main
16 |     with:
17 |       duckdb_version: v1.4-andium
18 |       ci_tools_version: main
19 |       extension_name: datasketches
20 |       exclude_archs: windows_amd64_rtools
21 |       vcpkg_binary_sources: ${{ vars.VCPKG_BINARY_SOURCES }}
22 |     secrets:
23 |       VCPKG_CACHING_AWS_ACCESS_KEY_ID: ${{ secrets.VCPKG_CACHING_AWS_ACCESS_KEY_ID
24 |         }}
25 |       VCPKG_CACHING_AWS_SECRET_ACCESS_KEY: ${{ secrets.VCPKG_CACHING_AWS_SECRET_ACCESS_KEY
26 |         }}
27 |       VCPKG_CACHING_AWS_ENDPOINT_URL: ${{ secrets.VCPKG_CACHING_AWS_ENDPOINT_URL }}
28 |       VCPKG_CACHING_AWS_DEFAULT_REGION: ${{ secrets.VCPKG_CACHING_AWS_DEFAULT_REGION}}
29 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.5)
 2 | 
 3 | # Set extension name here
 4 | set(TARGET_NAME datasketches)
 5 | 
 6 | set(CMAKE_CXX_STANDARD 17)
 7 | 
 8 | 
 9 | if (WIN32 AND MSVC)
10 |   add_compile_options(/bigobj)
11 | endif()
12 | 
13 | # DuckDB's extension distribution supports vcpkg. As such, dependencies can be added in ./vcpkg.json and then
14 | # used in cmake with find_package. Feel free to remove or replace with other dependencies.
15 | # Note that it should also be removed from vcpkg.json to prevent needlessly installing it..
16 | find_package(DataSketches CONFIG REQUIRED)
17 | 
18 | set(EXTENSION_NAME ${TARGET_NAME}_extension)
19 | set(LOADABLE_EXTENSION_NAME ${TARGET_NAME}_loadable_extension)
20 | 
21 | project(${TARGET_NAME})
22 | include_directories(src/include)
23 | 
24 | 
25 | set(EXTENSION_SOURCES src/datasketches_extension.cpp src/generated.cpp
26 | src/query_farm_telemetry.cpp
27 | src/theta_sketch.cpp
28 | src/frequent_items_sketch.cpp)
29 | 
30 | build_static_extension(${TARGET_NAME} ${EXTENSION_SOURCES})
31 | build_loadable_extension(${TARGET_NAME} " " ${EXTENSION_SOURCES})
32 | 
33 | # Link OpenSSL in both the static library as the loadable extension
34 | target_link_libraries(${EXTENSION_NAME} datasketches)
35 | target_link_libraries(${LOADABLE_EXTENSION_NAME} datasketches)
36 | 
37 | install(
38 |   TARGETS ${EXTENSION_NAME}
39 |   EXPORT "${DUCKDB_EXPORT_SET}"
40 |   LIBRARY DESTINATION "${INSTALL_LIB_DIR}"
41 |   ARCHIVE DESTINATION "${INSTALL_LIB_DIR}")
42 | 


--------------------------------------------------------------------------------
/src/datasketches_extension.cpp:
--------------------------------------------------------------------------------
 1 | #define DUCKDB_EXTENSION_MAIN
 2 | 
 3 | #include "datasketches_extension.hpp"
 4 | #include "duckdb.hpp"
 5 | #include "duckdb/common/exception.hpp"
 6 | #include "duckdb/common/string_util.hpp"
 7 | #include "duckdb/common/extra_type_info.hpp"
 8 | #include "duckdb/function/scalar_function.hpp"
 9 | #include <duckdb/parser/parsed_data/create_scalar_function_info.hpp>
10 | #include <duckdb/parser/parsed_data/create_aggregate_function_info.hpp>
11 | 
12 | #include <optional>
13 | 
14 | #include "generated.h"
15 | 
16 | #include "query_farm_telemetry.hpp"
17 | 
18 | namespace duckdb
19 | {
20 | 
21 |     void LoadQuantilesSketch(ExtensionLoader &loader);
22 |     void LoadKLLSketch(ExtensionLoader &loader);
23 |     void LoadREQSketch(ExtensionLoader &loader);
24 |     void LoadTDigestSketch(ExtensionLoader &loader);
25 |     void LoadHLLSketch(ExtensionLoader &loader);
26 |     void LoadCPCSketch(ExtensionLoader &loader);
27 |     void LoadThetaSketch(ExtensionLoader &loader);
28 |     void LoadFrequentItemsSketch(ExtensionLoader &loader);
29 | 
30 |     static void LoadInternal(ExtensionLoader &loader)
31 |     {
32 |         LoadQuantilesSketch(loader);
33 |         LoadKLLSketch(loader);
34 |         LoadREQSketch(loader);
35 |         LoadTDigestSketch(loader);
36 |         LoadHLLSketch(loader);
37 |         LoadCPCSketch(loader);
38 |         LoadThetaSketch(loader);
39 |         LoadFrequentItemsSketch(loader);
40 |         QueryFarmSendTelemetry(loader, "datasketches", "2025121201");
41 |     }
42 | 
43 |     void DatasketchesExtension::Load(ExtensionLoader &loader)
44 |     {
45 |         LoadInternal(loader);
46 |     }
47 |     std::string DatasketchesExtension::Name()
48 |     {
49 |         return "datasketches";
50 |     }
51 | 
52 | } // namespace duckdb
53 | 
54 | extern "C"
55 | {
56 |     DUCKDB_CPP_EXTENSION_ENTRY(datasketches, loader)
57 |     {
58 |         duckdb::LoadInternal(loader);
59 |     }
60 | }
61 | 


--------------------------------------------------------------------------------
/scripts/extension-upload.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Extension upload script
 4 | 
 5 | # Usage: ./extension-upload.sh <name> <extension_version> <duckdb_version> <architecture> <s3_bucket> <copy_to_latest> <copy_to_versioned>
 6 | # <name>                : Name of the extension
 7 | # <extension_version>   : Version (commit / version tag) of the extension
 8 | # <duckdb_version>      : Version (commit / version tag) of DuckDB
 9 | # <architecture>        : Architecture target of the extension binary
10 | # <s3_bucket>           : S3 bucket to upload to
11 | # <copy_to_latest>      : Set this as the latest version ("true" / "false", default: "false")
12 | # <copy_to_versioned>   : Set this as a versioned version that will prevent its deletion
13 | 
14 | set -e
15 | 
16 | if [[ $4 == wasm* ]]; then
17 |   ext="/tmp/extension/$1.duckdb_extension.wasm"
18 | else
19 |   ext="/tmp/extension/$1.duckdb_extension"
20 | fi
21 | 
22 | echo $ext
23 | 
24 | script_dir="$(dirname "$(readlink -f "$0")")"
25 | 
26 | # calculate SHA256 hash of extension binary
27 | cat $ext > $ext.append
28 | 
29 | if [[ $4 == wasm* ]]; then
30 |   # 0 for custom section
31 |   # 113 in hex = 275 in decimal, total lenght of what follows (1 + 16 + 2 + 256)
32 |   # [1(continuation) + 0010011(payload) = \x93, 0(continuation) + 10(payload) = \x02]
33 |   echo -n -e '\x00' >> $ext.append
34 |   echo -n -e '\x93\x02' >> $ext.append
35 |   # 10 in hex = 16 in decimal, lenght of name, 1 byte
36 |   echo -n -e '\x10' >> $ext.append
37 |   echo -n -e 'duckdb_signature' >> $ext.append
38 |   # the name of the WebAssembly custom section, 16 bytes
39 |   # 100 in hex, 256 in decimal
40 |   # [1(continuation) + 0000000(payload) = ff, 0(continuation) + 10(payload)],
41 |   # for a grand total of 2 bytes
42 |   echo -n -e '\x80\x02' >> $ext.append
43 | fi
44 | 
45 | # (Optionally) Sign binary
46 | if [ "$DUCKDB_EXTENSION_SIGNING_PK" != "" ]; then
47 |   echo "$DUCKDB_EXTENSION_SIGNING_PK" > private.pem
48 |   $script_dir/../duckdb/scripts/compute-extension-hash.sh $ext.append > $ext.hash
49 |   openssl pkeyutl -sign -in $ext.hash -inkey private.pem -pkeyopt digest:sha256 -out $ext.sign
50 |   rm -f private.pem
51 | fi
52 | 
53 | # Signature is always there, potentially defaulting to 256 zeros
54 | truncate -s 256 $ext.sign
55 | 
56 | # append signature to extension binary
57 | cat $ext.sign >> $ext.append
58 | 
59 | # compress extension binary
60 | if [[ $4 == wasm_* ]]; then
61 |   brotli < $ext.append > "$ext.compressed"
62 | else
63 |   gzip < $ext.append > "$ext.compressed"
64 | fi
65 | 
66 | set -e
67 | 
68 | # Abort if AWS key is not set
69 | if [ -z "$AWS_ACCESS_KEY_ID" ]; then
70 |     echo "No AWS key found, skipping.."
71 |     exit 0
72 | fi
73 | 
74 | # upload versioned version
75 | if [[ $7 = 'true' ]]; then
76 |   if [[ $4 == wasm* ]]; then
77 |     aws s3 cp $ext.compressed s3://$5/$1/$2/$3/$4/$1.duckdb_extension.wasm --acl public-read --content-encoding br --content-type="application/wasm"
78 |   else
79 |     aws s3 cp $ext.compressed s3://$5/$1/$2/$3/$4/$1.duckdb_extension.gz --acl public-read
80 |   fi
81 | fi
82 | 
83 | # upload to latest version
84 | if [[ $6 = 'true' ]]; then
85 |   if [[ $4 == wasm* ]]; then
86 |     aws s3 cp $ext.compressed s3://$5/$3/$4/$1.duckdb_extension.wasm --acl public-read --content-encoding br --content-type="application/wasm"
87 |   else
88 |     aws s3 cp $ext.compressed s3://$5/$3/$4/$1.duckdb_extension.gz --acl public-read
89 |   fi
90 | fi
91 | 


--------------------------------------------------------------------------------
/scripts/bootstrap-template.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | import sys, os, shutil, re
 4 | from pathlib import Path
 5 | 
 6 | 
 7 | def is_snake_case(s):
 8 |     # Define the regex pattern for snake case with numbers
 9 |     pattern = r"^[a-z0-9]+(_[a-z0-9]+)*$"
10 | 
11 |     # Use re.match to check if the string matches the pattern
12 |     if re.match(pattern, s):
13 |         return True
14 |     else:
15 |         return False
16 | 
17 | 
18 | def to_camel_case(snake_str):
19 |     return "".join(x.capitalize() for x in snake_str.lower().split("_"))
20 | 
21 | 
22 | def replace(file_name, to_find, to_replace):
23 |     with open(file_name, "r", encoding="utf8") as file:
24 |         filedata = file.read()
25 |     filedata = filedata.replace(to_find, to_replace)
26 |     with open(file_name, "w", encoding="utf8") as file:
27 |         file.write(filedata)
28 | 
29 | 
30 | def replace_everywhere(to_find, to_replace):
31 |     for path in files_to_search:
32 |         replace(path, to_find, to_replace)
33 |         replace(path, to_find.capitalize(), to_camel_case(to_replace))
34 |         replace(path, to_find.upper(), to_replace.upper())
35 | 
36 |     replace("./CMakeLists.txt", to_find, to_replace)
37 |     replace("./Makefile", to_find, to_replace)
38 |     replace("./Makefile", to_find.capitalize(), to_camel_case(to_replace))
39 |     replace("./Makefile", to_find.upper(), to_replace.upper())
40 |     replace("./README.md", to_find, to_replace)
41 |     replace("./extension_config.cmake", to_find, to_replace)
42 |     replace("./scripts/setup-custom-toolchain.sh", to_find, to_replace)
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     if len(sys.argv) != 2:
47 |         raise Exception(
48 |             "usage: python3 bootstrap-template.py <name_for_extension_in_snake_case>"
49 |         )
50 | 
51 |     name_extension = sys.argv[1]
52 | 
53 |     if name_extension[0].isdigit():
54 |         raise Exception("Please dont start your extension name with a number.")
55 | 
56 |     if not is_snake_case(name_extension):
57 |         raise Exception(
58 |             "Please enter the name of your extension in valid snake_case containing only lower case letters and numbers"
59 |         )
60 | 
61 |     shutil.copyfile("docs/NEXT_README.md", "README.md")
62 |     os.remove("docs/NEXT_README.md")
63 |     os.remove("docs/README.md")
64 | 
65 |     files_to_search = []
66 |     files_to_search.extend(Path("./.github").rglob("./**/*.yml"))
67 |     files_to_search.extend(Path("./test").rglob("./**/*.test"))
68 |     files_to_search.extend(Path("./src").rglob("./**/*.hpp"))
69 |     files_to_search.extend(Path("./src").rglob("./**/*.cpp"))
70 |     files_to_search.extend(Path("./src").rglob("./**/*.txt"))
71 |     files_to_search.extend(Path("./src").rglob("./*.md"))
72 | 
73 |     replace_everywhere("datasketches", name_extension)
74 |     replace_everywhere("datasketches", name_extension.capitalize())
75 |     replace_everywhere("<extension_name>", name_extension)
76 | 
77 |     string_to_replace = name_extension
78 |     string_to_find = "datasketches"
79 | 
80 |     # rename files
81 |     os.rename(f"test/sql/{string_to_find}.test", f"test/sql/{string_to_replace}.test")
82 |     os.rename(
83 |         f"src/{string_to_find}_extension.cpp", f"src/{string_to_replace}_extension.cpp"
84 |     )
85 |     os.rename(
86 |         f"src/include/{string_to_find}_extension.hpp",
87 |         f"src/include/{string_to_replace}_extension.hpp",
88 |     )
89 | 
90 |     # remove template-specific files
91 |     os.remove(".github/workflows/ExtensionTemplate.yml")
92 | 
93 |     # finally, remove this bootstrap file
94 |     os.remove(__file__)
95 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "files.exclude": {
  3 |     "vcpkg/downloads": true,
  4 |     "vcpkg/packages": true,
  5 |     "vcpkg/ports": true,
  6 |     "vcpkg/scripts": true,
  7 |     "vcpkg/toolsrc": true,
  8 |     "vcpkg/versions": true,
  9 |     "vcpkg/triplets": true,
 10 |     "vcpkg_installed/**": true,
 11 |     "duckdb_unittest_tempdir": true,
 12 |     "build/**": true,
 13 |     ".tmp/**": true,
 14 |     "**/.git": true,
 15 |     "**/.svn": true,
 16 |     "**/.hg": true,
 17 |     "**/.trunk": true,
 18 |     "**/CVS": true,
 19 |     "**/.DS_Store": true,
 20 |     "**/Thumbs.db": true
 21 |   },
 22 |   "cmake.environment": {
 23 |     "GEN": "ninja",
 24 |     "VCPKG_TOOLCHAIN_PATH": "${workspaceFolder}/vcpkg/scripts/buildsystems/vcpkg.cmake",
 25 |   },
 26 |   "workbench.colorCustomizations": {
 27 |     "activityBar.activeBackground": "#b5933c",
 28 |     "activityBar.background": "#b5933c",
 29 |     "activityBar.foreground": "#15202b",
 30 |     "activityBar.inactiveForeground": "#15202b99",
 31 |     "activityBarBadge.background": "#b7e7da",
 32 |     "activityBarBadge.foreground": "#15202b",
 33 |     "commandCenter.border": "#e7e7e799",
 34 |     "sash.hoverBorder": "#b5933c",
 35 |     "statusBar.background": "#8f742f",
 36 |     "statusBar.foreground": "#e7e7e7",
 37 |     "statusBarItem.hoverBackground": "#b5933c",
 38 |     "statusBarItem.remoteBackground": "#8f742f",
 39 |     "statusBarItem.remoteForeground": "#e7e7e7",
 40 |     "titleBar.activeBackground": "#8f742f",
 41 |     "titleBar.activeForeground": "#e7e7e7",
 42 |     "titleBar.inactiveBackground": "#8f742f99",
 43 |     "titleBar.inactiveForeground": "#e7e7e799"
 44 |   },
 45 |   "peacock.color": "#8f742f",
 46 |   "files.associations": {
 47 |     "optional": "cpp",
 48 |     "*.inc": "cpp",
 49 |     "__hash_table": "cpp",
 50 |     "__split_buffer": "cpp",
 51 |     "__tree": "cpp",
 52 |     "array": "cpp",
 53 |     "bitset": "cpp",
 54 |     "deque": "cpp",
 55 |     "hash_map": "cpp",
 56 |     "initializer_list": "cpp",
 57 |     "list": "cpp",
 58 |     "map": "cpp",
 59 |     "queue": "cpp",
 60 |     "regex": "cpp",
 61 |     "set": "cpp",
 62 |     "span": "cpp",
 63 |     "stack": "cpp",
 64 |     "string": "cpp",
 65 |     "string_view": "cpp",
 66 |     "unordered_map": "cpp",
 67 |     "unordered_set": "cpp",
 68 |     "valarray": "cpp",
 69 |     "vector": "cpp",
 70 |     "*.ipp": "cpp",
 71 |     "stdexcept": "cpp",
 72 |     "iomanip": "cpp",
 73 |     "__bit_reference": "cpp",
 74 |     "__locale": "cpp",
 75 |     "__node_handle": "cpp",
 76 |     "__verbose_abort": "cpp",
 77 |     "cctype": "cpp",
 78 |     "charconv": "cpp",
 79 |     "cmath": "cpp",
 80 |     "condition_variable": "cpp",
 81 |     "cstddef": "cpp",
 82 |     "cstdint": "cpp",
 83 |     "cstdio": "cpp",
 84 |     "cstdlib": "cpp",
 85 |     "cstring": "cpp",
 86 |     "ctime": "cpp",
 87 |     "cwchar": "cpp",
 88 |     "forward_list": "cpp",
 89 |     "fstream": "cpp",
 90 |     "future": "cpp",
 91 |     "ios": "cpp",
 92 |     "iosfwd": "cpp",
 93 |     "iostream": "cpp",
 94 |     "istream": "cpp",
 95 |     "limits": "cpp",
 96 |     "locale": "cpp",
 97 |     "mutex": "cpp",
 98 |     "new": "cpp",
 99 |     "ostream": "cpp",
100 |     "print": "cpp",
101 |     "ratio": "cpp",
102 |     "sstream": "cpp",
103 |     "streambuf": "cpp",
104 |     "tuple": "cpp",
105 |     "typeinfo": "cpp",
106 |     "variant": "cpp",
107 |     "algorithm": "cpp",
108 |     "__threading_support": "cpp",
109 |     "any": "cpp",
110 |     "cfenv": "cpp",
111 |     "cinttypes": "cpp",
112 |     "clocale": "cpp",
113 |     "codecvt": "cpp",
114 |     "complex": "cpp",
115 |     "csignal": "cpp",
116 |     "cstdarg": "cpp",
117 |     "cwctype": "cpp",
118 |     "execution": "cpp",
119 |     "memory": "cpp",
120 |     "shared_mutex": "cpp",
121 |     "source_location": "cpp",
122 |     "strstream": "cpp",
123 |     "typeindex": "cpp"
124 |   }
125 | }


--------------------------------------------------------------------------------
/src/query_farm_telemetry.cpp:
--------------------------------------------------------------------------------
  1 | #include "query_farm_telemetry.hpp"
  2 | #include <thread>
  3 | #include "duckdb.hpp"
  4 | #include "duckdb/common/http_util.hpp"
  5 | #include "yyjson.hpp"
  6 | #include "duckdb/main/extension_helper.hpp"
  7 | #include "duckdb/main/config.hpp"
  8 | #include <cstdlib>
  9 | #include <future>
 10 | using namespace duckdb_yyjson; // NOLINT
 11 | 
 12 | namespace duckdb
 13 | {
 14 | 
 15 | 	namespace
 16 | 	{
 17 | 
 18 | 		// Function to send the actual HTTP request
 19 | 		void sendHTTPRequest(shared_ptr<DatabaseInstance> db, char *json_body, size_t json_body_size)
 20 | 		{
 21 | 			const string TARGET_URL("https://duckdb-in.query-farm.services/");
 22 | 
 23 | 			HTTPHeaders headers;
 24 | 			headers.Insert("Content-Type", "application/json");
 25 | 
 26 | 			auto &http_util = HTTPUtil::Get(*db);
 27 | 			unique_ptr<HTTPParams> params = http_util.InitializeParameters(*db, TARGET_URL);
 28 | 
 29 | 			PostRequestInfo post_request(TARGET_URL, headers, *params, reinterpret_cast<const_data_ptr_t>(json_body),
 30 | 																	 json_body_size);
 31 | 			try
 32 | 			{
 33 | 				auto response = http_util.Request(post_request);
 34 | 			}
 35 | 			catch (const std::exception &e)
 36 | 			{
 37 | 				// ignore all errors.
 38 | 			}
 39 | 
 40 | 			free(json_body);
 41 | 			return;
 42 | 		}
 43 | 
 44 | 	} // namespace
 45 | 
 46 | 	INTERNAL_FUNC void QueryFarmSendTelemetry(ExtensionLoader &loader, const string &extension_name,
 47 | 																						const string &extension_version)
 48 | 	{
 49 | 		const char *opt_out = std::getenv("QUERY_FARM_TELEMETRY_OPT_OUT");
 50 | 		if (opt_out != nullptr)
 51 | 		{
 52 | 			return;
 53 | 		}
 54 | 
 55 | 		auto &dbconfig = DBConfig::GetConfig(loader.GetDatabaseInstance());
 56 | 		auto old_value = dbconfig.options.autoinstall_known_extensions;
 57 | 		dbconfig.options.autoinstall_known_extensions = false;
 58 | 		try
 59 | 		{
 60 | 			ExtensionHelper::AutoLoadExtension(loader.GetDatabaseInstance(), "httpfs");
 61 | 		}
 62 | 		catch (...)
 63 | 		{
 64 | 			dbconfig.options.autoinstall_known_extensions = old_value;
 65 | 			return;
 66 | 		}
 67 | 
 68 | 		dbconfig.options.autoinstall_known_extensions = old_value;
 69 | 		if (!loader.GetDatabaseInstance().ExtensionIsLoaded("httpfs"))
 70 | 		{
 71 | 			return;
 72 | 		}
 73 | 
 74 | 		// Initialize the telemetry sender
 75 | 		auto doc = yyjson_mut_doc_new(nullptr);
 76 | 
 77 | 		auto result_obj = yyjson_mut_obj(doc);
 78 | 		yyjson_mut_doc_set_root(doc, result_obj);
 79 | 
 80 | 		auto platform = DuckDB::Platform();
 81 | 
 82 | 		yyjson_mut_obj_add_str(doc, result_obj, "extension_name", extension_name.c_str());
 83 | 		yyjson_mut_obj_add_str(doc, result_obj, "extension_version", extension_version.c_str());
 84 | 		yyjson_mut_obj_add_str(doc, result_obj, "user_agent", "query-farm/20251011");
 85 | 		yyjson_mut_obj_add_str(doc, result_obj, "duckdb_platform", platform.c_str());
 86 | 		yyjson_mut_obj_add_str(doc, result_obj, "duckdb_library_version", DuckDB::LibraryVersion());
 87 | 		yyjson_mut_obj_add_str(doc, result_obj, "duckdb_release_codename", DuckDB::ReleaseCodename());
 88 | 		yyjson_mut_obj_add_str(doc, result_obj, "duckdb_source_id", DuckDB::SourceID());
 89 | 
 90 | 		size_t telemetry_len;
 91 | 		auto telemetry_data =
 92 | 				yyjson_mut_val_write_opts(result_obj, YYJSON_WRITE_ALLOW_INF_AND_NAN, NULL, &telemetry_len, nullptr);
 93 | 
 94 | 		if (telemetry_data == nullptr)
 95 | 		{
 96 | 			throw SerializationException("Failed to serialize telemetry data.");
 97 | 		}
 98 | 
 99 | 		yyjson_mut_doc_free(doc);
100 | 
101 | #ifndef __EMSCRIPTEN__
102 | 		[[maybe_unused]] auto _ = std::async(
103 | 				std::launch::async, [db_ptr = loader.GetDatabaseInstance().shared_from_this(), json = telemetry_data,
104 | 														 len = telemetry_len]() mutable
105 | 				{ sendHTTPRequest(std::move(db_ptr), json, len); });
106 | #else
107 | 		sendHTTPRequest(loader.GetDatabaseInstance().shared_from_this(), telemetry_data, telemetry_len);
108 | #endif
109 | 	}
110 | 
111 | } // namespace duckdb


--------------------------------------------------------------------------------
/test/sql/datasketch_tdigest.test:
--------------------------------------------------------------------------------
  1 | # name: test/sql/datasketch_tdigest.test
  2 | # description: test datasketch TDigest sketches
  3 | # group: [datasketches]
  4 | 
  5 | # Before we load the extension, this will fail
  6 | statement error
  7 | SELECT datasketch_tdigest_is_empty(''::blob);
  8 | ----
  9 | Catalog Error: Scalar Function with name datasketch_tdigest_is_empty does not exist!
 10 | 
 11 | # Require statement will ensure this test is run with this extension loaded
 12 | require datasketches
 13 | 
 14 | query I
 15 | SELECT datasketch_tdigest(10, 5);
 16 | ----
 17 | \x01\x01\x14\x0A\x00\x06\x00\x00\x00\x00\x00\x00\x00\x00\x14@
 18 | 
 19 | query I
 20 | SELECT datasketch_tdigest_is_empty('\x01\x01\x14\x0A\x00\x06\x00\x00\x00\x00\x00\x00\x00\x00\x14@'::sketch_tdigest_float);
 21 | ----
 22 | false
 23 | 
 24 | # Do some tests with integers.
 25 | 
 26 | statement ok
 27 | CREATE TABLE readings(temp double)
 28 | 
 29 | statement ok
 30 | INSERT INTO readings(temp) select unnest(generate_series(1, 1000))::double;
 31 | 
 32 | # Rank of 500 should be close to 0.5 (value is in middle of 1-1000 range)
 33 | query I
 34 | SELECT datasketch_tdigest_rank(datasketch_tdigest(10, temp), 500.0) between 0.4 and 0.6 from readings
 35 | ----
 36 | true
 37 | 
 38 | # Median (0.5 quantile) should be close to 500
 39 | query I
 40 | SELECT datasketch_tdigest_quantile(datasketch_tdigest(10, temp), 0.5) between 350 and 650 from readings
 41 | ----
 42 | true
 43 | 
 44 | # CDF and PMF outputs vary, just verify they execute successfully
 45 | statement ok
 46 | SELECT datasketch_tdigest_cdf(datasketch_tdigest(10, temp), [100, 200, 500]) from readings
 47 | 
 48 | statement ok
 49 | SELECT datasketch_tdigest_pmf(datasketch_tdigest(10, temp), [100, 200, 500]) from readings
 50 | 
 51 | query I
 52 | SELECT datasketch_tdigest_k(datasketch_tdigest(10, temp)) from readings
 53 | ----
 54 | 10
 55 | 
 56 | statement ok
 57 | CREATE TABLE sketches (sketch sketch_tdigest_double)
 58 | 
 59 | statement ok
 60 | INSERT INTO sketches (sketch) select datasketch_tdigest(12, temp) from readings where mod(temp::int, 3) == 0
 61 | 
 62 | statement ok
 63 | INSERT INTO sketches (sketch) select datasketch_tdigest(12, temp) from readings where mod(temp::int, 3) == 1
 64 | 
 65 | statement ok
 66 | INSERT INTO sketches (sketch) select datasketch_tdigest(12, temp) from readings where mod(temp::int, 3) == 2
 67 | 
 68 | query I
 69 | select datasketch_tdigest_is_empty(datasketch_tdigest(12, sketch)) from sketches
 70 | ----
 71 | False
 72 | 
 73 | # Merged sketch median should be close to 500
 74 | query I
 75 | select datasketch_tdigest_quantile(datasketch_tdigest(12, sketch), 0.5)::int between 300 and 700 from sketches
 76 | ----
 77 | true
 78 | 
 79 | # Test error handling for invalid/corrupted sketch data
 80 | statement error
 81 | SELECT datasketch_tdigest_is_empty('\x00\x01\x02'::sketch_tdigest_float);
 82 | ----
 83 | Invalid Input Error: Failed to deserialize TDigest sketch
 84 | 
 85 | statement error
 86 | SELECT datasketch_tdigest_k('\xDE\xAD\xBE\xEF'::sketch_tdigest_double);
 87 | ----
 88 | Invalid Input Error: Failed to deserialize TDigest sketch
 89 | 
 90 | # Test with empty blob
 91 | statement error
 92 | SELECT datasketch_tdigest_is_empty(''::sketch_tdigest_double);
 93 | ----
 94 | Invalid Input Error: Failed to deserialize TDigest sketch
 95 | 
 96 | # =============================================================================
 97 | # COMPREHENSIVE UNION/MERGE TESTS
 98 | # =============================================================================
 99 | 
100 | # Test merging multiple sketches from partitioned data
101 | statement ok
102 | CREATE TABLE merge_data(value double, partition_id int)
103 | 
104 | statement ok
105 | INSERT INTO merge_data SELECT unnest(generate_series(1, 300))::double, 1
106 | 
107 | statement ok
108 | INSERT INTO merge_data SELECT unnest(generate_series(301, 600))::double, 2
109 | 
110 | statement ok
111 | INSERT INTO merge_data SELECT unnest(generate_series(601, 900))::double, 3
112 | 
113 | # Create sketches per partition
114 | statement ok
115 | CREATE TABLE partition_sketches AS
116 | SELECT partition_id, datasketch_tdigest(100, value) as sketch
117 | FROM merge_data
118 | GROUP BY partition_id
119 | 
120 | # Verify we have 3 partition sketches
121 | query I
122 | SELECT count(*) FROM partition_sketches
123 | ----
124 | 3
125 | 
126 | # Merge all partition sketches and verify total weight
127 | query I
128 | SELECT datasketch_tdigest_total_weight(datasketch_tdigest(100, sketch)) FROM partition_sketches
129 | ----
130 | 900
131 | 
132 | # Verify merged sketch is not empty
133 | query I
134 | SELECT datasketch_tdigest_is_empty(datasketch_tdigest(100, sketch)) FROM partition_sketches
135 | ----
136 | False
137 | 
138 | # Verify merged sketch median is approximately in the middle
139 | query I
140 | SELECT datasketch_tdigest_quantile(datasketch_tdigest(100, sketch), 0.5) between 400 and 500 FROM partition_sketches
141 | ----
142 | true
143 | 
144 | # Test merging sketches with overlapping data ranges
145 | statement ok
146 | CREATE TABLE overlap_data(value double, group_id int)
147 | 
148 | statement ok
149 | INSERT INTO overlap_data SELECT unnest(generate_series(1, 500))::double, 1
150 | 
151 | statement ok
152 | INSERT INTO overlap_data SELECT unnest(generate_series(250, 750))::double, 2
153 | 
154 | statement ok
155 | CREATE TABLE overlap_sketches AS
156 | SELECT group_id, datasketch_tdigest(100, value) as sketch
157 | FROM overlap_data
158 | GROUP BY group_id
159 | 
160 | # Merged sketch should have correct total weight (500 + 501 = 1001)
161 | query I
162 | SELECT datasketch_tdigest_total_weight(datasketch_tdigest(100, sketch)) FROM overlap_sketches
163 | ----
164 | 1001
165 | 
166 | # Verify merged sketch is not empty
167 | query I
168 | SELECT datasketch_tdigest_is_empty(datasketch_tdigest(100, sketch)) FROM overlap_sketches
169 | ----
170 | False
171 | 
172 | # Test merge with different K values - merged sketch should use the K from bind
173 | statement ok
174 | CREATE TABLE k_test_sketches AS
175 | SELECT datasketch_tdigest(50, value) as sketch FROM merge_data WHERE partition_id = 1
176 | UNION ALL
177 | SELECT datasketch_tdigest(200, value) as sketch FROM merge_data WHERE partition_id = 2
178 | 
179 | query I
180 | SELECT datasketch_tdigest_k(datasketch_tdigest(100, sketch)) FROM k_test_sketches
181 | ----
182 | 100
183 | 
184 | # Test merging a single sketch (edge case)
185 | query I
186 | SELECT datasketch_tdigest_total_weight(datasketch_tdigest(50, sketch))
187 | FROM (SELECT datasketch_tdigest(50, value) as sketch FROM merge_data WHERE partition_id = 1) single_sketch
188 | ----
189 | 300
190 | 
191 | # Test CDF on merged sketch
192 | statement ok
193 | SELECT datasketch_tdigest_cdf(datasketch_tdigest(100, sketch), [300, 600]) FROM partition_sketches
194 | 
195 | # Test PMF on merged sketch
196 | statement ok
197 | SELECT datasketch_tdigest_pmf(datasketch_tdigest(100, sketch), [300, 600]) FROM partition_sketches
198 | 
199 | # Test rank query on merged sketch - rank of 450 should be approximately 0.5
200 | query I
201 | SELECT datasketch_tdigest_rank(datasketch_tdigest(100, sketch), 450.0) between 0.45 and 0.55 FROM partition_sketches
202 | ----
203 | true
204 | 
205 | # Test merging sketches created with GROUP BY
206 | statement ok
207 | CREATE TABLE grouped_data(category varchar, value double)
208 | 
209 | statement ok
210 | INSERT INTO grouped_data
211 | SELECT 'A', unnest(generate_series(1, 100))::double
212 | UNION ALL
213 | SELECT 'B', unnest(generate_series(101, 200))::double
214 | UNION ALL
215 | SELECT 'C', unnest(generate_series(201, 300))::double
216 | 
217 | statement ok
218 | CREATE TABLE category_sketches AS
219 | SELECT category, datasketch_tdigest(50, value) as sketch
220 | FROM grouped_data
221 | GROUP BY category
222 | 
223 | # Merge all category sketches
224 | query I
225 | SELECT datasketch_tdigest_total_weight(datasketch_tdigest(50, sketch)) FROM category_sketches
226 | ----
227 | 300
228 | 
229 | # Verify merged quantiles span all categories
230 | query I
231 | SELECT datasketch_tdigest_quantile(datasketch_tdigest(50, sketch), 0.0) between 1 and 5 FROM category_sketches
232 | ----
233 | true
234 | 
235 | query I
236 | SELECT datasketch_tdigest_quantile(datasketch_tdigest(50, sketch), 1.0) between 295 and 300 FROM category_sketches
237 | ----
238 | true
239 | 
240 | # Test total_weight matches n for merged sketch
241 | query I
242 | SELECT datasketch_tdigest_total_weight(datasketch_tdigest(100, sketch)) FROM partition_sketches
243 | ----
244 | 900.0
245 | 
246 | 


--------------------------------------------------------------------------------
/test/sql/datasketch_cpc.test:
--------------------------------------------------------------------------------
  1 | # name: test/sql/datasketch_cpc.test
  2 | # description: test datasketch CPC sketches
  3 | # group: [datasketches]
  4 | 
  5 | # Before we load the extension, this will fail
  6 | statement error
  7 | SELECT datasketch_cpc_is_empty(''::blob);
  8 | ----
  9 | Catalog Error: Scalar Function with name datasketch_cpc_is_empty does not exist!
 10 | 
 11 | # Require statement will ensure this test is run with this extension loaded
 12 | require datasketches
 13 | 
 14 | query I
 15 | SELECT datasketch_cpc(8, 5);
 16 | ----
 17 | \x08\x01\x10\x08\x00\x0E\xCC\x93\x01\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\xF8o@\x00\x00\x00\x00\x00\x00\xF0?\xDD\x03\x00\x00
 18 | 
 19 | query I
 20 | SELECT datasketch_cpc_is_empty('\x08\x01\x10\x08\x00\x0E\xCC\x93\x01\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\xF8o@\x00\x00\x00\x00\x00\x00\xF0?\xDD\x03\x00\x00');
 21 | ----
 22 | false
 23 | 
 24 | # Do some tests with integers.
 25 | 
 26 | statement ok
 27 | CREATE TABLE items(id integer)
 28 | 
 29 | statement ok
 30 | INSERT INTO items(id) select unnest(generate_series(1, 100000)) order by random()
 31 | 
 32 | # Duplicate items shouldn't affect the count.
 33 | 
 34 | statement ok
 35 | INSERT INTO items(id) select unnest(generate_series(1, 100000)) order by random()
 36 | 
 37 | query I
 38 | SELECT datasketch_cpc_is_empty(datasketch_cpc(12, id)) from items
 39 | ----
 40 | False
 41 | 
 42 | 
 43 | query I
 44 | SELECT datasketch_cpc_describe(datasketch_cpc(4, id)) like '%CPC sketch summary%' from items
 45 | ----
 46 | True
 47 | 
 48 | # Test with strings
 49 | 
 50 | statement ok
 51 | CREATE TABLE employees(name string)
 52 | 
 53 | statement ok
 54 | INSERT INTO employees(name) VALUES
 55 | ('John Doe'), ('Jane Smith'), ('Michael Johnson'), ('Emily Davis'), ('Chris Brown'), ('Sarah Wilson'), ('David Martinez'),('Sophia Anderson'), ('Daniel Lee'),('Olivia Taylor');
 56 | 
 57 | # 10 distinct names, estimate should be close
 58 | query I
 59 | SELECT datasketch_cpc_estimate(datasketch_cpc(4, name))::int between 8 and 15 from employees
 60 | ----
 61 | true
 62 | 
 63 | statement ok
 64 | CREATE TABLE sketches (sketch sketch_cpc)
 65 | 
 66 | statement ok
 67 | INSERT INTO sketches (sketch) select datasketch_cpc(12, id) from items where mod(id, 3) == 0
 68 | 
 69 | statement ok
 70 | INSERT INTO sketches (sketch) select datasketch_cpc(12, id) from items where mod(id, 3) == 1
 71 | 
 72 | statement ok
 73 | INSERT INTO sketches (sketch) select datasketch_cpc(12, id) from items where mod(id, 3) == 2
 74 | 
 75 | query I
 76 | select datasketch_cpc_is_empty(datasketch_cpc_union(12, sketch)) from sketches
 77 | ----
 78 | False
 79 | 
 80 | # Test error handling for invalid/corrupted sketch data
 81 | statement error
 82 | SELECT datasketch_cpc_estimate('\x00\x01\x02\x03'::blob);
 83 | ----
 84 | Invalid Input Error: Failed to deserialize CPC sketch
 85 | 
 86 | statement error
 87 | SELECT datasketch_cpc_is_empty('\xDE\xAD\xBE\xEF'::blob);
 88 | ----
 89 | Invalid Input Error: Failed to deserialize CPC sketch
 90 | 
 91 | # Test with empty blob
 92 | statement error
 93 | SELECT datasketch_cpc_estimate(''::blob);
 94 | ----
 95 | Invalid Input Error: Failed to deserialize CPC sketch
 96 | 
 97 | # =============================================================================
 98 | # COMPREHENSIVE UNION TESTS
 99 | # =============================================================================
100 | 
101 | # Test union of multiple sketches from partitioned data with non-overlapping values
102 | statement ok
103 | CREATE TABLE union_data(value int, partition_id int)
104 | 
105 | statement ok
106 | INSERT INTO union_data SELECT unnest(generate_series(1, 10000)), 1
107 | 
108 | statement ok
109 | INSERT INTO union_data SELECT unnest(generate_series(10001, 20000)), 2
110 | 
111 | statement ok
112 | INSERT INTO union_data SELECT unnest(generate_series(20001, 30000)), 3
113 | 
114 | # Create sketches per partition
115 | statement ok
116 | CREATE TABLE partition_sketches AS
117 | SELECT partition_id, datasketch_cpc(12, value) as sketch
118 | FROM union_data
119 | GROUP BY partition_id
120 | 
121 | # Verify we have 3 partition sketches
122 | query I
123 | SELECT count(*) FROM partition_sketches
124 | ----
125 | 3
126 | 
127 | # Each partition has 10000 distinct values
128 | query I
129 | SELECT datasketch_cpc_estimate(sketch)::int between 9500 and 10500 FROM partition_sketches ORDER BY partition_id LIMIT 1
130 | ----
131 | true
132 | 
133 | # Union all partition sketches - should have ~30000 distinct values
134 | query I
135 | SELECT datasketch_cpc_estimate(datasketch_cpc_union(12, sketch))::int between 28500 and 31500 FROM partition_sketches
136 | ----
137 | true
138 | 
139 | # Verify union is not empty
140 | query I
141 | SELECT datasketch_cpc_is_empty(datasketch_cpc_union(12, sketch)) FROM partition_sketches
142 | ----
143 | False
144 | 
145 | # Verify union is not empty (already tested above, but confirms consistency)
146 | query I
147 | SELECT datasketch_cpc_estimate(datasketch_cpc_union(12, sketch))::int > 0 FROM partition_sketches
148 | ----
149 | true
150 | 
151 | # Test union with overlapping data
152 | statement ok
153 | CREATE TABLE overlap_union_data(value int, group_id int)
154 | 
155 | statement ok
156 | INSERT INTO overlap_union_data SELECT unnest(generate_series(1, 50000)), 1
157 | 
158 | statement ok
159 | INSERT INTO overlap_union_data SELECT unnest(generate_series(25000, 75000)), 2
160 | 
161 | statement ok
162 | CREATE TABLE overlap_union_sketches AS
163 | SELECT group_id, datasketch_cpc(14, value) as sketch
164 | FROM overlap_union_data
165 | GROUP BY group_id
166 | 
167 | # Group 1 has 50000 distinct, Group 2 has 50001 distinct
168 | # Union should have 75000 distinct (1-75000)
169 | query I
170 | SELECT datasketch_cpc_estimate(datasketch_cpc_union(14, sketch))::int between 72000 and 78000 FROM overlap_union_sketches
171 | ----
172 | true
173 | 
174 | # Test union with different K values
175 | statement ok
176 | CREATE TABLE k_union_sketches AS
177 | SELECT datasketch_cpc(8, value) as sketch FROM union_data WHERE partition_id = 1
178 | UNION ALL
179 | SELECT datasketch_cpc(14, value) as sketch FROM union_data WHERE partition_id = 2
180 | 
181 | # Verify union works with different K values
182 | query I
183 | SELECT datasketch_cpc_estimate(datasketch_cpc_union(10, sketch))::int between 18000 and 22000 FROM k_union_sketches
184 | ----
185 | true
186 | 
187 | # Test union of single sketch (edge case)
188 | query I
189 | SELECT datasketch_cpc_estimate(datasketch_cpc_union(12, sketch))::int between 9500 and 10500
190 | FROM (SELECT datasketch_cpc(12, value) as sketch FROM union_data WHERE partition_id = 1) single_sketch
191 | ----
192 | true
193 | 
194 | # Test union preserves accuracy - lower_bound/upper_bound on union
195 | query I
196 | SELECT datasketch_cpc_lower_bound(datasketch_cpc_union(12, sketch), 1)::int between 27000 and 31000 FROM partition_sketches
197 | ----
198 | true
199 | 
200 | query I
201 | SELECT datasketch_cpc_upper_bound(datasketch_cpc_union(12, sketch), 1)::int between 29000 and 33000 FROM partition_sketches
202 | ----
203 | true
204 | 
205 | # Test union with string values
206 | statement ok
207 | CREATE TABLE string_union_data(name varchar, source_id int)
208 | 
209 | statement ok
210 | INSERT INTO string_union_data
211 | SELECT 'user_' || x, 1 FROM generate_series(1, 1000) t(x)
212 | UNION ALL
213 | SELECT 'user_' || x, 2 FROM generate_series(500, 1500) t(x)
214 | 
215 | statement ok
216 | CREATE TABLE string_union_sketches AS
217 | SELECT source_id, datasketch_cpc(10, name) as sketch
218 | FROM string_union_data
219 | GROUP BY source_id
220 | 
221 | # Union should have ~1500 distinct strings (user_1 to user_1500)
222 | query I
223 | SELECT datasketch_cpc_estimate(datasketch_cpc_union(10, sketch))::int between 1400 and 1600 FROM string_union_sketches
224 | ----
225 | true
226 | 
227 | # Test union with GROUP BY categories
228 | statement ok
229 | CREATE TABLE category_union_data(category varchar, user_id int)
230 | 
231 | statement ok
232 | INSERT INTO category_union_data
233 | SELECT 'electronics', unnest(generate_series(1, 5000))
234 | UNION ALL
235 | SELECT 'clothing', unnest(generate_series(2500, 7500))
236 | UNION ALL
237 | SELECT 'food', unnest(generate_series(5000, 10000))
238 | 
239 | statement ok
240 | CREATE TABLE category_union_sketches AS
241 | SELECT category, datasketch_cpc(12, user_id) as sketch
242 | FROM category_union_data
243 | GROUP BY category
244 | 
245 | # Each category has ~5000 distinct users
246 | # Union should have ~10000 distinct users (1-10000)
247 | query I
248 | SELECT datasketch_cpc_estimate(datasketch_cpc_union(12, sketch))::int between 9500 and 10500 FROM category_union_sketches
249 | ----
250 | true
251 | 
252 | # Test describe on union result
253 | query I
254 | SELECT datasketch_cpc_describe(datasketch_cpc_union(12, sketch)) like '%CPC sketch summary%' FROM partition_sketches
255 | ----
256 | True
257 | 
258 | 


--------------------------------------------------------------------------------
/test/sql/datasketch_kll.test:
--------------------------------------------------------------------------------
  1 | # name: test/sql/datasketch_kll.test
  2 | # description: test datasketch KLL sketches
  3 | # group: [datasketches]
  4 | 
  5 | # Before we load the extension, this will fail
  6 | statement error
  7 | SELECT datasketch_kll_is_empty(''::blob);
  8 | ----
  9 | Catalog Error: Scalar Function with name datasketch_kll_is_empty does not exist!
 10 | 
 11 | # Require statement will ensure this test is run with this extension loaded
 12 | require datasketches
 13 | 
 14 | query I
 15 | SELECT datasketch_kll(16, 5.0::float);
 16 | ----
 17 | \x02\x02\x0F\x04\x10\x00\x08\x00\x00\x00\xA0@
 18 | 
 19 | query I
 20 | SELECT datasketch_kll_is_empty('\x02\x02\x0F\x04\x10\x00\x08\x00\x00\x00\xA0@'::sketch_kll_float);
 21 | ----
 22 | false
 23 | 
 24 | # Do some tests with integers.
 25 | 
 26 | statement ok
 27 | CREATE TABLE readings(temp double)
 28 | 
 29 | statement ok
 30 | INSERT INTO readings(temp) select unnest(generate_series(1, 1000))::double;
 31 | 
 32 | query I
 33 | SELECT datasketch_kll_rank(datasketch_kll(16, temp), 500.0, true) between 0.40 and 0.60 from readings
 34 | ----
 35 | True
 36 | 
 37 | query I
 38 | SELECT datasketch_kll_quantile(datasketch_kll(16, temp), 0.5, true) between 400 and 600 from readings
 39 | ----
 40 | True
 41 | 
 42 | # Can't save results on these because they are random
 43 | 
 44 | statement ok
 45 | SELECT datasketch_kll_cdf(datasketch_kll(16, temp), [100, 200, 500], true) from readings
 46 | 
 47 | statement ok
 48 | SELECT datasketch_kll_pmf(datasketch_kll(16, temp), [100, 200, 500], true) from readings
 49 | 
 50 | query I
 51 | SELECT datasketch_kll_k(datasketch_kll(16, temp)) from readings
 52 | ----
 53 | 16
 54 | 
 55 | statement ok
 56 | CREATE TABLE sketches (sketch sketch_kll_double)
 57 | 
 58 | statement ok
 59 | INSERT INTO sketches (sketch) select datasketch_kll(16, temp) from readings where mod(temp::int, 3) == 0
 60 | 
 61 | statement ok
 62 | INSERT INTO sketches (sketch) select datasketch_kll(16, temp) from readings where mod(temp::int, 3) == 1
 63 | 
 64 | statement ok
 65 | INSERT INTO sketches (sketch) select datasketch_kll(16, temp) from readings where mod(temp::int, 3) == 2
 66 | 
 67 | query I
 68 | select datasketch_kll_is_empty(datasketch_kll(16, sketch)) from sketches
 69 | ----
 70 | False
 71 | 
 72 | statement ok
 73 | select datasketch_kll_quantile(datasketch_kll(16, sketch), 0.5, true)::int from sketches
 74 | 
 75 | query I
 76 | select datasketch_kll_n(datasketch_kll(16, sketch)) from sketches
 77 | ----
 78 | 1000
 79 | 
 80 | query I
 81 | select datasketch_kll_is_estimation_mode(datasketch_kll(16, sketch)) from sketches
 82 | ----
 83 | 1
 84 | 
 85 | # num_retained varies based on data distribution and internal compaction
 86 | statement ok
 87 | select datasketch_kll_num_retained(datasketch_kll(16, sketch)) from sketches
 88 | 
 89 | query I
 90 | select datasketch_kll_min_item(datasketch_kll(16, sketch)) from sketches
 91 | ----
 92 | 1.0
 93 | 
 94 | query I
 95 | select datasketch_kll_max_item(datasketch_kll(16, sketch)) from sketches
 96 | ----
 97 | 1000.0
 98 | 
 99 | # Test error handling for invalid/corrupted sketch data
100 | statement error
101 | SELECT datasketch_kll_is_empty('\x00\x01\x02'::sketch_kll_float);
102 | ----
103 | Invalid Input Error: Failed to deserialize KLL sketch
104 | 
105 | statement error
106 | SELECT datasketch_kll_k('\xDE\xAD\xBE\xEF'::sketch_kll_double);
107 | ----
108 | Invalid Input Error: Failed to deserialize KLL sketch
109 | 
110 | # Test with empty blob
111 | statement error
112 | SELECT datasketch_kll_is_empty(''::sketch_kll_integer);
113 | ----
114 | Invalid Input Error: Failed to deserialize KLL sketch
115 | 
116 | # =============================================================================
117 | # COMPREHENSIVE UNION/MERGE TESTS
118 | # =============================================================================
119 | 
120 | # Test merging multiple sketches from partitioned data
121 | statement ok
122 | CREATE TABLE merge_data(value double, partition_id int)
123 | 
124 | statement ok
125 | INSERT INTO merge_data SELECT unnest(generate_series(1, 300))::double, 1
126 | 
127 | statement ok
128 | INSERT INTO merge_data SELECT unnest(generate_series(301, 600))::double, 2
129 | 
130 | statement ok
131 | INSERT INTO merge_data SELECT unnest(generate_series(601, 900))::double, 3
132 | 
133 | # Create sketches per partition
134 | statement ok
135 | CREATE TABLE partition_sketches AS
136 | SELECT partition_id, datasketch_kll(32, value) as sketch
137 | FROM merge_data
138 | GROUP BY partition_id
139 | 
140 | # Verify we have 3 partition sketches
141 | query I
142 | SELECT count(*) FROM partition_sketches
143 | ----
144 | 3
145 | 
146 | # Merge all partition sketches and verify total count
147 | query I
148 | SELECT datasketch_kll_n(datasketch_kll(32, sketch)) FROM partition_sketches
149 | ----
150 | 900
151 | 
152 | # Verify merged sketch has correct min value
153 | query I
154 | SELECT datasketch_kll_min_item(datasketch_kll(32, sketch)) FROM partition_sketches
155 | ----
156 | 1.0
157 | 
158 | # Verify merged sketch has correct max value
159 | query I
160 | SELECT datasketch_kll_max_item(datasketch_kll(32, sketch)) FROM partition_sketches
161 | ----
162 | 900.0
163 | 
164 | # Verify merged sketch median is approximately in the middle
165 | query I
166 | SELECT datasketch_kll_quantile(datasketch_kll(32, sketch), 0.5, true) between 400 and 500 FROM partition_sketches
167 | ----
168 | True
169 | 
170 | # Test merging sketches with overlapping data ranges
171 | statement ok
172 | CREATE TABLE overlap_data(value double, group_id int)
173 | 
174 | statement ok
175 | INSERT INTO overlap_data SELECT unnest(generate_series(1, 500))::double, 1
176 | 
177 | statement ok
178 | INSERT INTO overlap_data SELECT unnest(generate_series(250, 750))::double, 2
179 | 
180 | statement ok
181 | CREATE TABLE overlap_sketches AS
182 | SELECT group_id, datasketch_kll(64, value) as sketch
183 | FROM overlap_data
184 | GROUP BY group_id
185 | 
186 | # Merged sketch should have correct total count (500 + 501 = 1001)
187 | query I
188 | SELECT datasketch_kll_n(datasketch_kll(64, sketch)) FROM overlap_sketches
189 | ----
190 | 1001
191 | 
192 | # Verify min/max of merged overlapping sketches
193 | query I
194 | SELECT datasketch_kll_min_item(datasketch_kll(64, sketch)) FROM overlap_sketches
195 | ----
196 | 1.0
197 | 
198 | query I
199 | SELECT datasketch_kll_max_item(datasketch_kll(64, sketch)) FROM overlap_sketches
200 | ----
201 | 750.0
202 | 
203 | # Test merge with different K values
204 | statement ok
205 | CREATE TABLE k_test_sketches AS
206 | SELECT datasketch_kll(16, value) as sketch FROM merge_data WHERE partition_id = 1
207 | UNION ALL
208 | SELECT datasketch_kll(64, value) as sketch FROM merge_data WHERE partition_id = 2
209 | 
210 | # Verify merge works with different K values and produces correct count
211 | query I
212 | SELECT datasketch_kll_n(datasketch_kll(32, sketch)) FROM k_test_sketches
213 | ----
214 | 600
215 | 
216 | # Test merging a single sketch (edge case)
217 | query I
218 | SELECT datasketch_kll_n(datasketch_kll(16, sketch))
219 | FROM (SELECT datasketch_kll(16, value) as sketch FROM merge_data WHERE partition_id = 1) single_sketch
220 | ----
221 | 300
222 | 
223 | # Test merge preserves estimation mode status
224 | query I
225 | SELECT datasketch_kll_is_estimation_mode(datasketch_kll(8, sketch)) FROM partition_sketches
226 | ----
227 | 1
228 | 
229 | # Test CDF on merged sketch
230 | statement ok
231 | SELECT datasketch_kll_cdf(datasketch_kll(32, sketch), [300, 600], true) FROM partition_sketches
232 | 
233 | # Test PMF on merged sketch
234 | statement ok
235 | SELECT datasketch_kll_pmf(datasketch_kll(32, sketch), [300, 600], true) FROM partition_sketches
236 | 
237 | # Test rank query on merged sketch - rank of 450 should be approximately 0.5
238 | query I
239 | SELECT datasketch_kll_rank(datasketch_kll(32, sketch), 450.0, true) between 0.45 and 0.55 FROM partition_sketches
240 | ----
241 | True
242 | 
243 | # Test merging sketches created with GROUP BY
244 | statement ok
245 | CREATE TABLE grouped_data(category varchar, value double)
246 | 
247 | statement ok
248 | INSERT INTO grouped_data
249 | SELECT 'A', unnest(generate_series(1, 100))::double
250 | UNION ALL
251 | SELECT 'B', unnest(generate_series(101, 200))::double
252 | UNION ALL
253 | SELECT 'C', unnest(generate_series(201, 300))::double
254 | 
255 | statement ok
256 | CREATE TABLE category_sketches AS
257 | SELECT category, datasketch_kll(16, value) as sketch
258 | FROM grouped_data
259 | GROUP BY category
260 | 
261 | # Merge all category sketches
262 | query I
263 | SELECT datasketch_kll_n(datasketch_kll(16, sketch)) FROM category_sketches
264 | ----
265 | 300
266 | 
267 | # Verify merged sketch min and max span all categories
268 | query I
269 | SELECT datasketch_kll_min_item(datasketch_kll(16, sketch)) FROM category_sketches
270 | ----
271 | 1.0
272 | 
273 | query I
274 | SELECT datasketch_kll_max_item(datasketch_kll(16, sketch)) FROM category_sketches
275 | ----
276 | 300.0
277 | 


--------------------------------------------------------------------------------
/test/sql/datasketch_req.test:
--------------------------------------------------------------------------------
  1 | # name: test/sql/datasketch_req.test
  2 | # description: test datasketch REQ sketches
  3 | # group: [datasketches]
  4 | 
  5 | # Before we load the extension, this will fail
  6 | statement error
  7 | SELECT datasketch_req_is_empty(''::blob);
  8 | ----
  9 | Catalog Error: Scalar Function with name datasketch_req_is_empty does not exist!
 10 | 
 11 | # Require statement will ensure this test is run with this extension loaded
 12 | require datasketches
 13 | 
 14 | query I
 15 | SELECT datasketch_req(16, 5.0::float);
 16 | ----
 17 | \x02\x01\x118\x10\x00\x01\x01\x00\x00\xA0@
 18 | 
 19 | query I
 20 | SELECT datasketch_req_is_empty('\x02\x01\x118\x10\x00\x01\x01\x00\x00\xA0@'::sketch_req_float);
 21 | ----
 22 | false
 23 | 
 24 | # Do some tests with integers.
 25 | 
 26 | statement ok
 27 | CREATE TABLE readings(temp double)
 28 | 
 29 | statement ok
 30 | INSERT INTO readings(temp) select unnest(generate_series(1, 1000))::double;
 31 | 
 32 | query I
 33 | SELECT datasketch_req_rank(datasketch_req(16, temp), 500.0, true) between 0.40 and 0.60 from readings
 34 | ----
 35 | True
 36 | 
 37 | query I
 38 | SELECT datasketch_req_quantile(datasketch_req(16, temp), 0.5, true) between 400 and 600 from readings
 39 | ----
 40 | True
 41 | 
 42 | # Can't save results on these because they are random
 43 | 
 44 | statement ok
 45 | SELECT datasketch_req_cdf(datasketch_req(16, temp), [100, 200, 500], true) from readings
 46 | 
 47 | statement ok
 48 | SELECT datasketch_req_pmf(datasketch_req(16, temp), [100, 200, 500], true) from readings
 49 | 
 50 | query I
 51 | SELECT datasketch_req_k(datasketch_req(16, temp)) from readings
 52 | ----
 53 | 16
 54 | 
 55 | statement ok
 56 | CREATE TABLE sketches (sketch sketch_req_double)
 57 | 
 58 | statement ok
 59 | INSERT INTO sketches (sketch) select datasketch_req(16, temp) from readings where mod(temp::int, 3) == 0
 60 | 
 61 | statement ok
 62 | INSERT INTO sketches (sketch) select datasketch_req(16, temp) from readings where mod(temp::int, 3) == 1
 63 | 
 64 | statement ok
 65 | INSERT INTO sketches (sketch) select datasketch_req(16, temp) from readings where mod(temp::int, 3) == 2
 66 | 
 67 | query I
 68 | select datasketch_req_is_empty(datasketch_req(16, sketch)) from sketches
 69 | ----
 70 | False
 71 | 
 72 | statement ok
 73 | select datasketch_req_quantile(datasketch_req(16, sketch), 0.5, true)::int from sketches
 74 | 
 75 | query I
 76 | select datasketch_req_n(datasketch_req(16, sketch)) from sketches
 77 | ----
 78 | 1000
 79 | 
 80 | query I
 81 | select datasketch_req_is_estimation_mode(datasketch_req(16, sketch)) from sketches
 82 | ----
 83 | 1
 84 | 
 85 | # num_retained varies based on data distribution and internal compaction
 86 | statement ok
 87 | select datasketch_req_num_retained(datasketch_req(16, sketch)) from sketches
 88 | 
 89 | query I
 90 | select datasketch_req_min_item(datasketch_req(16, sketch)) from sketches
 91 | ----
 92 | 1.0
 93 | 
 94 | query I
 95 | select datasketch_req_max_item(datasketch_req(16, sketch)) from sketches
 96 | ----
 97 | 1000.0
 98 | 
 99 | # Test error handling for invalid/corrupted sketch data
100 | statement error
101 | SELECT datasketch_req_is_empty('\x00\x01\x02'::sketch_req_float);
102 | ----
103 | Invalid Input Error: Failed to deserialize REQ sketch
104 | 
105 | statement error
106 | SELECT datasketch_req_k('\xDE\xAD\xBE\xEF'::sketch_req_double);
107 | ----
108 | Invalid Input Error: Failed to deserialize REQ sketch
109 | 
110 | # Test with empty blob
111 | statement error
112 | SELECT datasketch_req_is_empty(''::sketch_req_integer);
113 | ----
114 | Invalid Input Error: Failed to deserialize REQ sketch
115 | 
116 | # =============================================================================
117 | # COMPREHENSIVE UNION/MERGE TESTS
118 | # =============================================================================
119 | 
120 | # Test merging multiple sketches from partitioned data
121 | statement ok
122 | CREATE TABLE merge_data(value double, partition_id int)
123 | 
124 | statement ok
125 | INSERT INTO merge_data SELECT unnest(generate_series(1, 300))::double, 1
126 | 
127 | statement ok
128 | INSERT INTO merge_data SELECT unnest(generate_series(301, 600))::double, 2
129 | 
130 | statement ok
131 | INSERT INTO merge_data SELECT unnest(generate_series(601, 900))::double, 3
132 | 
133 | # Create sketches per partition (REQ requires k between 4 and 1024)
134 | statement ok
135 | CREATE TABLE partition_sketches AS
136 | SELECT partition_id, datasketch_req(32, value) as sketch
137 | FROM merge_data
138 | GROUP BY partition_id
139 | 
140 | # Verify we have 3 partition sketches
141 | query I
142 | SELECT count(*) FROM partition_sketches
143 | ----
144 | 3
145 | 
146 | # Merge all partition sketches and verify total count
147 | query I
148 | SELECT datasketch_req_n(datasketch_req(32, sketch)) FROM partition_sketches
149 | ----
150 | 900
151 | 
152 | # Verify merged sketch has correct min value
153 | query I
154 | SELECT datasketch_req_min_item(datasketch_req(32, sketch)) FROM partition_sketches
155 | ----
156 | 1.0
157 | 
158 | # Verify merged sketch has correct max value
159 | query I
160 | SELECT datasketch_req_max_item(datasketch_req(32, sketch)) FROM partition_sketches
161 | ----
162 | 900.0
163 | 
164 | # Verify merged sketch median is approximately in the middle
165 | query I
166 | SELECT datasketch_req_quantile(datasketch_req(32, sketch), 0.5, true) between 400 and 500 FROM partition_sketches
167 | ----
168 | True
169 | 
170 | # Test merging sketches with overlapping data ranges
171 | statement ok
172 | CREATE TABLE overlap_data(value double, group_id int)
173 | 
174 | statement ok
175 | INSERT INTO overlap_data SELECT unnest(generate_series(1, 500))::double, 1
176 | 
177 | statement ok
178 | INSERT INTO overlap_data SELECT unnest(generate_series(250, 750))::double, 2
179 | 
180 | statement ok
181 | CREATE TABLE overlap_sketches AS
182 | SELECT group_id, datasketch_req(64, value) as sketch
183 | FROM overlap_data
184 | GROUP BY group_id
185 | 
186 | # Merged sketch should have correct total count (500 + 501 = 1001)
187 | query I
188 | SELECT datasketch_req_n(datasketch_req(64, sketch)) FROM overlap_sketches
189 | ----
190 | 1001
191 | 
192 | # Verify min/max of merged overlapping sketches
193 | query I
194 | SELECT datasketch_req_min_item(datasketch_req(64, sketch)) FROM overlap_sketches
195 | ----
196 | 1.0
197 | 
198 | query I
199 | SELECT datasketch_req_max_item(datasketch_req(64, sketch)) FROM overlap_sketches
200 | ----
201 | 750.0
202 | 
203 | # Test merge with different K values
204 | statement ok
205 | CREATE TABLE k_test_sketches AS
206 | SELECT datasketch_req(8, value) as sketch FROM merge_data WHERE partition_id = 1
207 | UNION ALL
208 | SELECT datasketch_req(64, value) as sketch FROM merge_data WHERE partition_id = 2
209 | 
210 | # Verify merge works with different K values and produces correct count
211 | query I
212 | SELECT datasketch_req_n(datasketch_req(32, sketch)) FROM k_test_sketches
213 | ----
214 | 600
215 | 
216 | # Test merging a single sketch (edge case)
217 | query I
218 | SELECT datasketch_req_n(datasketch_req(16, sketch))
219 | FROM (SELECT datasketch_req(16, value) as sketch FROM merge_data WHERE partition_id = 1) single_sketch
220 | ----
221 | 300
222 | 
223 | # Test merge preserves estimation mode status
224 | query I
225 | SELECT datasketch_req_is_estimation_mode(datasketch_req(8, sketch)) FROM partition_sketches
226 | ----
227 | 1
228 | 
229 | # Test CDF on merged sketch
230 | statement ok
231 | SELECT datasketch_req_cdf(datasketch_req(32, sketch), [300, 600], true) FROM partition_sketches
232 | 
233 | # Test PMF on merged sketch
234 | statement ok
235 | SELECT datasketch_req_pmf(datasketch_req(32, sketch), [300, 600], true) FROM partition_sketches
236 | 
237 | # Test rank query on merged sketch - rank of 450 should be approximately 0.5
238 | query I
239 | SELECT datasketch_req_rank(datasketch_req(32, sketch), 450.0, true) between 0.45 and 0.55 FROM partition_sketches
240 | ----
241 | True
242 | 
243 | # Test merging sketches created with GROUP BY
244 | statement ok
245 | CREATE TABLE grouped_data(category varchar, value double)
246 | 
247 | statement ok
248 | INSERT INTO grouped_data
249 | SELECT 'A', unnest(generate_series(1, 100))::double
250 | UNION ALL
251 | SELECT 'B', unnest(generate_series(101, 200))::double
252 | UNION ALL
253 | SELECT 'C', unnest(generate_series(201, 300))::double
254 | 
255 | statement ok
256 | CREATE TABLE category_sketches AS
257 | SELECT category, datasketch_req(16, value) as sketch
258 | FROM grouped_data
259 | GROUP BY category
260 | 
261 | # Merge all category sketches
262 | query I
263 | SELECT datasketch_req_n(datasketch_req(16, sketch)) FROM category_sketches
264 | ----
265 | 300
266 | 
267 | # Verify merged sketch min and max span all categories
268 | query I
269 | SELECT datasketch_req_min_item(datasketch_req(16, sketch)) FROM category_sketches
270 | ----
271 | 1.0
272 | 
273 | query I
274 | SELECT datasketch_req_max_item(datasketch_req(16, sketch)) FROM category_sketches
275 | ----
276 | 300.0
277 | 


--------------------------------------------------------------------------------
/test/sql/datasketch_quantiles.test:
--------------------------------------------------------------------------------
  1 | # name: test/sql/datasketch_quantiles.test
  2 | # description: test datasketch Quantiles sketches
  3 | # group: [datasketches]
  4 | 
  5 | # Before we load the extension, this will fail
  6 | statement error
  7 | SELECT datasketch_quantiles_is_empty(''::blob);
  8 | ----
  9 | Catalog Error: Scalar Function with name datasketch_quantiles_is_empty does not exist!
 10 | 
 11 | # Require statement will ensure this test is run with this extension loaded
 12 | require datasketches
 13 | 
 14 | query I
 15 | SELECT datasketch_quantiles(16, 5.0::float);
 16 | ----
 17 | \x02\x03\x08\x18\x10\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\xA0@\x00\x00\xA0@\x00\x00\xA0@
 18 | 
 19 | query I
 20 | SELECT datasketch_quantiles_is_empty('\x02\x03\x08\x18\x10\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\xA0@\x00\x00\xA0@\x00\x00\xA0@'::sketch_quantiles_float);
 21 | ----
 22 | false
 23 | 
 24 | # Do some tests with integers.
 25 | 
 26 | statement ok
 27 | CREATE TABLE readings(temp double)
 28 | 
 29 | statement ok
 30 | INSERT INTO readings(temp) select unnest(generate_series(1, 1000))::double;
 31 | 
 32 | query I
 33 | SELECT datasketch_quantiles_rank(datasketch_quantiles(16, temp), 500.0, true) between 0.40 and 0.60 from readings
 34 | ----
 35 | True
 36 | 
 37 | query I
 38 | SELECT datasketch_quantiles_quantile(datasketch_quantiles(16, temp), 0.5, true) between 400 and 600 from readings
 39 | ----
 40 | True
 41 | 
 42 | # Can't save results on these because they are random
 43 | 
 44 | statement ok
 45 | SELECT datasketch_quantiles_cdf(datasketch_quantiles(16, temp), [100, 200, 500], true) from readings
 46 | 
 47 | statement ok
 48 | SELECT datasketch_quantiles_pmf(datasketch_quantiles(16, temp), [100, 200, 500], true) from readings
 49 | 
 50 | query I
 51 | SELECT datasketch_quantiles_k(datasketch_quantiles(16, temp)) from readings
 52 | ----
 53 | 16
 54 | 
 55 | statement ok
 56 | CREATE TABLE sketches (sketch sketch_quantiles_double)
 57 | 
 58 | statement ok
 59 | INSERT INTO sketches (sketch) select datasketch_quantiles(16, temp) from readings where mod(temp::int, 3) == 0
 60 | 
 61 | statement ok
 62 | INSERT INTO sketches (sketch) select datasketch_quantiles(16, temp) from readings where mod(temp::int, 3) == 1
 63 | 
 64 | statement ok
 65 | INSERT INTO sketches (sketch) select datasketch_quantiles(16, temp) from readings where mod(temp::int, 3) == 2
 66 | 
 67 | query I
 68 | select datasketch_quantiles_is_empty(datasketch_quantiles(16, sketch)) from sketches
 69 | ----
 70 | False
 71 | 
 72 | statement ok
 73 | select datasketch_quantiles_quantile(datasketch_quantiles(16, sketch), 0.5, true)::int from sketches
 74 | 
 75 | query I
 76 | select datasketch_quantiles_n(datasketch_quantiles(16, sketch)) from sketches
 77 | ----
 78 | 1000
 79 | 
 80 | query I
 81 | select datasketch_quantiles_is_estimation_mode(datasketch_quantiles(16, sketch)) from sketches
 82 | ----
 83 | 1
 84 | 
 85 | # num_retained varies based on data distribution and internal compaction
 86 | statement ok
 87 | select datasketch_quantiles_num_retained(datasketch_quantiles(16, sketch)) from sketches
 88 | 
 89 | query I
 90 | select datasketch_quantiles_min_item(datasketch_quantiles(16, sketch)) from sketches
 91 | ----
 92 | 1.0
 93 | 
 94 | query I
 95 | select datasketch_quantiles_max_item(datasketch_quantiles(16, sketch)) from sketches
 96 | ----
 97 | 1000.0
 98 | 
 99 | # Test error handling for invalid/corrupted sketch data
100 | statement error
101 | SELECT datasketch_quantiles_is_empty('\x00\x01\x02'::sketch_quantiles_float);
102 | ----
103 | Invalid Input Error: Failed to deserialize Quantiles sketch
104 | 
105 | statement error
106 | SELECT datasketch_quantiles_k('\xDE\xAD\xBE\xEF'::sketch_quantiles_double);
107 | ----
108 | Invalid Input Error: Failed to deserialize Quantiles sketch
109 | 
110 | # Test with empty blob
111 | statement error
112 | SELECT datasketch_quantiles_is_empty(''::sketch_quantiles_integer);
113 | ----
114 | Invalid Input Error: Failed to deserialize Quantiles sketch
115 | 
116 | # =============================================================================
117 | # COMPREHENSIVE UNION/MERGE TESTS
118 | # =============================================================================
119 | 
120 | # Test merging multiple sketches from partitioned data
121 | statement ok
122 | CREATE TABLE merge_data(value double, partition_id int)
123 | 
124 | statement ok
125 | INSERT INTO merge_data SELECT unnest(generate_series(1, 300))::double, 1
126 | 
127 | statement ok
128 | INSERT INTO merge_data SELECT unnest(generate_series(301, 600))::double, 2
129 | 
130 | statement ok
131 | INSERT INTO merge_data SELECT unnest(generate_series(601, 900))::double, 3
132 | 
133 | # Create sketches per partition
134 | statement ok
135 | CREATE TABLE partition_sketches AS
136 | SELECT partition_id, datasketch_quantiles(32, value) as sketch
137 | FROM merge_data
138 | GROUP BY partition_id
139 | 
140 | # Verify we have 3 partition sketches
141 | query I
142 | SELECT count(*) FROM partition_sketches
143 | ----
144 | 3
145 | 
146 | # Merge all partition sketches and verify total count
147 | query I
148 | SELECT datasketch_quantiles_n(datasketch_quantiles(32, sketch)) FROM partition_sketches
149 | ----
150 | 900
151 | 
152 | # Verify merged sketch has correct min value
153 | query I
154 | SELECT datasketch_quantiles_min_item(datasketch_quantiles(32, sketch)) FROM partition_sketches
155 | ----
156 | 1.0
157 | 
158 | # Verify merged sketch has correct max value
159 | query I
160 | SELECT datasketch_quantiles_max_item(datasketch_quantiles(32, sketch)) FROM partition_sketches
161 | ----
162 | 900.0
163 | 
164 | # Verify merged sketch median is approximately in the middle
165 | query I
166 | SELECT datasketch_quantiles_quantile(datasketch_quantiles(32, sketch), 0.5, true) between 400 and 500 FROM partition_sketches
167 | ----
168 | True
169 | 
170 | # Test merging sketches with overlapping data ranges
171 | statement ok
172 | CREATE TABLE overlap_data(value double, group_id int)
173 | 
174 | statement ok
175 | INSERT INTO overlap_data SELECT unnest(generate_series(1, 500))::double, 1
176 | 
177 | statement ok
178 | INSERT INTO overlap_data SELECT unnest(generate_series(250, 750))::double, 2
179 | 
180 | statement ok
181 | CREATE TABLE overlap_sketches AS
182 | SELECT group_id, datasketch_quantiles(64, value) as sketch
183 | FROM overlap_data
184 | GROUP BY group_id
185 | 
186 | # Merged sketch should have correct total count (500 + 501 = 1001, even with overlap since we count all items)
187 | query I
188 | SELECT datasketch_quantiles_n(datasketch_quantiles(64, sketch)) FROM overlap_sketches
189 | ----
190 | 1001
191 | 
192 | # Verify min/max of merged overlapping sketches
193 | query I
194 | SELECT datasketch_quantiles_min_item(datasketch_quantiles(64, sketch)) FROM overlap_sketches
195 | ----
196 | 1.0
197 | 
198 | query I
199 | SELECT datasketch_quantiles_max_item(datasketch_quantiles(64, sketch)) FROM overlap_sketches
200 | ----
201 | 750.0
202 | 
203 | # Test merge with different K values
204 | statement ok
205 | CREATE TABLE k_test_sketches AS
206 | SELECT datasketch_quantiles(16, value) as sketch FROM merge_data WHERE partition_id = 1
207 | UNION ALL
208 | SELECT datasketch_quantiles(64, value) as sketch FROM merge_data WHERE partition_id = 2
209 | 
210 | # Verify merge works with different K values and produces correct count
211 | query I
212 | SELECT datasketch_quantiles_n(datasketch_quantiles(32, sketch)) FROM k_test_sketches
213 | ----
214 | 600
215 | 
216 | # Test merging a single sketch (edge case)
217 | query I
218 | SELECT datasketch_quantiles_n(datasketch_quantiles(16, sketch))
219 | FROM (SELECT datasketch_quantiles(16, value) as sketch FROM merge_data WHERE partition_id = 1) single_sketch
220 | ----
221 | 300
222 | 
223 | # Test merge preserves estimation mode status
224 | query I
225 | SELECT datasketch_quantiles_is_estimation_mode(datasketch_quantiles(8, sketch)) FROM partition_sketches
226 | ----
227 | 1
228 | 
229 | # Test CDF on merged sketch
230 | statement ok
231 | SELECT datasketch_quantiles_cdf(datasketch_quantiles(32, sketch), [300, 600], true) FROM partition_sketches
232 | 
233 | # Test PMF on merged sketch
234 | statement ok
235 | SELECT datasketch_quantiles_pmf(datasketch_quantiles(32, sketch), [300, 600], true) FROM partition_sketches
236 | 
237 | # Test rank query on merged sketch - rank of 450 should be approximately 0.5
238 | query I
239 | SELECT datasketch_quantiles_rank(datasketch_quantiles(32, sketch), 450.0, true) between 0.45 and 0.55 FROM partition_sketches
240 | ----
241 | True
242 | 
243 | # Test merging sketches created with GROUP BY
244 | statement ok
245 | CREATE TABLE grouped_data(category varchar, value double)
246 | 
247 | statement ok
248 | INSERT INTO grouped_data
249 | SELECT 'A', unnest(generate_series(1, 100))::double
250 | UNION ALL
251 | SELECT 'B', unnest(generate_series(101, 200))::double
252 | UNION ALL
253 | SELECT 'C', unnest(generate_series(201, 300))::double
254 | 
255 | statement ok
256 | CREATE TABLE category_sketches AS
257 | SELECT category, datasketch_quantiles(16, value) as sketch
258 | FROM grouped_data
259 | GROUP BY category
260 | 
261 | # Merge all category sketches
262 | query I
263 | SELECT datasketch_quantiles_n(datasketch_quantiles(16, sketch)) FROM category_sketches
264 | ----
265 | 300
266 | 
267 | # Verify merged sketch min and max span all categories
268 | query I
269 | SELECT datasketch_quantiles_min_item(datasketch_quantiles(16, sketch)) FROM category_sketches
270 | ----
271 | 1.0
272 | 
273 | query I
274 | SELECT datasketch_quantiles_max_item(datasketch_quantiles(16, sketch)) FROM category_sketches
275 | ----
276 | 300.0
277 | 


--------------------------------------------------------------------------------
/test/sql/datasketch_hll.test:
--------------------------------------------------------------------------------
  1 | # name: test/sql/datasketch_hll.test
  2 | # description: test datasketch HLL sketches
  3 | # group: [datasketches]
  4 | 
  5 | # Before we load the extension, this will fail
  6 | statement error
  7 | SELECT datasketch_hll_is_empty(''::blob);
  8 | ----
  9 | Catalog Error: Scalar Function with name datasketch_hll_is_empty does not exist!
 10 | 
 11 | # Require statement will ensure this test is run with this extension loaded
 12 | require datasketches
 13 | 
 14 | query I
 15 | SELECT datasketch_hll(8, 5);
 16 | ----
 17 | \x02\x01\x07\x08\x03\x00\x01\x00{e\xE6\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00
 18 | 
 19 | query I
 20 | SELECT datasketch_hll_is_empty('\x02\x01\x07\x08\x03\x00\x01\x00{e\xE6\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00');
 21 | ----
 22 | false
 23 | 
 24 | query I
 25 | SELECT datasketch_hll_estimate('\x02\x01\x07\x08\x03\x00\x01\x00{e\xE6\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00');
 26 | ----
 27 | 1
 28 | 
 29 | # Do some tests with integers.
 30 | 
 31 | statement ok
 32 | CREATE TABLE items(id integer)
 33 | 
 34 | statement ok
 35 | INSERT INTO items(id) select unnest(generate_series(1, 100000));
 36 | 
 37 | # Duplicate items shouldn't affect the count.
 38 | 
 39 | statement ok
 40 | INSERT INTO items(id) select unnest(generate_series(1, 100000));
 41 | 
 42 | # HLL estimates should be close to the true count (100000 distinct values)
 43 | # With k=12, error should be within ~2-3%
 44 | query I
 45 | SELECT datasketch_hll_estimate(datasketch_hll(12, id))::int between 95000 and 105000 from items
 46 | ----
 47 | true
 48 | 
 49 | # With k=4, error can be much larger (~25%)
 50 | query I
 51 | SELECT datasketch_hll_estimate(datasketch_hll(4, id))::int between 75000 and 175000 from items
 52 | ----
 53 | true
 54 | 
 55 | query I
 56 | SELECT datasketch_hll_is_empty(datasketch_hll(12, id)) from items
 57 | ----
 58 | False
 59 | 
 60 | # Lower bound should be less than true count but reasonable
 61 | query I
 62 | SELECT datasketch_hll_lower_bound(datasketch_hll(12, id), 1)::int between 90000 and 105000 from items
 63 | ----
 64 | true
 65 | 
 66 | # Upper bound should be greater than true count but reasonable
 67 | query I
 68 | SELECT datasketch_hll_upper_bound(datasketch_hll(12, id), 1)::int between 95000 and 115000 from items
 69 | ----
 70 | true
 71 | 
 72 | query I
 73 | SELECT datasketch_hll_lg_config_k(datasketch_hll(12, id)) from items
 74 | ----
 75 | 12
 76 | 
 77 | query I
 78 | SELECT datasketch_hll_is_compact(datasketch_hll(12, id)) from items
 79 | ----
 80 | False
 81 | 
 82 | 
 83 | query I
 84 | SELECT datasketch_hll_describe(datasketch_hll(4, id), true, false) like '%HLL sketch summary%' from items
 85 | ----
 86 | True
 87 | 
 88 | # Test with strings
 89 | 
 90 | statement ok
 91 | CREATE TABLE employees(name string)
 92 | 
 93 | statement ok
 94 | INSERT INTO employees(name) VALUES
 95 | ('John Doe'), ('Jane Smith'), ('Michael Johnson'), ('Emily Davis'), ('Chris Brown'), ('Sarah Wilson'), ('David Martinez'),('Sophia Anderson'), ('Daniel Lee'),('Olivia Taylor');
 96 | 
 97 | # 10 distinct names, estimate should be close
 98 | query I
 99 | SELECT datasketch_hll_estimate(datasketch_hll(4, name))::int between 8 and 15 from employees
100 | ----
101 | true
102 | 
103 | # Grouped query - each group has 50 distinct values, estimates should be close
104 | query I
105 | select datasketch_hll_estimate(datasketch_hll(14, x))::int between 45 and 55 from unnest(range(100)) t(x) group by x % 2
106 | ----
107 | true
108 | true
109 | 
110 | statement ok
111 | CREATE TABLE sketches (sketch sketch_hll)
112 | 
113 | statement ok
114 | INSERT INTO sketches (sketch) select datasketch_hll(12, id) from items where mod(id, 3) == 0
115 | 
116 | statement ok
117 | INSERT INTO sketches (sketch) select datasketch_hll(12, id) from items where mod(id, 3) == 1
118 | 
119 | statement ok
120 | INSERT INTO sketches (sketch) select datasketch_hll(12, id) from items where mod(id, 3) == 2
121 | 
122 | query I
123 | select datasketch_hll_is_empty(datasketch_hll_union(12, sketch)) from sketches
124 | ----
125 | False
126 | 
127 | statement ok
128 | select datasketch_hll_estimate(datasketch_hll_union(12, sketch))::int from sketches
129 | 
130 | # Test error handling for invalid/corrupted sketch data
131 | statement error
132 | SELECT datasketch_hll_estimate('\x00\x01\x02\x03'::blob);
133 | ----
134 | Invalid Input Error: Failed to deserialize HLL sketch
135 | 
136 | statement error
137 | SELECT datasketch_hll_is_empty('\xDE\xAD\xBE\xEF'::blob);
138 | ----
139 | Invalid Input Error: Failed to deserialize HLL sketch
140 | 
141 | # Test with empty blob
142 | statement error
143 | SELECT datasketch_hll_estimate(''::blob);
144 | ----
145 | Invalid Input Error: Failed to deserialize HLL sketch
146 | 
147 | # =============================================================================
148 | # COMPREHENSIVE UNION TESTS
149 | # =============================================================================
150 | 
151 | # Test union of multiple sketches from partitioned data with non-overlapping values
152 | statement ok
153 | CREATE TABLE union_data(value int, partition_id int)
154 | 
155 | statement ok
156 | INSERT INTO union_data SELECT unnest(generate_series(1, 10000)), 1
157 | 
158 | statement ok
159 | INSERT INTO union_data SELECT unnest(generate_series(10001, 20000)), 2
160 | 
161 | statement ok
162 | INSERT INTO union_data SELECT unnest(generate_series(20001, 30000)), 3
163 | 
164 | # Create sketches per partition
165 | statement ok
166 | CREATE TABLE partition_sketches AS
167 | SELECT partition_id, datasketch_hll(12, value) as sketch
168 | FROM union_data
169 | GROUP BY partition_id
170 | 
171 | # Verify we have 3 partition sketches
172 | query I
173 | SELECT count(*) FROM partition_sketches
174 | ----
175 | 3
176 | 
177 | # Each partition has 10000 distinct values
178 | query I
179 | SELECT datasketch_hll_estimate(sketch)::int between 9500 and 10500 FROM partition_sketches ORDER BY partition_id LIMIT 1
180 | ----
181 | true
182 | 
183 | # Union all partition sketches - should have ~30000 distinct values
184 | query I
185 | SELECT datasketch_hll_estimate(datasketch_hll_union(12, sketch))::int between 28500 and 31500 FROM partition_sketches
186 | ----
187 | true
188 | 
189 | # Verify union is not empty
190 | query I
191 | SELECT datasketch_hll_is_empty(datasketch_hll_union(12, sketch)) FROM partition_sketches
192 | ----
193 | False
194 | 
195 | # Union should have lg_config_k from the bind parameter
196 | query I
197 | SELECT datasketch_hll_lg_config_k(datasketch_hll_union(12, sketch)) FROM partition_sketches
198 | ----
199 | 12
200 | 
201 | # Test union with overlapping data
202 | statement ok
203 | CREATE TABLE overlap_union_data(value int, group_id int)
204 | 
205 | statement ok
206 | INSERT INTO overlap_union_data SELECT unnest(generate_series(1, 50000)), 1
207 | 
208 | statement ok
209 | INSERT INTO overlap_union_data SELECT unnest(generate_series(25000, 75000)), 2
210 | 
211 | statement ok
212 | CREATE TABLE overlap_union_sketches AS
213 | SELECT group_id, datasketch_hll(14, value) as sketch
214 | FROM overlap_union_data
215 | GROUP BY group_id
216 | 
217 | # Group 1 has 50000 distinct, Group 2 has 50001 distinct
218 | # Union should have 75000 distinct (1-75000)
219 | query I
220 | SELECT datasketch_hll_estimate(datasketch_hll_union(14, sketch))::int between 72000 and 78000 FROM overlap_union_sketches
221 | ----
222 | true
223 | 
224 | # Test union with different K values
225 | statement ok
226 | CREATE TABLE k_union_sketches AS
227 | SELECT datasketch_hll(8, value) as sketch FROM union_data WHERE partition_id = 1
228 | UNION ALL
229 | SELECT datasketch_hll(14, value) as sketch FROM union_data WHERE partition_id = 2
230 | 
231 | # Verify union works with different K values and produces reasonable estimate
232 | query I
233 | SELECT datasketch_hll_estimate(datasketch_hll_union(10, sketch))::int between 18000 and 22000 FROM k_union_sketches
234 | ----
235 | true
236 | 
237 | # Test union of single sketch (edge case)
238 | query I
239 | SELECT datasketch_hll_estimate(datasketch_hll_union(12, sketch))::int between 9500 and 10500
240 | FROM (SELECT datasketch_hll(12, value) as sketch FROM union_data WHERE partition_id = 1) single_sketch
241 | ----
242 | true
243 | 
244 | # Test union preserves accuracy - lower_bound/upper_bound on union
245 | query I
246 | SELECT datasketch_hll_lower_bound(datasketch_hll_union(12, sketch), 1)::int between 27000 and 31000 FROM partition_sketches
247 | ----
248 | true
249 | 
250 | query I
251 | SELECT datasketch_hll_upper_bound(datasketch_hll_union(12, sketch), 1)::int between 29000 and 33000 FROM partition_sketches
252 | ----
253 | true
254 | 
255 | # Test union with string values
256 | statement ok
257 | CREATE TABLE string_union_data(name varchar, source_id int)
258 | 
259 | statement ok
260 | INSERT INTO string_union_data
261 | SELECT 'user_' || x, 1 FROM generate_series(1, 1000) t(x)
262 | UNION ALL
263 | SELECT 'user_' || x, 2 FROM generate_series(500, 1500) t(x)
264 | 
265 | statement ok
266 | CREATE TABLE string_union_sketches AS
267 | SELECT source_id, datasketch_hll(10, name) as sketch
268 | FROM string_union_data
269 | GROUP BY source_id
270 | 
271 | # Union should have ~1500 distinct strings (user_1 to user_1500)
272 | query I
273 | SELECT datasketch_hll_estimate(datasketch_hll_union(10, sketch))::int between 1400 and 1600 FROM string_union_sketches
274 | ----
275 | true
276 | 
277 | # Test union with GROUP BY categories
278 | statement ok
279 | CREATE TABLE category_union_data(category varchar, user_id int)
280 | 
281 | statement ok
282 | INSERT INTO category_union_data
283 | SELECT 'electronics', unnest(generate_series(1, 5000))
284 | UNION ALL
285 | SELECT 'clothing', unnest(generate_series(2500, 7500))
286 | UNION ALL
287 | SELECT 'food', unnest(generate_series(5000, 10000))
288 | 
289 | statement ok
290 | CREATE TABLE category_union_sketches AS
291 | SELECT category, datasketch_hll(12, user_id) as sketch
292 | FROM category_union_data
293 | GROUP BY category
294 | 
295 | # Each category has ~5000 distinct users
296 | # Union should have ~10000 distinct users (1-10000)
297 | query I
298 | SELECT datasketch_hll_estimate(datasketch_hll_union(12, sketch))::int between 9500 and 10500 FROM category_union_sketches
299 | ----
300 | true
301 | 
302 | # Test describe on union result
303 | query I
304 | SELECT datasketch_hll_describe(datasketch_hll_union(12, sketch), true, false) like '%HLL sketch summary%' FROM partition_sketches
305 | ----
306 | True
307 | 
308 | # Test is_compact on union result
309 | query I
310 | SELECT datasketch_hll_is_compact(datasketch_hll_union(12, sketch)) FROM partition_sketches
311 | ----
312 | False
313 | 
314 | 


--------------------------------------------------------------------------------
/test/sql/datasketch_theta.test:
--------------------------------------------------------------------------------
  1 | # name: test/sql/datasketch_theta.test
  2 | # description: test datasketch Theta sketches
  3 | # group: [datasketches]
  4 | 
  5 | # Ensure the extension is loaded
  6 | require datasketches
  7 | 
  8 | # -------------------------------------------------------------------
  9 | # 1. Basic Build and Estimate
 10 | # -------------------------------------------------------------------
 11 | 
 12 | # Test constant inputs
 13 | query I
 14 | SELECT datasketch_theta_estimate(datasketch_theta(1));
 15 | ----
 16 | 1
 17 | 
 18 | # Test basic distinct count with small data
 19 | statement ok
 20 | CREATE TABLE simple_items(id INTEGER);
 21 | 
 22 | statement ok
 23 | INSERT INTO simple_items VALUES (1), (2), (3), (3), (4), (5);
 24 | 
 25 | # Should count 5 distinct items
 26 | query I
 27 | SELECT datasketch_theta_estimate(datasketch_theta(id))::int FROM simple_items;
 28 | ----
 29 | 5
 30 | 
 31 | # -------------------------------------------------------------------
 32 | # 2. Large Data Accuracy (Standard Error)
 33 | # -------------------------------------------------------------------
 34 | 
 35 | statement ok
 36 | CREATE TABLE large_data AS SELECT * FROM range(0, 100000) t(i);
 37 | 
 38 | # Duplicate the data to ensure distinct counting works
 39 | statement ok
 40 | INSERT INTO large_data SELECT * FROM range(0, 100000) t(i);
 41 | 
 42 | # Check estimate (Standard Theta K=4096 has error ~1.5-2%)
 43 | query I
 44 | SELECT datasketch_theta_estimate(datasketch_theta(i))::int BETWEEN 98000 AND 102000 FROM large_data;
 45 | ----
 46 | true
 47 | 
 48 | # Check Lower/Upper bounds (Standard Deviation)
 49 | # Note: We didn't strictly implement lower/upper bound scalar functions in the C++ code
 50 | # provided in previous steps, but if you did, add tests here.
 51 | # If only 'estimate' was implemented, skip this.
 52 | 
 53 | # -------------------------------------------------------------------
 54 | # 3. Set Operations: Intersection & Difference
 55 | # -------------------------------------------------------------------
 56 | 
 57 | statement ok
 58 | CREATE TABLE set_a AS SELECT * FROM range(1, 6) t(i); -- {1, 2, 3, 4, 5}
 59 | 
 60 | statement ok
 61 | CREATE TABLE set_b AS SELECT * FROM range(4, 9) t(i); -- {4, 5, 6, 7, 8}
 62 | 
 63 | # Create a table to hold the sketches
 64 | statement ok
 65 | CREATE TABLE sketches (name VARCHAR, data sketch_theta);
 66 | 
 67 | # Build sketches for A and B
 68 | statement ok
 69 | INSERT INTO sketches VALUES
 70 | ('A', (SELECT datasketch_theta(i) FROM set_a)),
 71 | ('B', (SELECT datasketch_theta(i) FROM set_b));
 72 | 
 73 | # --- INTERSECTION Test ---
 74 | # Intersection of {1,2,3,4,5} and {4,5,6,7,8} is {4,5} -> Count: 2
 75 | query I
 76 | SELECT datasketch_theta_estimate(
 77 |     datasketch_theta_intersect(s1.data, s2.data)
 78 | )::int
 79 | FROM sketches s1, sketches s2
 80 | WHERE s1.name = 'A' AND s2.name = 'B';
 81 | ----
 82 | 2
 83 | 
 84 | # --- A NOT B Test ---
 85 | # {1,2,3,4,5} NOT {4,5,6,7,8} is {1,2,3} -> Count: 3
 86 | query I
 87 | SELECT datasketch_theta_estimate(
 88 |     datasketch_theta_a_not_b(s1.data, s2.data)
 89 | )::int
 90 | FROM sketches s1, sketches s2
 91 | WHERE s1.name = 'A' AND s2.name = 'B';
 92 | ----
 93 | 3
 94 | 
 95 | # --- B NOT A Test ---
 96 | # {4,5,6,7,8} NOT {1,2,3,4,5} is {6,7,8} -> Count: 3
 97 | query I
 98 | SELECT datasketch_theta_estimate(
 99 |     datasketch_theta_a_not_b(s2.data, s1.data)
100 | )::int
101 | FROM sketches s1, sketches s2
102 | WHERE s1.name = 'A' AND s2.name = 'B';
103 | ----
104 | 3
105 | 
106 | # -------------------------------------------------------------------
107 | # 4. String Types
108 | # -------------------------------------------------------------------
109 | 
110 | statement ok
111 | CREATE TABLE strings(s VARCHAR);
112 | 
113 | statement ok
114 | INSERT INTO strings VALUES ('apple'), ('banana'), ('apple'), ('cherry');
115 | 
116 | query I
117 | SELECT datasketch_theta_estimate(datasketch_theta(s))::int FROM strings;
118 | ----
119 | 3
120 | 
121 | # -------------------------------------------------------------------
122 | # 5. Configuration (Log K)
123 | # -------------------------------------------------------------------
124 | 
125 | # Test creating a sketch with a smaller K (Low accuracy) vs Large K
126 | # We verify the blobs are different sizes using octet_length.
127 | # NOTE: Minimum lg_k allowed by DataSketches is 5.
128 | query I
129 | SELECT octet_length(datasketch_theta(5, i)::BLOB) < octet_length(datasketch_theta(12, i)::BLOB)
130 | FROM range(0, 1000) t(i);
131 | ----
132 | true
133 | 
134 | # -------------------------------------------------------------------
135 | # 6. Bounds and Describe
136 | # -------------------------------------------------------------------
137 | 
138 | query I
139 | SELECT datasketch_theta_describe(datasketch_theta(1)) LIKE '%Theta sketch summary%';
140 | ----
141 | true
142 | 
143 | # Check lower bound for 2 standard deviations (approx 95% confidence)
144 | query I
145 | SELECT datasketch_theta_lower_bound(datasketch_theta(i), 2) <= 100000 FROM range(0, 100000) t(i);
146 | ----
147 | true
148 | 
149 | # Check upper bound
150 | query I
151 | SELECT datasketch_theta_upper_bound(datasketch_theta(i), 2) >= 100000 FROM range(0, 100000) t(i);
152 | ----
153 | true
154 | 
155 | # -------------------------------------------------------------------
156 | # 7. Edge Cases - Empty and NULL Values
157 | # -------------------------------------------------------------------
158 | 
159 | statement ok
160 | CREATE TABLE empty_table(id INTEGER);
161 | 
162 | # Empty sketch should estimate 0
163 | query I
164 | SELECT datasketch_theta_estimate(datasketch_theta(id))::int FROM empty_table;
165 | ----
166 | 0
167 | 
168 | # Test NULL handling
169 | statement ok
170 | CREATE TABLE with_nulls(id INTEGER);
171 | 
172 | statement ok
173 | INSERT INTO with_nulls VALUES (1), (NULL), (2), (NULL), (3);
174 | 
175 | # NULLs should be skipped, count only 3 distinct
176 | query I
177 | SELECT datasketch_theta_estimate(datasketch_theta(id))::int FROM with_nulls;
178 | ----
179 | 3
180 | 
181 | # All NULLs table
182 | statement ok
183 | CREATE TABLE all_nulls(id INTEGER);
184 | 
185 | statement ok
186 | INSERT INTO all_nulls VALUES (NULL), (NULL), (NULL);
187 | 
188 | query I
189 | SELECT datasketch_theta_estimate(datasketch_theta(id))::int FROM all_nulls;
190 | ----
191 | 0
192 | 
193 | # -------------------------------------------------------------------
194 | # 8. Single Item Edge Cases
195 | # -------------------------------------------------------------------
196 | 
197 | statement ok
198 | CREATE TABLE single_item(id INTEGER);
199 | 
200 | statement ok
201 | INSERT INTO single_item VALUES (42);
202 | 
203 | query I
204 | SELECT datasketch_theta_estimate(datasketch_theta(id))::int FROM single_item;
205 | ----
206 | 1
207 | 
208 | # Many duplicates of single item
209 | statement ok
210 | CREATE TABLE many_dupes AS SELECT 42 as id FROM range(0, 10000);
211 | 
212 | query I
213 | SELECT datasketch_theta_estimate(datasketch_theta(id))::int FROM many_dupes;
214 | ----
215 | 1
216 | 
217 | # -------------------------------------------------------------------
218 | # 9. Union Operations
219 | # -------------------------------------------------------------------
220 | 
221 | # Union of {1,2,3,4,5} and {4,5,6,7,8} is {1,2,3,4,5,6,7,8} -> Count: 8
222 | query I
223 | SELECT datasketch_theta_estimate(
224 |     datasketch_theta_union(s1.data, s2.data)
225 | )::int
226 | FROM sketches s1, sketches s2
227 | WHERE s1.name = 'A' AND s2.name = 'B';
228 | ----
229 | 8
230 | 
231 | # Union of a sketch with itself should equal the original
232 | query I
233 | SELECT datasketch_theta_estimate(
234 |     datasketch_theta_union(s1.data, s1.data)
235 | )::int
236 | FROM sketches s1
237 | WHERE s1.name = 'A';
238 | ----
239 | 5
240 | 
241 | # Union with empty sketch should equal the non-empty sketch
242 | statement ok
243 | INSERT INTO sketches VALUES ('EMPTY', (SELECT datasketch_theta(id) FROM empty_table));
244 | 
245 | query I
246 | SELECT datasketch_theta_estimate(
247 |     datasketch_theta_union(s1.data, s2.data)
248 | )::int
249 | FROM sketches s1, sketches s2
250 | WHERE s1.name = 'A' AND s2.name = 'EMPTY';
251 | ----
252 | 5
253 | 
254 | # -------------------------------------------------------------------
255 | # 10. Multiple Set Operations (Chaining)
256 | # -------------------------------------------------------------------
257 | 
258 | statement ok
259 | CREATE TABLE set_c AS SELECT * FROM range(1, 4) t(i); -- {1, 2, 3}
260 | 
261 | statement ok
262 | INSERT INTO sketches VALUES
263 | ('C', (SELECT datasketch_theta(i) FROM set_c));
264 | 
265 | # (A UNION B) INTERSECT C
266 | # A = {1,2,3,4,5}, B = {4,5,6,7,8}, C = {1,2,3}
267 | # A UNION B = {1,2,3,4,5,6,7,8}
268 | # (A UNION B) INTERSECT C = {1,2,3} -> Count: 3
269 | query I
270 | SELECT datasketch_theta_estimate(
271 |     datasketch_theta_intersect(
272 |         datasketch_theta_union(
273 |             (SELECT data FROM sketches WHERE name = 'A'),
274 |             (SELECT data FROM sketches WHERE name = 'B')
275 |         ),
276 |         (SELECT data FROM sketches WHERE name = 'C')
277 |     )
278 | )::int;
279 | ----
280 | 3
281 | 
282 | # A INTERSECT B INTERSECT C
283 | # A ∩ B = {4,5}, {4,5} ∩ {1,2,3} = {} -> Count: 0
284 | query I
285 | SELECT datasketch_theta_estimate(
286 |     datasketch_theta_intersect(
287 |         datasketch_theta_intersect(
288 |             (SELECT data FROM sketches WHERE name = 'A'),
289 |             (SELECT data FROM sketches WHERE name = 'B')
290 |         ),
291 |         (SELECT data FROM sketches WHERE name = 'C')
292 |     )
293 | )::int;
294 | ----
295 | 0
296 | 
297 | # -------------------------------------------------------------------
298 | # 11. Symmetric Difference (A NOT B + B NOT A)
299 | # -------------------------------------------------------------------
300 | 
301 | # A XOR B = (A - B) ∪ (B - A)
302 | # A = {1,2,3,4,5}, B = {4,5,6,7,8}
303 | # A - B = {1,2,3}, B - A = {6,7,8}
304 | # XOR = {1,2,3,6,7,8} -> Count: 6
305 | query I
306 | SELECT datasketch_theta_estimate(
307 |     datasketch_theta_union(
308 |         datasketch_theta_a_not_b(s1.data, s2.data),
309 |         datasketch_theta_a_not_b(s2.data, s1.data)
310 |     )
311 | )::int
312 | FROM sketches s1, sketches s2
313 | WHERE s1.name = 'A' AND s2.name = 'B';
314 | ----
315 | 6
316 | 
317 | # -------------------------------------------------------------------
318 | # 12. Merging Sketches via Aggregate
319 | # -------------------------------------------------------------------
320 | 
321 | statement ok
322 | CREATE TABLE partitions(partition_id INTEGER, value INTEGER);
323 | 
324 | statement ok
325 | INSERT INTO partitions VALUES
326 | (1, 1), (1, 2), (1, 3),
327 | (2, 3), (2, 4), (2, 5),
328 | (3, 5), (3, 6), (3, 7);
329 | 
330 | # Build sketches per partition, then merge them
331 | # Total distinct values: {1,2,3,4,5,6,7} -> 7
332 | statement ok
333 | CREATE TABLE partition_sketches AS
334 | SELECT partition_id, datasketch_theta(value) as sketch
335 | FROM partitions
336 | GROUP BY partition_id;
337 | 
338 | query I
339 | SELECT datasketch_theta_estimate(
340 |     datasketch_theta(sketch)
341 | )::int
342 | FROM partition_sketches;
343 | ----
344 | 7
345 | 
346 | # -------------------------------------------------------------------
347 | # 13. Different Data Types
348 | # -------------------------------------------------------------------
349 | 
350 | # BIGINT
351 | statement ok
352 | CREATE TABLE bigints(val BIGINT);
353 | 
354 | statement ok
355 | INSERT INTO bigints VALUES
356 | (9223372036854775807),
357 | (9223372036854775806),
358 | (-9223372036854775808),
359 | (9223372036854775807); -- duplicate
360 | 
361 | query I
362 | SELECT datasketch_theta_estimate(datasketch_theta(val))::int FROM bigints;
363 | ----
364 | 3
365 | 
366 | # VARCHAR with special characters
367 | statement ok
368 | CREATE TABLE special_strings(s VARCHAR);
369 | 
370 | statement ok
371 | INSERT INTO special_strings VALUES
372 | ('hello'), ('world'), ('hello world'),
373 | ('emoji 🎉'), (''), ('hello');
374 | 
375 | query I
376 | SELECT datasketch_theta_estimate(datasketch_theta(s))::int FROM special_strings;
377 | ----
378 | 5
379 | 
380 | # Empty string should count as distinct
381 | query I
382 | SELECT datasketch_theta_estimate(datasketch_theta(s))::int
383 | FROM (VALUES (''), ('')) t(s);
384 | ----
385 | 1
386 | 
387 | # -------------------------------------------------------------------
388 | # 14. Metadata and Diagnostic Functions
389 | # -------------------------------------------------------------------
390 | 
391 | statement ok
392 | CREATE TABLE meta_test AS SELECT * FROM range(0, 1000) t(i);
393 | 
394 | statement ok
395 | CREATE TABLE meta_sketch AS SELECT datasketch_theta(i) as sketch FROM meta_test;
396 | 
397 | # is_empty should be false for non-empty sketch
398 | query I
399 | SELECT datasketch_theta_is_empty(sketch) FROM meta_sketch;
400 | ----
401 | false
402 | 
403 | query I
404 | SELECT datasketch_theta_is_empty(datasketch_theta(id)) FROM empty_table;
405 | ----
406 | true
407 | 
408 | # num_retained should be positive for data within sketch capacity
409 | query I
410 | SELECT datasketch_theta_num_retained(sketch) > 0 FROM meta_sketch;
411 | ----
412 | true
413 | 
414 | # theta value should be in (0, 1] range
415 | query I
416 | SELECT datasketch_theta_get_theta(sketch) > 0 AND
417 |        datasketch_theta_get_theta(sketch) <= 1
418 | FROM meta_sketch;
419 | ----
420 | true
421 | 
422 | # seed should match default or custom seed
423 | query I
424 | SELECT datasketch_theta_get_seed(sketch) = datasketch_theta_get_seed(sketch)
425 | FROM meta_sketch;
426 | ----
427 | true
428 | 
429 | # is_estimation_mode - small data should be exact
430 | query I
431 | SELECT datasketch_theta_is_estimation_mode(datasketch_theta(i))
432 | FROM range(0, 10) t(i);
433 | ----
434 | false
435 | 
436 | # is_estimation_mode - large data should be in estimation mode
437 | query I
438 | SELECT datasketch_theta_is_estimation_mode(datasketch_theta(i))
439 | FROM range(0, 100000) t(i);
440 | ----
441 | true
442 | 
443 | # -------------------------------------------------------------------
444 | # 15. GROUP BY with Multiple Groups
445 | # -------------------------------------------------------------------
446 | 
447 | statement ok
448 | CREATE TABLE events(user_id INTEGER, event_type VARCHAR, item_id INTEGER);
449 | 
450 | statement ok
451 | INSERT INTO events VALUES
452 | (1, 'view', 100), (1, 'view', 101), (1, 'click', 100),
453 | (2, 'view', 102), (2, 'click', 102), (2, 'view', 103),
454 | (3, 'view', 100), (3, 'view', 100), (3, 'click', 104);
455 | 
456 | # Distinct items viewed per user
457 | statement ok
458 | CREATE TABLE user_sketches AS
459 | SELECT user_id, datasketch_theta(item_id) as sketch
460 | FROM events
461 | WHERE event_type = 'view'
462 | GROUP BY user_id;
463 | 
464 | query II
465 | SELECT user_id, datasketch_theta_estimate(sketch)::int
466 | FROM user_sketches
467 | ORDER BY user_id;
468 | ----
469 | 1	2
470 | 2	2
471 | 3	1
472 | 
473 | # -------------------------------------------------------------------
474 | # 16. Serialization and Persistence
475 | # -------------------------------------------------------------------
476 | 
477 | statement ok
478 | CREATE TABLE sketch_storage(id INTEGER, sketch_data sketch_theta);
479 | 
480 | statement ok
481 | INSERT INTO sketch_storage
482 | SELECT 1, datasketch_theta(i) FROM range(0, 100) t(i);
483 | 
484 | # Retrieve and use stored sketch
485 | query I
486 | SELECT datasketch_theta_estimate(sketch_data)::int
487 | FROM sketch_storage
488 | WHERE id = 1;
489 | ----
490 | 100
491 | 
492 | # Store result of set operation
493 | statement ok
494 | INSERT INTO sketch_storage
495 | SELECT 2, datasketch_theta_union(s1.data, s2.data)
496 | FROM sketches s1, sketches s2
497 | WHERE s1.name = 'A' AND s2.name = 'B';
498 | 
499 | query I
500 | SELECT datasketch_theta_estimate(sketch_data)::int
501 | FROM sketch_storage
502 | WHERE id = 2;
503 | ----
504 | 8
505 | 
506 | # -------------------------------------------------------------------
507 | # 17. Bounds with Different Standard Deviations
508 | # -------------------------------------------------------------------
509 | 
510 | statement ok
511 | CREATE TABLE bounds_test AS
512 | SELECT datasketch_theta(i) as sketch FROM range(0, 50000) t(i);
513 | 
514 | # 1 SD (~68% confidence)
515 | query I
516 | SELECT
517 |     datasketch_theta_lower_bound(sketch, 1) <= 50000 AND
518 |     datasketch_theta_upper_bound(sketch, 1) >= 50000
519 | FROM bounds_test;
520 | ----
521 | true
522 | 
523 | # 2 SD (~95% confidence) - wider interval
524 | query I
525 | SELECT
526 |     datasketch_theta_lower_bound(sketch, 2) <= datasketch_theta_lower_bound(sketch, 1) AND
527 |     datasketch_theta_upper_bound(sketch, 2) >= datasketch_theta_upper_bound(sketch, 1)
528 | FROM bounds_test;
529 | ----
530 | true
531 | 
532 | # 3 SD (~99.7% confidence) - widest interval
533 | query I
534 | SELECT
535 |     datasketch_theta_lower_bound(sketch, 3) <= datasketch_theta_lower_bound(sketch, 2) AND
536 |     datasketch_theta_upper_bound(sketch, 3) >= datasketch_theta_upper_bound(sketch, 2)
537 | FROM bounds_test;
538 | ----
539 | true
540 | 
541 | # Estimate should be within bounds
542 | query I
543 | SELECT
544 |     datasketch_theta_estimate(sketch) >= datasketch_theta_lower_bound(sketch, 2) AND
545 |     datasketch_theta_estimate(sketch) <= datasketch_theta_upper_bound(sketch, 2)
546 | FROM bounds_test;
547 | ----
548 | true
549 | 
550 | # -------------------------------------------------------------------
551 | # 18. Set Operations Commutativity and Identity
552 | # -------------------------------------------------------------------
553 | 
554 | # Union is commutative: A ∪ B = B ∪ A
555 | query I
556 | SELECT
557 |     datasketch_theta_estimate(datasketch_theta_union(s1.data, s2.data))::int =
558 |     datasketch_theta_estimate(datasketch_theta_union(s2.data, s1.data))::int
559 | FROM sketches s1, sketches s2
560 | WHERE s1.name = 'A' AND s2.name = 'B';
561 | ----
562 | true
563 | 
564 | # Intersection is commutative: A ∩ B = B ∩ A
565 | query I
566 | SELECT
567 |     datasketch_theta_estimate(datasketch_theta_intersect(s1.data, s2.data))::int =
568 |     datasketch_theta_estimate(datasketch_theta_intersect(s2.data, s1.data))::int
569 | FROM sketches s1, sketches s2
570 | WHERE s1.name = 'A' AND s2.name = 'B';
571 | ----
572 | true
573 | 
574 | # A ∪ ∅ = A (identity)
575 | query I
576 | SELECT
577 |     datasketch_theta_estimate(datasketch_theta_union(s1.data, s2.data))::int =
578 |     datasketch_theta_estimate(s1.data)::int
579 | FROM sketches s1, sketches s2
580 | WHERE s1.name = 'A' AND s2.name = 'EMPTY';
581 | ----
582 | true
583 | 
584 | # A ∩ A = A (idempotence)
585 | query I
586 | SELECT
587 |     datasketch_theta_estimate(datasketch_theta_intersect(s1.data, s1.data))::int =
588 |     datasketch_theta_estimate(s1.data)::int
589 | FROM sketches s1
590 | WHERE s1.name = 'A';
591 | ----
592 | true
593 | 
594 | # A - A = ∅
595 | query I
596 | SELECT datasketch_theta_estimate(datasketch_theta_a_not_b(s1.data, s1.data))::int
597 | FROM sketches s1
598 | WHERE s1.name = 'A';
599 | ----
600 | 0
601 | 
602 | # A NOT B ≠ B NOT A (not commutative)
603 | # Create separate tables for this test only
604 | statement ok
605 | CREATE TABLE noncomm_set_x AS SELECT * FROM range(1, 11) t(i); -- {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
606 | 
607 | statement ok
608 | CREATE TABLE noncomm_set_y AS SELECT * FROM range(8, 12) t(i); -- {8, 9, 10, 11}
609 | 
610 | # Create separate sketch table for this test
611 | statement ok
612 | CREATE TABLE noncomm_sketches(name VARCHAR, sketch sketch_theta);
613 | 
614 | statement ok
615 | INSERT INTO noncomm_sketches VALUES
616 | ('X', (SELECT datasketch_theta(i) FROM noncomm_set_x)),
617 | ('Y', (SELECT datasketch_theta(i) FROM noncomm_set_y));
618 | 
619 | # X NOT Y = {1, 2, 3, 4, 5, 6, 7} -> Count: 7
620 | query I
621 | SELECT datasketch_theta_estimate(
622 |     datasketch_theta_a_not_b(sx.sketch, sy.sketch)
623 | )::int
624 | FROM noncomm_sketches sx, noncomm_sketches sy
625 | WHERE sx.name = 'X' AND sy.name = 'Y';
626 | ----
627 | 7
628 | 
629 | # Y NOT X = {11} -> Count: 1
630 | query I
631 | SELECT datasketch_theta_estimate(
632 |     datasketch_theta_a_not_b(sy.sketch, sx.sketch)
633 | )::int
634 | FROM noncomm_sketches sx, noncomm_sketches sy
635 | WHERE sx.name = 'X' AND sy.name = 'Y';
636 | ----
637 | 1
638 | 
639 | # Verify they are NOT equal (7 != 1 proves non-commutativity)
640 | query I
641 | SELECT
642 |     datasketch_theta_estimate(datasketch_theta_a_not_b(sx.sketch, sy.sketch))::int !=
643 |     datasketch_theta_estimate(datasketch_theta_a_not_b(sy.sketch, sx.sketch))::int
644 | FROM noncomm_sketches sx, noncomm_sketches sy
645 | WHERE sx.name = 'X' AND sy.name = 'Y';
646 | ----
647 | true
648 | 
649 | # -------------------------------------------------------------------
650 | # 19. Jaccard Similarity (if implemented)
651 | # -------------------------------------------------------------------
652 | # Jaccard = |A ∩ B| / |A ∪ B|
653 | # A = {1,2,3,4,5}, B = {4,5,6,7,8}
654 | # |A ∩ B| = 2, |A ∪ B| = 8
655 | # Jaccard = 2/8 = 0.25
656 | 
657 | # Manual calculation if no direct Jaccard function
658 | query I
659 | SELECT (
660 |     datasketch_theta_estimate(datasketch_theta_intersect(s1.data, s2.data)) /
661 |     datasketch_theta_estimate(datasketch_theta_union(s1.data, s2.data))
662 | )::decimal(4,2)
663 | FROM sketches s1, sketches s2
664 | WHERE s1.name = 'A' AND s2.name = 'B';
665 | ----
666 | 0.25
667 | 
668 | # -------------------------------------------------------------------
669 | # 20. Very Small K Parameter
670 | # -------------------------------------------------------------------
671 | 
672 | # Test minimum lg_k = 5 (K = 32)
673 | statement ok
674 | CREATE TABLE small_k_test AS SELECT * FROM range(0, 1000) t(i);
675 | 
676 | statement ok
677 | CREATE TABLE small_k_sketch AS
678 | SELECT datasketch_theta(5, i) as sketch FROM small_k_test;
679 | 
680 | # Should still give reasonable estimate (with high error)
681 | query I
682 | SELECT datasketch_theta_estimate(sketch)::int BETWEEN 500 AND 1500
683 | FROM small_k_sketch;
684 | ----
685 | true
686 | 
687 | # Should be in estimation mode
688 | query I
689 | SELECT datasketch_theta_is_estimation_mode(sketch) FROM small_k_sketch;
690 | ----
691 | true
692 | 
693 | # Theta should be less than 1.0 (sampling occurred)
694 | query I
695 | SELECT datasketch_theta_get_theta(sketch) < 1.0 FROM small_k_sketch;
696 | ----
697 | true
698 | 
699 | # -------------------------------------------------------------------
700 | # 21. De-duplication Use Case
701 | # -------------------------------------------------------------------
702 | 
703 | statement ok
704 | CREATE TABLE raw_events(session_id VARCHAR, user_id INTEGER);
705 | 
706 | statement ok
707 | INSERT INTO raw_events VALUES
708 | ('s1', 1), ('s1', 1), ('s1', 1),  -- Same session/user repeated
709 | ('s2', 2), ('s2', 2),
710 | ('s3', 3), ('s4', 1);  -- User 1 appears again in different session
711 | 
712 | # Distinct sessions
713 | query I
714 | SELECT datasketch_theta_estimate(datasketch_theta(session_id))::int
715 | FROM raw_events;
716 | ----
717 | 4
718 | 
719 | # Distinct users
720 | query I
721 | SELECT datasketch_theta_estimate(datasketch_theta(user_id))::int
722 | FROM raw_events;
723 | ----
724 | 3
725 | 
726 | # -------------------------------------------------------------------
727 | # 22. Extremely Large Cardinality
728 | # -------------------------------------------------------------------
729 | 
730 | statement ok
731 | CREATE TABLE million_items AS SELECT * FROM range(0, 1000000) t(i);
732 | 
733 | # Should estimate close to 1M (within ~1-2% with default K)
734 | query I
735 | SELECT datasketch_theta_estimate(datasketch_theta(i))::int
736 |     BETWEEN 980000 AND 1020000
737 | FROM million_items;
738 | ----
739 | true
740 | 
741 | # Should be in estimation mode
742 | query I
743 | SELECT datasketch_theta_is_estimation_mode(datasketch_theta(i))
744 | FROM million_items;
745 | ----
746 | true
747 | 
748 | # -------------------------------------------------------------------
749 | # 23. Describe Function Detail
750 | # -------------------------------------------------------------------
751 | 
752 | # Verify describe contains key information
753 | query I
754 | SELECT datasketch_theta_describe(sketch) LIKE '%Empty%' OR
755 |        datasketch_theta_describe(sketch) LIKE '%estimate%'
756 | FROM meta_sketch;
757 | ----
758 | true
759 | 
760 | # Describe empty sketch
761 | # query I
762 | # SELECT datasketch_theta_describe(datasketch_theta(id)) LIKE '%Empty%'
763 | # FROM empty_table;
764 | # ----
765 | # true
766 | 
767 | # -------------------------------------------------------------------
768 | # 24. CTE and Subquery Integration
769 | # -------------------------------------------------------------------
770 | 
771 | 
772 | query I
773 | WITH daily_users AS (
774 |     SELECT 1 as day, unnest([1, 2, 3, 3, 4]) as user_id
775 |     UNION ALL
776 |     SELECT 2 as day, unnest([3, 4, 5, 6]) as user_id
777 |     UNION ALL
778 |     SELECT 3 as day, unnest([1, 5, 7, 8, 9]) as user_id
779 | ),
780 | daily_sketches AS (
781 |     SELECT day, datasketch_theta(user_id) as sketch
782 |     FROM daily_users
783 |     GROUP BY day
784 | )
785 | SELECT datasketch_theta_estimate(
786 |     datasketch_theta(sketch)
787 | )::int as total_unique_users
788 | FROM daily_sketches;
789 | ----
790 | 9
791 | 
792 | # -------------------------------------------------------------------
793 | # 25. Case Sensitivity (Strings)
794 | # -------------------------------------------------------------------
795 | 
796 | statement ok
797 | CREATE TABLE case_test(s VARCHAR);
798 | 
799 | statement ok
800 | INSERT INTO case_test VALUES ('Apple'), ('apple'), ('APPLE');
801 | 
802 | # Should count as 3 distinct (case-sensitive)
803 | query I
804 | SELECT datasketch_theta_estimate(datasketch_theta(s))::int FROM case_test;
805 | ----
806 | 3
807 | 
808 | # -------------------------------------------------------------------
809 | # 26. Window Functions (if supported)
810 | # -------------------------------------------------------------------
811 | 
812 | statement ok
813 | CREATE TABLE time_series(ts INTEGER, user_id INTEGER);
814 | 
815 | statement ok
816 | INSERT INTO time_series VALUES
817 | (1, 10), (2, 11), (3, 12), (4, 10), (5, 13),
818 | (6, 14), (7, 15), (8, 10), (9, 16), (10, 17);
819 | 
820 | # Running distinct count (if window aggregate is supported)
821 | # This may not work depending on implementation - include if supported
822 | query II
823 | SELECT ts, datasketch_theta_estimate(
824 |     datasketch_theta(user_id) OVER (ORDER BY ts ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
825 | )::int
826 | FROM time_series
827 | ORDER BY ts;
828 | ----
829 | 1	1
830 | 2	2
831 | 3	3
832 | 4	3
833 | 5	4
834 | 6	5
835 | 7	6
836 | 8	6
837 | 9	7
838 | 10	8
839 | 


--------------------------------------------------------------------------------
/test/sql/datasketch_frequent.test:
--------------------------------------------------------------------------------
  1 | # name: test/sql/datasketch_frequent.test
  2 | # description: Test DataSketches Frequent Items Sketch
  3 | # group: [datasketches]
  4 | 
  5 | # Ensure the extension is loaded
  6 | require datasketches
  7 | 
  8 | # -------------------------------------------------------------------
  9 | # 1. Basic Estimates (Exact Mode)
 10 | # -------------------------------------------------------------------
 11 | # For small datasets that fit entirely in the map, counts should be exact.
 12 | 
 13 | statement ok
 14 | CREATE TABLE fruits(name VARCHAR);
 15 | 
 16 | statement ok
 17 | INSERT INTO fruits VALUES
 18 | ('apple'), ('apple'), ('apple'),
 19 | ('banana'), ('banana'),
 20 | ('cherry');
 21 | 
 22 | # Build the sketch
 23 | statement ok
 24 | CREATE TABLE fruit_sketch AS SELECT datasketch_frequent_items(name) as sketch FROM fruits;
 25 | 
 26 | # Check specific estimates
 27 | query I
 28 | SELECT datasketch_frequent_items_estimate(sketch, 'apple') FROM fruit_sketch;
 29 | ----
 30 | 3
 31 | 
 32 | query I
 33 | SELECT datasketch_frequent_items_estimate(sketch, 'banana') FROM fruit_sketch;
 34 | ----
 35 | 2
 36 | 
 37 | query I
 38 | SELECT datasketch_frequent_items_estimate(sketch, 'cherry') FROM fruit_sketch;
 39 | ----
 40 | 1
 41 | 
 42 | query I
 43 | SELECT datasketch_frequent_items_estimate(sketch, 'dragonfruit') FROM fruit_sketch;
 44 | ----
 45 | 0
 46 | 
 47 | # -------------------------------------------------------------------
 48 | # 2. Get Frequent Items (Complex Return Type)
 49 | # -------------------------------------------------------------------
 50 | # Returns: LIST(STRUCT(item VARCHAR, estimate BIGINT, lower_bound BIGINT, upper_bound BIGINT))
 51 | 
 52 | # We unnest the list to verify the contents easily
 53 | query ITII
 54 | SELECT
 55 |     f.item,
 56 |     f.estimate,
 57 |     f.lower_bound,
 58 |     f.upper_bound
 59 | FROM fruit_sketch,
 60 |      UNNEST(datasketch_frequent_items_get_frequent(sketch, 'NO_FALSE_POSITIVES')) as t(f)
 61 | ORDER BY f.estimate DESC;
 62 | ----
 63 | apple	3	3	3
 64 | banana	2	2	2
 65 | cherry	1	1	1
 66 | 
 67 | # -------------------------------------------------------------------
 68 | # 3. Custom K Parameter
 69 | # -------------------------------------------------------------------
 70 | # Verify we can pass the lg_max_k parameter (log2 of map size)
 71 | 
 72 | statement ok
 73 | DROP TABLE fruit_sketch;
 74 | 
 75 | # Create with lg_k = 4 (small map)
 76 | statement ok
 77 | CREATE TABLE fruit_sketch AS SELECT datasketch_frequent_items(4, name) as sketch FROM fruits;
 78 | 
 79 | query I
 80 | SELECT datasketch_frequent_items_estimate(sketch, 'apple') FROM fruit_sketch;
 81 | ----
 82 | 3
 83 | 
 84 | # -------------------------------------------------------------------
 85 | # 4. Merging Sketches
 86 | # -------------------------------------------------------------------
 87 | 
 88 | statement ok
 89 | CREATE TABLE logs_part1(ip VARCHAR);
 90 | 
 91 | statement ok
 92 | CREATE TABLE logs_part2(ip VARCHAR);
 93 | 
 94 | statement ok
 95 | INSERT INTO logs_part1 VALUES ('192.168.1.1'), ('192.168.1.1'), ('10.0.0.1');
 96 | 
 97 | statement ok
 98 | INSERT INTO logs_part2 VALUES ('192.168.1.1'), ('10.0.0.5');
 99 | 
100 | # Create a table of partial sketches
101 | statement ok
102 | CREATE TABLE partial_sketches(grp INT, sketch sketch_frequent_items);
103 | 
104 | statement ok
105 | INSERT INTO partial_sketches VALUES
106 | (1, (SELECT datasketch_frequent_items(ip) FROM logs_part1)),
107 | (2, (SELECT datasketch_frequent_items(ip) FROM logs_part2));
108 | 
109 | # Merge them using the aggregate function
110 | # Total '192.168.1.1' count should be 2 + 1 = 3
111 | query I
112 | SELECT datasketch_frequent_items_estimate(
113 |     datasketch_frequent_items(sketch),
114 |     '192.168.1.1'
115 | )
116 | FROM partial_sketches;
117 | ----
118 | 3
119 | 
120 | # -------------------------------------------------------------------
121 | # 5. Heavy Hitters (Approximate Mode)
122 | # -------------------------------------------------------------------
123 | # We generate a dataset where 'heavy_hitter' appears 100 times,
124 | # and 2000 random distinct items appear once.
125 | # With a small K, the sketch should drop the singletons but keep the heavy hitter.
126 | 
127 | statement ok
128 | CREATE TABLE stream(item VARCHAR);
129 | 
130 | # Insert heavy hitter 100 times
131 | statement ok
132 | INSERT INTO stream SELECT 'heavy_hitter' FROM range(0, 100);
133 | 
134 | # Insert 2000 noise items
135 | statement ok
136 | INSERT INTO stream SELECT 'noise_' || i::VARCHAR FROM range(0, 2000) t(i);
137 | 
138 | # Use lg_k = 6 (Map size ~64).
139 | # This is too small to hold 2100 items, so it will purge the noise.
140 | statement ok
141 | CREATE TABLE stream_sketch AS SELECT datasketch_frequent_items(6, item) as sketch FROM stream;
142 | 
143 | # 1. Check Heavy Hitter is found
144 | query I
145 | SELECT datasketch_frequent_items_estimate(sketch, 'heavy_hitter') >= 100 FROM stream_sketch;
146 | ----
147 | true
148 | 
149 | # 2. Check Noise is likely dropped (estimate 0 or very close to 0)
150 | # Note: exact behavior depends on the purge algorithm, but for lg_k=6, noise should be evicted.
151 | query I
152 | SELECT datasketch_frequent_items_estimate(sketch, 'noise_1') < 5 FROM stream_sketch;
153 | ----
154 | true
155 | 
156 | # 3. Get Frequent List - Should definitely contain heavy_hitter
157 | query T
158 | SELECT f.item
159 | FROM stream_sketch,
160 |      UNNEST(datasketch_frequent_items_get_frequent(sketch, 'NO_FALSE_POSITIVES')) as t(f)
161 | WHERE f.item = 'heavy_hitter';
162 | ----
163 | heavy_hitter
164 | 
165 | # -------------------------------------------------------------------
166 | # 6. Error Type Enum Check
167 | # -------------------------------------------------------------------
168 | 
169 | # NO_FALSE_NEGATIVES usually returns more items (potentially including noise)
170 | # NO_FALSE_POSITIVES usually returns fewer items (stricter)
171 | 
172 | query I
173 | SELECT (
174 |     (SELECT count(*) FROM stream_sketch, UNNEST(datasketch_frequent_items_get_frequent(sketch, 'NO_FALSE_NEGATIVES')) t(f))
175 |     >=
176 |     (SELECT count(*) FROM stream_sketch, UNNEST(datasketch_frequent_items_get_frequent(sketch, 'NO_FALSE_POSITIVES')) t(f))
177 | );
178 | ----
179 | true
180 | 
181 | # -------------------------------------------------------------------
182 | # 7. Edge Cases - Empty Tables and NULL Values
183 | # -------------------------------------------------------------------
184 | 
185 | statement ok
186 | CREATE TABLE empty_table(item VARCHAR);
187 | 
188 | statement ok
189 | CREATE TABLE empty_sketch AS SELECT datasketch_frequent_items(item) as sketch FROM empty_table;
190 | 
191 | # Empty sketch should return 0 for any item
192 | query I
193 | SELECT datasketch_frequent_items_estimate(sketch, 'anything') FROM empty_sketch;
194 | ----
195 | 0
196 | 
197 | # Test NULL handling - NULLs should be skipped
198 | statement ok
199 | CREATE TABLE with_nulls(item VARCHAR);
200 | 
201 | statement ok
202 | INSERT INTO with_nulls VALUES ('apple'), (NULL), ('apple'), (NULL), ('banana');
203 | 
204 | statement ok
205 | CREATE TABLE nulls_sketch AS SELECT datasketch_frequent_items(item) as sketch FROM with_nulls;
206 | 
207 | # Should only count non-NULL values
208 | query I
209 | SELECT datasketch_frequent_items_estimate(sketch, 'apple') FROM nulls_sketch;
210 | ----
211 | 2
212 | 
213 | query I
214 | SELECT datasketch_frequent_items_estimate(sketch, 'banana') FROM nulls_sketch;
215 | ----
216 | 1
217 | 
218 | # -------------------------------------------------------------------
219 | # 8. Metadata Functions
220 | # -------------------------------------------------------------------
221 | 
222 | # Test epsilon (error bound) - should be related to map size
223 | query I
224 | SELECT datasketch_frequent_items_epsilon(sketch) > 0 FROM fruit_sketch;
225 | ----
226 | true
227 | 
228 | # Test total weight - should match total items
229 | query I
230 | SELECT datasketch_frequent_items_total_weight(sketch) FROM fruit_sketch;
231 | ----
232 | 6
233 | 
234 | # Test is_empty on non-empty sketch
235 | query I
236 | SELECT datasketch_frequent_items_is_empty(sketch) FROM fruit_sketch;
237 | ----
238 | false
239 | 
240 | # Test is_empty on empty sketch
241 | query I
242 | SELECT datasketch_frequent_items_is_empty(sketch) FROM empty_sketch;
243 | ----
244 | true
245 | 
246 | # Test num_active_items
247 | query I
248 | SELECT datasketch_frequent_items_num_active(sketch) FROM fruit_sketch;
249 | ----
250 | 3
251 | 
252 | # -------------------------------------------------------------------
253 | # 9. Upper and Lower Bounds
254 | # -------------------------------------------------------------------
255 | 
256 | # In exact mode, bounds should equal the estimate
257 | query III
258 | SELECT
259 |     datasketch_frequent_items_estimate(sketch, 'apple'),
260 |     datasketch_frequent_items_lower_bound(sketch, 'apple'),
261 |     datasketch_frequent_items_upper_bound(sketch, 'apple')
262 | FROM fruit_sketch;
263 | ----
264 | 3	3	3
265 | 
266 | # In approximate mode with heavy_hitter
267 | query I
268 | SELECT
269 |     datasketch_frequent_items_upper_bound(sketch, 'heavy_hitter') >=
270 |     datasketch_frequent_items_lower_bound(sketch, 'heavy_hitter')
271 | FROM stream_sketch;
272 | ----
273 | true
274 | 
275 | # Bounds should satisfy: lower <= estimate <= upper
276 | query I
277 | SELECT
278 |     datasketch_frequent_items_lower_bound(sketch, 'heavy_hitter') <=
279 |     datasketch_frequent_items_estimate(sketch, 'heavy_hitter') AND
280 |     datasketch_frequent_items_estimate(sketch, 'heavy_hitter') <=
281 |     datasketch_frequent_items_upper_bound(sketch, 'heavy_hitter')
282 | FROM stream_sketch;
283 | ----
284 | true
285 | 
286 | # -------------------------------------------------------------------
287 | # 11. All Items Same Frequency
288 | # -------------------------------------------------------------------
289 | 
290 | statement ok
291 | CREATE TABLE uniform(item VARCHAR);
292 | 
293 | statement ok
294 | INSERT INTO uniform VALUES ('a'), ('b'), ('c'), ('d'), ('e');
295 | 
296 | statement ok
297 | CREATE TABLE uniform_sketch AS SELECT datasketch_frequent_items(item) as sketch FROM uniform;
298 | 
299 | # All should have estimate = 1
300 | query I
301 | SELECT datasketch_frequent_items_estimate(sketch, 'a') =
302 |        datasketch_frequent_items_estimate(sketch, 'b') AND
303 |        datasketch_frequent_items_estimate(sketch, 'b') =
304 |        datasketch_frequent_items_estimate(sketch, 'c')
305 | FROM uniform_sketch;
306 | ----
307 | true
308 | 
309 | # -------------------------------------------------------------------
310 | # 12. Single Item Dataset
311 | # -------------------------------------------------------------------
312 | 
313 | statement ok
314 | CREATE TABLE singleton(item VARCHAR);
315 | 
316 | statement ok
317 | INSERT INTO singleton VALUES ('only_one');
318 | 
319 | statement ok
320 | CREATE TABLE singleton_sketch AS SELECT datasketch_frequent_items(item) as sketch FROM singleton;
321 | 
322 | query I
323 | SELECT datasketch_frequent_items_estimate(sketch, 'only_one') FROM singleton_sketch;
324 | ----
325 | 1
326 | 
327 | query I
328 | SELECT datasketch_frequent_items_total_weight(sketch) FROM singleton_sketch;
329 | ----
330 | 1
331 | 
332 | query I
333 | SELECT datasketch_frequent_items_num_active(sketch) FROM singleton_sketch;
334 | ----
335 | 1
336 | 
337 | # -------------------------------------------------------------------
338 | # 13. GROUP BY with Multiple Sketches
339 | # -------------------------------------------------------------------
340 | 
341 | statement ok
342 | CREATE TABLE events(category VARCHAR, event_type VARCHAR);
343 | 
344 | statement ok
345 | INSERT INTO events VALUES
346 | ('web', 'click'), ('web', 'click'), ('web', 'view'),
347 | ('mobile', 'click'), ('mobile', 'swipe'), ('mobile', 'swipe'), ('mobile', 'swipe');
348 | 
349 | # Create sketches per category
350 | statement ok
351 | CREATE TABLE category_sketches AS
352 | SELECT
353 |     category,
354 |     datasketch_frequent_items(event_type) as sketch
355 | FROM events
356 | GROUP BY category;
357 | 
358 | # Verify web category
359 | query I
360 | SELECT datasketch_frequent_items_estimate(sketch, 'click')
361 | FROM category_sketches
362 | WHERE category = 'web';
363 | ----
364 | 2
365 | 
366 | # Verify mobile category
367 | query I
368 | SELECT datasketch_frequent_items_estimate(sketch, 'swipe')
369 | FROM category_sketches
370 | WHERE category = 'mobile';
371 | ----
372 | 3
373 | 
374 | # -------------------------------------------------------------------
375 | # 14. Serialization Persistence
376 | # -------------------------------------------------------------------
377 | 
378 | # Create a sketch, insert it, retrieve it, and verify it works
379 | statement ok
380 | CREATE TABLE sketch_storage(id INTEGER, sketch_data sketch_frequent_items);
381 | 
382 | statement ok
383 | INSERT INTO sketch_storage
384 | SELECT 1, datasketch_frequent_items(name) FROM fruits;
385 | 
386 | # Query from stored sketch
387 | query I
388 | SELECT datasketch_frequent_items_estimate(sketch_data, 'apple')
389 | FROM sketch_storage
390 | WHERE id = 1;
391 | ----
392 | 3
393 | 
394 | # -------------------------------------------------------------------
395 | # 15. Very Large K Parameter
396 | # -------------------------------------------------------------------
397 | 
398 | statement ok
399 | CREATE TABLE large_k_data(item VARCHAR);
400 | 
401 | statement ok
402 | INSERT INTO large_k_data VALUES ('a'), ('b'), ('c');
403 | 
404 | # Use lg_k = 20 (very large map: 2^20 = ~1M entries)
405 | statement ok
406 | CREATE TABLE large_k_sketch AS SELECT datasketch_frequent_items(20, item) as sketch FROM large_k_data;
407 | 
408 | query I
409 | SELECT datasketch_frequent_items_estimate(sketch, 'a') FROM large_k_sketch;
410 | ----
411 | 1
412 | 
413 | # Epsilon should be very small for large K
414 | query I
415 | SELECT datasketch_frequent_items_epsilon(sketch) < 0.001 FROM large_k_sketch;
416 | ----
417 | true
418 | 
419 | # -------------------------------------------------------------------
420 | # 16. Duplicate Item Streams
421 | # -------------------------------------------------------------------
422 | 
423 | statement ok
424 | CREATE TABLE duplicates(item VARCHAR);
425 | 
426 | statement ok
427 | INSERT INTO duplicates SELECT 'repeated' FROM range(0, 1000);
428 | 
429 | statement ok
430 | CREATE TABLE dup_sketch AS SELECT datasketch_frequent_items(item) as sketch FROM duplicates;
431 | 
432 | query I
433 | SELECT datasketch_frequent_items_estimate(sketch, 'repeated') FROM dup_sketch;
434 | ----
435 | 1000
436 | 
437 | query I
438 | SELECT datasketch_frequent_items_total_weight(sketch) FROM dup_sketch;
439 | ----
440 | 1000
441 | 
442 | # Only 1 unique item
443 | query I
444 | SELECT datasketch_frequent_items_num_active(sketch) FROM dup_sketch;
445 | ----
446 | 1
447 | 
448 | # -------------------------------------------------------------------
449 | # 17. Zipfian Distribution (Realistic Workload)
450 | # -------------------------------------------------------------------
451 | # Create a Zipf-like distribution where a few items are very frequent
452 | 
453 | statement ok
454 | CREATE TABLE zipf_data(item VARCHAR);
455 | 
456 | statement ok
457 | INSERT INTO zipf_data
458 | SELECT 'rank_' || (i % 10)::VARCHAR
459 | FROM range(0, 1000) t(i)
460 | WHERE i % 2 = 0  -- 'rank_0' appears most
461 | UNION ALL
462 | SELECT 'rank_' || ((i % 100) / 10)::VARCHAR
463 | FROM range(0, 1000) t(i)
464 | WHERE i % 3 = 0;  -- Add more skew
465 | 
466 | statement ok
467 | CREATE TABLE zipf_sketch AS SELECT datasketch_frequent_items(8, item) as sketch FROM zipf_data;
468 | 
469 | # Verify top items are captured
470 | query I
471 | SELECT count(*) > 0
472 | FROM zipf_sketch,
473 |      UNNEST(datasketch_frequent_items_get_frequent(sketch, 'NO_FALSE_POSITIVES')) as t(f)
474 | WHERE f.item = 'rank_0';
475 | ----
476 | true
477 | 
478 | # -------------------------------------------------------------------
479 | # 18. Error Type Difference Verification
480 | # -------------------------------------------------------------------
481 | 
482 | statement ok
483 | CREATE TABLE error_test(item VARCHAR);
484 | 
485 | statement ok
486 | INSERT INTO error_test
487 | SELECT 'freq_' || (i % 5)::VARCHAR FROM range(0, 100) t(i)
488 | UNION ALL
489 | SELECT 'rare_' || i::VARCHAR FROM range(0, 50) t(i);
490 | 
491 | statement ok
492 | CREATE TABLE error_sketch AS SELECT datasketch_frequent_items(6, item) as sketch FROM error_test;
493 | 
494 | # NO_FALSE_NEGATIVES should include items that might not be heavy hitters
495 | # NO_FALSE_POSITIVES should only include confirmed heavy hitters
496 | 
497 | # Get counts for each error type
498 | query I
499 | SELECT count(*) > 0
500 | FROM error_sketch,
501 |      UNNEST(datasketch_frequent_items_get_frequent(sketch, 'NO_FALSE_NEGATIVES')) as t(f);
502 | ----
503 | true
504 | 
505 | query I
506 | SELECT count(*) > 0
507 | FROM error_sketch,
508 |      UNNEST(datasketch_frequent_items_get_frequent(sketch, 'NO_FALSE_POSITIVES')) as t(f);
509 | ----
510 | true
511 | 
512 | # -------------------------------------------------------------------
513 | # 19. Non-Existent Item Queries
514 | # -------------------------------------------------------------------
515 | 
516 | # Querying items that were never added should return 0
517 | query I
518 | SELECT datasketch_frequent_items_estimate(sketch, 'never_existed') FROM fruit_sketch;
519 | ----
520 | 0
521 | 
522 | query II
523 | SELECT
524 |     datasketch_frequent_items_lower_bound(sketch, 'never_existed'),
525 |     datasketch_frequent_items_upper_bound(sketch, 'never_existed')
526 | FROM fruit_sketch;
527 | ----
528 | 0	0
529 | 
530 | 
531 | # -------------------------------------------------------------------
532 | # 10. Integer Type Support
533 | # -------------------------------------------------------------------
534 | 
535 | statement ok
536 | CREATE TABLE user_ids(id INTEGER);
537 | 
538 | statement ok
539 | INSERT INTO user_ids VALUES (101), (101), (101), (202), (202), (303);
540 | 
541 | statement ok
542 | CREATE TABLE int_sketch AS SELECT datasketch_frequent_items(id) as sketch FROM user_ids;
543 | 
544 | query I
545 | SELECT datasketch_frequent_items_estimate(sketch, 101) FROM int_sketch;
546 | ----
547 | 3
548 | 
549 | query I
550 | SELECT datasketch_frequent_items_estimate(sketch, 202) FROM int_sketch;
551 | ----
552 | 2
553 | 
554 | query I
555 | SELECT datasketch_frequent_items_estimate(sketch, 303) FROM int_sketch;
556 | ----
557 | 1
558 | 
559 | # Get frequent integers
560 | query ITII
561 | SELECT
562 |     f.item,
563 |     f.estimate,
564 |     f.lower_bound,
565 |     f.upper_bound
566 | FROM int_sketch,
567 |      UNNEST(datasketch_frequent_items_get_frequent(sketch, 'NO_FALSE_POSITIVES')) as t(f)
568 | ORDER BY f.estimate DESC;
569 | ----
570 | 101	3	3	3
571 | 202	2	2	2
572 | 303	1	1	1
573 | 
574 | # -------------------------------------------------------------------
575 | # 20. BIGINT Support
576 | # -------------------------------------------------------------------
577 | 
578 | statement ok
579 | CREATE TABLE big_numbers(val BIGINT);
580 | 
581 | statement ok
582 | INSERT INTO big_numbers VALUES
583 | (9223372036854775807),
584 | (9223372036854775807),
585 | (-9223372036854775808);
586 | 
587 | statement ok
588 | CREATE TABLE bigint_sketch AS SELECT datasketch_frequent_items(val) as sketch FROM big_numbers;
589 | 
590 | query I
591 | SELECT datasketch_frequent_items_estimate(sketch, 9223372036854775807) FROM bigint_sketch;
592 | ----
593 | 2
594 | 
595 | query I
596 | SELECT datasketch_frequent_items_estimate(sketch, -9223372036854775808) FROM bigint_sketch;
597 | ----
598 | 1
599 | 
600 | # -------------------------------------------------------------------
601 | # 21. TINYINT Support
602 | # -------------------------------------------------------------------
603 | 
604 | statement ok
605 | CREATE TABLE tiny_vals(val TINYINT);
606 | 
607 | statement ok
608 | INSERT INTO tiny_vals VALUES (1), (1), (1), (2), (2), (-128), (127);
609 | 
610 | statement ok
611 | CREATE TABLE tiny_sketch AS SELECT datasketch_frequent_items(val) as sketch FROM tiny_vals;
612 | 
613 | query I
614 | SELECT datasketch_frequent_items_estimate(sketch, 1::TINYINT) FROM tiny_sketch;
615 | ----
616 | 3
617 | 
618 | query I
619 | SELECT datasketch_frequent_items_estimate(sketch, (-128)::TINYINT) FROM tiny_sketch;
620 | ----
621 | 1
622 | 
623 | query I
624 | SELECT datasketch_frequent_items_estimate(sketch, 127::TINYINT) FROM tiny_sketch;
625 | ----
626 | 1
627 | 
628 | # -------------------------------------------------------------------
629 | # 22. SMALLINT Support
630 | # -------------------------------------------------------------------
631 | 
632 | statement ok
633 | CREATE TABLE small_vals(val SMALLINT);
634 | 
635 | statement ok
636 | INSERT INTO small_vals VALUES (1000), (1000), (-32768), (32767);
637 | 
638 | statement ok
639 | CREATE TABLE small_sketch AS SELECT datasketch_frequent_items(val) as sketch FROM small_vals;
640 | 
641 | query I
642 | SELECT datasketch_frequent_items_estimate(sketch, 1000::SMALLINT) FROM small_sketch;
643 | ----
644 | 2
645 | 
646 | query I
647 | SELECT datasketch_frequent_items_estimate(sketch, (-32768)::SMALLINT) FROM small_sketch;
648 | ----
649 | 1
650 | 
651 | # -------------------------------------------------------------------
652 | # 23. Unsigned Integer Types (UTINYINT, USMALLINT, UINTEGER, UBIGINT)
653 | # -------------------------------------------------------------------
654 | 
655 | statement ok
656 | CREATE TABLE unsigned_vals(
657 |     ut UTINYINT,
658 |     us USMALLINT,
659 |     ui UINTEGER,
660 |     ub UBIGINT
661 | );
662 | 
663 | statement ok
664 | INSERT INTO unsigned_vals VALUES
665 | (255, 65535, 4294967295, 18446744073709551615),
666 | (255, 65535, 4294967295, 18446744073709551615),
667 | (0, 0, 0, 0);
668 | 
669 | # UTINYINT
670 | statement ok
671 | CREATE TABLE ut_sketch AS SELECT datasketch_frequent_items(ut) as sketch FROM unsigned_vals;
672 | 
673 | query I
674 | SELECT datasketch_frequent_items_estimate(sketch, 255::UTINYINT) FROM ut_sketch;
675 | ----
676 | 2
677 | 
678 | query I
679 | SELECT datasketch_frequent_items_estimate(sketch, 0::UTINYINT) FROM ut_sketch;
680 | ----
681 | 1
682 | 
683 | # USMALLINT
684 | statement ok
685 | CREATE TABLE us_sketch AS SELECT datasketch_frequent_items(us) as sketch FROM unsigned_vals;
686 | 
687 | query I
688 | SELECT datasketch_frequent_items_estimate(sketch, 65535::USMALLINT) FROM us_sketch;
689 | ----
690 | 2
691 | 
692 | # UINTEGER
693 | statement ok
694 | CREATE TABLE ui_sketch AS SELECT datasketch_frequent_items(ui) as sketch FROM unsigned_vals;
695 | 
696 | query I
697 | SELECT datasketch_frequent_items_estimate(sketch, 4294967295::UINTEGER) FROM ui_sketch;
698 | ----
699 | 2
700 | 
701 | # UBIGINT
702 | statement ok
703 | CREATE TABLE ub_sketch AS SELECT datasketch_frequent_items(ub) as sketch FROM unsigned_vals;
704 | 
705 | query I
706 | SELECT datasketch_frequent_items_estimate(sketch, 18446744073709551615::UBIGINT) FROM ub_sketch;
707 | ----
708 | 2
709 | 
710 | # -------------------------------------------------------------------
711 | # 24. FLOAT Support
712 | # -------------------------------------------------------------------
713 | 
714 | statement ok
715 | CREATE TABLE float_vals(val FLOAT);
716 | 
717 | statement ok
718 | INSERT INTO float_vals VALUES (3.14), (3.14), (3.14), (2.71), (-0.5);
719 | 
720 | statement ok
721 | CREATE TABLE float_sketch AS SELECT datasketch_frequent_items(val) as sketch FROM float_vals;
722 | 
723 | query I
724 | SELECT datasketch_frequent_items_estimate(sketch, 3.14::FLOAT) FROM float_sketch;
725 | ----
726 | 3
727 | 
728 | query I
729 | SELECT datasketch_frequent_items_estimate(sketch, 2.71::FLOAT) FROM float_sketch;
730 | ----
731 | 1
732 | 
733 | query I
734 | SELECT datasketch_frequent_items_estimate(sketch, (-0.5)::FLOAT) FROM float_sketch;
735 | ----
736 | 1
737 | 
738 | # -------------------------------------------------------------------
739 | # 25. DOUBLE Support
740 | # -------------------------------------------------------------------
741 | 
742 | statement ok
743 | CREATE TABLE double_vals(val DOUBLE);
744 | 
745 | statement ok
746 | INSERT INTO double_vals VALUES
747 | (3.141592653589793),
748 | (3.141592653589793),
749 | (2.718281828459045),
750 | (1.7976931348623157e+308);
751 | 
752 | statement ok
753 | CREATE TABLE double_sketch AS SELECT datasketch_frequent_items(val) as sketch FROM double_vals;
754 | 
755 | query I
756 | SELECT datasketch_frequent_items_estimate(sketch, 3.141592653589793::DOUBLE) FROM double_sketch;
757 | ----
758 | 2
759 | 
760 | query I
761 | SELECT datasketch_frequent_items_estimate(sketch, 1.7976931348623157e+308::DOUBLE) FROM double_sketch;
762 | ----
763 | 1
764 | 
765 | # -------------------------------------------------------------------
766 | # 26. Mixed Types in GROUP BY
767 | # -------------------------------------------------------------------
768 | 
769 | statement ok
770 | CREATE TABLE mixed_events(category VARCHAR, count_val INTEGER, amount DOUBLE);
771 | 
772 | statement ok
773 | INSERT INTO mixed_events VALUES
774 | ('A', 100, 9.99), ('A', 100, 9.99), ('A', 200, 19.99),
775 | ('B', 100, 9.99), ('B', 300, 29.99), ('B', 300, 29.99);
776 | 
777 | # Integer sketch per category
778 | statement ok
779 | CREATE TABLE mixed_int_sketch AS
780 | SELECT category, datasketch_frequent_items(count_val) as sketch
781 | FROM mixed_events GROUP BY category;
782 | 
783 | query I
784 | SELECT datasketch_frequent_items_estimate(sketch, 100)
785 | FROM mixed_int_sketch WHERE category = 'A';
786 | ----
787 | 2
788 | 
789 | query I
790 | SELECT datasketch_frequent_items_estimate(sketch, 300)
791 | FROM mixed_int_sketch WHERE category = 'B';
792 | ----
793 | 2
794 | 
795 | # Double sketch per category
796 | statement ok
797 | CREATE TABLE mixed_dbl_sketch AS
798 | SELECT category, datasketch_frequent_items(amount) as sketch
799 | FROM mixed_events GROUP BY category;
800 | 
801 | query I
802 | SELECT datasketch_frequent_items_estimate(sketch, 9.99::DOUBLE)
803 | FROM mixed_dbl_sketch WHERE category = 'A';
804 | ----
805 | 2
806 | 
807 | # -------------------------------------------------------------------
808 | # 27. Bounds Work for All Types
809 | # -------------------------------------------------------------------
810 | 
811 | query III
812 | SELECT
813 |     datasketch_frequent_items_estimate(sketch, 255::UTINYINT),
814 |     datasketch_frequent_items_lower_bound(sketch, 255::UTINYINT),
815 |     datasketch_frequent_items_upper_bound(sketch, 255::UTINYINT)
816 | FROM ut_sketch;
817 | ----
818 | 2	2	2
819 | 
820 | query III
821 | SELECT
822 |     datasketch_frequent_items_estimate(sketch, 3.14::FLOAT),
823 |     datasketch_frequent_items_lower_bound(sketch, 3.14::FLOAT),
824 |     datasketch_frequent_items_upper_bound(sketch, 3.14::FLOAT)
825 | FROM float_sketch;
826 | ----
827 | 3	3	3
828 | 
829 | 


--------------------------------------------------------------------------------
/codegen/generator.py:
--------------------------------------------------------------------------------
  1 | from jinja2 import Environment, FileSystemLoader
  2 | from typing import Any
  3 | 
  4 | # Set up the Jinja2 environment
  5 | env = Environment(loader=FileSystemLoader(searchpath="./codegen/"), autoescape=False)
  6 | template = env.get_template("generated.cpp.j2")
  7 | 
  8 | 
  9 | counting_sketch_names = ["CPC", "HLL"]
 10 | 
 11 | logical_type_mapping = {
 12 |     "LogicalType::BOOLEAN": "bool",
 13 |     "LogicalType::TINYINT": "int8_t",
 14 |     "LogicalType::SMALLINT": "int16_t",
 15 |     "LogicalType::INTEGER": "int32_t",
 16 |     "LogicalType::BIGINT": "int64_t",
 17 |     "LogicalType::FLOAT": "float",
 18 |     "LogicalType::DOUBLE": "double",
 19 |     "LogicalType::UTINYINT": "uint8_t",
 20 |     "LogicalType::USMALLINT": "uint16_t",
 21 |     "LogicalType::UINTEGER": "uint32_t",
 22 |     "LogicalType::UBIGINT": "uint64_t",
 23 |     "LogicalType::VARCHAR": "string_t",
 24 | }
 25 | 
 26 | cpp_type_mapping = {value: key for key, value in logical_type_mapping.items()}
 27 | 
 28 | 
 29 | def sketch_type_to_allowed_logical_types(sketch_type):
 30 |     if sketch_type in counting_sketch_names:
 31 |         return {
 32 |             "LogicalType::TINYINT": "int8_t",
 33 |             "LogicalType::SMALLINT": "int16_t",
 34 |             "LogicalType::INTEGER": "int32_t",
 35 |             "LogicalType::BIGINT": "int64_t",
 36 |             "LogicalType::FLOAT": "float",
 37 |             "LogicalType::DOUBLE": "double",
 38 |             "LogicalType::UTINYINT": "uint8_t",
 39 |             "LogicalType::USMALLINT": "uint16_t",
 40 |             "LogicalType::UINTEGER": "uint32_t",
 41 |             "LogicalType::UBIGINT": "uint64_t",
 42 |             "LogicalType::VARCHAR": "string_t",
 43 |             "LogicalType::BLOB": "string_t",
 44 |         }
 45 | 
 46 |     if sketch_type == "TDigest":
 47 |         return {"LogicalType::FLOAT": "float", "LogicalType::DOUBLE": "double"}
 48 | 
 49 |     return {
 50 |         "LogicalType::TINYINT": "int8_t",
 51 |         "LogicalType::SMALLINT": "int16_t",
 52 |         "LogicalType::INTEGER": "int32_t",
 53 |         "LogicalType::BIGINT": "int64_t",
 54 |         "LogicalType::FLOAT": "float",
 55 |         "LogicalType::DOUBLE": "double",
 56 |         "LogicalType::UTINYINT": "uint8_t",
 57 |         "LogicalType::USMALLINT": "uint16_t",
 58 |         "LogicalType::UINTEGER": "uint32_t",
 59 |         "LogicalType::UBIGINT": "uint64_t",
 60 |     }
 61 | 
 62 | 
 63 | def get_sketch_class_name(sketch_type: str):
 64 |     if sketch_type == "TDigest":
 65 |         return "datasketches::tdigest"
 66 |     return f"datasketches::{sketch_type.lower()}_sketch"
 67 | 
 68 | 
 69 | def unary_functions_per_sketch_type(sketch_type: str):
 70 |     if sketch_type not in counting_sketch_names:
 71 |         deserialize_sketch = f"""
 72 |             auto sketch = [&]() {{
 73 |                 try {{
 74 |                     return {get_sketch_class_name(sketch_type)}<T>::deserialize(sketch_data.GetDataUnsafe(), sketch_data.GetSize());
 75 |                 }} catch (const std::exception &e) {{
 76 |                     throw InvalidInputException("Failed to deserialize {sketch_type} sketch: %s", e.what());
 77 |                 }}
 78 |             }}();"""
 79 |     else:
 80 |         deserialize_sketch = f"""
 81 |             auto sketch = [&]() {{
 82 |                 try {{
 83 |                     return {get_sketch_class_name(sketch_type)}::deserialize(sketch_data.GetDataUnsafe(), sketch_data.GetSize());
 84 |                 }} catch (const std::exception &e) {{
 85 |                     throw InvalidInputException("Failed to deserialize {sketch_type} sketch: %s", e.what());
 86 |                 }}
 87 |             }}();"""
 88 | 
 89 |     if sketch_type in counting_sketch_names:
 90 |         sketch_argument = {
 91 |             "cpp_type": "string_t",
 92 |             "duckdb_type": lambda contained_type: "sketch_type",
 93 |             "name": "sketch",
 94 |             "process": deserialize_sketch,
 95 |         }
 96 |     else:
 97 |         sketch_argument = {
 98 |             "cpp_type": "string_t",
 99 |             "duckdb_type": lambda contained_type: f"sketch_map_types[{contained_type.replace('LogicalType', 'LogicalTypeId')}]",
100 |             "name": "sketch",
101 |             "process": deserialize_sketch,
102 |         }
103 | 
104 |     cdf_points_argument = {
105 |         "cpp_type": "list_entry_t",
106 |         "duckdb_type": lambda contained_type: f"LogicalType::LIST({contained_type})",
107 |         "name": "split_points",
108 |         "pre_executor": """
109 |                     UnifiedVectorFormat unified_split_points;
110 |                     split_points_vector.ToUnifiedFormat(args.size(), unified_split_points);
111 | 
112 |                     // auto split_points_list_entries = UnifiedVectorFormat::GetData<list_entry_t>(unified_split_points);
113 |                     //         auto split_points_validitiy = FlatVector::Validity(split_points_vector);
114 | 
115 |                     auto &split_points_list_children = ListVector::GetEntry(split_points_vector);
116 | 
117 |                     UnifiedVectorFormat split_points_children_unified;
118 |                     split_points_list_children.ToUnifiedFormat(args.size(), split_points_children_unified);
119 | 
120 |                     const T *split_points_list_children_data = UnifiedVectorFormat::GetData<T>(split_points_children_unified);
121 |                     """,
122 |         "process": """
123 |                     std::vector<T> passing_points(split_points_data.length);
124 |                     for (idx_t i = 0; i < split_points_data.length; i++)
125 |                     {
126 |                         passing_points[i] = split_points_list_children_data[i + split_points_data.offset];
127 |                     }
128 |                     """,
129 |     }
130 |     pmf_points_argument = cdf_points_argument
131 | 
132 |     result = [
133 |         {
134 |             "method": "return sketch.is_empty();",
135 |             "name": "is_empty",
136 |             "description": "Return a boolean indicating if the sketch is empty",
137 |             "example": f"datasketch_{sketch_type.lower()}_is_empty(sketch)",
138 |             "arguments": [
139 |                 sketch_argument,
140 |             ],
141 |             "return_type": "LogicalType::BOOLEAN",
142 |         },
143 |     ]
144 | 
145 |     if sketch_type not in counting_sketch_names:
146 |         result.extend(
147 |             [
148 |                 {
149 |                     "method": "return sketch.get_k();",
150 |                     "arguments": [sketch_argument],
151 |                     "name": "k",
152 |                     "description": "Return the value of K for this sketch",
153 |                     "example": f"datasketch_{sketch_type.lower()}_k(sketch)",
154 |                     "return_type": "LogicalType::USMALLINT",
155 |                 },
156 |                 {
157 |                     "name": "cdf",
158 |                     "description": "Return the Cumulative Distribution Function (CDF) of the sketch for a series of points",
159 |                     "example": f"datasketch_{sketch_type.lower()}_cdf(sketch, points, inclusive)"
160 |                     if sketch_type != "TDigest"
161 |                     else f"datasketch_{sketch_type.lower()}_cdf(sketch, points)",
162 |                     "method": (
163 |                         "auto cdf_result = sketch.get_CDF(passing_points.data(), split_points_data.length, inclusive_data);"
164 |                         if sketch_type != "TDigest"
165 |                         else "auto cdf_result = sketch.get_CDF(passing_points.data(), split_points_data.length);"
166 |                     )
167 |                     + """
168 |                 auto current_size = ListVector::GetListSize(result);
169 |                 auto new_size = current_size + cdf_result.size();
170 |                 if (ListVector::GetListCapacity(result) < new_size)
171 |                 {
172 |                     ListVector::Reserve(result, new_size);
173 |                 }
174 | 
175 |                 auto &child_entry = ListVector::GetEntry(result);
176 |                 auto child_vals = FlatVector::GetData<T>(child_entry);
177 |                 //auto &child_validity = FlatVector::Validity(child_entry);
178 |                 for (idx_t i = 0; i < cdf_result.size(); i++)
179 |                 {
180 |                     child_vals[current_size + i] = cdf_result[i];
181 |                 }
182 |                 ListVector::SetListSize(result, new_size);
183 |                 return list_entry_t{current_size, cdf_result.size()};
184 |                 """,
185 |                     "arguments": [
186 |                         sketch_argument,
187 |                         cdf_points_argument,
188 |                         {
189 |                             "cpp_type": "bool",
190 |                             "name": "inclusive",
191 |                         },
192 |                     ]
193 |                     if sketch_type != "TDigest"
194 |                     else [sketch_argument, cdf_points_argument],
195 |                     "return_type_dynamic_list": True,
196 |                 },
197 |                 {
198 |                     "name": "pmf",
199 |                     "description": "Return the Probability Mass Function (PMF) of the sketch for a series of points",
200 |                     "example": f"datasketch_{sketch_type.lower()}_pmf(sketch, points, inclusive)"
201 |                     if sketch_type != "TDigest"
202 |                     else f"datasketch_{sketch_type.lower()}_pmf(sketch, points)",
203 |                     "method": (
204 |                         "auto pmf_result = sketch.get_PMF(passing_points.data(), split_points_data.length, inclusive_data);"
205 |                         if sketch_type != "TDigest"
206 |                         else "auto pmf_result = sketch.get_PMF(passing_points.data(), split_points_data.length);"
207 |                     )
208 |                     + """
209 |                 auto current_size = ListVector::GetListSize(result);
210 |                 auto new_size = current_size + pmf_result.size();
211 |                 if (ListVector::GetListCapacity(result) < new_size)
212 |                 {
213 |                     ListVector::Reserve(result, new_size);
214 |                 }
215 | 
216 |                 auto &child_entry = ListVector::GetEntry(result);
217 |                 auto child_vals = FlatVector::GetData<double>(child_entry);
218 |                 //auto &child_validity = FlatVector::Validity(child_entry);
219 |                 for (idx_t i = 0; i < pmf_result.size(); i++)
220 |                 {
221 |                     child_vals[current_size + i] = pmf_result[i];
222 |                 }
223 |                 ListVector::SetListSize(result, new_size);
224 |                 return list_entry_t{current_size, pmf_result.size()};
225 |                 """,
226 |                     "arguments": [
227 |                         sketch_argument,
228 |                         pmf_points_argument,
229 |                         {
230 |                             "cpp_type": "bool",
231 |                             "name": "inclusive",
232 |                         },
233 |                     ]
234 |                     if sketch_type != "TDigest"
235 |                     else [sketch_argument, pmf_points_argument],
236 |                     "return_type_dynamic_list": True,
237 |                 },
238 |             ]
239 |         )
240 | 
241 |     if sketch_type == "HLL":
242 |         result.extend(
243 |             [
244 |                 {
245 |                     "method": "return StringVector::AddString(result, sketch.to_string(summary_data, detail_data, false, false));",
246 |                     "description": "Return a string representation of the sketch",
247 |                     "example": f"datasketch_{sketch_type.lower()}_describe(sketch, include_summary, include_detail)",
248 |                     "arguments": [
249 |                         sketch_argument,
250 |                         {
251 |                             "cpp_type": "bool",
252 |                             "name": "summary",
253 |                         },
254 |                         {
255 |                             "cpp_type": "bool",
256 |                             "name": "detail",
257 |                         },
258 |                     ],
259 |                     "name": "describe",
260 |                     "return_type": "LogicalType::VARCHAR",
261 |                 },
262 |                 {
263 |                     "method": "return sketch.get_lg_config_k();",
264 |                     "description": "Return the value of log base 2 K for this sketch",
265 |                     "example": f"datasketch_{sketch_type.lower()}_lg_config_k(sketch)",
266 |                     "arguments": [
267 |                         sketch_argument,
268 |                     ],
269 |                     "name": "lg_config_k",
270 |                     "return_type": "LogicalType::UTINYINT",
271 |                 },
272 |                 {
273 |                     "method": "return sketch.is_compact();",
274 |                     "description": "Return whether the sketch is in compact form",
275 |                     "example": f"datasketch_{sketch_type.lower()}_is_compact(sketch)",
276 |                     "arguments": [
277 |                         sketch_argument,
278 |                     ],
279 |                     "name": "is_compact",
280 |                     "return_type": "LogicalType::BOOLEAN",
281 |                 },
282 |             ]
283 |         )
284 | 
285 |     if sketch_type == "CPC":
286 |         result.append(
287 |             {
288 |                 "method": "return StringVector::AddString(result, sketch.to_string());",
289 |                 "description": "Return a string representation of the sketch",
290 |                 "example": f"datasketch_{sketch_type.lower()}_describe(sketch)",
291 |                 "arguments": [
292 |                     sketch_argument,
293 |                 ],
294 |                 "name": "describe",
295 |                 "return_type": "LogicalType::VARCHAR",
296 |             },
297 |         )
298 | 
299 |     if sketch_type in counting_sketch_names:
300 |         result.extend(
301 |             [
302 |                 {
303 |                     "method": "return sketch.get_estimate();",
304 |                     "description": "Return the estimate of the number of distinct items seen by the sketch",
305 |                     "example": f"datasketch_{sketch_type.lower()}_estimate(sketch)",
306 |                     "arguments": [
307 |                         sketch_argument,
308 |                     ],
309 |                     "name": "estimate",
310 |                     "return_type": "LogicalType::DOUBLE",
311 |                 },
312 |                 {
313 |                     "description": "Return the lower bound of the number of distinct items seen by the sketch",
314 |                     "example": f"datasketch_{sketch_type.lower()}_lower_bound(sketch, std_dev)",
315 |                     "method": "return sketch.get_lower_bound(std_dev_data);",
316 |                     "arguments": [
317 |                         sketch_argument,
318 |                         {
319 |                             "cpp_type": "uint8_t",
320 |                             "name": "std_dev",
321 |                         },
322 |                     ],
323 |                     "name": "lower_bound",
324 |                     "return_type": "LogicalType::DOUBLE",
325 |                 },
326 |                 {
327 |                     "description": "Return the upper bound of the number of distinct items seen by the sketch",
328 |                     "example": f"datasketch_{sketch_type.lower()}_upper_bound(sketch, std_dev)",
329 |                     "method": "return sketch.get_upper_bound(std_dev_data);",
330 |                     "arguments": [
331 |                         sketch_argument,
332 |                         {
333 |                             "cpp_type": "uint8_t",
334 |                             "name": "std_dev",
335 |                         },
336 |                     ],
337 |                     "name": "upper_bound",
338 |                     "return_type": "LogicalType::DOUBLE",
339 |                 },
340 |             ]
341 |         )
342 | 
343 |     if sketch_type == "TDigest":
344 |         result.extend(
345 |             [
346 |                 {
347 |                     "description": "Return a description of this sketch",
348 |                     "example": f"datasketch_{sketch_type.lower()}_describe(sketch, include_centroids)",
349 |                     "method": "return StringVector::AddString(result, sketch.to_string(include_centroids_data));",
350 |                     "arguments": [
351 |                         sketch_argument,
352 |                         {
353 |                             "cpp_type": "bool",
354 |                             "name": "include_centroids",
355 |                         },
356 |                     ],
357 |                     "name": "describe",
358 |                     "return_type": "LogicalType::VARCHAR",
359 |                 },
360 |                 {
361 |                     "description": "Return the rank of an item in the sketch",
362 |                     "example": f"datasketch_{sketch_type.lower()}_rank(sketch, item)",
363 |                     "method": "return sketch.get_rank(item_data);",
364 |                     "name": "rank",
365 |                     "arguments": [
366 |                         sketch_argument,
367 |                         {
368 |                             "cpp_type_dynamic": True,
369 |                             "name": "item",
370 |                         },
371 |                     ],
372 |                     "return_type": "LogicalType::DOUBLE",
373 |                 },
374 |                 {
375 |                     "description": "Return the total weight of this sketch",
376 |                     "example": f"datasketch_{sketch_type.lower()}_total_weight(sketch)",
377 |                     "method": "return sketch.get_total_weight();",
378 |                     "name": "total_weight",
379 |                     "arguments": [
380 |                         sketch_argument,
381 |                     ],
382 |                     "return_type": "LogicalType::UBIGINT",
383 |                 },
384 |                 {
385 |                     "description": "Return the quantile of a rank in the sketch",
386 |                     "example": f"datasketch_{sketch_type.lower()}_quantile(sketch, rank)",
387 |                     "method": "return sketch.get_quantile(rank_data);",
388 |                     "name": "quantile",
389 |                     "arguments": [
390 |                         sketch_argument,
391 |                         {
392 |                             "cpp_type": "double",
393 |                             "name": "rank",
394 |                         },
395 |                     ],
396 |                     "dynamic_return_type": True,
397 |                 },
398 |             ]
399 |         )
400 | 
401 |     if sketch_type not in ("TDigest", "REQ", "HLL", "CPC"):
402 |         result.extend(
403 |             [
404 |                 {
405 |                     "description": "Return the normalized rank error of the sketch",
406 |                     "example": f"datasketch_{sketch_type.lower()}_normalized_rank_error(sketch, is_pmf)",
407 |                     "method": "return sketch.get_normalized_rank_error(is_pmf_data);",
408 |                     "name": "normalized_rank_error",
409 |                     "arguments": [
410 |                         sketch_argument,
411 |                         {
412 |                             "cpp_type": "bool",
413 |                             "name": "is_pmf",
414 |                         },
415 |                     ],
416 |                     "return_type": "LogicalType::DOUBLE",
417 |                 },
418 |             ]
419 |         )
420 | 
421 |     if sketch_type != "TDigest" and sketch_type not in counting_sketch_names:
422 |         result.extend(
423 |             [
424 |                 {
425 |                     "description": "Return a description of this sketch",
426 |                     "example": f"datasketch_{sketch_type.lower()}_describe(sketch, include_levels, include_items)",
427 |                     "method": "return StringVector::AddString(result, sketch.to_string(include_levels_data, include_items_data));",
428 |                     "arguments": [
429 |                         sketch_argument,
430 |                         {
431 |                             "cpp_type": "bool",
432 |                             "name": "include_levels",
433 |                         },
434 |                         {
435 |                             "cpp_type": "bool",
436 |                             "name": "include_items",
437 |                         },
438 |                     ],
439 |                     "name": "describe",
440 |                     "return_type": "LogicalType::VARCHAR",
441 |                 },
442 |                 {
443 |                     "description": "Return the rank of an item in the sketch",
444 |                     "example": f"datasketch_{sketch_type.lower()}_rank(sketch, item, inclusive)",
445 |                     "method": "return sketch.get_rank(item_data, inclusive_data);",
446 |                     "name": "rank",
447 |                     "arguments": [
448 |                         sketch_argument,
449 |                         {
450 |                             "cpp_type_dynamic": True,
451 |                             "name": "item",
452 |                         },
453 |                         {
454 |                             "cpp_type": "bool",
455 |                             "name": "inclusive",
456 |                         },
457 |                     ],
458 |                     "return_type": "LogicalType::DOUBLE",
459 |                 },
460 |                 {
461 |                     "description": "Return the quantile of a rank in the sketch",
462 |                     "example": f"datasketch_{sketch_type.lower()}_rank(sketch, rank, inclusive)",
463 |                     "method": "return sketch.get_quantile(rank_data, inclusive_data);",
464 |                     "name": "quantile",
465 |                     "arguments": [
466 |                         sketch_argument,
467 |                         {
468 |                             "cpp_type": "double",
469 |                             "name": "rank",
470 |                         },
471 |                         {
472 |                             "cpp_type": "bool",
473 |                             "name": "inclusive",
474 |                         },
475 |                     ],
476 |                     "dynamic_return_type": True,
477 |                 },
478 |                 {
479 |                     "description": "Return the number of items contained in the sketch",
480 |                     "example": f"datasketch_{sketch_type.lower()}_rank(sketch)",
481 |                     "method": "return sketch.get_n();",
482 |                     "name": "n",
483 |                     "arguments": [
484 |                         sketch_argument,
485 |                     ],
486 |                     "return_type": "LogicalType::UBIGINT",
487 |                 },
488 |                 {
489 |                     "description": "Return a boolean indicating if the sketch is in estimation mode",
490 |                     "example": f"datasketch_{sketch_type.lower()}_is_estimation_mode(sketch)",
491 |                     "method": "return sketch.is_estimation_mode();",
492 |                     "name": "is_estimation_mode",
493 |                     "arguments": [
494 |                         sketch_argument,
495 |                     ],
496 |                     "return_type": "LogicalType::BOOLEAN",
497 |                 },
498 |                 {
499 |                     "description": "Return the number of retained items in the sketch",
500 |                     "example": f"datasketch_{sketch_type.lower()}_num_retained(sketch)",
501 |                     "method": "return sketch.get_num_retained();",
502 |                     "name": "num_retained",
503 |                     "arguments": [
504 |                         sketch_argument,
505 |                     ],
506 |                     "return_type": "LogicalType::UBIGINT",
507 |                 },
508 |                 {
509 |                     "description": "Return the minimum item in the sketch",
510 |                     "example": f"datasketch_{sketch_type.lower()}_min_item(sketch)",
511 |                     "method": "return sketch.get_min_item();",
512 |                     "name": "min_item",
513 |                     "arguments": [
514 |                         sketch_argument,
515 |                     ],
516 |                     "dynamic_return_type": True,
517 |                 },
518 |                 {
519 |                     "description": "Return the maxium item in the sketch",
520 |                     "example": f"datasketch_{sketch_type.lower()}_max_item(sketch)",
521 |                     "method": "return sketch.get_max_item();",
522 |                     "name": "max_item",
523 |                     "arguments": [sketch_argument],
524 |                     "dynamic_return_type": True,
525 |                 },
526 |             ]
527 |         )
528 |     return result
529 | 
530 | 
531 | def get_executor_name(arguments: list) -> str:
532 |     if len(arguments) == 1:
533 |         return "UnaryExecutor"
534 |     elif len(arguments) == 2:
535 |         return "BinaryExecutor"
536 |     elif len(arguments) == 3:
537 |         return "TernaryExecutor"
538 |     else:
539 |         raise NotImplementedError(f"Unhandled number of arguments {len(arguments)}")
540 | 
541 | 
542 | def get_scalar_function_args(
543 |     function_info: Any, logical_type: str, cpp_type: str
544 | ) -> str:
545 |     input_parameters = []
546 |     for arg in function_info["arguments"]:
547 |         if "duckdb_type" in arg:
548 |             input_parameters.append(arg["duckdb_type"](logical_type))
549 |         elif "cpp_type_dynamic" in arg:
550 |             input_parameters.append(logical_type)
551 |         else:
552 |             input_parameters.append(cpp_type_mapping[arg["cpp_type"]])
553 | 
554 |     joined_input_parameters = ",".join(input_parameters)
555 | 
556 |     all_args = [f"{{{joined_input_parameters}}}"]
557 | 
558 |     if function_info.get("dynamic_return_type"):
559 |         all_args.append(logical_type)
560 |     elif function_info.get("return_type_dynamic_list"):
561 |         all_args.append(f"LogicalType::LIST({logical_type})")
562 |     else:
563 |         all_args.append(function_info["return_type"])
564 | 
565 |     return ",".join(all_args)
566 | 
567 | 
568 | def get_function_block(function_info: Any) -> str:
569 |     cpp_types = []
570 |     for value in function_info["arguments"]:
571 |         if value.get("cpp_type_dynamic"):
572 |             cpp_types.append("T")
573 |         else:
574 |             cpp_types.append(value["cpp_type"])
575 | 
576 |     if function_info.get("return_type_dynamic_list"):
577 |         cpp_types.append("list_entry_t")
578 |     elif function_info.get("dynamic_return_type"):
579 |         cpp_types.append("T")
580 |     else:
581 |         cpp_types.append(logical_type_mapping[function_info["return_type"]])
582 | 
583 |     joined_cpp_types = ",".join(cpp_types)
584 | 
585 |     executor_args = []
586 |     lambda_args = []
587 |     lambda_lines = []
588 |     pre_executor_lines = []
589 | 
590 |     for argument in function_info["arguments"]:
591 |         executor_args.append(f"{argument['name']}_vector")
592 | 
593 |         if argument.get("cpp_type_dynamic"):
594 |             lambda_args.append(f"T {argument['name']}_data")
595 |         else:
596 |             lambda_args.append(f"{argument['cpp_type']} {argument['name']}_data")
597 | 
598 |         if "process" in argument:
599 |             lambda_lines.append(argument["process"])
600 | 
601 |         if "pre_executor" in argument:
602 |             pre_executor_lines.append(argument["pre_executor"])
603 | 
604 |     executor_args.append("result")
605 |     executor_args.append("args.size()")
606 | 
607 |     joined_executor_args = ",".join(executor_args)
608 |     joined_lambda_args = ",".join(lambda_args)
609 | 
610 |     lambda_lines.append(function_info["method"])
611 | 
612 |     lambda_body = "\n".join(lambda_lines)
613 |     pre_executor_body = "\n".join(pre_executor_lines)
614 | 
615 |     result = f"""
616 |         {pre_executor_body}
617 |         {get_executor_name(function_info["arguments"])}::Execute
618 |         <{joined_cpp_types}>
619 |         (
620 |         {joined_executor_args},
621 |         [&]({joined_lambda_args}) {{
622 | 
623 |             {lambda_body}
624 |         }});"""
625 | 
626 |     return result
627 | 
628 | 
629 | # Data to render the template
630 | data = {
631 |     "sketch_class_name": get_sketch_class_name,
632 |     "counting_sketch_names": counting_sketch_names,
633 |     #    "function_names_per_sketch": get_sketch_function_names,
634 |     "sketch_types": ["Quantiles", "KLL", "REQ", "TDigest", "HLL", "CPC"],
635 |     "logical_type_to_cplusplus_type": sketch_type_to_allowed_logical_types,
636 |     "functions_per_sketch_type": unary_functions_per_sketch_type,
637 |     "get_function_block": get_function_block,
638 |     "get_scalar_function_args": get_scalar_function_args,
639 |     "logical_type_mapping": logical_type_mapping,
640 |     "to_type_id": lambda v: v.replace("LogicalType", "LogicalTypeId"),
641 |     "sketch_k_cpp_type": {
642 |         "Quantiles": "int32_t",
643 |         "KLL": "int32_t",
644 |         "REQ": "int32_t",
645 |         "TDigest": "int32_t",
646 |         "HLL": "int32_t",
647 |         "CPC": "int32_t",
648 |     },
649 |     "cpp_type_mapping": cpp_type_mapping,
650 | }
651 | 
652 | 
653 | # Render the template
654 | output = template.render(data)
655 | 
656 | # Write the generated C++ code to a file
657 | with open("src/generated.cpp", "w") as f:
658 |     f.write(output)
659 | 
660 | print("C++ file generated successfully!")
661 | 


--------------------------------------------------------------------------------
/src/theta_sketch.cpp:
--------------------------------------------------------------------------------
  1 | #include "datasketches_extension.hpp"
  2 | #include "duckdb/function/scalar_function.hpp"
  3 | #include "duckdb/parser/parsed_data/create_scalar_function_info.hpp"
  4 | #include "duckdb/parser/parsed_data/create_aggregate_function_info.hpp"
  5 | 
  6 | // Apache DataSketches Headers
  7 | #include <DataSketches/theta_sketch.hpp>
  8 | #include <DataSketches/theta_union.hpp>
  9 | #include <DataSketches/theta_intersection.hpp>
 10 | #include <DataSketches/theta_a_not_b.hpp>
 11 | 
 12 | namespace duckdb
 13 | {
 14 |     namespace
 15 |     {
 16 |         // ============================================================
 17 |         // 1. Helpers & Bind Data
 18 |         // ============================================================
 19 | 
 20 |         struct DSThetaBindData : public FunctionData
 21 |         {
 22 |             DSThetaBindData() : lg_k(12) {}
 23 |             explicit DSThetaBindData(uint8_t lg_k) : lg_k(lg_k) {}
 24 | 
 25 |             unique_ptr<FunctionData> Copy() const override
 26 |             {
 27 |                 return make_uniq<DSThetaBindData>(lg_k);
 28 |             }
 29 | 
 30 |             bool Equals(const FunctionData &other_p) const override
 31 |             {
 32 |                 auto &other = other_p.Cast<DSThetaBindData>();
 33 |                 return lg_k == other.lg_k;
 34 |             }
 35 | 
 36 |             uint8_t lg_k;
 37 |         };
 38 | 
 39 |         unique_ptr<FunctionData> DSThetaBindWithK(ClientContext &context, AggregateFunction &function,
 40 |                                                   vector<unique_ptr<Expression>> &arguments)
 41 |         {
 42 |             if (arguments[0]->HasParameter())
 43 |                 throw ParameterNotResolvedException();
 44 |             if (!arguments[0]->IsFoldable())
 45 |                 throw BinderException("Theta Sketch lg_k must be constant");
 46 | 
 47 |             Value k_val = ExpressionExecutor::EvaluateScalar(context, *arguments[0]);
 48 |             if (k_val.IsNull())
 49 |                 throw BinderException("Theta Sketch lg_k cannot be NULL");
 50 | 
 51 |             auto lg_k = (uint8_t)k_val.GetValue<int32_t>();
 52 |             Function::EraseArgument(function, arguments, 0);
 53 |             return make_uniq<DSThetaBindData>(lg_k);
 54 |         }
 55 | 
 56 |         unique_ptr<FunctionData> DSThetaBindDefault(ClientContext &context, AggregateFunction &function,
 57 |                                                     vector<unique_ptr<Expression>> &arguments)
 58 |         {
 59 |             return make_uniq<DSThetaBindData>(12);
 60 |         }
 61 | 
 62 |         // ============================================================
 63 |         // 2. State & Operations
 64 |         // ============================================================
 65 | 
 66 |         struct DSThetaState
 67 |         {
 68 |             datasketches::update_theta_sketch *update_sketch = nullptr;
 69 |             datasketches::theta_union *union_sketch = nullptr;
 70 | 
 71 |             ~DSThetaState()
 72 |             {
 73 |                 if (update_sketch)
 74 |                     delete update_sketch;
 75 |                 if (union_sketch)
 76 |                     delete union_sketch;
 77 |             }
 78 | 
 79 |             void CreateUpdateSketch(uint8_t lg_k)
 80 |             {
 81 |                 if (!update_sketch)
 82 |                 {
 83 |                     datasketches::update_theta_sketch::builder b;
 84 |                     b.set_lg_k(lg_k);
 85 |                     update_sketch = new datasketches::update_theta_sketch(b.build());
 86 |                 }
 87 |             }
 88 | 
 89 |             void CreateUnionSketch(uint8_t lg_k)
 90 |             {
 91 |                 if (!union_sketch)
 92 |                 {
 93 |                     datasketches::theta_union::builder b;
 94 |                     b.set_lg_k(lg_k);
 95 |                     union_sketch = new datasketches::theta_union(b.build());
 96 |                 }
 97 |             }
 98 |         };
 99 | 
100 |         struct DSThetaOperationBase
101 |         {
102 |             template <class STATE>
103 |             static void Initialize(STATE &state)
104 |             {
105 |                 state.update_sketch = nullptr;
106 |                 state.union_sketch = nullptr;
107 |             }
108 |             template <class STATE>
109 |             static void Destroy(STATE &state, AggregateInputData &)
110 |             {
111 |                 if (state.update_sketch)
112 |                     delete state.update_sketch;
113 |                 if (state.union_sketch)
114 |                     delete state.union_sketch;
115 |             }
116 |             static bool IgnoreNull() { return true; }
117 | 
118 |             template <class STATE, class OP>
119 |             static void Combine(const STATE &source, STATE &target, AggregateInputData &aggr_input_data)
120 |             {
121 |                 if (!source.update_sketch && !source.union_sketch)
122 |                     return;
123 | 
124 |                 if (!target.union_sketch)
125 |                 {
126 |                     auto &bind_data = aggr_input_data.bind_data->template Cast<DSThetaBindData>();
127 |                     target.CreateUnionSketch(bind_data.lg_k);
128 |                     if (target.update_sketch)
129 |                     {
130 |                         target.union_sketch->update(*target.update_sketch);
131 |                         delete target.update_sketch;
132 |                         target.update_sketch = nullptr;
133 |                     }
134 |                 }
135 |                 if (source.update_sketch)
136 |                     target.union_sketch->update(*source.update_sketch);
137 |                 if (source.union_sketch)
138 |                     target.union_sketch->update(source.union_sketch->get_result());
139 |             }
140 | 
141 |             template <class T, class STATE>
142 |             static void Finalize(STATE &state, T &target, AggregateFinalizeData &finalize_data)
143 |             {
144 |                 if (state.union_sketch)
145 |                 {
146 |                     auto compact = state.union_sketch->get_result();
147 |                     auto serialized = compact.serialize();
148 |                     target = StringVector::AddStringOrBlob(finalize_data.result, std::string(serialized.begin(), serialized.end()));
149 |                 }
150 |                 else if (state.update_sketch)
151 |                 {
152 |                     auto compact = state.update_sketch->compact();
153 |                     auto serialized = compact.serialize();
154 |                     target = StringVector::AddStringOrBlob(finalize_data.result, std::string(serialized.begin(), serialized.end()));
155 |                 }
156 |                 else
157 |                 {
158 |                     auto &bind_data = finalize_data.input.bind_data->template Cast<DSThetaBindData>();
159 |                     datasketches::update_theta_sketch::builder b;
160 |                     b.set_lg_k(bind_data.lg_k);
161 |                     auto empty_sketch = b.build();
162 |                     auto compact = empty_sketch.compact();
163 |                     auto serialized = compact.serialize();
164 |                     target = StringVector::AddStringOrBlob(finalize_data.result, std::string(serialized.begin(), serialized.end()));
165 |                 }
166 |             }
167 |         };
168 | 
169 |         struct DSThetaCreateOperation : DSThetaOperationBase
170 |         {
171 |             template <class A_TYPE, class STATE, class OP>
172 |             static void Operation(STATE &state, const A_TYPE &a_data, AggregateUnaryInput &idata)
173 |             {
174 |                 auto &bind_data = idata.input.bind_data->template Cast<DSThetaBindData>();
175 |                 state.CreateUpdateSketch(bind_data.lg_k);
176 |                 if constexpr (std::is_same_v<A_TYPE, duckdb::string_t>)
177 |                 {
178 |                     state.update_sketch->update(a_data.GetData(), a_data.GetSize());
179 |                 }
180 |                 else
181 |                 {
182 |                     state.update_sketch->update(a_data);
183 |                 }
184 |             }
185 | 
186 |             template <class INPUT_TYPE, class STATE, class OP>
187 |             static void ConstantOperation(STATE &state, const INPUT_TYPE &input, AggregateUnaryInput &unary_input, idx_t count)
188 |             {
189 |                 for (idx_t i = 0; i < count; i++)
190 |                 {
191 |                     Operation<INPUT_TYPE, STATE, OP>(state, input, unary_input);
192 |                 }
193 |             }
194 |         };
195 | 
196 |         struct DSThetaMergeOperation : DSThetaOperationBase
197 |         {
198 |             template <class A_TYPE, class STATE, class OP>
199 |             static void Operation(STATE &state, const A_TYPE &a_data, AggregateUnaryInput &idata)
200 |             {
201 |                 auto &bind_data = idata.input.bind_data->template Cast<DSThetaBindData>();
202 |                 state.CreateUnionSketch(bind_data.lg_k);
203 |                 auto sketch = datasketches::compact_theta_sketch::deserialize(a_data.GetDataUnsafe(), a_data.GetSize());
204 |                 state.union_sketch->update(sketch);
205 |             }
206 | 
207 |             template <class INPUT_TYPE, class STATE, class OP>
208 |             static void ConstantOperation(STATE &state, const INPUT_TYPE &input, AggregateUnaryInput &unary_input, idx_t count)
209 |             {
210 |                 for (idx_t i = 0; i < count; i++)
211 |                 {
212 |                     Operation<INPUT_TYPE, STATE, OP>(state, input, unary_input);
213 |                 }
214 |             }
215 |         };
216 | 
217 |         // ============================================================
218 |         // 3. Scalar Functions
219 |         // ============================================================
220 | 
221 |         static void DSThetaUnion(DataChunk &args, ExpressionState &state, Vector &result)
222 |         {
223 |             BinaryExecutor::Execute<string_t, string_t, string_t>(
224 |                 args.data[0], args.data[1], result, args.size(),
225 |                 [&](string_t a_blob, string_t b_blob)
226 |                 {
227 |                     datasketches::theta_union::builder b;
228 |                     b.set_lg_k(12);
229 |                     auto union_obj = b.build();
230 |                     auto sketch_a = datasketches::compact_theta_sketch::deserialize(a_blob.GetDataUnsafe(), a_blob.GetSize());
231 |                     auto sketch_b = datasketches::compact_theta_sketch::deserialize(b_blob.GetDataUnsafe(), b_blob.GetSize());
232 |                     union_obj.update(sketch_a);
233 |                     union_obj.update(sketch_b);
234 |                     auto res = union_obj.get_result();
235 |                     auto serialized = res.serialize();
236 |                     return StringVector::AddStringOrBlob(result, std::string(serialized.begin(), serialized.end()));
237 |                 });
238 |         }
239 | 
240 |         static void DSThetaIntersect(DataChunk &args, ExpressionState &state, Vector &result)
241 |         {
242 |             BinaryExecutor::Execute<string_t, string_t, string_t>(
243 |                 args.data[0], args.data[1], result, args.size(),
244 |                 [&](string_t a_blob, string_t b_blob)
245 |                 {
246 |                     auto sketch_a = datasketches::compact_theta_sketch::deserialize(a_blob.GetDataUnsafe(), a_blob.GetSize());
247 |                     auto sketch_b = datasketches::compact_theta_sketch::deserialize(b_blob.GetDataUnsafe(), b_blob.GetSize());
248 |                     datasketches::theta_intersection intersection;
249 |                     intersection.update(sketch_a);
250 |                     intersection.update(sketch_b);
251 |                     auto res = intersection.get_result();
252 |                     auto serialized = res.serialize();
253 |                     return StringVector::AddStringOrBlob(result, std::string(serialized.begin(), serialized.end()));
254 |                 });
255 |         }
256 | 
257 |         static void DSThetaANotB(DataChunk &args, ExpressionState &state, Vector &result)
258 |         {
259 |             BinaryExecutor::Execute<string_t, string_t, string_t>(
260 |                 args.data[0], args.data[1], result, args.size(),
261 |                 [&](string_t a_blob, string_t b_blob)
262 |                 {
263 |                     auto sketch_a = datasketches::compact_theta_sketch::deserialize(a_blob.GetDataUnsafe(), a_blob.GetSize());
264 |                     auto sketch_b = datasketches::compact_theta_sketch::deserialize(b_blob.GetDataUnsafe(), b_blob.GetSize());
265 |                     datasketches::theta_a_not_b a_not_b;
266 |                     auto res = a_not_b.compute(sketch_a, sketch_b);
267 |                     auto serialized = res.serialize();
268 |                     return StringVector::AddStringOrBlob(result, std::string(serialized.begin(), serialized.end()));
269 |                 });
270 |         }
271 | 
272 |         static void DSThetaEstimate(DataChunk &args, ExpressionState &state, Vector &result)
273 |         {
274 |             UnaryExecutor::Execute<string_t, double>(args.data[0], result, args.size(),
275 |                                                      [&](string_t sketch_blob)
276 |                                                      {
277 |                                                          return datasketches::compact_theta_sketch::deserialize(sketch_blob.GetDataUnsafe(), sketch_blob.GetSize()).get_estimate();
278 |                                                      });
279 |         }
280 | 
281 |         static void DSThetaLowerBound(DataChunk &args, ExpressionState &state, Vector &result)
282 |         {
283 |             BinaryExecutor::Execute<string_t, int32_t, double>(args.data[0], args.data[1], result, args.size(),
284 |                                                                [&](string_t sketch_blob, int32_t num_std_devs)
285 |                                                                {
286 |                                                                    return datasketches::compact_theta_sketch::deserialize(sketch_blob.GetDataUnsafe(), sketch_blob.GetSize()).get_lower_bound(static_cast<uint8_t>(num_std_devs));
287 |                                                                });
288 |         }
289 | 
290 |         static void DSThetaUpperBound(DataChunk &args, ExpressionState &state, Vector &result)
291 |         {
292 |             BinaryExecutor::Execute<string_t, int32_t, double>(args.data[0], args.data[1], result, args.size(),
293 |                                                                [&](string_t sketch_blob, int32_t num_std_devs)
294 |                                                                {
295 |                                                                    return datasketches::compact_theta_sketch::deserialize(sketch_blob.GetDataUnsafe(), sketch_blob.GetSize()).get_upper_bound(static_cast<uint8_t>(num_std_devs));
296 |                                                                });
297 |         }
298 | 
299 |         static void DSThetaDescribe(DataChunk &args, ExpressionState &state, Vector &result)
300 |         {
301 |             UnaryExecutor::Execute<string_t, string_t>(args.data[0], result, args.size(),
302 |                                                        [&](string_t sketch_blob)
303 |                                                        {
304 |                                                            return StringVector::AddString(result, datasketches::compact_theta_sketch::deserialize(sketch_blob.GetDataUnsafe(), sketch_blob.GetSize()).to_string(false));
305 |                                                        });
306 |         }
307 | 
308 |         // --- METADATA FUNCTIONS
309 | 
310 |         static void DSThetaIsEmpty(DataChunk &args, ExpressionState &state, Vector &result)
311 |         {
312 |             UnaryExecutor::Execute<string_t, bool>(args.data[0], result, args.size(),
313 |                                                    [&](string_t sketch_blob)
314 |                                                    {
315 |                                                        return datasketches::compact_theta_sketch::deserialize(sketch_blob.GetDataUnsafe(), sketch_blob.GetSize()).is_empty();
316 |                                                    });
317 |         }
318 | 
319 |         static void DSThetaIsEstimation(DataChunk &args, ExpressionState &state, Vector &result)
320 |         {
321 |             UnaryExecutor::Execute<string_t, bool>(args.data[0], result, args.size(),
322 |                                                    [&](string_t sketch_blob)
323 |                                                    {
324 |                                                        return datasketches::compact_theta_sketch::deserialize(sketch_blob.GetDataUnsafe(), sketch_blob.GetSize()).is_estimation_mode();
325 |                                                    });
326 |         }
327 | 
328 |         static void DSThetaGetTheta(DataChunk &args, ExpressionState &state, Vector &result)
329 |         {
330 |             UnaryExecutor::Execute<string_t, double>(args.data[0], result, args.size(),
331 |                                                      [&](string_t sketch_blob)
332 |                                                      {
333 |                                                          return datasketches::compact_theta_sketch::deserialize(sketch_blob.GetDataUnsafe(), sketch_blob.GetSize()).get_theta();
334 |                                                      });
335 |         }
336 | 
337 |         static void DSThetaNumRetained(DataChunk &args, ExpressionState &state, Vector &result)
338 |         {
339 |             UnaryExecutor::Execute<string_t, int64_t>(args.data[0], result, args.size(),
340 |                                                       [&](string_t sketch_blob)
341 |                                                       {
342 |                                                           return (int64_t)datasketches::compact_theta_sketch::deserialize(sketch_blob.GetDataUnsafe(), sketch_blob.GetSize()).get_num_retained();
343 |                                                       });
344 |         }
345 | 
346 |         static void DSThetaGetSeed(DataChunk &args, ExpressionState &state, Vector &result)
347 |         {
348 |             // Note: Compact sketches typically store the Seed HASH, not the full seed.
349 |             UnaryExecutor::Execute<string_t, int64_t>(args.data[0], result, args.size(),
350 |                                                       [&](string_t sketch_blob)
351 |                                                       {
352 |                                                           return (int64_t)datasketches::compact_theta_sketch::deserialize(sketch_blob.GetDataUnsafe(), sketch_blob.GetSize()).get_seed_hash();
353 |                                                       });
354 |         }
355 | 
356 |         // ============================================================
357 |         // 4. Type Creation & Registration Helpers
358 |         // ============================================================
359 | 
360 |         static LogicalType CreateThetaSketchType(ExtensionLoader &loader)
361 |         {
362 |             auto new_type = LogicalType(LogicalTypeId::BLOB);
363 |             auto new_type_name = "sketch_theta";
364 |             auto type_info = CreateTypeInfo(new_type_name, LogicalType::BLOB);
365 |             type_info.temporary = false;
366 |             type_info.internal = true;
367 |             type_info.comment = "Sketch type for Theta Sketch";
368 |             new_type.SetAlias(new_type_name);
369 | 
370 |             auto &system_catalog = Catalog::GetSystemCatalog(loader.GetDatabaseInstance());
371 |             auto data = CatalogTransaction::GetSystemTransaction(loader.GetDatabaseInstance());
372 |             system_catalog.CreateType(data, type_info);
373 | 
374 |             loader.RegisterCastFunction(LogicalType::BLOB, new_type, DefaultCasts::ReinterpretCast, 1);
375 |             loader.RegisterCastFunction(new_type, LogicalType::BLOB, DefaultCasts::ReinterpretCast, 1);
376 |             return new_type;
377 |         }
378 | 
379 |         template <typename T>
380 |         static void RegisterThetaAggregates(AggregateFunctionSet &set, const LogicalType &input_type, const LogicalType &result_type)
381 |         {
382 |             auto fun_default = AggregateFunction::UnaryAggregateDestructor<DSThetaState, T, string_t, DSThetaCreateOperation, AggregateDestructorType::LEGACY>(
383 |                 input_type, result_type);
384 |             fun_default.bind = DSThetaBindDefault;
385 |             fun_default.order_dependent = AggregateOrderDependent::NOT_ORDER_DEPENDENT;
386 |             set.AddFunction(fun_default);
387 | 
388 |             auto fun_with_k = AggregateFunction::UnaryAggregateDestructor<DSThetaState, T, string_t, DSThetaCreateOperation, AggregateDestructorType::LEGACY>(
389 |                 input_type, result_type);
390 |             fun_with_k.bind = DSThetaBindWithK;
391 |             fun_with_k.order_dependent = AggregateOrderDependent::NOT_ORDER_DEPENDENT;
392 |             fun_with_k.arguments.insert(fun_with_k.arguments.begin(), LogicalType::INTEGER);
393 |             set.AddFunction(fun_with_k);
394 |         }
395 |     }
396 | 
397 |     // ============================================================
398 |     // 5. Main Loader
399 |     // ============================================================
400 | 
401 |     void LoadThetaSketch(ExtensionLoader &loader)
402 |     {
403 |         auto sketch_type = CreateThetaSketchType(loader);
404 |         AggregateFunctionSet sketch_agg("datasketch_theta");
405 | 
406 |         // 1. RAW DATA - Register specific types
407 |         // IMPORTANT: DO NOT register LogicalType::BLOB here!
408 |         // If we do, it shadows the Merge operation for "sketch_theta".
409 |         RegisterThetaAggregates<int8_t>(sketch_agg, LogicalType::TINYINT, sketch_type);
410 |         RegisterThetaAggregates<int16_t>(sketch_agg, LogicalType::SMALLINT, sketch_type);
411 |         RegisterThetaAggregates<int32_t>(sketch_agg, LogicalType::INTEGER, sketch_type);
412 |         RegisterThetaAggregates<int64_t>(sketch_agg, LogicalType::BIGINT, sketch_type);
413 |         RegisterThetaAggregates<float>(sketch_agg, LogicalType::FLOAT, sketch_type);
414 |         RegisterThetaAggregates<double>(sketch_agg, LogicalType::DOUBLE, sketch_type);
415 |         RegisterThetaAggregates<string_t>(sketch_agg, LogicalType::VARCHAR, sketch_type);
416 | 
417 |         // 2. MERGE SKETCHES (sketch_theta / BLOB)
418 |         auto fun_merge = AggregateFunction::UnaryAggregateDestructor<DSThetaState, string_t, string_t, DSThetaMergeOperation, AggregateDestructorType::LEGACY>(
419 |             sketch_type, sketch_type);
420 |         fun_merge.bind = DSThetaBindDefault;
421 |         fun_merge.arguments = {sketch_type};
422 |         sketch_agg.AddFunction(fun_merge);
423 | 
424 |         auto fun_merge_k = AggregateFunction::UnaryAggregateDestructor<DSThetaState, string_t, string_t, DSThetaMergeOperation, AggregateDestructorType::LEGACY>(
425 |             sketch_type, sketch_type);
426 |         fun_merge_k.bind = DSThetaBindWithK;
427 |         fun_merge_k.arguments = {LogicalType::INTEGER, sketch_type};
428 |         sketch_agg.AddFunction(fun_merge_k);
429 | 
430 |         {
431 |             CreateAggregateFunctionInfo info(sketch_agg);
432 |             FunctionDescription desc;
433 |             desc.description = "Creates a Theta sketch for estimating set cardinality and performing set operations";
434 |             desc.examples.push_back("datasketch_theta(column)");
435 |             desc.examples.push_back("datasketch_theta(12, column)");
436 |             info.descriptions.push_back(desc);
437 |             loader.RegisterFunction(info);
438 |         }
439 | 
440 |         // --- SCALAR FUNCTIONS ---
441 |         {
442 |             CreateScalarFunctionInfo info(ScalarFunction("datasketch_theta_intersect", {sketch_type, sketch_type}, sketch_type, DSThetaIntersect));
443 |             FunctionDescription desc;
444 |             desc.description = "Returns a new Theta sketch representing the intersection of two sketches";
445 |             desc.examples.push_back("datasketch_theta_intersect(sketch1, sketch2)");
446 |             info.descriptions.push_back(desc);
447 |             loader.RegisterFunction(info);
448 |         }
449 |         {
450 |             CreateScalarFunctionInfo info(ScalarFunction("datasketch_theta_union", {sketch_type, sketch_type}, sketch_type, DSThetaUnion));
451 |             FunctionDescription desc;
452 |             desc.description = "Returns a new Theta sketch representing the union of two sketches";
453 |             desc.examples.push_back("datasketch_theta_union(sketch1, sketch2)");
454 |             info.descriptions.push_back(desc);
455 |             loader.RegisterFunction(info);
456 |         }
457 |         {
458 |             CreateScalarFunctionInfo info(ScalarFunction("datasketch_theta_a_not_b", {sketch_type, sketch_type}, sketch_type, DSThetaANotB));
459 |             FunctionDescription desc;
460 |             desc.description = "Returns a new Theta sketch representing elements in sketch A but not in sketch B (set difference)";
461 |             desc.examples.push_back("datasketch_theta_a_not_b(sketch_a, sketch_b)");
462 |             info.descriptions.push_back(desc);
463 |             loader.RegisterFunction(info);
464 |         }
465 |         {
466 |             CreateScalarFunctionInfo info(ScalarFunction("datasketch_theta_estimate", {sketch_type}, LogicalType::DOUBLE, DSThetaEstimate));
467 |             FunctionDescription desc;
468 |             desc.description = "Returns the estimated number of distinct values in the Theta sketch";
469 |             desc.examples.push_back("datasketch_theta_estimate(sketch)");
470 |             info.descriptions.push_back(desc);
471 |             loader.RegisterFunction(info);
472 |         }
473 |         {
474 |             CreateScalarFunctionInfo info(ScalarFunction("datasketch_theta_lower_bound", {sketch_type, LogicalType::INTEGER}, LogicalType::DOUBLE, DSThetaLowerBound));
475 |             FunctionDescription desc;
476 |             desc.description = "Returns the lower bound estimate at the given number of standard deviations (1, 2, or 3)";
477 |             desc.examples.push_back("datasketch_theta_lower_bound(sketch, 2)");
478 |             info.descriptions.push_back(desc);
479 |             loader.RegisterFunction(info);
480 |         }
481 |         {
482 |             CreateScalarFunctionInfo info(ScalarFunction("datasketch_theta_upper_bound", {sketch_type, LogicalType::INTEGER}, LogicalType::DOUBLE, DSThetaUpperBound));
483 |             FunctionDescription desc;
484 |             desc.description = "Returns the upper bound estimate at the given number of standard deviations (1, 2, or 3)";
485 |             desc.examples.push_back("datasketch_theta_upper_bound(sketch, 2)");
486 |             info.descriptions.push_back(desc);
487 |             loader.RegisterFunction(info);
488 |         }
489 |         {
490 |             CreateScalarFunctionInfo info(ScalarFunction("datasketch_theta_describe", {sketch_type}, LogicalType::VARCHAR, DSThetaDescribe));
491 |             FunctionDescription desc;
492 |             desc.description = "Returns a human-readable description of the Theta sketch";
493 |             desc.examples.push_back("datasketch_theta_describe(sketch)");
494 |             info.descriptions.push_back(desc);
495 |             loader.RegisterFunction(info);
496 |         }
497 | 
498 |         // Metadata
499 |         {
500 |             CreateScalarFunctionInfo info(ScalarFunction("datasketch_theta_is_empty", {sketch_type}, LogicalType::BOOLEAN, DSThetaIsEmpty));
501 |             FunctionDescription desc;
502 |             desc.description = "Returns true if the Theta sketch is empty";
503 |             desc.examples.push_back("datasketch_theta_is_empty(sketch)");
504 |             info.descriptions.push_back(desc);
505 |             loader.RegisterFunction(info);
506 |         }
507 |         {
508 |             CreateScalarFunctionInfo info(ScalarFunction("datasketch_theta_is_estimation_mode", {sketch_type}, LogicalType::BOOLEAN, DSThetaIsEstimation));
509 |             FunctionDescription desc;
510 |             desc.description = "Returns true if the sketch is in estimation mode (has exceeded exact counting capacity)";
511 |             desc.examples.push_back("datasketch_theta_is_estimation_mode(sketch)");
512 |             info.descriptions.push_back(desc);
513 |             loader.RegisterFunction(info);
514 |         }
515 |         {
516 |             CreateScalarFunctionInfo info(ScalarFunction("datasketch_theta_get_theta", {sketch_type}, LogicalType::DOUBLE, DSThetaGetTheta));
517 |             FunctionDescription desc;
518 |             desc.description = "Returns the theta value of the sketch (sampling probability)";
519 |             desc.examples.push_back("datasketch_theta_get_theta(sketch)");
520 |             info.descriptions.push_back(desc);
521 |             loader.RegisterFunction(info);
522 |         }
523 |         {
524 |             CreateScalarFunctionInfo info(ScalarFunction("datasketch_theta_num_retained", {sketch_type}, LogicalType::BIGINT, DSThetaNumRetained));
525 |             FunctionDescription desc;
526 |             desc.description = "Returns the number of hash values retained in the sketch";
527 |             desc.examples.push_back("datasketch_theta_num_retained(sketch)");
528 |             info.descriptions.push_back(desc);
529 |             loader.RegisterFunction(info);
530 |         }
531 |         {
532 |             CreateScalarFunctionInfo info(ScalarFunction("datasketch_theta_get_seed", {sketch_type}, LogicalType::BIGINT, DSThetaGetSeed));
533 |             FunctionDescription desc;
534 |             desc.description = "Returns the seed hash used by the sketch";
535 |             desc.examples.push_back("datasketch_theta_get_seed(sketch)");
536 |             info.descriptions.push_back(desc);
537 |             loader.RegisterFunction(info);
538 |         }
539 |     }
540 | 
541 | }
542 | 


--------------------------------------------------------------------------------
/codegen/generated.cpp.j2:
--------------------------------------------------------------------------------
  1 | #include "datasketches_extension.hpp"
  2 | 
  3 | 
  4 | #include "duckdb/parser/parsed_data/create_scalar_function_info.hpp"
  5 | #include "duckdb/parser/parsed_data/create_aggregate_function_info.hpp"
  6 | #include "duckdb/function/scalar_function.hpp"
  7 | 
  8 | #include <DataSketches/quantiles_sketch.hpp>
  9 | #include <DataSketches/kll_sketch.hpp>
 10 | #include <DataSketches/req_sketch.hpp>
 11 | #include <DataSketches/tdigest.hpp>
 12 | #include <DataSketches/hll.hpp>
 13 | #include <DataSketches/cpc_sketch.hpp>
 14 | #include <DataSketches/cpc_union.hpp>
 15 | 
 16 | namespace duckdb
 17 | {
 18 | 
 19 | 
 20 | static std::string toLowerCase(const std::string& input) {
 21 |     std::string result = input;
 22 |     std::transform(result.begin(), result.end(), result.begin(), [](unsigned char c) {
 23 |         return std::tolower(c);
 24 |     });
 25 |     return result;
 26 | }
 27 | 
 28 |     {% for sketch_type in sketch_types %}
 29 | 
 30 | 
 31 | struct DS{{sketch_type}}BindData : public FunctionData {
 32 | 	DS{{sketch_type}}BindData() {
 33 | 	}
 34 | 	explicit DS{{sketch_type}}BindData({{sketch_k_cpp_type[sketch_type]}} k) : k(k) {
 35 | 	}
 36 | 
 37 | 	unique_ptr<FunctionData> Copy() const override {
 38 | 		return make_uniq<DS{{sketch_type}}BindData>(k);
 39 | 	}
 40 | 
 41 | 	bool Equals(const FunctionData &other_p) const override {
 42 | 		auto &other = other_p.Cast<DS{{sketch_type}}BindData>();
 43 |         return k == other.k;
 44 | 	}
 45 | 
 46 |     {{sketch_k_cpp_type[sketch_type]}} k;
 47 | };
 48 | 
 49 | 
 50 | unique_ptr<FunctionData> DS{{sketch_type}}Bind(ClientContext &context, AggregateFunction &function,
 51 |                                             vector<unique_ptr<Expression>> &arguments) {
 52 | 	if (arguments[0]->HasParameter()) {
 53 | 		throw ParameterNotResolvedException();
 54 | 	}
 55 | 	if (!arguments[0]->IsFoldable()) {
 56 | 		throw BinderException("{{sketch_type}} can only take a constant K value");
 57 | 	}
 58 | 	Value k_val = ExpressionExecutor::EvaluateScalar(context, *arguments[0]);
 59 | 	if (k_val.IsNull()) {
 60 | 		throw BinderException("{{sketch_type}} K value cannot be NULL");
 61 | 	}
 62 | 
 63 | 	auto actual_k = k_val.GetValue<{{sketch_k_cpp_type[sketch_type]}}>();
 64 | 
 65 | 	{% if sketch_type in ["Quantiles", "KLL"] %}
 66 | 	// Validate K parameter: must be in range (0, 32768]
 67 | 	if (actual_k <= 0 || actual_k > 32768) {
 68 | 		throw BinderException("{{sketch_type}} K value must be between 1 and 32768, got: " + std::to_string(actual_k));
 69 | 	}
 70 | 	{% elif sketch_type == "REQ" %}
 71 | 	// Validate K parameter: must be in range [4, 1024]
 72 | 	if (actual_k < 4 || actual_k > 1024) {
 73 | 		throw BinderException("REQ K value must be between 4 and 1024, got: " + std::to_string(actual_k));
 74 | 	}
 75 | 	{% elif sketch_type == "TDigest" %}
 76 | 	// Validate K parameter: must be positive (TDigest compression parameter)
 77 | 	if (actual_k <= 0) {
 78 | 		throw BinderException("TDigest K (compression) value must be positive, got: " + std::to_string(actual_k));
 79 | 	}
 80 | 	{% elif sketch_type == "HLL" %}
 81 | 	// Validate K parameter: lg_k must be in range [4, 21] for HLL
 82 | 	if (actual_k < 4 || actual_k > 21) {
 83 | 		throw BinderException("HLL K (lg_k) value must be between 4 and 21, got: " + std::to_string(actual_k));
 84 | 	}
 85 | 	{% elif sketch_type == "CPC" %}
 86 | 	// Validate K parameter: lg_k must be in range [4, 26] for CPC
 87 | 	if (actual_k < 4 || actual_k > 26) {
 88 | 		throw BinderException("CPC K (lg_k) value must be between 4 and 26, got: " + std::to_string(actual_k));
 89 | 	}
 90 | 	{% endif %}
 91 | 
 92 | 	Function::EraseArgument(function, arguments, 0);
 93 | 	return make_uniq<DS{{sketch_type}}BindData>(actual_k);
 94 | }
 95 | 
 96 | 
 97 |     {% if sketch_type not in counting_sketch_names %}
 98 |     template <class T>
 99 |     {% endif %}
100 |     struct DS{{sketch_type}}State
101 |     {
102 |         {% if sketch_type in counting_sketch_names %}
103 |         {{sketch_class_name(sketch_type)}} *sketch = nullptr;
104 |         {% else %}
105 |         {{sketch_class_name(sketch_type)}}<T> *sketch = nullptr;
106 |         {% endif %}
107 | 
108 |         ~DS{{sketch_type}}State()
109 |         {
110 |             if (sketch)
111 |             {
112 |                 delete sketch;
113 |             }
114 |         }
115 | 
116 |         {% if sketch_type in ["Quantiles", "KLL"] %}
117 |         void CreateSketch(int32_t k)
118 |         {
119 |             D_ASSERT(!sketch);
120 |             D_ASSERT(k > 0);
121 |             D_ASSERT(k <= 32768);
122 |             sketch = new {{sketch_class_name(sketch_type)}}<T>(k);
123 |         }
124 |         {% elif sketch_type == "REQ" %}
125 |         void CreateSketch(int32_t k)
126 |         {
127 |             D_ASSERT(!sketch);
128 |             D_ASSERT(k >= 4);
129 |             D_ASSERT(k <= 1024);
130 |             sketch = new {{sketch_class_name(sketch_type)}}<T>(k);
131 |         }
132 |         {% elif sketch_type == "TDigest" %}
133 |         void CreateSketch(uint16_t k)
134 |         {
135 |             D_ASSERT(!sketch);
136 |             sketch = new {{sketch_class_name(sketch_type)}}<T>(k);
137 |         }
138 |         {% elif sketch_type == "HLL" %}
139 |         void CreateSketch(uint16_t k)
140 |         {
141 |             D_ASSERT(!sketch);
142 |             sketch = new {{sketch_class_name(sketch_type)}}(k);
143 |         }
144 |         {% elif sketch_type == "CPC" %}
145 |         void CreateSketch(uint8_t k)
146 |         {
147 |             D_ASSERT(!sketch);
148 |             sketch = new {{sketch_class_name(sketch_type)}}(k);
149 |         }
150 |         {% endif %}
151 | 
152 |         void CreateSketch(const DS{{sketch_type}}State &existing)
153 |         {
154 |             if (existing.sketch)
155 |             {
156 |                 {% if sketch_type in counting_sketch_names %}
157 |                 sketch = new {{sketch_class_name(sketch_type)}}(*existing.sketch);
158 |                 {% else %}
159 |                 sketch = new {{sketch_class_name(sketch_type)}}<T>(*existing.sketch);
160 |                 {% endif %}
161 |             }
162 |         }
163 | 
164 |         {% if sketch_type not in counting_sketch_names %}
165 |         {{sketch_class_name(sketch_type)}}<T> deserialize_sketch(const string_t &data)
166 |         {
167 |             try {
168 |                 return {{sketch_class_name(sketch_type)}}<T>::deserialize(data.GetDataUnsafe(), data.GetSize());
169 |             } catch (const std::exception &e) {
170 |                 throw InvalidInputException("Failed to deserialize {{sketch_type}} sketch: %s", e.what());
171 |             }
172 |         }
173 |         {% else %}
174 |         {{sketch_class_name(sketch_type)}} deserialize_sketch(const string_t &data)
175 |         {
176 |             try {
177 |                 return {{sketch_class_name(sketch_type)}}::deserialize(data.GetDataUnsafe(), data.GetSize());
178 |             } catch (const std::exception &e) {
179 |                 throw InvalidInputException("Failed to deserialize {{sketch_type}} sketch: %s", e.what());
180 |             }
181 |         }
182 |         {% endif %}
183 |     };
184 | 
185 | 
186 | 
187 |     {% if sketch_type in counting_sketch_names %}
188 |     static LogicalType Create{{sketch_type}}CountingSketchType(ExtensionLoader &loader)
189 |     {
190 |         auto new_type = LogicalType(LogicalTypeId::BLOB);
191 |         auto new_type_name = "sketch_{{sketch_type|lower}}";
192 |         auto type_info = CreateTypeInfo(new_type_name, LogicalType::BLOB);
193 |         type_info.temporary = false;
194 |         type_info.internal = true;
195 |         type_info.comment = "Sketch type for {{sketch_type}} sketch";
196 |         new_type.SetAlias(new_type_name);
197 |         auto &system_catalog = Catalog::GetSystemCatalog(loader.GetDatabaseInstance());
198 |         auto data = CatalogTransaction::GetSystemTransaction(loader.GetDatabaseInstance());
199 |         system_catalog.CreateType(data, type_info);
200 | 	    loader.RegisterCastFunction(LogicalType::BLOB, new_type, DefaultCasts::ReinterpretCast, 1);
201 |         loader.RegisterCastFunction(new_type, LogicalType::BLOB, DefaultCasts::ReinterpretCast, 1);
202 |         return new_type;
203 |     }
204 |     {% else %}
205 |     static LogicalType Create{{sketch_type}}SketchType(ExtensionLoader &loader, LogicalType embedded_type)
206 |     {
207 |         auto new_type = LogicalType(LogicalTypeId::BLOB);
208 |         auto type_suffix = toLowerCase(embedded_type.ToString());
209 |         auto new_type_name = "sketch_{{sketch_type|lower}}_" + type_suffix;
210 | 
211 | 
212 |         new_type.SetAlias(new_type_name);
213 |         auto type_info = CreateTypeInfo(new_type_name, new_type);
214 |         type_info.temporary = false;
215 |         type_info.internal = true;
216 |         type_info.comment = "Sketch type for {{sketch_type}} sketch with embedded type " + embedded_type.ToString();
217 |         auto &system_catalog = Catalog::GetSystemCatalog(loader.GetDatabaseInstance());
218 |         auto data = CatalogTransaction::GetSystemTransaction(loader.GetDatabaseInstance());
219 |         system_catalog.CreateType(data, type_info);
220 | 	    loader.RegisterCastFunction(LogicalType::BLOB, new_type, DefaultCasts::ReinterpretCast, 1);
221 |         loader.RegisterCastFunction(new_type, LogicalType::BLOB, DefaultCasts::ReinterpretCast, 1);
222 |         return new_type;
223 |     }
224 |     {% endif %}
225 | 
226 | 
227 |     {%- endfor %}
228 | 
229 |     struct DSSketchOperationBase {
230 |         template <class STATE>
231 |         static void Initialize(STATE &state)
232 |         {
233 |             state.sketch = nullptr;
234 |         }
235 | 
236 |         template <class STATE>
237 |         static void Destroy(STATE &state, AggregateInputData &aggr_input_data) {
238 |             if (state.sketch) {
239 |                 delete state.sketch;
240 |                 state.sketch = nullptr;
241 |             }
242 |         }
243 | 
244 |         static bool IgnoreNull() { return true; }
245 |     };
246 | 
247 |     template <class BIND_DATA_TYPE>
248 |     struct DSQuantilesMergeOperation : DSSketchOperationBase
249 |     {
250 |         template <class A_TYPE, class STATE, class OP>
251 |         static void Operation(STATE &state,
252 |                               const A_TYPE &a_data,
253 |                               AggregateUnaryInput &idata)
254 |         {
255 |             if (!state.sketch)
256 |             {
257 |                 auto &bind_data = idata.input.bind_data->template Cast<BIND_DATA_TYPE>();
258 |                 state.CreateSketch(bind_data.k);
259 |             }
260 | 
261 |             // this is a sketch in b_data, so we need to deserialize it.
262 |             state.sketch->merge(state.deserialize_sketch(a_data));
263 |         }
264 | 
265 |         template <class INPUT_TYPE, class STATE, class OP>
266 |         static void ConstantOperation(STATE &state, const INPUT_TYPE &input, AggregateUnaryInput &unary_input,
267 |                                     idx_t count) {
268 |             for (idx_t i = 0; i < count; i++) {
269 |                 Operation<INPUT_TYPE, STATE, OP>(state, input, unary_input);
270 |             }
271 |         }
272 | 
273 |         template <class STATE, class OP>
274 |         static void Combine(const STATE &source, STATE &target,
275 |                             AggregateInputData &aggr_input_data)
276 |         {
277 |             if (!target.sketch)
278 |             {
279 |                 target.CreateSketch(source);
280 |             }
281 |             else
282 |             {
283 |                 target.sketch->merge(*source.sketch);
284 |             }
285 |         }
286 | 
287 |         template <class T, class STATE>
288 |         static void Finalize(STATE &state, T &target,
289 |                              AggregateFinalizeData &finalize_data)
290 |         {
291 |             if (!state.sketch)
292 |             {
293 |                 finalize_data.ReturnNull();
294 |             }
295 |             else
296 |             {
297 |                 auto serialized_data = state.sketch->serialize();
298 |                 auto sketch_string = std::string(serialized_data.begin(), serialized_data.end());
299 |                 target = StringVector::AddStringOrBlob(finalize_data.result, sketch_string);
300 |             }
301 |         }
302 |     };
303 | 
304 |     template <class BIND_DATA_TYPE>
305 |     struct DSQuantilesCreateOperation : DSSketchOperationBase
306 |     {
307 |         template <class A_TYPE, class STATE, class OP>
308 |         static void Operation(STATE &state,
309 |                               const A_TYPE &a_data,
310 |                               AggregateUnaryInput &idata)
311 |         {
312 |             if (!state.sketch)
313 |             {
314 |                 auto &bind_data = idata.input.bind_data->template Cast<BIND_DATA_TYPE>();
315 |                 state.CreateSketch(bind_data.k);
316 |             }
317 | 
318 |             state.sketch->update(a_data);
319 |         }
320 | 
321 |         template <class INPUT_TYPE, class STATE, class OP>
322 |         static void ConstantOperation(STATE &state, const INPUT_TYPE &input, AggregateUnaryInput &unary_input,
323 |                                     idx_t count) {
324 |             for (idx_t i = 0; i < count; i++) {
325 |                 Operation<INPUT_TYPE, STATE, OP>(state, input, unary_input);
326 |             }
327 |         }
328 | 
329 |         template <class STATE, class OP>
330 |         static void Combine(const STATE &source, STATE &target,
331 |                             AggregateInputData &aggr_input_data)
332 |         {
333 |             if (!target.sketch)
334 |             {
335 |                 target.CreateSketch(source);
336 |             }
337 |             else
338 |             {
339 |                 target.sketch->merge(*source.sketch);
340 |             }
341 |         }
342 | 
343 |         template <class T, class STATE>
344 |         static void Finalize(STATE &state, T &target,
345 |                              AggregateFinalizeData &finalize_data)
346 |         {
347 |             if (!state.sketch)
348 |             {
349 |                 finalize_data.ReturnNull();
350 |             }
351 |             else
352 |             {
353 |                 auto serialized_data = state.sketch->serialize();
354 |                 auto sketch_string = std::string(serialized_data.begin(), serialized_data.end());
355 |                 target = StringVector::AddStringOrBlob(finalize_data.result, sketch_string);
356 |             }
357 |         }
358 |     };
359 | 
360 |     template <class BIND_DATA_TYPE>
361 |     struct DSHLLCreateOperation : DSSketchOperationBase
362 |     {
363 |         template <class A_TYPE, class STATE, class OP>
364 |         static void Operation(STATE &state,
365 |                               const A_TYPE &a_data,
366 |                               AggregateUnaryInput &idata)
367 |         {
368 |             if (!state.sketch)
369 |             {
370 |                 auto &bind_data = idata.input.bind_data->template Cast<BIND_DATA_TYPE>();
371 |                 state.CreateSketch(bind_data.k);
372 |             }
373 | 
374 |             if constexpr (std::is_same_v<A_TYPE, duckdb::string_t>) {
375 |                 state.sketch->update(a_data.GetData(), a_data.GetSize());
376 |             } else {
377 |                 state.sketch->update(a_data);
378 |             }
379 |         }
380 | 
381 |         template <class INPUT_TYPE, class STATE, class OP>
382 |         static void ConstantOperation(STATE &state, const INPUT_TYPE &input, AggregateUnaryInput &unary_input,
383 |                                     idx_t count) {
384 |             for (idx_t i = 0; i < count; i++) {
385 |                 Operation<INPUT_TYPE, STATE, OP>(state, input, unary_input);
386 |             }
387 |         }
388 | 
389 |         template <class STATE, class OP>
390 |         static void Combine(const STATE &source, STATE &target,
391 |                             AggregateInputData &aggr_input_data)
392 |         {
393 |             if (!target.sketch)
394 |             {
395 |                 target.CreateSketch(source);
396 |             }
397 |             else
398 |             {
399 |                 datasketches::hll_union u(target.sketch->get_lg_config_k());
400 |                 u.update(*target.sketch);
401 |                 if(source.sketch) {
402 |                     u.update(*source.sketch);
403 |                 }
404 |                 *target.sketch = u.get_result(datasketches::target_hll_type::HLL_4);
405 |             }
406 |         }
407 | 
408 |         template <class T, class STATE>
409 |         static void Finalize(STATE &state, T &target,
410 |                              AggregateFinalizeData &finalize_data)
411 |         {
412 |             if (!state.sketch)
413 |             {
414 |                 finalize_data.ReturnNull();
415 |             }
416 |             else
417 |             {
418 |                 auto serialized_data = state.sketch->serialize_updatable();
419 |                 auto sketch_string = std::string(serialized_data.begin(), serialized_data.end());
420 |                 target = StringVector::AddStringOrBlob(finalize_data.result, sketch_string);
421 |             }
422 |         }
423 |     };
424 | 
425 |     template <class BIND_DATA_TYPE>
426 |     struct DSHLLMergeOperation : DSSketchOperationBase
427 |     {
428 | 
429 |         template <class A_TYPE, class STATE, class OP>
430 |         static void Operation(STATE &state,
431 |                               const A_TYPE &a_data,
432 |                               AggregateUnaryInput &idata)
433 |         {
434 |             auto &bind_data = idata.input.bind_data->template Cast<BIND_DATA_TYPE>();
435 | 
436 |             if (!state.sketch)
437 |             {
438 |                 state.CreateSketch(bind_data.k);
439 |             }
440 | 
441 |             auto a_sketch = state.deserialize_sketch(a_data);
442 | 
443 |             datasketches::hll_union u(bind_data.k);
444 |             if(state.sketch) {
445 |                 u.update(*state.sketch);
446 |             }
447 |             u.update(a_sketch);
448 | 
449 |             *state.sketch = u.get_result(datasketches::target_hll_type::HLL_4);
450 |         }
451 | 
452 |         template <class INPUT_TYPE, class STATE, class OP>
453 |         static void ConstantOperation(STATE &state, const INPUT_TYPE &input, AggregateUnaryInput &unary_input,
454 |                                     idx_t count) {
455 |             for (idx_t i = 0; i < count; i++) {
456 |                 Operation<INPUT_TYPE, STATE, OP>(state, input, unary_input);
457 |             }
458 |         }
459 | 
460 |         template <class STATE, class OP>
461 |         static void Combine(const STATE &source, STATE &target,
462 |                             AggregateInputData &aggr_input_data)
463 |         {
464 |             if (!target.sketch)
465 |             {
466 |                 target.CreateSketch(source);
467 |             }
468 |             else
469 |             {
470 |                 datasketches::hll_union u(target.sketch->get_lg_config_k());
471 |                 if(source.sketch) {
472 |                     u.update(*source.sketch);
473 |                 }
474 |                 u.update(*target.sketch);
475 | 
476 |                 *target.sketch = u.get_result(datasketches::target_hll_type::HLL_4);
477 |             }
478 |         }
479 | 
480 |         template <class T, class STATE>
481 |         static void Finalize(STATE &state, T &target,
482 |                              AggregateFinalizeData &finalize_data)
483 |         {
484 |             if (!state.sketch)
485 |             {
486 |                 finalize_data.ReturnNull();
487 |             }
488 |             else
489 |             {
490 |                 auto serialized_data = state.sketch->serialize_updatable();
491 |                 auto sketch_string = std::string(serialized_data.begin(), serialized_data.end());
492 |                 target = StringVector::AddStringOrBlob(finalize_data.result, sketch_string);
493 |             }
494 |         }
495 |     };
496 | 
497 | 
498 |     template <class BIND_DATA_TYPE>
499 |     struct DSCPCMergeOperation : DSSketchOperationBase
500 |     {
501 |         template <class A_TYPE, class STATE, class OP>
502 |         static void Operation(STATE &state,
503 |                               const A_TYPE &a_data,
504 |                               AggregateUnaryInput &idata)
505 |         {
506 |             auto &bind_data = idata.input.bind_data->template Cast<BIND_DATA_TYPE>();
507 | 
508 |             if (!state.sketch)
509 |             {
510 |                 state.CreateSketch(bind_data.k);
511 |             }
512 | 
513 |             auto a_sketch = state.deserialize_sketch(a_data);
514 |             datasketches::cpc_union u(bind_data.k);
515 |             if(state.sketch) {
516 |                 u.update(*state.sketch);
517 |             }
518 |             u.update(a_sketch);
519 | 
520 |             *state.sketch = u.get_result();
521 |         }
522 | 
523 |         template <class INPUT_TYPE, class STATE, class OP>
524 |         static void ConstantOperation(STATE &state, const INPUT_TYPE &input, AggregateUnaryInput &unary_input,
525 |                                     idx_t count) {
526 |             for (idx_t i = 0; i < count; i++) {
527 |                 Operation<INPUT_TYPE, STATE, OP>(state, input, unary_input);
528 |             }
529 |         }
530 | 
531 |         template <class STATE, class OP>
532 |         static void Combine(const STATE &source, STATE &target,
533 |                             AggregateInputData &aggr_input_data)
534 |         {
535 |             if (!target.sketch)
536 |             {
537 |                 target.CreateSketch(source);
538 |             }
539 |             else
540 |             {
541 |                 datasketches::cpc_union u(target.sketch->get_lg_k());
542 |                 if(source.sketch) {
543 |                     u.update(*source.sketch);
544 |                 }
545 |                 u.update(*target.sketch);
546 |                 *target.sketch = u.get_result();
547 |             }
548 |         }
549 | 
550 |         template <class T, class STATE>
551 |         static void Finalize(STATE &state, T &target,
552 |                              AggregateFinalizeData &finalize_data)
553 |         {
554 |             if (!state.sketch)
555 |             {
556 |                 finalize_data.ReturnNull();
557 |             }
558 |             else
559 |             {
560 |                 auto serialized_data = state.sketch->serialize();
561 |                 auto sketch_string = std::string(serialized_data.begin(), serialized_data.end());
562 |                 target = StringVector::AddStringOrBlob(finalize_data.result, sketch_string);
563 |             }
564 |         }
565 |     };
566 | 
567 |     template <class BIND_DATA_TYPE>
568 |     struct DSCPCCreateOperation : DSSketchOperationBase
569 |     {
570 |         template <class A_TYPE, class STATE, class OP>
571 |         static void Operation(STATE &state,
572 |                               const A_TYPE &a_data,
573 |                               AggregateUnaryInput &idata)
574 |         {
575 |             if (!state.sketch)
576 |             {
577 |                 auto &bind_data = idata.input.bind_data->template Cast<BIND_DATA_TYPE>();
578 |                 state.CreateSketch(bind_data.k);
579 |             }
580 | 
581 |             if constexpr (std::is_same_v<A_TYPE, duckdb::string_t>) {
582 |                 state.sketch->update(a_data.GetData(), a_data.GetSize());
583 |             } else {
584 |                 state.sketch->update(a_data);
585 |             }
586 |         }
587 | 
588 |         template <class INPUT_TYPE, class STATE, class OP>
589 |         static void ConstantOperation(STATE &state, const INPUT_TYPE &input, AggregateUnaryInput &unary_input,
590 |                                     idx_t count) {
591 |             for (idx_t i = 0; i < count; i++) {
592 |                 Operation<INPUT_TYPE, STATE, OP>(state, input, unary_input);
593 |             }
594 |         }
595 | 
596 | 
597 |         template <class STATE, class OP>
598 |         static void Combine(const STATE &source, STATE &target,
599 |                             AggregateInputData &aggr_input_data)
600 |         {
601 |             if (!target.sketch)
602 |             {
603 |                 target.CreateSketch(source);
604 |             }
605 |             else
606 |             {
607 |                 datasketches::cpc_union u(target.sketch->get_lg_k());
608 |                 u.update(*target.sketch);
609 |                 if(source.sketch) {
610 |                     u.update(*source.sketch);
611 |                 }
612 |                 *target.sketch = u.get_result();
613 |             }
614 |         }
615 | 
616 | 
617 |         template <class T, class STATE>
618 |         static void Finalize(STATE &state, T &target,
619 |                              AggregateFinalizeData &finalize_data)
620 |         {
621 |             if (!state.sketch)
622 |             {
623 |                 finalize_data.ReturnNull();
624 |             }
625 |             else
626 |             {
627 |                 auto serialized_data = state.sketch->serialize();
628 |                 auto sketch_string = std::string(serialized_data.begin(), serialized_data.end());
629 |                 target = StringVector::AddStringOrBlob(finalize_data.result, sketch_string);
630 |             }
631 |         }
632 |     };
633 | 
634 | 
635 | 
636 | {% for sketch_type in sketch_types %}
637 |     {% for unary_function in functions_per_sketch_type(sketch_type) %}
638 | 
639 | {% if sketch_type not in counting_sketch_names %}
640 | template <class T>
641 | {% endif %}
642 | static inline void DS{{sketch_type}}{{unary_function.name}}(DataChunk &args, ExpressionState &state, Vector &result)
643 | {
644 |     // Get the references to the incoming vectors.
645 |     D_ASSERT(args.ColumnCount() == {{unary_function.arguments|length}});
646 | 
647 |     {% for a in unary_function.arguments %}
648 |         auto &{{a.name}}_vector = args.data[{{loop.index0}}];
649 |     {%- endfor %}
650 | 
651 |     {{ get_function_block(unary_function) }}
652 | 
653 | }
654 | 
655 |     {% endfor %}
656 | 
657 | 
658 |     {% if sketch_type not in counting_sketch_names %}
659 |     template <typename T>
660 |     auto static DS{{sketch_type}}MergeAggregate(const LogicalType &type, const LogicalType &result_type) -> AggregateFunction
661 |     {% else %}
662 |     auto static DS{{sketch_type}}MergeAggregate(const LogicalType &result_type) -> AggregateFunction
663 |     {% endif %}
664 |     {
665 |         {% if sketch_type == "HLL" %}
666 |         return AggregateFunction::UnaryAggregateDestructor<DS{{sketch_type}}State, string_t, string_t, DSHLLMergeOperation<DS{{sketch_type}}BindData>, AggregateDestructorType::LEGACY>(
667 |             result_type, result_type);
668 |         {% elif sketch_type == "CPC" %}
669 |         return AggregateFunction::UnaryAggregateDestructor<DS{{sketch_type}}State, string_t, string_t, DSCPCMergeOperation<DS{{sketch_type}}BindData>, AggregateDestructorType::LEGACY>(
670 |             result_type, result_type);
671 |         {% else %}
672 |         return AggregateFunction::UnaryAggregateDestructor<DS{{sketch_type}}State<T>, string_t, string_t, DSQuantilesMergeOperation<DS{{sketch_type}}BindData>, AggregateDestructorType::LEGACY>(
673 |             result_type, result_type);
674 |         {% endif %}
675 |     }
676 | 
677 | 
678 | 
679 |     template <typename T>
680 |     auto static DS{{sketch_type}}CreateAggregate(const LogicalType &type, const LogicalType &result_type) -> AggregateFunction
681 |     {
682 |         {% if sketch_type == 'HLL' %}
683 |         return AggregateFunction::UnaryAggregateDestructor<DS{{sketch_type}}State, T, string_t, DSHLLCreateOperation<DS{{sketch_type}}BindData>, AggregateDestructorType::LEGACY>(
684 |             type, result_type);
685 |         {% elif sketch_type == 'CPC' %}
686 |         return AggregateFunction::UnaryAggregateDestructor<DS{{sketch_type}}State, T, string_t, DSCPCCreateOperation<DS{{sketch_type}}BindData>, AggregateDestructorType::LEGACY>(
687 |             type, result_type);
688 |         {% else %}
689 |         return AggregateFunction::UnaryAggregateDestructor<DS{{sketch_type}}State<T>, T, string_t, DSQuantilesCreateOperation<DS{{sketch_type}}BindData>, AggregateDestructorType::LEGACY>(
690 |             type, result_type);
691 |         {% endif %}
692 |     }
693 | 
694 | 
695 |   void Load{{sketch_type}}Sketch(ExtensionLoader &loader) {
696 | 
697 | 
698 |       {% if sketch_type in counting_sketch_names %}
699 |         auto sketch_type = Create{{sketch_type}}CountingSketchType(loader);
700 |       {% else %}
701 |         std::unordered_map<LogicalTypeId, LogicalType> sketch_map_types;
702 |         {% for logical_type in logical_type_to_cplusplus_type(sketch_type).keys() %}
703 |             sketch_map_types.insert({ {{to_type_id(logical_type)}}, Create{{sketch_type}}SketchType(loader, LogicalType({{to_type_id(logical_type)}}))});
704 |         {%- endfor %}
705 |       {% endif %}
706 | 
707 |       {% for unary_function in functions_per_sketch_type(sketch_type) %}
708 |       {
709 |         ScalarFunctionSet fs("datasketch_{{sketch_type|lower}}_{{unary_function.name|lower}}");
710 |         {% if sketch_type in counting_sketch_names %}
711 |                 fs.AddFunction(ScalarFunction(
712 |                 {{get_scalar_function_args(unary_function, None, None)}}
713 |                 ,    DS{{sketch_type}}{{unary_function.name}}));
714 |         {% else %}
715 |             {% for logical_type, cpp_type in logical_type_to_cplusplus_type(sketch_type).items() %}
716 |                 fs.AddFunction(ScalarFunction(
717 |                 {{get_scalar_function_args(unary_function, logical_type, cpp_type)}}
718 |                 ,    DS{{sketch_type}}{{unary_function.name}}<{{cpp_type}}>));
719 |             {%- endfor %}
720 |         {% endif %}
721 | 
722 |         CreateScalarFunctionInfo info(std::move(fs));
723 | 
724 |         {
725 |             FunctionDescription desc;
726 |             desc.description = "{{unary_function.description}}";
727 |             desc.examples.push_back("{{unary_function.example}}");
728 |             info.descriptions.push_back(desc);
729 |         }
730 | 
731 |         loader.RegisterFunction(info);
732 | 
733 | 
734 |       }
735 |       {%- endfor %}
736 | 
737 |       // This funciton creates the sketches.
738 |       {
739 |       AggregateFunctionSet sketch("datasketch_{{sketch_type|lower}}");
740 |       {% for logical_type, cpp_type in logical_type_to_cplusplus_type(sketch_type).items() %}
741 |         {% if sketch_type in counting_sketch_names %}
742 |             {
743 |                 auto fun = DS{{sketch_type}}CreateAggregate<{{cpp_type}}>({{logical_type}}, sketch_type);
744 |                 fun.bind = DS{{sketch_type}}Bind;
745 |                 fun.order_dependent = AggregateOrderDependent::NOT_ORDER_DEPENDENT;
746 |                 fun.arguments.insert(fun.arguments.begin(), {{cpp_type_mapping[sketch_k_cpp_type[sketch_type]]}});
747 |                 sketch.AddFunction(fun);
748 |             }
749 |         {% else %}
750 |             {
751 |                 auto fun = DS{{sketch_type}}CreateAggregate<{{cpp_type}}>({{logical_type}}, sketch_map_types[{{to_type_id(logical_type)}}]);
752 |                 fun.bind = DS{{sketch_type}}Bind;
753 |                 fun.order_dependent = AggregateOrderDependent::NOT_ORDER_DEPENDENT;
754 |                 fun.arguments.insert(fun.arguments.begin(), {{cpp_type_mapping[sketch_k_cpp_type[sketch_type]]}});
755 |                 sketch.AddFunction(fun);
756 |             }
757 |             //sketch.AddFunction(DS{{sketch_type}}CreateAggregate<{{cpp_type}}>({{logical_type}}, sketch_map_types[{{to_type_id(logical_type)}}]));
758 |             {
759 |                 auto fun = DS{{sketch_type}}MergeAggregate<{{cpp_type}}>({{logical_type}}, sketch_map_types[{{to_type_id(logical_type)}}]);
760 |                 fun.bind = DS{{sketch_type}}Bind;
761 |                 fun.order_dependent = AggregateOrderDependent::NOT_ORDER_DEPENDENT;
762 |                 fun.arguments.insert(fun.arguments.begin(), {{cpp_type_mapping[sketch_k_cpp_type[sketch_type]]}});
763 |                 sketch.AddFunction(fun);
764 |             }
765 |             //sketch.AddFunction(DS{{sketch_type}}MergeAggregate<{{cpp_type}}>({{logical_type}}, sketch_map_types[{{to_type_id(logical_type)}}]));
766 |         {% endif %}
767 |       {%- endfor %}
768 |       CreateAggregateFunctionInfo sketch_info(sketch);
769 | 
770 | 
771 |     {
772 |         FunctionDescription desc;
773 |         desc.description = "Creates a sketch_{{sketch_type|lower}} data sketch by aggregating values or by aggregating other {{sketch_type}} data sketches";
774 |         desc.examples.push_back("datasketch_{{sketch_type|lower}}(k, data)");
775 |         sketch_info.descriptions.push_back(desc);
776 |     }
777 | 
778 |       loader.RegisterFunction(sketch_info);
779 |       }
780 | 
781 | 
782 | 
783 |       {% if sketch_type in counting_sketch_names %}
784 |       {
785 |       AggregateFunctionSet sketch("datasketch_{{sketch_type|lower}}_union");
786 |       auto fun = DS{{sketch_type}}MergeAggregate(sketch_type);
787 |       fun.bind = DS{{sketch_type}}Bind;
788 |       fun.order_dependent = AggregateOrderDependent::NOT_ORDER_DEPENDENT;
789 |       fun.arguments.insert(fun.arguments.begin(), {{cpp_type_mapping[sketch_k_cpp_type[sketch_type]]}});
790 |       sketch.AddFunction(fun);
791 |       CreateAggregateFunctionInfo sketch_info(sketch);
792 | 
793 |     {
794 |         FunctionDescription desc;
795 |         desc.description = "Creates a sketch_{{sketch_type}} data sketch by aggregating other {{sketch_type}} data sketches";
796 |         desc.examples.push_back("datasketch_{{sketch_type|lower}}_union(k, data)");
797 |         sketch_info.descriptions.push_back(desc);
798 |     }
799 | 
800 |       loader.RegisterFunction(sketch_info);
801 |       }
802 |       {% endif %}
803 | 
804 | 
805 |   }
806 |   {%- endfor %}
807 | 
808 | 
809 | }


--------------------------------------------------------------------------------