├── .editorconfig ├── docs └── duck-sketches-2.jpg ├── .gitignore ├── vcpkg-overlay └── apache-datasketches │ ├── usage │ ├── vcpkg.json │ └── portfile.cmake ├── vcpkg.json ├── .gitmodules ├── Makefile ├── src ├── include │ ├── datasketches_extension.hpp │ └── query_farm_telemetry.hpp ├── generated.h ├── datasketches_extension.cpp ├── query_farm_telemetry.cpp └── theta_sketch.cpp ├── extension_config.cmake ├── test ├── README.md └── sql │ ├── datasketch_tdigest.test │ ├── datasketch_cpc.test │ ├── datasketch_kll.test │ ├── datasketch_req.test │ ├── datasketch_quantiles.test │ ├── datasketch_hll.test │ ├── datasketch_theta.test │ └── datasketch_frequent.test ├── scripts ├── setup-custom-toolchain.sh ├── extension-upload.sh └── bootstrap-template.py ├── .github └── workflows │ ├── scheduled-1.4.yml │ └── MainDistributionPipeline.yml ├── .vscode ├── c_cpp_properties.json └── settings.json ├── LICENSE ├── CMakeLists.txt └── codegen ├── generator.py └── generated.cpp.j2 /.editorconfig: -------------------------------------------------------------------------------- 1 | duckdb/.editorconfig -------------------------------------------------------------------------------- /docs/duck-sketches-2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Query-farm/datasketches/HEAD/docs/duck-sketches-2.jpg -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | .idea 3 | cmake-build-debug 4 | duckdb_unittest_tempdir/ 5 | .DS_Store 6 | testext 7 | test/python/__pycache__/ 8 | .Rhistory 9 | -------------------------------------------------------------------------------- /vcpkg-overlay/apache-datasketches/usage: -------------------------------------------------------------------------------- 1 | apache-datasketches provides CMake targets: 2 | 3 | find_package(DataSketches CONFIG REQUIRED) 4 | target_link_libraries(main PRIVATE datasketches) 5 | -------------------------------------------------------------------------------- /vcpkg.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": [ 3 | "apache-datasketches" 4 | ], 5 | "vcpkg-configuration": { 6 | "overlay-ports": [ 7 | "./vcpkg-overlay/apache-datasketches" 8 | ] 9 | } 10 | } -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "duckdb"] 2 | path = duckdb 3 | url = https://github.com/duckdb/duckdb 4 | branch = main 5 | [submodule "extension-ci-tools"] 6 | path = extension-ci-tools 7 | url = https://github.com/duckdb/extension-ci-tools 8 | branch = main -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PROJ_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) 2 | 3 | # Configuration of extension 4 | EXT_NAME=datasketches 5 | EXT_CONFIG=${PROJ_DIR}extension_config.cmake 6 | 7 | # Include the Makefile from extension-ci-tools 8 | include extension-ci-tools/makefiles/duckdb_extension.Makefile -------------------------------------------------------------------------------- /src/include/datasketches_extension.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "duckdb.hpp" 4 | 5 | namespace duckdb 6 | { 7 | 8 | class DatasketchesExtension : public Extension 9 | { 10 | public: 11 | void Load(ExtensionLoader &loader) override; 12 | std::string Name() override; 13 | }; 14 | 15 | } // namespace duckdb 16 | -------------------------------------------------------------------------------- /extension_config.cmake: -------------------------------------------------------------------------------- 1 | # This file is included by DuckDB's build system. It specifies which extension to load 2 | 3 | # Extension from this repo 4 | duckdb_extension_load(datasketches 5 | SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR} 6 | LOAD_TESTS 7 | ) 8 | 9 | # Any extra extensions that should be built 10 | # e.g.: duckdb_extension_load(json) -------------------------------------------------------------------------------- /src/generated.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "datasketches_extension.hpp" 4 | 5 | using namespace duckdb; 6 | namespace duckdb_datasketches 7 | { 8 | void LoadQuantilesSketch(ExtensionLoader &loader); 9 | void LoadKLLSketch(ExtensionLoader &loader); 10 | void LoadREQSketch(ExtensionLoader &loader); 11 | void LoadTDigestSketch(ExtensionLoader &loader); 12 | void LoadHLLSketch(ExtensionLoader &loader); 13 | void LoadCPCSketch(ExtensionLoader &loader); 14 | } -------------------------------------------------------------------------------- /vcpkg-overlay/apache-datasketches/vcpkg.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "apache-datasketches", 3 | "version": "5.1.0", 4 | "port-version": 1, 5 | "description": "Apache DataSketches Core C++ Library Component.", 6 | "homepage": "https://datasketches.apache.org/", 7 | "license": "Apache-2.0", 8 | "dependencies": [ 9 | { 10 | "name": "vcpkg-cmake", 11 | "host": true 12 | }, 13 | { 14 | "name": "vcpkg-cmake-config", 15 | "host": true 16 | } 17 | ] 18 | } -------------------------------------------------------------------------------- /test/README.md: -------------------------------------------------------------------------------- 1 | # Testing this extension 2 | This directory contains all the tests for this extension. The `sql` directory holds tests that are written as [SQLLogicTests](https://duckdb.org/dev/sqllogictest/intro.html). DuckDB aims to have most its tests in this format as SQL statements, so for the datasketches extension, this should probably be the goal too. 3 | 4 | The root makefile contains targets to build and run all of these tests. To run the SQLLogicTests: 5 | ```bash 6 | make test 7 | ``` 8 | or 9 | ```bash 10 | make test_debug 11 | ``` -------------------------------------------------------------------------------- /src/include/query_farm_telemetry.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include "duckdb.hpp" 4 | 5 | #if defined(_WIN32) || defined(_WIN64) 6 | // Windows: functions are hidden by default unless exported 7 | #define INTERNAL_FUNC 8 | #elif defined(__GNUC__) || defined(__clang__) 9 | // Linux / macOS: hide symbol using visibility attribute 10 | #define INTERNAL_FUNC __attribute__((visibility("hidden"))) 11 | #else 12 | #define INTERNAL_FUNC 13 | #endif 14 | 15 | namespace duckdb { 16 | void QueryFarmSendTelemetry(ExtensionLoader &loader, const string &extension_name, const string &extension_version); 17 | } -------------------------------------------------------------------------------- /scripts/setup-custom-toolchain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This is an example script that can be used to install additional toolchain dependencies. Feel free to remove this script 4 | # if no additional toolchains are required 5 | 6 | # To enable this script, set the `custom_toolchain_script` option to true when calling the reusable workflow 7 | # `.github/workflows/_extension_distribution.yml` from `https://github.com/duckdb/extension-ci-tools` 8 | 9 | # note that the $DUCKDB_PLATFORM environment variable can be used to discern between the platforms 10 | echo "This is the sample custom toolchain script running for architecture '$DUCKDB_PLATFORM' for the datasketches extension." 11 | 12 | -------------------------------------------------------------------------------- /.github/workflows/scheduled-1.4.yml: -------------------------------------------------------------------------------- 1 | name: Scheduled Trigger for 1.4 2 | 3 | on: 4 | schedule: 5 | - cron: '0 12 * * *' # Runs at 12:00 UTC every day 6 | workflow_dispatch: # Allows manual trigger 7 | 8 | jobs: 9 | trigger: 10 | runs-on: ubuntu-latest 11 | permissions: 12 | actions: write # Allow triggering workflows 13 | steps: 14 | - name: Checkout repository # Required for gh to work 15 | uses: actions/checkout@v4 16 | 17 | - name: Install GitHub CLI 18 | run: | 19 | sudo apt update && sudo apt install gh -y 20 | 21 | - name: Authenticate GH CLI 22 | run: | 23 | echo "${{ secrets.GITHUB_TOKEN }}" | gh auth login --with-token 24 | 25 | - name: Trigger Workflow on my-branch 26 | run: | 27 | gh workflow run MainDistributionPipeline.yml --ref v1.4 28 | -------------------------------------------------------------------------------- /vcpkg-overlay/apache-datasketches/portfile.cmake: -------------------------------------------------------------------------------- 1 | vcpkg_from_github( 2 | OUT_SOURCE_PATH SOURCE_PATH 3 | REPO apache/datasketches-cpp 4 | REF dddc4a668cdc47ad8a221cf7d4cb5054e53a40ee 5 | SHA512 2fff76d978acecccadcf712389bc1b724569ab7f70c256473f59680f04473a7ffd6b1ed94125f6665bfad31a22874a18ea73d455eda49ff3e9f8a994548c52d5 6 | HEAD_REF master 7 | ) 8 | 9 | set(VCPKG_BUILD_TYPE release) # header-only port 10 | 11 | vcpkg_cmake_configure( 12 | SOURCE_PATH "${SOURCE_PATH}" 13 | OPTIONS 14 | -DBUILD_TESTS=OFF 15 | ) 16 | 17 | vcpkg_cmake_install() 18 | vcpkg_cmake_config_fixup(PACKAGE_NAME DataSketches CONFIG_PATH lib/DataSketches/cmake) 19 | 20 | file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/lib") 21 | 22 | vcpkg_install_copyright(FILE_LIST "${SOURCE_PATH}/LICENSE") 23 | file(INSTALL "${CMAKE_CURRENT_LIST_DIR}/usage" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}") 24 | -------------------------------------------------------------------------------- /.vscode/c_cpp_properties.json: -------------------------------------------------------------------------------- 1 | { 2 | "configurations": [ 3 | { 4 | "name": "Mac", 5 | "includePath": [ 6 | "${workspaceFolder}/**", 7 | "${workspaceFolder}/src/include/", 8 | "${workspaceFolder}/duckdb/src/include/", 9 | "${workspaceFolder}/duckdb/third_party/**/include/", 10 | "${workspaceFolder}/vcpkg_installed/arm64-osx/include" 11 | ], 12 | "defines": [], 13 | "macFrameworkPath": [ 14 | "/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks" 15 | ], 16 | "compilerPath": "/usr/bin/clang", 17 | "cStandard": "c17", 18 | "cppStandard": "c++17", 19 | "intelliSenseMode": "macos-clang-arm64", 20 | "configurationProvider": "ms-vscode.makefile-tools" 21 | } 22 | ], 23 | "version": 4 24 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2024 Rusty Conover (rusty@conover.me) 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /.github/workflows/MainDistributionPipeline.yml: -------------------------------------------------------------------------------- 1 | name: Main Extension Distribution Pipeline 2 | on: 3 | push: null 4 | pull_request: null 5 | workflow_dispatch: null 6 | schedule: 7 | - cron: 0 2 * * * 8 | concurrency: 9 | group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ 10 | github.base_ref || '' }}-${{ github.ref != 'refs/heads/main' || github.sha }} 11 | cancel-in-progress: true 12 | jobs: 13 | duckdb-stable-build: 14 | name: Build extension binaries 15 | uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main 16 | with: 17 | duckdb_version: v1.4-andium 18 | ci_tools_version: main 19 | extension_name: datasketches 20 | exclude_archs: windows_amd64_rtools 21 | vcpkg_binary_sources: ${{ vars.VCPKG_BINARY_SOURCES }} 22 | secrets: 23 | VCPKG_CACHING_AWS_ACCESS_KEY_ID: ${{ secrets.VCPKG_CACHING_AWS_ACCESS_KEY_ID 24 | }} 25 | VCPKG_CACHING_AWS_SECRET_ACCESS_KEY: ${{ secrets.VCPKG_CACHING_AWS_SECRET_ACCESS_KEY 26 | }} 27 | VCPKG_CACHING_AWS_ENDPOINT_URL: ${{ secrets.VCPKG_CACHING_AWS_ENDPOINT_URL }} 28 | VCPKG_CACHING_AWS_DEFAULT_REGION: ${{ secrets.VCPKG_CACHING_AWS_DEFAULT_REGION}} 29 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.5) 2 | 3 | # Set extension name here 4 | set(TARGET_NAME datasketches) 5 | 6 | set(CMAKE_CXX_STANDARD 17) 7 | 8 | 9 | if (WIN32 AND MSVC) 10 | add_compile_options(/bigobj) 11 | endif() 12 | 13 | # DuckDB's extension distribution supports vcpkg. As such, dependencies can be added in ./vcpkg.json and then 14 | # used in cmake with find_package. Feel free to remove or replace with other dependencies. 15 | # Note that it should also be removed from vcpkg.json to prevent needlessly installing it.. 16 | find_package(DataSketches CONFIG REQUIRED) 17 | 18 | set(EXTENSION_NAME ${TARGET_NAME}_extension) 19 | set(LOADABLE_EXTENSION_NAME ${TARGET_NAME}_loadable_extension) 20 | 21 | project(${TARGET_NAME}) 22 | include_directories(src/include) 23 | 24 | 25 | set(EXTENSION_SOURCES src/datasketches_extension.cpp src/generated.cpp 26 | src/query_farm_telemetry.cpp 27 | src/theta_sketch.cpp 28 | src/frequent_items_sketch.cpp) 29 | 30 | build_static_extension(${TARGET_NAME} ${EXTENSION_SOURCES}) 31 | build_loadable_extension(${TARGET_NAME} " " ${EXTENSION_SOURCES}) 32 | 33 | # Link OpenSSL in both the static library as the loadable extension 34 | target_link_libraries(${EXTENSION_NAME} datasketches) 35 | target_link_libraries(${LOADABLE_EXTENSION_NAME} datasketches) 36 | 37 | install( 38 | TARGETS ${EXTENSION_NAME} 39 | EXPORT "${DUCKDB_EXPORT_SET}" 40 | LIBRARY DESTINATION "${INSTALL_LIB_DIR}" 41 | ARCHIVE DESTINATION "${INSTALL_LIB_DIR}") 42 | -------------------------------------------------------------------------------- /src/datasketches_extension.cpp: -------------------------------------------------------------------------------- 1 | #define DUCKDB_EXTENSION_MAIN 2 | 3 | #include "datasketches_extension.hpp" 4 | #include "duckdb.hpp" 5 | #include "duckdb/common/exception.hpp" 6 | #include "duckdb/common/string_util.hpp" 7 | #include "duckdb/common/extra_type_info.hpp" 8 | #include "duckdb/function/scalar_function.hpp" 9 | #include 10 | #include 11 | 12 | #include 13 | 14 | #include "generated.h" 15 | 16 | #include "query_farm_telemetry.hpp" 17 | 18 | namespace duckdb 19 | { 20 | 21 | void LoadQuantilesSketch(ExtensionLoader &loader); 22 | void LoadKLLSketch(ExtensionLoader &loader); 23 | void LoadREQSketch(ExtensionLoader &loader); 24 | void LoadTDigestSketch(ExtensionLoader &loader); 25 | void LoadHLLSketch(ExtensionLoader &loader); 26 | void LoadCPCSketch(ExtensionLoader &loader); 27 | void LoadThetaSketch(ExtensionLoader &loader); 28 | void LoadFrequentItemsSketch(ExtensionLoader &loader); 29 | 30 | static void LoadInternal(ExtensionLoader &loader) 31 | { 32 | LoadQuantilesSketch(loader); 33 | LoadKLLSketch(loader); 34 | LoadREQSketch(loader); 35 | LoadTDigestSketch(loader); 36 | LoadHLLSketch(loader); 37 | LoadCPCSketch(loader); 38 | LoadThetaSketch(loader); 39 | LoadFrequentItemsSketch(loader); 40 | QueryFarmSendTelemetry(loader, "datasketches", "2025121201"); 41 | } 42 | 43 | void DatasketchesExtension::Load(ExtensionLoader &loader) 44 | { 45 | LoadInternal(loader); 46 | } 47 | std::string DatasketchesExtension::Name() 48 | { 49 | return "datasketches"; 50 | } 51 | 52 | } // namespace duckdb 53 | 54 | extern "C" 55 | { 56 | DUCKDB_CPP_EXTENSION_ENTRY(datasketches, loader) 57 | { 58 | duckdb::LoadInternal(loader); 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /scripts/extension-upload.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Extension upload script 4 | 5 | # Usage: ./extension-upload.sh 6 | # : Name of the extension 7 | # : Version (commit / version tag) of the extension 8 | # : Version (commit / version tag) of DuckDB 9 | # : Architecture target of the extension binary 10 | # : S3 bucket to upload to 11 | # : Set this as the latest version ("true" / "false", default: "false") 12 | # : Set this as a versioned version that will prevent its deletion 13 | 14 | set -e 15 | 16 | if [[ $4 == wasm* ]]; then 17 | ext="/tmp/extension/$1.duckdb_extension.wasm" 18 | else 19 | ext="/tmp/extension/$1.duckdb_extension" 20 | fi 21 | 22 | echo $ext 23 | 24 | script_dir="$(dirname "$(readlink -f "$0")")" 25 | 26 | # calculate SHA256 hash of extension binary 27 | cat $ext > $ext.append 28 | 29 | if [[ $4 == wasm* ]]; then 30 | # 0 for custom section 31 | # 113 in hex = 275 in decimal, total lenght of what follows (1 + 16 + 2 + 256) 32 | # [1(continuation) + 0010011(payload) = \x93, 0(continuation) + 10(payload) = \x02] 33 | echo -n -e '\x00' >> $ext.append 34 | echo -n -e '\x93\x02' >> $ext.append 35 | # 10 in hex = 16 in decimal, lenght of name, 1 byte 36 | echo -n -e '\x10' >> $ext.append 37 | echo -n -e 'duckdb_signature' >> $ext.append 38 | # the name of the WebAssembly custom section, 16 bytes 39 | # 100 in hex, 256 in decimal 40 | # [1(continuation) + 0000000(payload) = ff, 0(continuation) + 10(payload)], 41 | # for a grand total of 2 bytes 42 | echo -n -e '\x80\x02' >> $ext.append 43 | fi 44 | 45 | # (Optionally) Sign binary 46 | if [ "$DUCKDB_EXTENSION_SIGNING_PK" != "" ]; then 47 | echo "$DUCKDB_EXTENSION_SIGNING_PK" > private.pem 48 | $script_dir/../duckdb/scripts/compute-extension-hash.sh $ext.append > $ext.hash 49 | openssl pkeyutl -sign -in $ext.hash -inkey private.pem -pkeyopt digest:sha256 -out $ext.sign 50 | rm -f private.pem 51 | fi 52 | 53 | # Signature is always there, potentially defaulting to 256 zeros 54 | truncate -s 256 $ext.sign 55 | 56 | # append signature to extension binary 57 | cat $ext.sign >> $ext.append 58 | 59 | # compress extension binary 60 | if [[ $4 == wasm_* ]]; then 61 | brotli < $ext.append > "$ext.compressed" 62 | else 63 | gzip < $ext.append > "$ext.compressed" 64 | fi 65 | 66 | set -e 67 | 68 | # Abort if AWS key is not set 69 | if [ -z "$AWS_ACCESS_KEY_ID" ]; then 70 | echo "No AWS key found, skipping.." 71 | exit 0 72 | fi 73 | 74 | # upload versioned version 75 | if [[ $7 = 'true' ]]; then 76 | if [[ $4 == wasm* ]]; then 77 | aws s3 cp $ext.compressed s3://$5/$1/$2/$3/$4/$1.duckdb_extension.wasm --acl public-read --content-encoding br --content-type="application/wasm" 78 | else 79 | aws s3 cp $ext.compressed s3://$5/$1/$2/$3/$4/$1.duckdb_extension.gz --acl public-read 80 | fi 81 | fi 82 | 83 | # upload to latest version 84 | if [[ $6 = 'true' ]]; then 85 | if [[ $4 == wasm* ]]; then 86 | aws s3 cp $ext.compressed s3://$5/$3/$4/$1.duckdb_extension.wasm --acl public-read --content-encoding br --content-type="application/wasm" 87 | else 88 | aws s3 cp $ext.compressed s3://$5/$3/$4/$1.duckdb_extension.gz --acl public-read 89 | fi 90 | fi 91 | -------------------------------------------------------------------------------- /scripts/bootstrap-template.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import sys, os, shutil, re 4 | from pathlib import Path 5 | 6 | 7 | def is_snake_case(s): 8 | # Define the regex pattern for snake case with numbers 9 | pattern = r"^[a-z0-9]+(_[a-z0-9]+)*$" 10 | 11 | # Use re.match to check if the string matches the pattern 12 | if re.match(pattern, s): 13 | return True 14 | else: 15 | return False 16 | 17 | 18 | def to_camel_case(snake_str): 19 | return "".join(x.capitalize() for x in snake_str.lower().split("_")) 20 | 21 | 22 | def replace(file_name, to_find, to_replace): 23 | with open(file_name, "r", encoding="utf8") as file: 24 | filedata = file.read() 25 | filedata = filedata.replace(to_find, to_replace) 26 | with open(file_name, "w", encoding="utf8") as file: 27 | file.write(filedata) 28 | 29 | 30 | def replace_everywhere(to_find, to_replace): 31 | for path in files_to_search: 32 | replace(path, to_find, to_replace) 33 | replace(path, to_find.capitalize(), to_camel_case(to_replace)) 34 | replace(path, to_find.upper(), to_replace.upper()) 35 | 36 | replace("./CMakeLists.txt", to_find, to_replace) 37 | replace("./Makefile", to_find, to_replace) 38 | replace("./Makefile", to_find.capitalize(), to_camel_case(to_replace)) 39 | replace("./Makefile", to_find.upper(), to_replace.upper()) 40 | replace("./README.md", to_find, to_replace) 41 | replace("./extension_config.cmake", to_find, to_replace) 42 | replace("./scripts/setup-custom-toolchain.sh", to_find, to_replace) 43 | 44 | 45 | if __name__ == "__main__": 46 | if len(sys.argv) != 2: 47 | raise Exception( 48 | "usage: python3 bootstrap-template.py " 49 | ) 50 | 51 | name_extension = sys.argv[1] 52 | 53 | if name_extension[0].isdigit(): 54 | raise Exception("Please dont start your extension name with a number.") 55 | 56 | if not is_snake_case(name_extension): 57 | raise Exception( 58 | "Please enter the name of your extension in valid snake_case containing only lower case letters and numbers" 59 | ) 60 | 61 | shutil.copyfile("docs/NEXT_README.md", "README.md") 62 | os.remove("docs/NEXT_README.md") 63 | os.remove("docs/README.md") 64 | 65 | files_to_search = [] 66 | files_to_search.extend(Path("./.github").rglob("./**/*.yml")) 67 | files_to_search.extend(Path("./test").rglob("./**/*.test")) 68 | files_to_search.extend(Path("./src").rglob("./**/*.hpp")) 69 | files_to_search.extend(Path("./src").rglob("./**/*.cpp")) 70 | files_to_search.extend(Path("./src").rglob("./**/*.txt")) 71 | files_to_search.extend(Path("./src").rglob("./*.md")) 72 | 73 | replace_everywhere("datasketches", name_extension) 74 | replace_everywhere("datasketches", name_extension.capitalize()) 75 | replace_everywhere("", name_extension) 76 | 77 | string_to_replace = name_extension 78 | string_to_find = "datasketches" 79 | 80 | # rename files 81 | os.rename(f"test/sql/{string_to_find}.test", f"test/sql/{string_to_replace}.test") 82 | os.rename( 83 | f"src/{string_to_find}_extension.cpp", f"src/{string_to_replace}_extension.cpp" 84 | ) 85 | os.rename( 86 | f"src/include/{string_to_find}_extension.hpp", 87 | f"src/include/{string_to_replace}_extension.hpp", 88 | ) 89 | 90 | # remove template-specific files 91 | os.remove(".github/workflows/ExtensionTemplate.yml") 92 | 93 | # finally, remove this bootstrap file 94 | os.remove(__file__) 95 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "files.exclude": { 3 | "vcpkg/downloads": true, 4 | "vcpkg/packages": true, 5 | "vcpkg/ports": true, 6 | "vcpkg/scripts": true, 7 | "vcpkg/toolsrc": true, 8 | "vcpkg/versions": true, 9 | "vcpkg/triplets": true, 10 | "vcpkg_installed/**": true, 11 | "duckdb_unittest_tempdir": true, 12 | "build/**": true, 13 | ".tmp/**": true, 14 | "**/.git": true, 15 | "**/.svn": true, 16 | "**/.hg": true, 17 | "**/.trunk": true, 18 | "**/CVS": true, 19 | "**/.DS_Store": true, 20 | "**/Thumbs.db": true 21 | }, 22 | "cmake.environment": { 23 | "GEN": "ninja", 24 | "VCPKG_TOOLCHAIN_PATH": "${workspaceFolder}/vcpkg/scripts/buildsystems/vcpkg.cmake", 25 | }, 26 | "workbench.colorCustomizations": { 27 | "activityBar.activeBackground": "#b5933c", 28 | "activityBar.background": "#b5933c", 29 | "activityBar.foreground": "#15202b", 30 | "activityBar.inactiveForeground": "#15202b99", 31 | "activityBarBadge.background": "#b7e7da", 32 | "activityBarBadge.foreground": "#15202b", 33 | "commandCenter.border": "#e7e7e799", 34 | "sash.hoverBorder": "#b5933c", 35 | "statusBar.background": "#8f742f", 36 | "statusBar.foreground": "#e7e7e7", 37 | "statusBarItem.hoverBackground": "#b5933c", 38 | "statusBarItem.remoteBackground": "#8f742f", 39 | "statusBarItem.remoteForeground": "#e7e7e7", 40 | "titleBar.activeBackground": "#8f742f", 41 | "titleBar.activeForeground": "#e7e7e7", 42 | "titleBar.inactiveBackground": "#8f742f99", 43 | "titleBar.inactiveForeground": "#e7e7e799" 44 | }, 45 | "peacock.color": "#8f742f", 46 | "files.associations": { 47 | "optional": "cpp", 48 | "*.inc": "cpp", 49 | "__hash_table": "cpp", 50 | "__split_buffer": "cpp", 51 | "__tree": "cpp", 52 | "array": "cpp", 53 | "bitset": "cpp", 54 | "deque": "cpp", 55 | "hash_map": "cpp", 56 | "initializer_list": "cpp", 57 | "list": "cpp", 58 | "map": "cpp", 59 | "queue": "cpp", 60 | "regex": "cpp", 61 | "set": "cpp", 62 | "span": "cpp", 63 | "stack": "cpp", 64 | "string": "cpp", 65 | "string_view": "cpp", 66 | "unordered_map": "cpp", 67 | "unordered_set": "cpp", 68 | "valarray": "cpp", 69 | "vector": "cpp", 70 | "*.ipp": "cpp", 71 | "stdexcept": "cpp", 72 | "iomanip": "cpp", 73 | "__bit_reference": "cpp", 74 | "__locale": "cpp", 75 | "__node_handle": "cpp", 76 | "__verbose_abort": "cpp", 77 | "cctype": "cpp", 78 | "charconv": "cpp", 79 | "cmath": "cpp", 80 | "condition_variable": "cpp", 81 | "cstddef": "cpp", 82 | "cstdint": "cpp", 83 | "cstdio": "cpp", 84 | "cstdlib": "cpp", 85 | "cstring": "cpp", 86 | "ctime": "cpp", 87 | "cwchar": "cpp", 88 | "forward_list": "cpp", 89 | "fstream": "cpp", 90 | "future": "cpp", 91 | "ios": "cpp", 92 | "iosfwd": "cpp", 93 | "iostream": "cpp", 94 | "istream": "cpp", 95 | "limits": "cpp", 96 | "locale": "cpp", 97 | "mutex": "cpp", 98 | "new": "cpp", 99 | "ostream": "cpp", 100 | "print": "cpp", 101 | "ratio": "cpp", 102 | "sstream": "cpp", 103 | "streambuf": "cpp", 104 | "tuple": "cpp", 105 | "typeinfo": "cpp", 106 | "variant": "cpp", 107 | "algorithm": "cpp", 108 | "__threading_support": "cpp", 109 | "any": "cpp", 110 | "cfenv": "cpp", 111 | "cinttypes": "cpp", 112 | "clocale": "cpp", 113 | "codecvt": "cpp", 114 | "complex": "cpp", 115 | "csignal": "cpp", 116 | "cstdarg": "cpp", 117 | "cwctype": "cpp", 118 | "execution": "cpp", 119 | "memory": "cpp", 120 | "shared_mutex": "cpp", 121 | "source_location": "cpp", 122 | "strstream": "cpp", 123 | "typeindex": "cpp" 124 | } 125 | } -------------------------------------------------------------------------------- /src/query_farm_telemetry.cpp: -------------------------------------------------------------------------------- 1 | #include "query_farm_telemetry.hpp" 2 | #include 3 | #include "duckdb.hpp" 4 | #include "duckdb/common/http_util.hpp" 5 | #include "yyjson.hpp" 6 | #include "duckdb/main/extension_helper.hpp" 7 | #include "duckdb/main/config.hpp" 8 | #include 9 | #include 10 | using namespace duckdb_yyjson; // NOLINT 11 | 12 | namespace duckdb 13 | { 14 | 15 | namespace 16 | { 17 | 18 | // Function to send the actual HTTP request 19 | void sendHTTPRequest(shared_ptr db, char *json_body, size_t json_body_size) 20 | { 21 | const string TARGET_URL("https://duckdb-in.query-farm.services/"); 22 | 23 | HTTPHeaders headers; 24 | headers.Insert("Content-Type", "application/json"); 25 | 26 | auto &http_util = HTTPUtil::Get(*db); 27 | unique_ptr params = http_util.InitializeParameters(*db, TARGET_URL); 28 | 29 | PostRequestInfo post_request(TARGET_URL, headers, *params, reinterpret_cast(json_body), 30 | json_body_size); 31 | try 32 | { 33 | auto response = http_util.Request(post_request); 34 | } 35 | catch (const std::exception &e) 36 | { 37 | // ignore all errors. 38 | } 39 | 40 | free(json_body); 41 | return; 42 | } 43 | 44 | } // namespace 45 | 46 | INTERNAL_FUNC void QueryFarmSendTelemetry(ExtensionLoader &loader, const string &extension_name, 47 | const string &extension_version) 48 | { 49 | const char *opt_out = std::getenv("QUERY_FARM_TELEMETRY_OPT_OUT"); 50 | if (opt_out != nullptr) 51 | { 52 | return; 53 | } 54 | 55 | auto &dbconfig = DBConfig::GetConfig(loader.GetDatabaseInstance()); 56 | auto old_value = dbconfig.options.autoinstall_known_extensions; 57 | dbconfig.options.autoinstall_known_extensions = false; 58 | try 59 | { 60 | ExtensionHelper::AutoLoadExtension(loader.GetDatabaseInstance(), "httpfs"); 61 | } 62 | catch (...) 63 | { 64 | dbconfig.options.autoinstall_known_extensions = old_value; 65 | return; 66 | } 67 | 68 | dbconfig.options.autoinstall_known_extensions = old_value; 69 | if (!loader.GetDatabaseInstance().ExtensionIsLoaded("httpfs")) 70 | { 71 | return; 72 | } 73 | 74 | // Initialize the telemetry sender 75 | auto doc = yyjson_mut_doc_new(nullptr); 76 | 77 | auto result_obj = yyjson_mut_obj(doc); 78 | yyjson_mut_doc_set_root(doc, result_obj); 79 | 80 | auto platform = DuckDB::Platform(); 81 | 82 | yyjson_mut_obj_add_str(doc, result_obj, "extension_name", extension_name.c_str()); 83 | yyjson_mut_obj_add_str(doc, result_obj, "extension_version", extension_version.c_str()); 84 | yyjson_mut_obj_add_str(doc, result_obj, "user_agent", "query-farm/20251011"); 85 | yyjson_mut_obj_add_str(doc, result_obj, "duckdb_platform", platform.c_str()); 86 | yyjson_mut_obj_add_str(doc, result_obj, "duckdb_library_version", DuckDB::LibraryVersion()); 87 | yyjson_mut_obj_add_str(doc, result_obj, "duckdb_release_codename", DuckDB::ReleaseCodename()); 88 | yyjson_mut_obj_add_str(doc, result_obj, "duckdb_source_id", DuckDB::SourceID()); 89 | 90 | size_t telemetry_len; 91 | auto telemetry_data = 92 | yyjson_mut_val_write_opts(result_obj, YYJSON_WRITE_ALLOW_INF_AND_NAN, NULL, &telemetry_len, nullptr); 93 | 94 | if (telemetry_data == nullptr) 95 | { 96 | throw SerializationException("Failed to serialize telemetry data."); 97 | } 98 | 99 | yyjson_mut_doc_free(doc); 100 | 101 | #ifndef __EMSCRIPTEN__ 102 | [[maybe_unused]] auto _ = std::async( 103 | std::launch::async, [db_ptr = loader.GetDatabaseInstance().shared_from_this(), json = telemetry_data, 104 | len = telemetry_len]() mutable 105 | { sendHTTPRequest(std::move(db_ptr), json, len); }); 106 | #else 107 | sendHTTPRequest(loader.GetDatabaseInstance().shared_from_this(), telemetry_data, telemetry_len); 108 | #endif 109 | } 110 | 111 | } // namespace duckdb -------------------------------------------------------------------------------- /test/sql/datasketch_tdigest.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/datasketch_tdigest.test 2 | # description: test datasketch TDigest sketches 3 | # group: [datasketches] 4 | 5 | # Before we load the extension, this will fail 6 | statement error 7 | SELECT datasketch_tdigest_is_empty(''::blob); 8 | ---- 9 | Catalog Error: Scalar Function with name datasketch_tdigest_is_empty does not exist! 10 | 11 | # Require statement will ensure this test is run with this extension loaded 12 | require datasketches 13 | 14 | query I 15 | SELECT datasketch_tdigest(10, 5); 16 | ---- 17 | \x01\x01\x14\x0A\x00\x06\x00\x00\x00\x00\x00\x00\x00\x00\x14@ 18 | 19 | query I 20 | SELECT datasketch_tdigest_is_empty('\x01\x01\x14\x0A\x00\x06\x00\x00\x00\x00\x00\x00\x00\x00\x14@'::sketch_tdigest_float); 21 | ---- 22 | false 23 | 24 | # Do some tests with integers. 25 | 26 | statement ok 27 | CREATE TABLE readings(temp double) 28 | 29 | statement ok 30 | INSERT INTO readings(temp) select unnest(generate_series(1, 1000))::double; 31 | 32 | # Rank of 500 should be close to 0.5 (value is in middle of 1-1000 range) 33 | query I 34 | SELECT datasketch_tdigest_rank(datasketch_tdigest(10, temp), 500.0) between 0.4 and 0.6 from readings 35 | ---- 36 | true 37 | 38 | # Median (0.5 quantile) should be close to 500 39 | query I 40 | SELECT datasketch_tdigest_quantile(datasketch_tdigest(10, temp), 0.5) between 350 and 650 from readings 41 | ---- 42 | true 43 | 44 | # CDF and PMF outputs vary, just verify they execute successfully 45 | statement ok 46 | SELECT datasketch_tdigest_cdf(datasketch_tdigest(10, temp), [100, 200, 500]) from readings 47 | 48 | statement ok 49 | SELECT datasketch_tdigest_pmf(datasketch_tdigest(10, temp), [100, 200, 500]) from readings 50 | 51 | query I 52 | SELECT datasketch_tdigest_k(datasketch_tdigest(10, temp)) from readings 53 | ---- 54 | 10 55 | 56 | statement ok 57 | CREATE TABLE sketches (sketch sketch_tdigest_double) 58 | 59 | statement ok 60 | INSERT INTO sketches (sketch) select datasketch_tdigest(12, temp) from readings where mod(temp::int, 3) == 0 61 | 62 | statement ok 63 | INSERT INTO sketches (sketch) select datasketch_tdigest(12, temp) from readings where mod(temp::int, 3) == 1 64 | 65 | statement ok 66 | INSERT INTO sketches (sketch) select datasketch_tdigest(12, temp) from readings where mod(temp::int, 3) == 2 67 | 68 | query I 69 | select datasketch_tdigest_is_empty(datasketch_tdigest(12, sketch)) from sketches 70 | ---- 71 | False 72 | 73 | # Merged sketch median should be close to 500 74 | query I 75 | select datasketch_tdigest_quantile(datasketch_tdigest(12, sketch), 0.5)::int between 300 and 700 from sketches 76 | ---- 77 | true 78 | 79 | # Test error handling for invalid/corrupted sketch data 80 | statement error 81 | SELECT datasketch_tdigest_is_empty('\x00\x01\x02'::sketch_tdigest_float); 82 | ---- 83 | Invalid Input Error: Failed to deserialize TDigest sketch 84 | 85 | statement error 86 | SELECT datasketch_tdigest_k('\xDE\xAD\xBE\xEF'::sketch_tdigest_double); 87 | ---- 88 | Invalid Input Error: Failed to deserialize TDigest sketch 89 | 90 | # Test with empty blob 91 | statement error 92 | SELECT datasketch_tdigest_is_empty(''::sketch_tdigest_double); 93 | ---- 94 | Invalid Input Error: Failed to deserialize TDigest sketch 95 | 96 | # ============================================================================= 97 | # COMPREHENSIVE UNION/MERGE TESTS 98 | # ============================================================================= 99 | 100 | # Test merging multiple sketches from partitioned data 101 | statement ok 102 | CREATE TABLE merge_data(value double, partition_id int) 103 | 104 | statement ok 105 | INSERT INTO merge_data SELECT unnest(generate_series(1, 300))::double, 1 106 | 107 | statement ok 108 | INSERT INTO merge_data SELECT unnest(generate_series(301, 600))::double, 2 109 | 110 | statement ok 111 | INSERT INTO merge_data SELECT unnest(generate_series(601, 900))::double, 3 112 | 113 | # Create sketches per partition 114 | statement ok 115 | CREATE TABLE partition_sketches AS 116 | SELECT partition_id, datasketch_tdigest(100, value) as sketch 117 | FROM merge_data 118 | GROUP BY partition_id 119 | 120 | # Verify we have 3 partition sketches 121 | query I 122 | SELECT count(*) FROM partition_sketches 123 | ---- 124 | 3 125 | 126 | # Merge all partition sketches and verify total weight 127 | query I 128 | SELECT datasketch_tdigest_total_weight(datasketch_tdigest(100, sketch)) FROM partition_sketches 129 | ---- 130 | 900 131 | 132 | # Verify merged sketch is not empty 133 | query I 134 | SELECT datasketch_tdigest_is_empty(datasketch_tdigest(100, sketch)) FROM partition_sketches 135 | ---- 136 | False 137 | 138 | # Verify merged sketch median is approximately in the middle 139 | query I 140 | SELECT datasketch_tdigest_quantile(datasketch_tdigest(100, sketch), 0.5) between 400 and 500 FROM partition_sketches 141 | ---- 142 | true 143 | 144 | # Test merging sketches with overlapping data ranges 145 | statement ok 146 | CREATE TABLE overlap_data(value double, group_id int) 147 | 148 | statement ok 149 | INSERT INTO overlap_data SELECT unnest(generate_series(1, 500))::double, 1 150 | 151 | statement ok 152 | INSERT INTO overlap_data SELECT unnest(generate_series(250, 750))::double, 2 153 | 154 | statement ok 155 | CREATE TABLE overlap_sketches AS 156 | SELECT group_id, datasketch_tdigest(100, value) as sketch 157 | FROM overlap_data 158 | GROUP BY group_id 159 | 160 | # Merged sketch should have correct total weight (500 + 501 = 1001) 161 | query I 162 | SELECT datasketch_tdigest_total_weight(datasketch_tdigest(100, sketch)) FROM overlap_sketches 163 | ---- 164 | 1001 165 | 166 | # Verify merged sketch is not empty 167 | query I 168 | SELECT datasketch_tdigest_is_empty(datasketch_tdigest(100, sketch)) FROM overlap_sketches 169 | ---- 170 | False 171 | 172 | # Test merge with different K values - merged sketch should use the K from bind 173 | statement ok 174 | CREATE TABLE k_test_sketches AS 175 | SELECT datasketch_tdigest(50, value) as sketch FROM merge_data WHERE partition_id = 1 176 | UNION ALL 177 | SELECT datasketch_tdigest(200, value) as sketch FROM merge_data WHERE partition_id = 2 178 | 179 | query I 180 | SELECT datasketch_tdigest_k(datasketch_tdigest(100, sketch)) FROM k_test_sketches 181 | ---- 182 | 100 183 | 184 | # Test merging a single sketch (edge case) 185 | query I 186 | SELECT datasketch_tdigest_total_weight(datasketch_tdigest(50, sketch)) 187 | FROM (SELECT datasketch_tdigest(50, value) as sketch FROM merge_data WHERE partition_id = 1) single_sketch 188 | ---- 189 | 300 190 | 191 | # Test CDF on merged sketch 192 | statement ok 193 | SELECT datasketch_tdigest_cdf(datasketch_tdigest(100, sketch), [300, 600]) FROM partition_sketches 194 | 195 | # Test PMF on merged sketch 196 | statement ok 197 | SELECT datasketch_tdigest_pmf(datasketch_tdigest(100, sketch), [300, 600]) FROM partition_sketches 198 | 199 | # Test rank query on merged sketch - rank of 450 should be approximately 0.5 200 | query I 201 | SELECT datasketch_tdigest_rank(datasketch_tdigest(100, sketch), 450.0) between 0.45 and 0.55 FROM partition_sketches 202 | ---- 203 | true 204 | 205 | # Test merging sketches created with GROUP BY 206 | statement ok 207 | CREATE TABLE grouped_data(category varchar, value double) 208 | 209 | statement ok 210 | INSERT INTO grouped_data 211 | SELECT 'A', unnest(generate_series(1, 100))::double 212 | UNION ALL 213 | SELECT 'B', unnest(generate_series(101, 200))::double 214 | UNION ALL 215 | SELECT 'C', unnest(generate_series(201, 300))::double 216 | 217 | statement ok 218 | CREATE TABLE category_sketches AS 219 | SELECT category, datasketch_tdigest(50, value) as sketch 220 | FROM grouped_data 221 | GROUP BY category 222 | 223 | # Merge all category sketches 224 | query I 225 | SELECT datasketch_tdigest_total_weight(datasketch_tdigest(50, sketch)) FROM category_sketches 226 | ---- 227 | 300 228 | 229 | # Verify merged quantiles span all categories 230 | query I 231 | SELECT datasketch_tdigest_quantile(datasketch_tdigest(50, sketch), 0.0) between 1 and 5 FROM category_sketches 232 | ---- 233 | true 234 | 235 | query I 236 | SELECT datasketch_tdigest_quantile(datasketch_tdigest(50, sketch), 1.0) between 295 and 300 FROM category_sketches 237 | ---- 238 | true 239 | 240 | # Test total_weight matches n for merged sketch 241 | query I 242 | SELECT datasketch_tdigest_total_weight(datasketch_tdigest(100, sketch)) FROM partition_sketches 243 | ---- 244 | 900.0 245 | 246 | -------------------------------------------------------------------------------- /test/sql/datasketch_cpc.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/datasketch_cpc.test 2 | # description: test datasketch CPC sketches 3 | # group: [datasketches] 4 | 5 | # Before we load the extension, this will fail 6 | statement error 7 | SELECT datasketch_cpc_is_empty(''::blob); 8 | ---- 9 | Catalog Error: Scalar Function with name datasketch_cpc_is_empty does not exist! 10 | 11 | # Require statement will ensure this test is run with this extension loaded 12 | require datasketches 13 | 14 | query I 15 | SELECT datasketch_cpc(8, 5); 16 | ---- 17 | \x08\x01\x10\x08\x00\x0E\xCC\x93\x01\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\xF8o@\x00\x00\x00\x00\x00\x00\xF0?\xDD\x03\x00\x00 18 | 19 | query I 20 | SELECT datasketch_cpc_is_empty('\x08\x01\x10\x08\x00\x0E\xCC\x93\x01\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\xF8o@\x00\x00\x00\x00\x00\x00\xF0?\xDD\x03\x00\x00'); 21 | ---- 22 | false 23 | 24 | # Do some tests with integers. 25 | 26 | statement ok 27 | CREATE TABLE items(id integer) 28 | 29 | statement ok 30 | INSERT INTO items(id) select unnest(generate_series(1, 100000)) order by random() 31 | 32 | # Duplicate items shouldn't affect the count. 33 | 34 | statement ok 35 | INSERT INTO items(id) select unnest(generate_series(1, 100000)) order by random() 36 | 37 | query I 38 | SELECT datasketch_cpc_is_empty(datasketch_cpc(12, id)) from items 39 | ---- 40 | False 41 | 42 | 43 | query I 44 | SELECT datasketch_cpc_describe(datasketch_cpc(4, id)) like '%CPC sketch summary%' from items 45 | ---- 46 | True 47 | 48 | # Test with strings 49 | 50 | statement ok 51 | CREATE TABLE employees(name string) 52 | 53 | statement ok 54 | INSERT INTO employees(name) VALUES 55 | ('John Doe'), ('Jane Smith'), ('Michael Johnson'), ('Emily Davis'), ('Chris Brown'), ('Sarah Wilson'), ('David Martinez'),('Sophia Anderson'), ('Daniel Lee'),('Olivia Taylor'); 56 | 57 | # 10 distinct names, estimate should be close 58 | query I 59 | SELECT datasketch_cpc_estimate(datasketch_cpc(4, name))::int between 8 and 15 from employees 60 | ---- 61 | true 62 | 63 | statement ok 64 | CREATE TABLE sketches (sketch sketch_cpc) 65 | 66 | statement ok 67 | INSERT INTO sketches (sketch) select datasketch_cpc(12, id) from items where mod(id, 3) == 0 68 | 69 | statement ok 70 | INSERT INTO sketches (sketch) select datasketch_cpc(12, id) from items where mod(id, 3) == 1 71 | 72 | statement ok 73 | INSERT INTO sketches (sketch) select datasketch_cpc(12, id) from items where mod(id, 3) == 2 74 | 75 | query I 76 | select datasketch_cpc_is_empty(datasketch_cpc_union(12, sketch)) from sketches 77 | ---- 78 | False 79 | 80 | # Test error handling for invalid/corrupted sketch data 81 | statement error 82 | SELECT datasketch_cpc_estimate('\x00\x01\x02\x03'::blob); 83 | ---- 84 | Invalid Input Error: Failed to deserialize CPC sketch 85 | 86 | statement error 87 | SELECT datasketch_cpc_is_empty('\xDE\xAD\xBE\xEF'::blob); 88 | ---- 89 | Invalid Input Error: Failed to deserialize CPC sketch 90 | 91 | # Test with empty blob 92 | statement error 93 | SELECT datasketch_cpc_estimate(''::blob); 94 | ---- 95 | Invalid Input Error: Failed to deserialize CPC sketch 96 | 97 | # ============================================================================= 98 | # COMPREHENSIVE UNION TESTS 99 | # ============================================================================= 100 | 101 | # Test union of multiple sketches from partitioned data with non-overlapping values 102 | statement ok 103 | CREATE TABLE union_data(value int, partition_id int) 104 | 105 | statement ok 106 | INSERT INTO union_data SELECT unnest(generate_series(1, 10000)), 1 107 | 108 | statement ok 109 | INSERT INTO union_data SELECT unnest(generate_series(10001, 20000)), 2 110 | 111 | statement ok 112 | INSERT INTO union_data SELECT unnest(generate_series(20001, 30000)), 3 113 | 114 | # Create sketches per partition 115 | statement ok 116 | CREATE TABLE partition_sketches AS 117 | SELECT partition_id, datasketch_cpc(12, value) as sketch 118 | FROM union_data 119 | GROUP BY partition_id 120 | 121 | # Verify we have 3 partition sketches 122 | query I 123 | SELECT count(*) FROM partition_sketches 124 | ---- 125 | 3 126 | 127 | # Each partition has 10000 distinct values 128 | query I 129 | SELECT datasketch_cpc_estimate(sketch)::int between 9500 and 10500 FROM partition_sketches ORDER BY partition_id LIMIT 1 130 | ---- 131 | true 132 | 133 | # Union all partition sketches - should have ~30000 distinct values 134 | query I 135 | SELECT datasketch_cpc_estimate(datasketch_cpc_union(12, sketch))::int between 28500 and 31500 FROM partition_sketches 136 | ---- 137 | true 138 | 139 | # Verify union is not empty 140 | query I 141 | SELECT datasketch_cpc_is_empty(datasketch_cpc_union(12, sketch)) FROM partition_sketches 142 | ---- 143 | False 144 | 145 | # Verify union is not empty (already tested above, but confirms consistency) 146 | query I 147 | SELECT datasketch_cpc_estimate(datasketch_cpc_union(12, sketch))::int > 0 FROM partition_sketches 148 | ---- 149 | true 150 | 151 | # Test union with overlapping data 152 | statement ok 153 | CREATE TABLE overlap_union_data(value int, group_id int) 154 | 155 | statement ok 156 | INSERT INTO overlap_union_data SELECT unnest(generate_series(1, 50000)), 1 157 | 158 | statement ok 159 | INSERT INTO overlap_union_data SELECT unnest(generate_series(25000, 75000)), 2 160 | 161 | statement ok 162 | CREATE TABLE overlap_union_sketches AS 163 | SELECT group_id, datasketch_cpc(14, value) as sketch 164 | FROM overlap_union_data 165 | GROUP BY group_id 166 | 167 | # Group 1 has 50000 distinct, Group 2 has 50001 distinct 168 | # Union should have 75000 distinct (1-75000) 169 | query I 170 | SELECT datasketch_cpc_estimate(datasketch_cpc_union(14, sketch))::int between 72000 and 78000 FROM overlap_union_sketches 171 | ---- 172 | true 173 | 174 | # Test union with different K values 175 | statement ok 176 | CREATE TABLE k_union_sketches AS 177 | SELECT datasketch_cpc(8, value) as sketch FROM union_data WHERE partition_id = 1 178 | UNION ALL 179 | SELECT datasketch_cpc(14, value) as sketch FROM union_data WHERE partition_id = 2 180 | 181 | # Verify union works with different K values 182 | query I 183 | SELECT datasketch_cpc_estimate(datasketch_cpc_union(10, sketch))::int between 18000 and 22000 FROM k_union_sketches 184 | ---- 185 | true 186 | 187 | # Test union of single sketch (edge case) 188 | query I 189 | SELECT datasketch_cpc_estimate(datasketch_cpc_union(12, sketch))::int between 9500 and 10500 190 | FROM (SELECT datasketch_cpc(12, value) as sketch FROM union_data WHERE partition_id = 1) single_sketch 191 | ---- 192 | true 193 | 194 | # Test union preserves accuracy - lower_bound/upper_bound on union 195 | query I 196 | SELECT datasketch_cpc_lower_bound(datasketch_cpc_union(12, sketch), 1)::int between 27000 and 31000 FROM partition_sketches 197 | ---- 198 | true 199 | 200 | query I 201 | SELECT datasketch_cpc_upper_bound(datasketch_cpc_union(12, sketch), 1)::int between 29000 and 33000 FROM partition_sketches 202 | ---- 203 | true 204 | 205 | # Test union with string values 206 | statement ok 207 | CREATE TABLE string_union_data(name varchar, source_id int) 208 | 209 | statement ok 210 | INSERT INTO string_union_data 211 | SELECT 'user_' || x, 1 FROM generate_series(1, 1000) t(x) 212 | UNION ALL 213 | SELECT 'user_' || x, 2 FROM generate_series(500, 1500) t(x) 214 | 215 | statement ok 216 | CREATE TABLE string_union_sketches AS 217 | SELECT source_id, datasketch_cpc(10, name) as sketch 218 | FROM string_union_data 219 | GROUP BY source_id 220 | 221 | # Union should have ~1500 distinct strings (user_1 to user_1500) 222 | query I 223 | SELECT datasketch_cpc_estimate(datasketch_cpc_union(10, sketch))::int between 1400 and 1600 FROM string_union_sketches 224 | ---- 225 | true 226 | 227 | # Test union with GROUP BY categories 228 | statement ok 229 | CREATE TABLE category_union_data(category varchar, user_id int) 230 | 231 | statement ok 232 | INSERT INTO category_union_data 233 | SELECT 'electronics', unnest(generate_series(1, 5000)) 234 | UNION ALL 235 | SELECT 'clothing', unnest(generate_series(2500, 7500)) 236 | UNION ALL 237 | SELECT 'food', unnest(generate_series(5000, 10000)) 238 | 239 | statement ok 240 | CREATE TABLE category_union_sketches AS 241 | SELECT category, datasketch_cpc(12, user_id) as sketch 242 | FROM category_union_data 243 | GROUP BY category 244 | 245 | # Each category has ~5000 distinct users 246 | # Union should have ~10000 distinct users (1-10000) 247 | query I 248 | SELECT datasketch_cpc_estimate(datasketch_cpc_union(12, sketch))::int between 9500 and 10500 FROM category_union_sketches 249 | ---- 250 | true 251 | 252 | # Test describe on union result 253 | query I 254 | SELECT datasketch_cpc_describe(datasketch_cpc_union(12, sketch)) like '%CPC sketch summary%' FROM partition_sketches 255 | ---- 256 | True 257 | 258 | -------------------------------------------------------------------------------- /test/sql/datasketch_kll.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/datasketch_kll.test 2 | # description: test datasketch KLL sketches 3 | # group: [datasketches] 4 | 5 | # Before we load the extension, this will fail 6 | statement error 7 | SELECT datasketch_kll_is_empty(''::blob); 8 | ---- 9 | Catalog Error: Scalar Function with name datasketch_kll_is_empty does not exist! 10 | 11 | # Require statement will ensure this test is run with this extension loaded 12 | require datasketches 13 | 14 | query I 15 | SELECT datasketch_kll(16, 5.0::float); 16 | ---- 17 | \x02\x02\x0F\x04\x10\x00\x08\x00\x00\x00\xA0@ 18 | 19 | query I 20 | SELECT datasketch_kll_is_empty('\x02\x02\x0F\x04\x10\x00\x08\x00\x00\x00\xA0@'::sketch_kll_float); 21 | ---- 22 | false 23 | 24 | # Do some tests with integers. 25 | 26 | statement ok 27 | CREATE TABLE readings(temp double) 28 | 29 | statement ok 30 | INSERT INTO readings(temp) select unnest(generate_series(1, 1000))::double; 31 | 32 | query I 33 | SELECT datasketch_kll_rank(datasketch_kll(16, temp), 500.0, true) between 0.40 and 0.60 from readings 34 | ---- 35 | True 36 | 37 | query I 38 | SELECT datasketch_kll_quantile(datasketch_kll(16, temp), 0.5, true) between 400 and 600 from readings 39 | ---- 40 | True 41 | 42 | # Can't save results on these because they are random 43 | 44 | statement ok 45 | SELECT datasketch_kll_cdf(datasketch_kll(16, temp), [100, 200, 500], true) from readings 46 | 47 | statement ok 48 | SELECT datasketch_kll_pmf(datasketch_kll(16, temp), [100, 200, 500], true) from readings 49 | 50 | query I 51 | SELECT datasketch_kll_k(datasketch_kll(16, temp)) from readings 52 | ---- 53 | 16 54 | 55 | statement ok 56 | CREATE TABLE sketches (sketch sketch_kll_double) 57 | 58 | statement ok 59 | INSERT INTO sketches (sketch) select datasketch_kll(16, temp) from readings where mod(temp::int, 3) == 0 60 | 61 | statement ok 62 | INSERT INTO sketches (sketch) select datasketch_kll(16, temp) from readings where mod(temp::int, 3) == 1 63 | 64 | statement ok 65 | INSERT INTO sketches (sketch) select datasketch_kll(16, temp) from readings where mod(temp::int, 3) == 2 66 | 67 | query I 68 | select datasketch_kll_is_empty(datasketch_kll(16, sketch)) from sketches 69 | ---- 70 | False 71 | 72 | statement ok 73 | select datasketch_kll_quantile(datasketch_kll(16, sketch), 0.5, true)::int from sketches 74 | 75 | query I 76 | select datasketch_kll_n(datasketch_kll(16, sketch)) from sketches 77 | ---- 78 | 1000 79 | 80 | query I 81 | select datasketch_kll_is_estimation_mode(datasketch_kll(16, sketch)) from sketches 82 | ---- 83 | 1 84 | 85 | # num_retained varies based on data distribution and internal compaction 86 | statement ok 87 | select datasketch_kll_num_retained(datasketch_kll(16, sketch)) from sketches 88 | 89 | query I 90 | select datasketch_kll_min_item(datasketch_kll(16, sketch)) from sketches 91 | ---- 92 | 1.0 93 | 94 | query I 95 | select datasketch_kll_max_item(datasketch_kll(16, sketch)) from sketches 96 | ---- 97 | 1000.0 98 | 99 | # Test error handling for invalid/corrupted sketch data 100 | statement error 101 | SELECT datasketch_kll_is_empty('\x00\x01\x02'::sketch_kll_float); 102 | ---- 103 | Invalid Input Error: Failed to deserialize KLL sketch 104 | 105 | statement error 106 | SELECT datasketch_kll_k('\xDE\xAD\xBE\xEF'::sketch_kll_double); 107 | ---- 108 | Invalid Input Error: Failed to deserialize KLL sketch 109 | 110 | # Test with empty blob 111 | statement error 112 | SELECT datasketch_kll_is_empty(''::sketch_kll_integer); 113 | ---- 114 | Invalid Input Error: Failed to deserialize KLL sketch 115 | 116 | # ============================================================================= 117 | # COMPREHENSIVE UNION/MERGE TESTS 118 | # ============================================================================= 119 | 120 | # Test merging multiple sketches from partitioned data 121 | statement ok 122 | CREATE TABLE merge_data(value double, partition_id int) 123 | 124 | statement ok 125 | INSERT INTO merge_data SELECT unnest(generate_series(1, 300))::double, 1 126 | 127 | statement ok 128 | INSERT INTO merge_data SELECT unnest(generate_series(301, 600))::double, 2 129 | 130 | statement ok 131 | INSERT INTO merge_data SELECT unnest(generate_series(601, 900))::double, 3 132 | 133 | # Create sketches per partition 134 | statement ok 135 | CREATE TABLE partition_sketches AS 136 | SELECT partition_id, datasketch_kll(32, value) as sketch 137 | FROM merge_data 138 | GROUP BY partition_id 139 | 140 | # Verify we have 3 partition sketches 141 | query I 142 | SELECT count(*) FROM partition_sketches 143 | ---- 144 | 3 145 | 146 | # Merge all partition sketches and verify total count 147 | query I 148 | SELECT datasketch_kll_n(datasketch_kll(32, sketch)) FROM partition_sketches 149 | ---- 150 | 900 151 | 152 | # Verify merged sketch has correct min value 153 | query I 154 | SELECT datasketch_kll_min_item(datasketch_kll(32, sketch)) FROM partition_sketches 155 | ---- 156 | 1.0 157 | 158 | # Verify merged sketch has correct max value 159 | query I 160 | SELECT datasketch_kll_max_item(datasketch_kll(32, sketch)) FROM partition_sketches 161 | ---- 162 | 900.0 163 | 164 | # Verify merged sketch median is approximately in the middle 165 | query I 166 | SELECT datasketch_kll_quantile(datasketch_kll(32, sketch), 0.5, true) between 400 and 500 FROM partition_sketches 167 | ---- 168 | True 169 | 170 | # Test merging sketches with overlapping data ranges 171 | statement ok 172 | CREATE TABLE overlap_data(value double, group_id int) 173 | 174 | statement ok 175 | INSERT INTO overlap_data SELECT unnest(generate_series(1, 500))::double, 1 176 | 177 | statement ok 178 | INSERT INTO overlap_data SELECT unnest(generate_series(250, 750))::double, 2 179 | 180 | statement ok 181 | CREATE TABLE overlap_sketches AS 182 | SELECT group_id, datasketch_kll(64, value) as sketch 183 | FROM overlap_data 184 | GROUP BY group_id 185 | 186 | # Merged sketch should have correct total count (500 + 501 = 1001) 187 | query I 188 | SELECT datasketch_kll_n(datasketch_kll(64, sketch)) FROM overlap_sketches 189 | ---- 190 | 1001 191 | 192 | # Verify min/max of merged overlapping sketches 193 | query I 194 | SELECT datasketch_kll_min_item(datasketch_kll(64, sketch)) FROM overlap_sketches 195 | ---- 196 | 1.0 197 | 198 | query I 199 | SELECT datasketch_kll_max_item(datasketch_kll(64, sketch)) FROM overlap_sketches 200 | ---- 201 | 750.0 202 | 203 | # Test merge with different K values 204 | statement ok 205 | CREATE TABLE k_test_sketches AS 206 | SELECT datasketch_kll(16, value) as sketch FROM merge_data WHERE partition_id = 1 207 | UNION ALL 208 | SELECT datasketch_kll(64, value) as sketch FROM merge_data WHERE partition_id = 2 209 | 210 | # Verify merge works with different K values and produces correct count 211 | query I 212 | SELECT datasketch_kll_n(datasketch_kll(32, sketch)) FROM k_test_sketches 213 | ---- 214 | 600 215 | 216 | # Test merging a single sketch (edge case) 217 | query I 218 | SELECT datasketch_kll_n(datasketch_kll(16, sketch)) 219 | FROM (SELECT datasketch_kll(16, value) as sketch FROM merge_data WHERE partition_id = 1) single_sketch 220 | ---- 221 | 300 222 | 223 | # Test merge preserves estimation mode status 224 | query I 225 | SELECT datasketch_kll_is_estimation_mode(datasketch_kll(8, sketch)) FROM partition_sketches 226 | ---- 227 | 1 228 | 229 | # Test CDF on merged sketch 230 | statement ok 231 | SELECT datasketch_kll_cdf(datasketch_kll(32, sketch), [300, 600], true) FROM partition_sketches 232 | 233 | # Test PMF on merged sketch 234 | statement ok 235 | SELECT datasketch_kll_pmf(datasketch_kll(32, sketch), [300, 600], true) FROM partition_sketches 236 | 237 | # Test rank query on merged sketch - rank of 450 should be approximately 0.5 238 | query I 239 | SELECT datasketch_kll_rank(datasketch_kll(32, sketch), 450.0, true) between 0.45 and 0.55 FROM partition_sketches 240 | ---- 241 | True 242 | 243 | # Test merging sketches created with GROUP BY 244 | statement ok 245 | CREATE TABLE grouped_data(category varchar, value double) 246 | 247 | statement ok 248 | INSERT INTO grouped_data 249 | SELECT 'A', unnest(generate_series(1, 100))::double 250 | UNION ALL 251 | SELECT 'B', unnest(generate_series(101, 200))::double 252 | UNION ALL 253 | SELECT 'C', unnest(generate_series(201, 300))::double 254 | 255 | statement ok 256 | CREATE TABLE category_sketches AS 257 | SELECT category, datasketch_kll(16, value) as sketch 258 | FROM grouped_data 259 | GROUP BY category 260 | 261 | # Merge all category sketches 262 | query I 263 | SELECT datasketch_kll_n(datasketch_kll(16, sketch)) FROM category_sketches 264 | ---- 265 | 300 266 | 267 | # Verify merged sketch min and max span all categories 268 | query I 269 | SELECT datasketch_kll_min_item(datasketch_kll(16, sketch)) FROM category_sketches 270 | ---- 271 | 1.0 272 | 273 | query I 274 | SELECT datasketch_kll_max_item(datasketch_kll(16, sketch)) FROM category_sketches 275 | ---- 276 | 300.0 277 | -------------------------------------------------------------------------------- /test/sql/datasketch_req.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/datasketch_req.test 2 | # description: test datasketch REQ sketches 3 | # group: [datasketches] 4 | 5 | # Before we load the extension, this will fail 6 | statement error 7 | SELECT datasketch_req_is_empty(''::blob); 8 | ---- 9 | Catalog Error: Scalar Function with name datasketch_req_is_empty does not exist! 10 | 11 | # Require statement will ensure this test is run with this extension loaded 12 | require datasketches 13 | 14 | query I 15 | SELECT datasketch_req(16, 5.0::float); 16 | ---- 17 | \x02\x01\x118\x10\x00\x01\x01\x00\x00\xA0@ 18 | 19 | query I 20 | SELECT datasketch_req_is_empty('\x02\x01\x118\x10\x00\x01\x01\x00\x00\xA0@'::sketch_req_float); 21 | ---- 22 | false 23 | 24 | # Do some tests with integers. 25 | 26 | statement ok 27 | CREATE TABLE readings(temp double) 28 | 29 | statement ok 30 | INSERT INTO readings(temp) select unnest(generate_series(1, 1000))::double; 31 | 32 | query I 33 | SELECT datasketch_req_rank(datasketch_req(16, temp), 500.0, true) between 0.40 and 0.60 from readings 34 | ---- 35 | True 36 | 37 | query I 38 | SELECT datasketch_req_quantile(datasketch_req(16, temp), 0.5, true) between 400 and 600 from readings 39 | ---- 40 | True 41 | 42 | # Can't save results on these because they are random 43 | 44 | statement ok 45 | SELECT datasketch_req_cdf(datasketch_req(16, temp), [100, 200, 500], true) from readings 46 | 47 | statement ok 48 | SELECT datasketch_req_pmf(datasketch_req(16, temp), [100, 200, 500], true) from readings 49 | 50 | query I 51 | SELECT datasketch_req_k(datasketch_req(16, temp)) from readings 52 | ---- 53 | 16 54 | 55 | statement ok 56 | CREATE TABLE sketches (sketch sketch_req_double) 57 | 58 | statement ok 59 | INSERT INTO sketches (sketch) select datasketch_req(16, temp) from readings where mod(temp::int, 3) == 0 60 | 61 | statement ok 62 | INSERT INTO sketches (sketch) select datasketch_req(16, temp) from readings where mod(temp::int, 3) == 1 63 | 64 | statement ok 65 | INSERT INTO sketches (sketch) select datasketch_req(16, temp) from readings where mod(temp::int, 3) == 2 66 | 67 | query I 68 | select datasketch_req_is_empty(datasketch_req(16, sketch)) from sketches 69 | ---- 70 | False 71 | 72 | statement ok 73 | select datasketch_req_quantile(datasketch_req(16, sketch), 0.5, true)::int from sketches 74 | 75 | query I 76 | select datasketch_req_n(datasketch_req(16, sketch)) from sketches 77 | ---- 78 | 1000 79 | 80 | query I 81 | select datasketch_req_is_estimation_mode(datasketch_req(16, sketch)) from sketches 82 | ---- 83 | 1 84 | 85 | # num_retained varies based on data distribution and internal compaction 86 | statement ok 87 | select datasketch_req_num_retained(datasketch_req(16, sketch)) from sketches 88 | 89 | query I 90 | select datasketch_req_min_item(datasketch_req(16, sketch)) from sketches 91 | ---- 92 | 1.0 93 | 94 | query I 95 | select datasketch_req_max_item(datasketch_req(16, sketch)) from sketches 96 | ---- 97 | 1000.0 98 | 99 | # Test error handling for invalid/corrupted sketch data 100 | statement error 101 | SELECT datasketch_req_is_empty('\x00\x01\x02'::sketch_req_float); 102 | ---- 103 | Invalid Input Error: Failed to deserialize REQ sketch 104 | 105 | statement error 106 | SELECT datasketch_req_k('\xDE\xAD\xBE\xEF'::sketch_req_double); 107 | ---- 108 | Invalid Input Error: Failed to deserialize REQ sketch 109 | 110 | # Test with empty blob 111 | statement error 112 | SELECT datasketch_req_is_empty(''::sketch_req_integer); 113 | ---- 114 | Invalid Input Error: Failed to deserialize REQ sketch 115 | 116 | # ============================================================================= 117 | # COMPREHENSIVE UNION/MERGE TESTS 118 | # ============================================================================= 119 | 120 | # Test merging multiple sketches from partitioned data 121 | statement ok 122 | CREATE TABLE merge_data(value double, partition_id int) 123 | 124 | statement ok 125 | INSERT INTO merge_data SELECT unnest(generate_series(1, 300))::double, 1 126 | 127 | statement ok 128 | INSERT INTO merge_data SELECT unnest(generate_series(301, 600))::double, 2 129 | 130 | statement ok 131 | INSERT INTO merge_data SELECT unnest(generate_series(601, 900))::double, 3 132 | 133 | # Create sketches per partition (REQ requires k between 4 and 1024) 134 | statement ok 135 | CREATE TABLE partition_sketches AS 136 | SELECT partition_id, datasketch_req(32, value) as sketch 137 | FROM merge_data 138 | GROUP BY partition_id 139 | 140 | # Verify we have 3 partition sketches 141 | query I 142 | SELECT count(*) FROM partition_sketches 143 | ---- 144 | 3 145 | 146 | # Merge all partition sketches and verify total count 147 | query I 148 | SELECT datasketch_req_n(datasketch_req(32, sketch)) FROM partition_sketches 149 | ---- 150 | 900 151 | 152 | # Verify merged sketch has correct min value 153 | query I 154 | SELECT datasketch_req_min_item(datasketch_req(32, sketch)) FROM partition_sketches 155 | ---- 156 | 1.0 157 | 158 | # Verify merged sketch has correct max value 159 | query I 160 | SELECT datasketch_req_max_item(datasketch_req(32, sketch)) FROM partition_sketches 161 | ---- 162 | 900.0 163 | 164 | # Verify merged sketch median is approximately in the middle 165 | query I 166 | SELECT datasketch_req_quantile(datasketch_req(32, sketch), 0.5, true) between 400 and 500 FROM partition_sketches 167 | ---- 168 | True 169 | 170 | # Test merging sketches with overlapping data ranges 171 | statement ok 172 | CREATE TABLE overlap_data(value double, group_id int) 173 | 174 | statement ok 175 | INSERT INTO overlap_data SELECT unnest(generate_series(1, 500))::double, 1 176 | 177 | statement ok 178 | INSERT INTO overlap_data SELECT unnest(generate_series(250, 750))::double, 2 179 | 180 | statement ok 181 | CREATE TABLE overlap_sketches AS 182 | SELECT group_id, datasketch_req(64, value) as sketch 183 | FROM overlap_data 184 | GROUP BY group_id 185 | 186 | # Merged sketch should have correct total count (500 + 501 = 1001) 187 | query I 188 | SELECT datasketch_req_n(datasketch_req(64, sketch)) FROM overlap_sketches 189 | ---- 190 | 1001 191 | 192 | # Verify min/max of merged overlapping sketches 193 | query I 194 | SELECT datasketch_req_min_item(datasketch_req(64, sketch)) FROM overlap_sketches 195 | ---- 196 | 1.0 197 | 198 | query I 199 | SELECT datasketch_req_max_item(datasketch_req(64, sketch)) FROM overlap_sketches 200 | ---- 201 | 750.0 202 | 203 | # Test merge with different K values 204 | statement ok 205 | CREATE TABLE k_test_sketches AS 206 | SELECT datasketch_req(8, value) as sketch FROM merge_data WHERE partition_id = 1 207 | UNION ALL 208 | SELECT datasketch_req(64, value) as sketch FROM merge_data WHERE partition_id = 2 209 | 210 | # Verify merge works with different K values and produces correct count 211 | query I 212 | SELECT datasketch_req_n(datasketch_req(32, sketch)) FROM k_test_sketches 213 | ---- 214 | 600 215 | 216 | # Test merging a single sketch (edge case) 217 | query I 218 | SELECT datasketch_req_n(datasketch_req(16, sketch)) 219 | FROM (SELECT datasketch_req(16, value) as sketch FROM merge_data WHERE partition_id = 1) single_sketch 220 | ---- 221 | 300 222 | 223 | # Test merge preserves estimation mode status 224 | query I 225 | SELECT datasketch_req_is_estimation_mode(datasketch_req(8, sketch)) FROM partition_sketches 226 | ---- 227 | 1 228 | 229 | # Test CDF on merged sketch 230 | statement ok 231 | SELECT datasketch_req_cdf(datasketch_req(32, sketch), [300, 600], true) FROM partition_sketches 232 | 233 | # Test PMF on merged sketch 234 | statement ok 235 | SELECT datasketch_req_pmf(datasketch_req(32, sketch), [300, 600], true) FROM partition_sketches 236 | 237 | # Test rank query on merged sketch - rank of 450 should be approximately 0.5 238 | query I 239 | SELECT datasketch_req_rank(datasketch_req(32, sketch), 450.0, true) between 0.45 and 0.55 FROM partition_sketches 240 | ---- 241 | True 242 | 243 | # Test merging sketches created with GROUP BY 244 | statement ok 245 | CREATE TABLE grouped_data(category varchar, value double) 246 | 247 | statement ok 248 | INSERT INTO grouped_data 249 | SELECT 'A', unnest(generate_series(1, 100))::double 250 | UNION ALL 251 | SELECT 'B', unnest(generate_series(101, 200))::double 252 | UNION ALL 253 | SELECT 'C', unnest(generate_series(201, 300))::double 254 | 255 | statement ok 256 | CREATE TABLE category_sketches AS 257 | SELECT category, datasketch_req(16, value) as sketch 258 | FROM grouped_data 259 | GROUP BY category 260 | 261 | # Merge all category sketches 262 | query I 263 | SELECT datasketch_req_n(datasketch_req(16, sketch)) FROM category_sketches 264 | ---- 265 | 300 266 | 267 | # Verify merged sketch min and max span all categories 268 | query I 269 | SELECT datasketch_req_min_item(datasketch_req(16, sketch)) FROM category_sketches 270 | ---- 271 | 1.0 272 | 273 | query I 274 | SELECT datasketch_req_max_item(datasketch_req(16, sketch)) FROM category_sketches 275 | ---- 276 | 300.0 277 | -------------------------------------------------------------------------------- /test/sql/datasketch_quantiles.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/datasketch_quantiles.test 2 | # description: test datasketch Quantiles sketches 3 | # group: [datasketches] 4 | 5 | # Before we load the extension, this will fail 6 | statement error 7 | SELECT datasketch_quantiles_is_empty(''::blob); 8 | ---- 9 | Catalog Error: Scalar Function with name datasketch_quantiles_is_empty does not exist! 10 | 11 | # Require statement will ensure this test is run with this extension loaded 12 | require datasketches 13 | 14 | query I 15 | SELECT datasketch_quantiles(16, 5.0::float); 16 | ---- 17 | \x02\x03\x08\x18\x10\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\xA0@\x00\x00\xA0@\x00\x00\xA0@ 18 | 19 | query I 20 | SELECT datasketch_quantiles_is_empty('\x02\x03\x08\x18\x10\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\xA0@\x00\x00\xA0@\x00\x00\xA0@'::sketch_quantiles_float); 21 | ---- 22 | false 23 | 24 | # Do some tests with integers. 25 | 26 | statement ok 27 | CREATE TABLE readings(temp double) 28 | 29 | statement ok 30 | INSERT INTO readings(temp) select unnest(generate_series(1, 1000))::double; 31 | 32 | query I 33 | SELECT datasketch_quantiles_rank(datasketch_quantiles(16, temp), 500.0, true) between 0.40 and 0.60 from readings 34 | ---- 35 | True 36 | 37 | query I 38 | SELECT datasketch_quantiles_quantile(datasketch_quantiles(16, temp), 0.5, true) between 400 and 600 from readings 39 | ---- 40 | True 41 | 42 | # Can't save results on these because they are random 43 | 44 | statement ok 45 | SELECT datasketch_quantiles_cdf(datasketch_quantiles(16, temp), [100, 200, 500], true) from readings 46 | 47 | statement ok 48 | SELECT datasketch_quantiles_pmf(datasketch_quantiles(16, temp), [100, 200, 500], true) from readings 49 | 50 | query I 51 | SELECT datasketch_quantiles_k(datasketch_quantiles(16, temp)) from readings 52 | ---- 53 | 16 54 | 55 | statement ok 56 | CREATE TABLE sketches (sketch sketch_quantiles_double) 57 | 58 | statement ok 59 | INSERT INTO sketches (sketch) select datasketch_quantiles(16, temp) from readings where mod(temp::int, 3) == 0 60 | 61 | statement ok 62 | INSERT INTO sketches (sketch) select datasketch_quantiles(16, temp) from readings where mod(temp::int, 3) == 1 63 | 64 | statement ok 65 | INSERT INTO sketches (sketch) select datasketch_quantiles(16, temp) from readings where mod(temp::int, 3) == 2 66 | 67 | query I 68 | select datasketch_quantiles_is_empty(datasketch_quantiles(16, sketch)) from sketches 69 | ---- 70 | False 71 | 72 | statement ok 73 | select datasketch_quantiles_quantile(datasketch_quantiles(16, sketch), 0.5, true)::int from sketches 74 | 75 | query I 76 | select datasketch_quantiles_n(datasketch_quantiles(16, sketch)) from sketches 77 | ---- 78 | 1000 79 | 80 | query I 81 | select datasketch_quantiles_is_estimation_mode(datasketch_quantiles(16, sketch)) from sketches 82 | ---- 83 | 1 84 | 85 | # num_retained varies based on data distribution and internal compaction 86 | statement ok 87 | select datasketch_quantiles_num_retained(datasketch_quantiles(16, sketch)) from sketches 88 | 89 | query I 90 | select datasketch_quantiles_min_item(datasketch_quantiles(16, sketch)) from sketches 91 | ---- 92 | 1.0 93 | 94 | query I 95 | select datasketch_quantiles_max_item(datasketch_quantiles(16, sketch)) from sketches 96 | ---- 97 | 1000.0 98 | 99 | # Test error handling for invalid/corrupted sketch data 100 | statement error 101 | SELECT datasketch_quantiles_is_empty('\x00\x01\x02'::sketch_quantiles_float); 102 | ---- 103 | Invalid Input Error: Failed to deserialize Quantiles sketch 104 | 105 | statement error 106 | SELECT datasketch_quantiles_k('\xDE\xAD\xBE\xEF'::sketch_quantiles_double); 107 | ---- 108 | Invalid Input Error: Failed to deserialize Quantiles sketch 109 | 110 | # Test with empty blob 111 | statement error 112 | SELECT datasketch_quantiles_is_empty(''::sketch_quantiles_integer); 113 | ---- 114 | Invalid Input Error: Failed to deserialize Quantiles sketch 115 | 116 | # ============================================================================= 117 | # COMPREHENSIVE UNION/MERGE TESTS 118 | # ============================================================================= 119 | 120 | # Test merging multiple sketches from partitioned data 121 | statement ok 122 | CREATE TABLE merge_data(value double, partition_id int) 123 | 124 | statement ok 125 | INSERT INTO merge_data SELECT unnest(generate_series(1, 300))::double, 1 126 | 127 | statement ok 128 | INSERT INTO merge_data SELECT unnest(generate_series(301, 600))::double, 2 129 | 130 | statement ok 131 | INSERT INTO merge_data SELECT unnest(generate_series(601, 900))::double, 3 132 | 133 | # Create sketches per partition 134 | statement ok 135 | CREATE TABLE partition_sketches AS 136 | SELECT partition_id, datasketch_quantiles(32, value) as sketch 137 | FROM merge_data 138 | GROUP BY partition_id 139 | 140 | # Verify we have 3 partition sketches 141 | query I 142 | SELECT count(*) FROM partition_sketches 143 | ---- 144 | 3 145 | 146 | # Merge all partition sketches and verify total count 147 | query I 148 | SELECT datasketch_quantiles_n(datasketch_quantiles(32, sketch)) FROM partition_sketches 149 | ---- 150 | 900 151 | 152 | # Verify merged sketch has correct min value 153 | query I 154 | SELECT datasketch_quantiles_min_item(datasketch_quantiles(32, sketch)) FROM partition_sketches 155 | ---- 156 | 1.0 157 | 158 | # Verify merged sketch has correct max value 159 | query I 160 | SELECT datasketch_quantiles_max_item(datasketch_quantiles(32, sketch)) FROM partition_sketches 161 | ---- 162 | 900.0 163 | 164 | # Verify merged sketch median is approximately in the middle 165 | query I 166 | SELECT datasketch_quantiles_quantile(datasketch_quantiles(32, sketch), 0.5, true) between 400 and 500 FROM partition_sketches 167 | ---- 168 | True 169 | 170 | # Test merging sketches with overlapping data ranges 171 | statement ok 172 | CREATE TABLE overlap_data(value double, group_id int) 173 | 174 | statement ok 175 | INSERT INTO overlap_data SELECT unnest(generate_series(1, 500))::double, 1 176 | 177 | statement ok 178 | INSERT INTO overlap_data SELECT unnest(generate_series(250, 750))::double, 2 179 | 180 | statement ok 181 | CREATE TABLE overlap_sketches AS 182 | SELECT group_id, datasketch_quantiles(64, value) as sketch 183 | FROM overlap_data 184 | GROUP BY group_id 185 | 186 | # Merged sketch should have correct total count (500 + 501 = 1001, even with overlap since we count all items) 187 | query I 188 | SELECT datasketch_quantiles_n(datasketch_quantiles(64, sketch)) FROM overlap_sketches 189 | ---- 190 | 1001 191 | 192 | # Verify min/max of merged overlapping sketches 193 | query I 194 | SELECT datasketch_quantiles_min_item(datasketch_quantiles(64, sketch)) FROM overlap_sketches 195 | ---- 196 | 1.0 197 | 198 | query I 199 | SELECT datasketch_quantiles_max_item(datasketch_quantiles(64, sketch)) FROM overlap_sketches 200 | ---- 201 | 750.0 202 | 203 | # Test merge with different K values 204 | statement ok 205 | CREATE TABLE k_test_sketches AS 206 | SELECT datasketch_quantiles(16, value) as sketch FROM merge_data WHERE partition_id = 1 207 | UNION ALL 208 | SELECT datasketch_quantiles(64, value) as sketch FROM merge_data WHERE partition_id = 2 209 | 210 | # Verify merge works with different K values and produces correct count 211 | query I 212 | SELECT datasketch_quantiles_n(datasketch_quantiles(32, sketch)) FROM k_test_sketches 213 | ---- 214 | 600 215 | 216 | # Test merging a single sketch (edge case) 217 | query I 218 | SELECT datasketch_quantiles_n(datasketch_quantiles(16, sketch)) 219 | FROM (SELECT datasketch_quantiles(16, value) as sketch FROM merge_data WHERE partition_id = 1) single_sketch 220 | ---- 221 | 300 222 | 223 | # Test merge preserves estimation mode status 224 | query I 225 | SELECT datasketch_quantiles_is_estimation_mode(datasketch_quantiles(8, sketch)) FROM partition_sketches 226 | ---- 227 | 1 228 | 229 | # Test CDF on merged sketch 230 | statement ok 231 | SELECT datasketch_quantiles_cdf(datasketch_quantiles(32, sketch), [300, 600], true) FROM partition_sketches 232 | 233 | # Test PMF on merged sketch 234 | statement ok 235 | SELECT datasketch_quantiles_pmf(datasketch_quantiles(32, sketch), [300, 600], true) FROM partition_sketches 236 | 237 | # Test rank query on merged sketch - rank of 450 should be approximately 0.5 238 | query I 239 | SELECT datasketch_quantiles_rank(datasketch_quantiles(32, sketch), 450.0, true) between 0.45 and 0.55 FROM partition_sketches 240 | ---- 241 | True 242 | 243 | # Test merging sketches created with GROUP BY 244 | statement ok 245 | CREATE TABLE grouped_data(category varchar, value double) 246 | 247 | statement ok 248 | INSERT INTO grouped_data 249 | SELECT 'A', unnest(generate_series(1, 100))::double 250 | UNION ALL 251 | SELECT 'B', unnest(generate_series(101, 200))::double 252 | UNION ALL 253 | SELECT 'C', unnest(generate_series(201, 300))::double 254 | 255 | statement ok 256 | CREATE TABLE category_sketches AS 257 | SELECT category, datasketch_quantiles(16, value) as sketch 258 | FROM grouped_data 259 | GROUP BY category 260 | 261 | # Merge all category sketches 262 | query I 263 | SELECT datasketch_quantiles_n(datasketch_quantiles(16, sketch)) FROM category_sketches 264 | ---- 265 | 300 266 | 267 | # Verify merged sketch min and max span all categories 268 | query I 269 | SELECT datasketch_quantiles_min_item(datasketch_quantiles(16, sketch)) FROM category_sketches 270 | ---- 271 | 1.0 272 | 273 | query I 274 | SELECT datasketch_quantiles_max_item(datasketch_quantiles(16, sketch)) FROM category_sketches 275 | ---- 276 | 300.0 277 | -------------------------------------------------------------------------------- /test/sql/datasketch_hll.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/datasketch_hll.test 2 | # description: test datasketch HLL sketches 3 | # group: [datasketches] 4 | 5 | # Before we load the extension, this will fail 6 | statement error 7 | SELECT datasketch_hll_is_empty(''::blob); 8 | ---- 9 | Catalog Error: Scalar Function with name datasketch_hll_is_empty does not exist! 10 | 11 | # Require statement will ensure this test is run with this extension loaded 12 | require datasketches 13 | 14 | query I 15 | SELECT datasketch_hll(8, 5); 16 | ---- 17 | \x02\x01\x07\x08\x03\x00\x01\x00{e\xE6\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00 18 | 19 | query I 20 | SELECT datasketch_hll_is_empty('\x02\x01\x07\x08\x03\x00\x01\x00{e\xE6\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'); 21 | ---- 22 | false 23 | 24 | query I 25 | SELECT datasketch_hll_estimate('\x02\x01\x07\x08\x03\x00\x01\x00{e\xE6\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'); 26 | ---- 27 | 1 28 | 29 | # Do some tests with integers. 30 | 31 | statement ok 32 | CREATE TABLE items(id integer) 33 | 34 | statement ok 35 | INSERT INTO items(id) select unnest(generate_series(1, 100000)); 36 | 37 | # Duplicate items shouldn't affect the count. 38 | 39 | statement ok 40 | INSERT INTO items(id) select unnest(generate_series(1, 100000)); 41 | 42 | # HLL estimates should be close to the true count (100000 distinct values) 43 | # With k=12, error should be within ~2-3% 44 | query I 45 | SELECT datasketch_hll_estimate(datasketch_hll(12, id))::int between 95000 and 105000 from items 46 | ---- 47 | true 48 | 49 | # With k=4, error can be much larger (~25%) 50 | query I 51 | SELECT datasketch_hll_estimate(datasketch_hll(4, id))::int between 75000 and 175000 from items 52 | ---- 53 | true 54 | 55 | query I 56 | SELECT datasketch_hll_is_empty(datasketch_hll(12, id)) from items 57 | ---- 58 | False 59 | 60 | # Lower bound should be less than true count but reasonable 61 | query I 62 | SELECT datasketch_hll_lower_bound(datasketch_hll(12, id), 1)::int between 90000 and 105000 from items 63 | ---- 64 | true 65 | 66 | # Upper bound should be greater than true count but reasonable 67 | query I 68 | SELECT datasketch_hll_upper_bound(datasketch_hll(12, id), 1)::int between 95000 and 115000 from items 69 | ---- 70 | true 71 | 72 | query I 73 | SELECT datasketch_hll_lg_config_k(datasketch_hll(12, id)) from items 74 | ---- 75 | 12 76 | 77 | query I 78 | SELECT datasketch_hll_is_compact(datasketch_hll(12, id)) from items 79 | ---- 80 | False 81 | 82 | 83 | query I 84 | SELECT datasketch_hll_describe(datasketch_hll(4, id), true, false) like '%HLL sketch summary%' from items 85 | ---- 86 | True 87 | 88 | # Test with strings 89 | 90 | statement ok 91 | CREATE TABLE employees(name string) 92 | 93 | statement ok 94 | INSERT INTO employees(name) VALUES 95 | ('John Doe'), ('Jane Smith'), ('Michael Johnson'), ('Emily Davis'), ('Chris Brown'), ('Sarah Wilson'), ('David Martinez'),('Sophia Anderson'), ('Daniel Lee'),('Olivia Taylor'); 96 | 97 | # 10 distinct names, estimate should be close 98 | query I 99 | SELECT datasketch_hll_estimate(datasketch_hll(4, name))::int between 8 and 15 from employees 100 | ---- 101 | true 102 | 103 | # Grouped query - each group has 50 distinct values, estimates should be close 104 | query I 105 | select datasketch_hll_estimate(datasketch_hll(14, x))::int between 45 and 55 from unnest(range(100)) t(x) group by x % 2 106 | ---- 107 | true 108 | true 109 | 110 | statement ok 111 | CREATE TABLE sketches (sketch sketch_hll) 112 | 113 | statement ok 114 | INSERT INTO sketches (sketch) select datasketch_hll(12, id) from items where mod(id, 3) == 0 115 | 116 | statement ok 117 | INSERT INTO sketches (sketch) select datasketch_hll(12, id) from items where mod(id, 3) == 1 118 | 119 | statement ok 120 | INSERT INTO sketches (sketch) select datasketch_hll(12, id) from items where mod(id, 3) == 2 121 | 122 | query I 123 | select datasketch_hll_is_empty(datasketch_hll_union(12, sketch)) from sketches 124 | ---- 125 | False 126 | 127 | statement ok 128 | select datasketch_hll_estimate(datasketch_hll_union(12, sketch))::int from sketches 129 | 130 | # Test error handling for invalid/corrupted sketch data 131 | statement error 132 | SELECT datasketch_hll_estimate('\x00\x01\x02\x03'::blob); 133 | ---- 134 | Invalid Input Error: Failed to deserialize HLL sketch 135 | 136 | statement error 137 | SELECT datasketch_hll_is_empty('\xDE\xAD\xBE\xEF'::blob); 138 | ---- 139 | Invalid Input Error: Failed to deserialize HLL sketch 140 | 141 | # Test with empty blob 142 | statement error 143 | SELECT datasketch_hll_estimate(''::blob); 144 | ---- 145 | Invalid Input Error: Failed to deserialize HLL sketch 146 | 147 | # ============================================================================= 148 | # COMPREHENSIVE UNION TESTS 149 | # ============================================================================= 150 | 151 | # Test union of multiple sketches from partitioned data with non-overlapping values 152 | statement ok 153 | CREATE TABLE union_data(value int, partition_id int) 154 | 155 | statement ok 156 | INSERT INTO union_data SELECT unnest(generate_series(1, 10000)), 1 157 | 158 | statement ok 159 | INSERT INTO union_data SELECT unnest(generate_series(10001, 20000)), 2 160 | 161 | statement ok 162 | INSERT INTO union_data SELECT unnest(generate_series(20001, 30000)), 3 163 | 164 | # Create sketches per partition 165 | statement ok 166 | CREATE TABLE partition_sketches AS 167 | SELECT partition_id, datasketch_hll(12, value) as sketch 168 | FROM union_data 169 | GROUP BY partition_id 170 | 171 | # Verify we have 3 partition sketches 172 | query I 173 | SELECT count(*) FROM partition_sketches 174 | ---- 175 | 3 176 | 177 | # Each partition has 10000 distinct values 178 | query I 179 | SELECT datasketch_hll_estimate(sketch)::int between 9500 and 10500 FROM partition_sketches ORDER BY partition_id LIMIT 1 180 | ---- 181 | true 182 | 183 | # Union all partition sketches - should have ~30000 distinct values 184 | query I 185 | SELECT datasketch_hll_estimate(datasketch_hll_union(12, sketch))::int between 28500 and 31500 FROM partition_sketches 186 | ---- 187 | true 188 | 189 | # Verify union is not empty 190 | query I 191 | SELECT datasketch_hll_is_empty(datasketch_hll_union(12, sketch)) FROM partition_sketches 192 | ---- 193 | False 194 | 195 | # Union should have lg_config_k from the bind parameter 196 | query I 197 | SELECT datasketch_hll_lg_config_k(datasketch_hll_union(12, sketch)) FROM partition_sketches 198 | ---- 199 | 12 200 | 201 | # Test union with overlapping data 202 | statement ok 203 | CREATE TABLE overlap_union_data(value int, group_id int) 204 | 205 | statement ok 206 | INSERT INTO overlap_union_data SELECT unnest(generate_series(1, 50000)), 1 207 | 208 | statement ok 209 | INSERT INTO overlap_union_data SELECT unnest(generate_series(25000, 75000)), 2 210 | 211 | statement ok 212 | CREATE TABLE overlap_union_sketches AS 213 | SELECT group_id, datasketch_hll(14, value) as sketch 214 | FROM overlap_union_data 215 | GROUP BY group_id 216 | 217 | # Group 1 has 50000 distinct, Group 2 has 50001 distinct 218 | # Union should have 75000 distinct (1-75000) 219 | query I 220 | SELECT datasketch_hll_estimate(datasketch_hll_union(14, sketch))::int between 72000 and 78000 FROM overlap_union_sketches 221 | ---- 222 | true 223 | 224 | # Test union with different K values 225 | statement ok 226 | CREATE TABLE k_union_sketches AS 227 | SELECT datasketch_hll(8, value) as sketch FROM union_data WHERE partition_id = 1 228 | UNION ALL 229 | SELECT datasketch_hll(14, value) as sketch FROM union_data WHERE partition_id = 2 230 | 231 | # Verify union works with different K values and produces reasonable estimate 232 | query I 233 | SELECT datasketch_hll_estimate(datasketch_hll_union(10, sketch))::int between 18000 and 22000 FROM k_union_sketches 234 | ---- 235 | true 236 | 237 | # Test union of single sketch (edge case) 238 | query I 239 | SELECT datasketch_hll_estimate(datasketch_hll_union(12, sketch))::int between 9500 and 10500 240 | FROM (SELECT datasketch_hll(12, value) as sketch FROM union_data WHERE partition_id = 1) single_sketch 241 | ---- 242 | true 243 | 244 | # Test union preserves accuracy - lower_bound/upper_bound on union 245 | query I 246 | SELECT datasketch_hll_lower_bound(datasketch_hll_union(12, sketch), 1)::int between 27000 and 31000 FROM partition_sketches 247 | ---- 248 | true 249 | 250 | query I 251 | SELECT datasketch_hll_upper_bound(datasketch_hll_union(12, sketch), 1)::int between 29000 and 33000 FROM partition_sketches 252 | ---- 253 | true 254 | 255 | # Test union with string values 256 | statement ok 257 | CREATE TABLE string_union_data(name varchar, source_id int) 258 | 259 | statement ok 260 | INSERT INTO string_union_data 261 | SELECT 'user_' || x, 1 FROM generate_series(1, 1000) t(x) 262 | UNION ALL 263 | SELECT 'user_' || x, 2 FROM generate_series(500, 1500) t(x) 264 | 265 | statement ok 266 | CREATE TABLE string_union_sketches AS 267 | SELECT source_id, datasketch_hll(10, name) as sketch 268 | FROM string_union_data 269 | GROUP BY source_id 270 | 271 | # Union should have ~1500 distinct strings (user_1 to user_1500) 272 | query I 273 | SELECT datasketch_hll_estimate(datasketch_hll_union(10, sketch))::int between 1400 and 1600 FROM string_union_sketches 274 | ---- 275 | true 276 | 277 | # Test union with GROUP BY categories 278 | statement ok 279 | CREATE TABLE category_union_data(category varchar, user_id int) 280 | 281 | statement ok 282 | INSERT INTO category_union_data 283 | SELECT 'electronics', unnest(generate_series(1, 5000)) 284 | UNION ALL 285 | SELECT 'clothing', unnest(generate_series(2500, 7500)) 286 | UNION ALL 287 | SELECT 'food', unnest(generate_series(5000, 10000)) 288 | 289 | statement ok 290 | CREATE TABLE category_union_sketches AS 291 | SELECT category, datasketch_hll(12, user_id) as sketch 292 | FROM category_union_data 293 | GROUP BY category 294 | 295 | # Each category has ~5000 distinct users 296 | # Union should have ~10000 distinct users (1-10000) 297 | query I 298 | SELECT datasketch_hll_estimate(datasketch_hll_union(12, sketch))::int between 9500 and 10500 FROM category_union_sketches 299 | ---- 300 | true 301 | 302 | # Test describe on union result 303 | query I 304 | SELECT datasketch_hll_describe(datasketch_hll_union(12, sketch), true, false) like '%HLL sketch summary%' FROM partition_sketches 305 | ---- 306 | True 307 | 308 | # Test is_compact on union result 309 | query I 310 | SELECT datasketch_hll_is_compact(datasketch_hll_union(12, sketch)) FROM partition_sketches 311 | ---- 312 | False 313 | 314 | -------------------------------------------------------------------------------- /test/sql/datasketch_theta.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/datasketch_theta.test 2 | # description: test datasketch Theta sketches 3 | # group: [datasketches] 4 | 5 | # Ensure the extension is loaded 6 | require datasketches 7 | 8 | # ------------------------------------------------------------------- 9 | # 1. Basic Build and Estimate 10 | # ------------------------------------------------------------------- 11 | 12 | # Test constant inputs 13 | query I 14 | SELECT datasketch_theta_estimate(datasketch_theta(1)); 15 | ---- 16 | 1 17 | 18 | # Test basic distinct count with small data 19 | statement ok 20 | CREATE TABLE simple_items(id INTEGER); 21 | 22 | statement ok 23 | INSERT INTO simple_items VALUES (1), (2), (3), (3), (4), (5); 24 | 25 | # Should count 5 distinct items 26 | query I 27 | SELECT datasketch_theta_estimate(datasketch_theta(id))::int FROM simple_items; 28 | ---- 29 | 5 30 | 31 | # ------------------------------------------------------------------- 32 | # 2. Large Data Accuracy (Standard Error) 33 | # ------------------------------------------------------------------- 34 | 35 | statement ok 36 | CREATE TABLE large_data AS SELECT * FROM range(0, 100000) t(i); 37 | 38 | # Duplicate the data to ensure distinct counting works 39 | statement ok 40 | INSERT INTO large_data SELECT * FROM range(0, 100000) t(i); 41 | 42 | # Check estimate (Standard Theta K=4096 has error ~1.5-2%) 43 | query I 44 | SELECT datasketch_theta_estimate(datasketch_theta(i))::int BETWEEN 98000 AND 102000 FROM large_data; 45 | ---- 46 | true 47 | 48 | # Check Lower/Upper bounds (Standard Deviation) 49 | # Note: We didn't strictly implement lower/upper bound scalar functions in the C++ code 50 | # provided in previous steps, but if you did, add tests here. 51 | # If only 'estimate' was implemented, skip this. 52 | 53 | # ------------------------------------------------------------------- 54 | # 3. Set Operations: Intersection & Difference 55 | # ------------------------------------------------------------------- 56 | 57 | statement ok 58 | CREATE TABLE set_a AS SELECT * FROM range(1, 6) t(i); -- {1, 2, 3, 4, 5} 59 | 60 | statement ok 61 | CREATE TABLE set_b AS SELECT * FROM range(4, 9) t(i); -- {4, 5, 6, 7, 8} 62 | 63 | # Create a table to hold the sketches 64 | statement ok 65 | CREATE TABLE sketches (name VARCHAR, data sketch_theta); 66 | 67 | # Build sketches for A and B 68 | statement ok 69 | INSERT INTO sketches VALUES 70 | ('A', (SELECT datasketch_theta(i) FROM set_a)), 71 | ('B', (SELECT datasketch_theta(i) FROM set_b)); 72 | 73 | # --- INTERSECTION Test --- 74 | # Intersection of {1,2,3,4,5} and {4,5,6,7,8} is {4,5} -> Count: 2 75 | query I 76 | SELECT datasketch_theta_estimate( 77 | datasketch_theta_intersect(s1.data, s2.data) 78 | )::int 79 | FROM sketches s1, sketches s2 80 | WHERE s1.name = 'A' AND s2.name = 'B'; 81 | ---- 82 | 2 83 | 84 | # --- A NOT B Test --- 85 | # {1,2,3,4,5} NOT {4,5,6,7,8} is {1,2,3} -> Count: 3 86 | query I 87 | SELECT datasketch_theta_estimate( 88 | datasketch_theta_a_not_b(s1.data, s2.data) 89 | )::int 90 | FROM sketches s1, sketches s2 91 | WHERE s1.name = 'A' AND s2.name = 'B'; 92 | ---- 93 | 3 94 | 95 | # --- B NOT A Test --- 96 | # {4,5,6,7,8} NOT {1,2,3,4,5} is {6,7,8} -> Count: 3 97 | query I 98 | SELECT datasketch_theta_estimate( 99 | datasketch_theta_a_not_b(s2.data, s1.data) 100 | )::int 101 | FROM sketches s1, sketches s2 102 | WHERE s1.name = 'A' AND s2.name = 'B'; 103 | ---- 104 | 3 105 | 106 | # ------------------------------------------------------------------- 107 | # 4. String Types 108 | # ------------------------------------------------------------------- 109 | 110 | statement ok 111 | CREATE TABLE strings(s VARCHAR); 112 | 113 | statement ok 114 | INSERT INTO strings VALUES ('apple'), ('banana'), ('apple'), ('cherry'); 115 | 116 | query I 117 | SELECT datasketch_theta_estimate(datasketch_theta(s))::int FROM strings; 118 | ---- 119 | 3 120 | 121 | # ------------------------------------------------------------------- 122 | # 5. Configuration (Log K) 123 | # ------------------------------------------------------------------- 124 | 125 | # Test creating a sketch with a smaller K (Low accuracy) vs Large K 126 | # We verify the blobs are different sizes using octet_length. 127 | # NOTE: Minimum lg_k allowed by DataSketches is 5. 128 | query I 129 | SELECT octet_length(datasketch_theta(5, i)::BLOB) < octet_length(datasketch_theta(12, i)::BLOB) 130 | FROM range(0, 1000) t(i); 131 | ---- 132 | true 133 | 134 | # ------------------------------------------------------------------- 135 | # 6. Bounds and Describe 136 | # ------------------------------------------------------------------- 137 | 138 | query I 139 | SELECT datasketch_theta_describe(datasketch_theta(1)) LIKE '%Theta sketch summary%'; 140 | ---- 141 | true 142 | 143 | # Check lower bound for 2 standard deviations (approx 95% confidence) 144 | query I 145 | SELECT datasketch_theta_lower_bound(datasketch_theta(i), 2) <= 100000 FROM range(0, 100000) t(i); 146 | ---- 147 | true 148 | 149 | # Check upper bound 150 | query I 151 | SELECT datasketch_theta_upper_bound(datasketch_theta(i), 2) >= 100000 FROM range(0, 100000) t(i); 152 | ---- 153 | true 154 | 155 | # ------------------------------------------------------------------- 156 | # 7. Edge Cases - Empty and NULL Values 157 | # ------------------------------------------------------------------- 158 | 159 | statement ok 160 | CREATE TABLE empty_table(id INTEGER); 161 | 162 | # Empty sketch should estimate 0 163 | query I 164 | SELECT datasketch_theta_estimate(datasketch_theta(id))::int FROM empty_table; 165 | ---- 166 | 0 167 | 168 | # Test NULL handling 169 | statement ok 170 | CREATE TABLE with_nulls(id INTEGER); 171 | 172 | statement ok 173 | INSERT INTO with_nulls VALUES (1), (NULL), (2), (NULL), (3); 174 | 175 | # NULLs should be skipped, count only 3 distinct 176 | query I 177 | SELECT datasketch_theta_estimate(datasketch_theta(id))::int FROM with_nulls; 178 | ---- 179 | 3 180 | 181 | # All NULLs table 182 | statement ok 183 | CREATE TABLE all_nulls(id INTEGER); 184 | 185 | statement ok 186 | INSERT INTO all_nulls VALUES (NULL), (NULL), (NULL); 187 | 188 | query I 189 | SELECT datasketch_theta_estimate(datasketch_theta(id))::int FROM all_nulls; 190 | ---- 191 | 0 192 | 193 | # ------------------------------------------------------------------- 194 | # 8. Single Item Edge Cases 195 | # ------------------------------------------------------------------- 196 | 197 | statement ok 198 | CREATE TABLE single_item(id INTEGER); 199 | 200 | statement ok 201 | INSERT INTO single_item VALUES (42); 202 | 203 | query I 204 | SELECT datasketch_theta_estimate(datasketch_theta(id))::int FROM single_item; 205 | ---- 206 | 1 207 | 208 | # Many duplicates of single item 209 | statement ok 210 | CREATE TABLE many_dupes AS SELECT 42 as id FROM range(0, 10000); 211 | 212 | query I 213 | SELECT datasketch_theta_estimate(datasketch_theta(id))::int FROM many_dupes; 214 | ---- 215 | 1 216 | 217 | # ------------------------------------------------------------------- 218 | # 9. Union Operations 219 | # ------------------------------------------------------------------- 220 | 221 | # Union of {1,2,3,4,5} and {4,5,6,7,8} is {1,2,3,4,5,6,7,8} -> Count: 8 222 | query I 223 | SELECT datasketch_theta_estimate( 224 | datasketch_theta_union(s1.data, s2.data) 225 | )::int 226 | FROM sketches s1, sketches s2 227 | WHERE s1.name = 'A' AND s2.name = 'B'; 228 | ---- 229 | 8 230 | 231 | # Union of a sketch with itself should equal the original 232 | query I 233 | SELECT datasketch_theta_estimate( 234 | datasketch_theta_union(s1.data, s1.data) 235 | )::int 236 | FROM sketches s1 237 | WHERE s1.name = 'A'; 238 | ---- 239 | 5 240 | 241 | # Union with empty sketch should equal the non-empty sketch 242 | statement ok 243 | INSERT INTO sketches VALUES ('EMPTY', (SELECT datasketch_theta(id) FROM empty_table)); 244 | 245 | query I 246 | SELECT datasketch_theta_estimate( 247 | datasketch_theta_union(s1.data, s2.data) 248 | )::int 249 | FROM sketches s1, sketches s2 250 | WHERE s1.name = 'A' AND s2.name = 'EMPTY'; 251 | ---- 252 | 5 253 | 254 | # ------------------------------------------------------------------- 255 | # 10. Multiple Set Operations (Chaining) 256 | # ------------------------------------------------------------------- 257 | 258 | statement ok 259 | CREATE TABLE set_c AS SELECT * FROM range(1, 4) t(i); -- {1, 2, 3} 260 | 261 | statement ok 262 | INSERT INTO sketches VALUES 263 | ('C', (SELECT datasketch_theta(i) FROM set_c)); 264 | 265 | # (A UNION B) INTERSECT C 266 | # A = {1,2,3,4,5}, B = {4,5,6,7,8}, C = {1,2,3} 267 | # A UNION B = {1,2,3,4,5,6,7,8} 268 | # (A UNION B) INTERSECT C = {1,2,3} -> Count: 3 269 | query I 270 | SELECT datasketch_theta_estimate( 271 | datasketch_theta_intersect( 272 | datasketch_theta_union( 273 | (SELECT data FROM sketches WHERE name = 'A'), 274 | (SELECT data FROM sketches WHERE name = 'B') 275 | ), 276 | (SELECT data FROM sketches WHERE name = 'C') 277 | ) 278 | )::int; 279 | ---- 280 | 3 281 | 282 | # A INTERSECT B INTERSECT C 283 | # A ∩ B = {4,5}, {4,5} ∩ {1,2,3} = {} -> Count: 0 284 | query I 285 | SELECT datasketch_theta_estimate( 286 | datasketch_theta_intersect( 287 | datasketch_theta_intersect( 288 | (SELECT data FROM sketches WHERE name = 'A'), 289 | (SELECT data FROM sketches WHERE name = 'B') 290 | ), 291 | (SELECT data FROM sketches WHERE name = 'C') 292 | ) 293 | )::int; 294 | ---- 295 | 0 296 | 297 | # ------------------------------------------------------------------- 298 | # 11. Symmetric Difference (A NOT B + B NOT A) 299 | # ------------------------------------------------------------------- 300 | 301 | # A XOR B = (A - B) ∪ (B - A) 302 | # A = {1,2,3,4,5}, B = {4,5,6,7,8} 303 | # A - B = {1,2,3}, B - A = {6,7,8} 304 | # XOR = {1,2,3,6,7,8} -> Count: 6 305 | query I 306 | SELECT datasketch_theta_estimate( 307 | datasketch_theta_union( 308 | datasketch_theta_a_not_b(s1.data, s2.data), 309 | datasketch_theta_a_not_b(s2.data, s1.data) 310 | ) 311 | )::int 312 | FROM sketches s1, sketches s2 313 | WHERE s1.name = 'A' AND s2.name = 'B'; 314 | ---- 315 | 6 316 | 317 | # ------------------------------------------------------------------- 318 | # 12. Merging Sketches via Aggregate 319 | # ------------------------------------------------------------------- 320 | 321 | statement ok 322 | CREATE TABLE partitions(partition_id INTEGER, value INTEGER); 323 | 324 | statement ok 325 | INSERT INTO partitions VALUES 326 | (1, 1), (1, 2), (1, 3), 327 | (2, 3), (2, 4), (2, 5), 328 | (3, 5), (3, 6), (3, 7); 329 | 330 | # Build sketches per partition, then merge them 331 | # Total distinct values: {1,2,3,4,5,6,7} -> 7 332 | statement ok 333 | CREATE TABLE partition_sketches AS 334 | SELECT partition_id, datasketch_theta(value) as sketch 335 | FROM partitions 336 | GROUP BY partition_id; 337 | 338 | query I 339 | SELECT datasketch_theta_estimate( 340 | datasketch_theta(sketch) 341 | )::int 342 | FROM partition_sketches; 343 | ---- 344 | 7 345 | 346 | # ------------------------------------------------------------------- 347 | # 13. Different Data Types 348 | # ------------------------------------------------------------------- 349 | 350 | # BIGINT 351 | statement ok 352 | CREATE TABLE bigints(val BIGINT); 353 | 354 | statement ok 355 | INSERT INTO bigints VALUES 356 | (9223372036854775807), 357 | (9223372036854775806), 358 | (-9223372036854775808), 359 | (9223372036854775807); -- duplicate 360 | 361 | query I 362 | SELECT datasketch_theta_estimate(datasketch_theta(val))::int FROM bigints; 363 | ---- 364 | 3 365 | 366 | # VARCHAR with special characters 367 | statement ok 368 | CREATE TABLE special_strings(s VARCHAR); 369 | 370 | statement ok 371 | INSERT INTO special_strings VALUES 372 | ('hello'), ('world'), ('hello world'), 373 | ('emoji 🎉'), (''), ('hello'); 374 | 375 | query I 376 | SELECT datasketch_theta_estimate(datasketch_theta(s))::int FROM special_strings; 377 | ---- 378 | 5 379 | 380 | # Empty string should count as distinct 381 | query I 382 | SELECT datasketch_theta_estimate(datasketch_theta(s))::int 383 | FROM (VALUES (''), ('')) t(s); 384 | ---- 385 | 1 386 | 387 | # ------------------------------------------------------------------- 388 | # 14. Metadata and Diagnostic Functions 389 | # ------------------------------------------------------------------- 390 | 391 | statement ok 392 | CREATE TABLE meta_test AS SELECT * FROM range(0, 1000) t(i); 393 | 394 | statement ok 395 | CREATE TABLE meta_sketch AS SELECT datasketch_theta(i) as sketch FROM meta_test; 396 | 397 | # is_empty should be false for non-empty sketch 398 | query I 399 | SELECT datasketch_theta_is_empty(sketch) FROM meta_sketch; 400 | ---- 401 | false 402 | 403 | query I 404 | SELECT datasketch_theta_is_empty(datasketch_theta(id)) FROM empty_table; 405 | ---- 406 | true 407 | 408 | # num_retained should be positive for data within sketch capacity 409 | query I 410 | SELECT datasketch_theta_num_retained(sketch) > 0 FROM meta_sketch; 411 | ---- 412 | true 413 | 414 | # theta value should be in (0, 1] range 415 | query I 416 | SELECT datasketch_theta_get_theta(sketch) > 0 AND 417 | datasketch_theta_get_theta(sketch) <= 1 418 | FROM meta_sketch; 419 | ---- 420 | true 421 | 422 | # seed should match default or custom seed 423 | query I 424 | SELECT datasketch_theta_get_seed(sketch) = datasketch_theta_get_seed(sketch) 425 | FROM meta_sketch; 426 | ---- 427 | true 428 | 429 | # is_estimation_mode - small data should be exact 430 | query I 431 | SELECT datasketch_theta_is_estimation_mode(datasketch_theta(i)) 432 | FROM range(0, 10) t(i); 433 | ---- 434 | false 435 | 436 | # is_estimation_mode - large data should be in estimation mode 437 | query I 438 | SELECT datasketch_theta_is_estimation_mode(datasketch_theta(i)) 439 | FROM range(0, 100000) t(i); 440 | ---- 441 | true 442 | 443 | # ------------------------------------------------------------------- 444 | # 15. GROUP BY with Multiple Groups 445 | # ------------------------------------------------------------------- 446 | 447 | statement ok 448 | CREATE TABLE events(user_id INTEGER, event_type VARCHAR, item_id INTEGER); 449 | 450 | statement ok 451 | INSERT INTO events VALUES 452 | (1, 'view', 100), (1, 'view', 101), (1, 'click', 100), 453 | (2, 'view', 102), (2, 'click', 102), (2, 'view', 103), 454 | (3, 'view', 100), (3, 'view', 100), (3, 'click', 104); 455 | 456 | # Distinct items viewed per user 457 | statement ok 458 | CREATE TABLE user_sketches AS 459 | SELECT user_id, datasketch_theta(item_id) as sketch 460 | FROM events 461 | WHERE event_type = 'view' 462 | GROUP BY user_id; 463 | 464 | query II 465 | SELECT user_id, datasketch_theta_estimate(sketch)::int 466 | FROM user_sketches 467 | ORDER BY user_id; 468 | ---- 469 | 1 2 470 | 2 2 471 | 3 1 472 | 473 | # ------------------------------------------------------------------- 474 | # 16. Serialization and Persistence 475 | # ------------------------------------------------------------------- 476 | 477 | statement ok 478 | CREATE TABLE sketch_storage(id INTEGER, sketch_data sketch_theta); 479 | 480 | statement ok 481 | INSERT INTO sketch_storage 482 | SELECT 1, datasketch_theta(i) FROM range(0, 100) t(i); 483 | 484 | # Retrieve and use stored sketch 485 | query I 486 | SELECT datasketch_theta_estimate(sketch_data)::int 487 | FROM sketch_storage 488 | WHERE id = 1; 489 | ---- 490 | 100 491 | 492 | # Store result of set operation 493 | statement ok 494 | INSERT INTO sketch_storage 495 | SELECT 2, datasketch_theta_union(s1.data, s2.data) 496 | FROM sketches s1, sketches s2 497 | WHERE s1.name = 'A' AND s2.name = 'B'; 498 | 499 | query I 500 | SELECT datasketch_theta_estimate(sketch_data)::int 501 | FROM sketch_storage 502 | WHERE id = 2; 503 | ---- 504 | 8 505 | 506 | # ------------------------------------------------------------------- 507 | # 17. Bounds with Different Standard Deviations 508 | # ------------------------------------------------------------------- 509 | 510 | statement ok 511 | CREATE TABLE bounds_test AS 512 | SELECT datasketch_theta(i) as sketch FROM range(0, 50000) t(i); 513 | 514 | # 1 SD (~68% confidence) 515 | query I 516 | SELECT 517 | datasketch_theta_lower_bound(sketch, 1) <= 50000 AND 518 | datasketch_theta_upper_bound(sketch, 1) >= 50000 519 | FROM bounds_test; 520 | ---- 521 | true 522 | 523 | # 2 SD (~95% confidence) - wider interval 524 | query I 525 | SELECT 526 | datasketch_theta_lower_bound(sketch, 2) <= datasketch_theta_lower_bound(sketch, 1) AND 527 | datasketch_theta_upper_bound(sketch, 2) >= datasketch_theta_upper_bound(sketch, 1) 528 | FROM bounds_test; 529 | ---- 530 | true 531 | 532 | # 3 SD (~99.7% confidence) - widest interval 533 | query I 534 | SELECT 535 | datasketch_theta_lower_bound(sketch, 3) <= datasketch_theta_lower_bound(sketch, 2) AND 536 | datasketch_theta_upper_bound(sketch, 3) >= datasketch_theta_upper_bound(sketch, 2) 537 | FROM bounds_test; 538 | ---- 539 | true 540 | 541 | # Estimate should be within bounds 542 | query I 543 | SELECT 544 | datasketch_theta_estimate(sketch) >= datasketch_theta_lower_bound(sketch, 2) AND 545 | datasketch_theta_estimate(sketch) <= datasketch_theta_upper_bound(sketch, 2) 546 | FROM bounds_test; 547 | ---- 548 | true 549 | 550 | # ------------------------------------------------------------------- 551 | # 18. Set Operations Commutativity and Identity 552 | # ------------------------------------------------------------------- 553 | 554 | # Union is commutative: A ∪ B = B ∪ A 555 | query I 556 | SELECT 557 | datasketch_theta_estimate(datasketch_theta_union(s1.data, s2.data))::int = 558 | datasketch_theta_estimate(datasketch_theta_union(s2.data, s1.data))::int 559 | FROM sketches s1, sketches s2 560 | WHERE s1.name = 'A' AND s2.name = 'B'; 561 | ---- 562 | true 563 | 564 | # Intersection is commutative: A ∩ B = B ∩ A 565 | query I 566 | SELECT 567 | datasketch_theta_estimate(datasketch_theta_intersect(s1.data, s2.data))::int = 568 | datasketch_theta_estimate(datasketch_theta_intersect(s2.data, s1.data))::int 569 | FROM sketches s1, sketches s2 570 | WHERE s1.name = 'A' AND s2.name = 'B'; 571 | ---- 572 | true 573 | 574 | # A ∪ ∅ = A (identity) 575 | query I 576 | SELECT 577 | datasketch_theta_estimate(datasketch_theta_union(s1.data, s2.data))::int = 578 | datasketch_theta_estimate(s1.data)::int 579 | FROM sketches s1, sketches s2 580 | WHERE s1.name = 'A' AND s2.name = 'EMPTY'; 581 | ---- 582 | true 583 | 584 | # A ∩ A = A (idempotence) 585 | query I 586 | SELECT 587 | datasketch_theta_estimate(datasketch_theta_intersect(s1.data, s1.data))::int = 588 | datasketch_theta_estimate(s1.data)::int 589 | FROM sketches s1 590 | WHERE s1.name = 'A'; 591 | ---- 592 | true 593 | 594 | # A - A = ∅ 595 | query I 596 | SELECT datasketch_theta_estimate(datasketch_theta_a_not_b(s1.data, s1.data))::int 597 | FROM sketches s1 598 | WHERE s1.name = 'A'; 599 | ---- 600 | 0 601 | 602 | # A NOT B ≠ B NOT A (not commutative) 603 | # Create separate tables for this test only 604 | statement ok 605 | CREATE TABLE noncomm_set_x AS SELECT * FROM range(1, 11) t(i); -- {1, 2, 3, 4, 5, 6, 7, 8, 9, 10} 606 | 607 | statement ok 608 | CREATE TABLE noncomm_set_y AS SELECT * FROM range(8, 12) t(i); -- {8, 9, 10, 11} 609 | 610 | # Create separate sketch table for this test 611 | statement ok 612 | CREATE TABLE noncomm_sketches(name VARCHAR, sketch sketch_theta); 613 | 614 | statement ok 615 | INSERT INTO noncomm_sketches VALUES 616 | ('X', (SELECT datasketch_theta(i) FROM noncomm_set_x)), 617 | ('Y', (SELECT datasketch_theta(i) FROM noncomm_set_y)); 618 | 619 | # X NOT Y = {1, 2, 3, 4, 5, 6, 7} -> Count: 7 620 | query I 621 | SELECT datasketch_theta_estimate( 622 | datasketch_theta_a_not_b(sx.sketch, sy.sketch) 623 | )::int 624 | FROM noncomm_sketches sx, noncomm_sketches sy 625 | WHERE sx.name = 'X' AND sy.name = 'Y'; 626 | ---- 627 | 7 628 | 629 | # Y NOT X = {11} -> Count: 1 630 | query I 631 | SELECT datasketch_theta_estimate( 632 | datasketch_theta_a_not_b(sy.sketch, sx.sketch) 633 | )::int 634 | FROM noncomm_sketches sx, noncomm_sketches sy 635 | WHERE sx.name = 'X' AND sy.name = 'Y'; 636 | ---- 637 | 1 638 | 639 | # Verify they are NOT equal (7 != 1 proves non-commutativity) 640 | query I 641 | SELECT 642 | datasketch_theta_estimate(datasketch_theta_a_not_b(sx.sketch, sy.sketch))::int != 643 | datasketch_theta_estimate(datasketch_theta_a_not_b(sy.sketch, sx.sketch))::int 644 | FROM noncomm_sketches sx, noncomm_sketches sy 645 | WHERE sx.name = 'X' AND sy.name = 'Y'; 646 | ---- 647 | true 648 | 649 | # ------------------------------------------------------------------- 650 | # 19. Jaccard Similarity (if implemented) 651 | # ------------------------------------------------------------------- 652 | # Jaccard = |A ∩ B| / |A ∪ B| 653 | # A = {1,2,3,4,5}, B = {4,5,6,7,8} 654 | # |A ∩ B| = 2, |A ∪ B| = 8 655 | # Jaccard = 2/8 = 0.25 656 | 657 | # Manual calculation if no direct Jaccard function 658 | query I 659 | SELECT ( 660 | datasketch_theta_estimate(datasketch_theta_intersect(s1.data, s2.data)) / 661 | datasketch_theta_estimate(datasketch_theta_union(s1.data, s2.data)) 662 | )::decimal(4,2) 663 | FROM sketches s1, sketches s2 664 | WHERE s1.name = 'A' AND s2.name = 'B'; 665 | ---- 666 | 0.25 667 | 668 | # ------------------------------------------------------------------- 669 | # 20. Very Small K Parameter 670 | # ------------------------------------------------------------------- 671 | 672 | # Test minimum lg_k = 5 (K = 32) 673 | statement ok 674 | CREATE TABLE small_k_test AS SELECT * FROM range(0, 1000) t(i); 675 | 676 | statement ok 677 | CREATE TABLE small_k_sketch AS 678 | SELECT datasketch_theta(5, i) as sketch FROM small_k_test; 679 | 680 | # Should still give reasonable estimate (with high error) 681 | query I 682 | SELECT datasketch_theta_estimate(sketch)::int BETWEEN 500 AND 1500 683 | FROM small_k_sketch; 684 | ---- 685 | true 686 | 687 | # Should be in estimation mode 688 | query I 689 | SELECT datasketch_theta_is_estimation_mode(sketch) FROM small_k_sketch; 690 | ---- 691 | true 692 | 693 | # Theta should be less than 1.0 (sampling occurred) 694 | query I 695 | SELECT datasketch_theta_get_theta(sketch) < 1.0 FROM small_k_sketch; 696 | ---- 697 | true 698 | 699 | # ------------------------------------------------------------------- 700 | # 21. De-duplication Use Case 701 | # ------------------------------------------------------------------- 702 | 703 | statement ok 704 | CREATE TABLE raw_events(session_id VARCHAR, user_id INTEGER); 705 | 706 | statement ok 707 | INSERT INTO raw_events VALUES 708 | ('s1', 1), ('s1', 1), ('s1', 1), -- Same session/user repeated 709 | ('s2', 2), ('s2', 2), 710 | ('s3', 3), ('s4', 1); -- User 1 appears again in different session 711 | 712 | # Distinct sessions 713 | query I 714 | SELECT datasketch_theta_estimate(datasketch_theta(session_id))::int 715 | FROM raw_events; 716 | ---- 717 | 4 718 | 719 | # Distinct users 720 | query I 721 | SELECT datasketch_theta_estimate(datasketch_theta(user_id))::int 722 | FROM raw_events; 723 | ---- 724 | 3 725 | 726 | # ------------------------------------------------------------------- 727 | # 22. Extremely Large Cardinality 728 | # ------------------------------------------------------------------- 729 | 730 | statement ok 731 | CREATE TABLE million_items AS SELECT * FROM range(0, 1000000) t(i); 732 | 733 | # Should estimate close to 1M (within ~1-2% with default K) 734 | query I 735 | SELECT datasketch_theta_estimate(datasketch_theta(i))::int 736 | BETWEEN 980000 AND 1020000 737 | FROM million_items; 738 | ---- 739 | true 740 | 741 | # Should be in estimation mode 742 | query I 743 | SELECT datasketch_theta_is_estimation_mode(datasketch_theta(i)) 744 | FROM million_items; 745 | ---- 746 | true 747 | 748 | # ------------------------------------------------------------------- 749 | # 23. Describe Function Detail 750 | # ------------------------------------------------------------------- 751 | 752 | # Verify describe contains key information 753 | query I 754 | SELECT datasketch_theta_describe(sketch) LIKE '%Empty%' OR 755 | datasketch_theta_describe(sketch) LIKE '%estimate%' 756 | FROM meta_sketch; 757 | ---- 758 | true 759 | 760 | # Describe empty sketch 761 | # query I 762 | # SELECT datasketch_theta_describe(datasketch_theta(id)) LIKE '%Empty%' 763 | # FROM empty_table; 764 | # ---- 765 | # true 766 | 767 | # ------------------------------------------------------------------- 768 | # 24. CTE and Subquery Integration 769 | # ------------------------------------------------------------------- 770 | 771 | 772 | query I 773 | WITH daily_users AS ( 774 | SELECT 1 as day, unnest([1, 2, 3, 3, 4]) as user_id 775 | UNION ALL 776 | SELECT 2 as day, unnest([3, 4, 5, 6]) as user_id 777 | UNION ALL 778 | SELECT 3 as day, unnest([1, 5, 7, 8, 9]) as user_id 779 | ), 780 | daily_sketches AS ( 781 | SELECT day, datasketch_theta(user_id) as sketch 782 | FROM daily_users 783 | GROUP BY day 784 | ) 785 | SELECT datasketch_theta_estimate( 786 | datasketch_theta(sketch) 787 | )::int as total_unique_users 788 | FROM daily_sketches; 789 | ---- 790 | 9 791 | 792 | # ------------------------------------------------------------------- 793 | # 25. Case Sensitivity (Strings) 794 | # ------------------------------------------------------------------- 795 | 796 | statement ok 797 | CREATE TABLE case_test(s VARCHAR); 798 | 799 | statement ok 800 | INSERT INTO case_test VALUES ('Apple'), ('apple'), ('APPLE'); 801 | 802 | # Should count as 3 distinct (case-sensitive) 803 | query I 804 | SELECT datasketch_theta_estimate(datasketch_theta(s))::int FROM case_test; 805 | ---- 806 | 3 807 | 808 | # ------------------------------------------------------------------- 809 | # 26. Window Functions (if supported) 810 | # ------------------------------------------------------------------- 811 | 812 | statement ok 813 | CREATE TABLE time_series(ts INTEGER, user_id INTEGER); 814 | 815 | statement ok 816 | INSERT INTO time_series VALUES 817 | (1, 10), (2, 11), (3, 12), (4, 10), (5, 13), 818 | (6, 14), (7, 15), (8, 10), (9, 16), (10, 17); 819 | 820 | # Running distinct count (if window aggregate is supported) 821 | # This may not work depending on implementation - include if supported 822 | query II 823 | SELECT ts, datasketch_theta_estimate( 824 | datasketch_theta(user_id) OVER (ORDER BY ts ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) 825 | )::int 826 | FROM time_series 827 | ORDER BY ts; 828 | ---- 829 | 1 1 830 | 2 2 831 | 3 3 832 | 4 3 833 | 5 4 834 | 6 5 835 | 7 6 836 | 8 6 837 | 9 7 838 | 10 8 839 | -------------------------------------------------------------------------------- /test/sql/datasketch_frequent.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/datasketch_frequent.test 2 | # description: Test DataSketches Frequent Items Sketch 3 | # group: [datasketches] 4 | 5 | # Ensure the extension is loaded 6 | require datasketches 7 | 8 | # ------------------------------------------------------------------- 9 | # 1. Basic Estimates (Exact Mode) 10 | # ------------------------------------------------------------------- 11 | # For small datasets that fit entirely in the map, counts should be exact. 12 | 13 | statement ok 14 | CREATE TABLE fruits(name VARCHAR); 15 | 16 | statement ok 17 | INSERT INTO fruits VALUES 18 | ('apple'), ('apple'), ('apple'), 19 | ('banana'), ('banana'), 20 | ('cherry'); 21 | 22 | # Build the sketch 23 | statement ok 24 | CREATE TABLE fruit_sketch AS SELECT datasketch_frequent_items(name) as sketch FROM fruits; 25 | 26 | # Check specific estimates 27 | query I 28 | SELECT datasketch_frequent_items_estimate(sketch, 'apple') FROM fruit_sketch; 29 | ---- 30 | 3 31 | 32 | query I 33 | SELECT datasketch_frequent_items_estimate(sketch, 'banana') FROM fruit_sketch; 34 | ---- 35 | 2 36 | 37 | query I 38 | SELECT datasketch_frequent_items_estimate(sketch, 'cherry') FROM fruit_sketch; 39 | ---- 40 | 1 41 | 42 | query I 43 | SELECT datasketch_frequent_items_estimate(sketch, 'dragonfruit') FROM fruit_sketch; 44 | ---- 45 | 0 46 | 47 | # ------------------------------------------------------------------- 48 | # 2. Get Frequent Items (Complex Return Type) 49 | # ------------------------------------------------------------------- 50 | # Returns: LIST(STRUCT(item VARCHAR, estimate BIGINT, lower_bound BIGINT, upper_bound BIGINT)) 51 | 52 | # We unnest the list to verify the contents easily 53 | query ITII 54 | SELECT 55 | f.item, 56 | f.estimate, 57 | f.lower_bound, 58 | f.upper_bound 59 | FROM fruit_sketch, 60 | UNNEST(datasketch_frequent_items_get_frequent(sketch, 'NO_FALSE_POSITIVES')) as t(f) 61 | ORDER BY f.estimate DESC; 62 | ---- 63 | apple 3 3 3 64 | banana 2 2 2 65 | cherry 1 1 1 66 | 67 | # ------------------------------------------------------------------- 68 | # 3. Custom K Parameter 69 | # ------------------------------------------------------------------- 70 | # Verify we can pass the lg_max_k parameter (log2 of map size) 71 | 72 | statement ok 73 | DROP TABLE fruit_sketch; 74 | 75 | # Create with lg_k = 4 (small map) 76 | statement ok 77 | CREATE TABLE fruit_sketch AS SELECT datasketch_frequent_items(4, name) as sketch FROM fruits; 78 | 79 | query I 80 | SELECT datasketch_frequent_items_estimate(sketch, 'apple') FROM fruit_sketch; 81 | ---- 82 | 3 83 | 84 | # ------------------------------------------------------------------- 85 | # 4. Merging Sketches 86 | # ------------------------------------------------------------------- 87 | 88 | statement ok 89 | CREATE TABLE logs_part1(ip VARCHAR); 90 | 91 | statement ok 92 | CREATE TABLE logs_part2(ip VARCHAR); 93 | 94 | statement ok 95 | INSERT INTO logs_part1 VALUES ('192.168.1.1'), ('192.168.1.1'), ('10.0.0.1'); 96 | 97 | statement ok 98 | INSERT INTO logs_part2 VALUES ('192.168.1.1'), ('10.0.0.5'); 99 | 100 | # Create a table of partial sketches 101 | statement ok 102 | CREATE TABLE partial_sketches(grp INT, sketch sketch_frequent_items); 103 | 104 | statement ok 105 | INSERT INTO partial_sketches VALUES 106 | (1, (SELECT datasketch_frequent_items(ip) FROM logs_part1)), 107 | (2, (SELECT datasketch_frequent_items(ip) FROM logs_part2)); 108 | 109 | # Merge them using the aggregate function 110 | # Total '192.168.1.1' count should be 2 + 1 = 3 111 | query I 112 | SELECT datasketch_frequent_items_estimate( 113 | datasketch_frequent_items(sketch), 114 | '192.168.1.1' 115 | ) 116 | FROM partial_sketches; 117 | ---- 118 | 3 119 | 120 | # ------------------------------------------------------------------- 121 | # 5. Heavy Hitters (Approximate Mode) 122 | # ------------------------------------------------------------------- 123 | # We generate a dataset where 'heavy_hitter' appears 100 times, 124 | # and 2000 random distinct items appear once. 125 | # With a small K, the sketch should drop the singletons but keep the heavy hitter. 126 | 127 | statement ok 128 | CREATE TABLE stream(item VARCHAR); 129 | 130 | # Insert heavy hitter 100 times 131 | statement ok 132 | INSERT INTO stream SELECT 'heavy_hitter' FROM range(0, 100); 133 | 134 | # Insert 2000 noise items 135 | statement ok 136 | INSERT INTO stream SELECT 'noise_' || i::VARCHAR FROM range(0, 2000) t(i); 137 | 138 | # Use lg_k = 6 (Map size ~64). 139 | # This is too small to hold 2100 items, so it will purge the noise. 140 | statement ok 141 | CREATE TABLE stream_sketch AS SELECT datasketch_frequent_items(6, item) as sketch FROM stream; 142 | 143 | # 1. Check Heavy Hitter is found 144 | query I 145 | SELECT datasketch_frequent_items_estimate(sketch, 'heavy_hitter') >= 100 FROM stream_sketch; 146 | ---- 147 | true 148 | 149 | # 2. Check Noise is likely dropped (estimate 0 or very close to 0) 150 | # Note: exact behavior depends on the purge algorithm, but for lg_k=6, noise should be evicted. 151 | query I 152 | SELECT datasketch_frequent_items_estimate(sketch, 'noise_1') < 5 FROM stream_sketch; 153 | ---- 154 | true 155 | 156 | # 3. Get Frequent List - Should definitely contain heavy_hitter 157 | query T 158 | SELECT f.item 159 | FROM stream_sketch, 160 | UNNEST(datasketch_frequent_items_get_frequent(sketch, 'NO_FALSE_POSITIVES')) as t(f) 161 | WHERE f.item = 'heavy_hitter'; 162 | ---- 163 | heavy_hitter 164 | 165 | # ------------------------------------------------------------------- 166 | # 6. Error Type Enum Check 167 | # ------------------------------------------------------------------- 168 | 169 | # NO_FALSE_NEGATIVES usually returns more items (potentially including noise) 170 | # NO_FALSE_POSITIVES usually returns fewer items (stricter) 171 | 172 | query I 173 | SELECT ( 174 | (SELECT count(*) FROM stream_sketch, UNNEST(datasketch_frequent_items_get_frequent(sketch, 'NO_FALSE_NEGATIVES')) t(f)) 175 | >= 176 | (SELECT count(*) FROM stream_sketch, UNNEST(datasketch_frequent_items_get_frequent(sketch, 'NO_FALSE_POSITIVES')) t(f)) 177 | ); 178 | ---- 179 | true 180 | 181 | # ------------------------------------------------------------------- 182 | # 7. Edge Cases - Empty Tables and NULL Values 183 | # ------------------------------------------------------------------- 184 | 185 | statement ok 186 | CREATE TABLE empty_table(item VARCHAR); 187 | 188 | statement ok 189 | CREATE TABLE empty_sketch AS SELECT datasketch_frequent_items(item) as sketch FROM empty_table; 190 | 191 | # Empty sketch should return 0 for any item 192 | query I 193 | SELECT datasketch_frequent_items_estimate(sketch, 'anything') FROM empty_sketch; 194 | ---- 195 | 0 196 | 197 | # Test NULL handling - NULLs should be skipped 198 | statement ok 199 | CREATE TABLE with_nulls(item VARCHAR); 200 | 201 | statement ok 202 | INSERT INTO with_nulls VALUES ('apple'), (NULL), ('apple'), (NULL), ('banana'); 203 | 204 | statement ok 205 | CREATE TABLE nulls_sketch AS SELECT datasketch_frequent_items(item) as sketch FROM with_nulls; 206 | 207 | # Should only count non-NULL values 208 | query I 209 | SELECT datasketch_frequent_items_estimate(sketch, 'apple') FROM nulls_sketch; 210 | ---- 211 | 2 212 | 213 | query I 214 | SELECT datasketch_frequent_items_estimate(sketch, 'banana') FROM nulls_sketch; 215 | ---- 216 | 1 217 | 218 | # ------------------------------------------------------------------- 219 | # 8. Metadata Functions 220 | # ------------------------------------------------------------------- 221 | 222 | # Test epsilon (error bound) - should be related to map size 223 | query I 224 | SELECT datasketch_frequent_items_epsilon(sketch) > 0 FROM fruit_sketch; 225 | ---- 226 | true 227 | 228 | # Test total weight - should match total items 229 | query I 230 | SELECT datasketch_frequent_items_total_weight(sketch) FROM fruit_sketch; 231 | ---- 232 | 6 233 | 234 | # Test is_empty on non-empty sketch 235 | query I 236 | SELECT datasketch_frequent_items_is_empty(sketch) FROM fruit_sketch; 237 | ---- 238 | false 239 | 240 | # Test is_empty on empty sketch 241 | query I 242 | SELECT datasketch_frequent_items_is_empty(sketch) FROM empty_sketch; 243 | ---- 244 | true 245 | 246 | # Test num_active_items 247 | query I 248 | SELECT datasketch_frequent_items_num_active(sketch) FROM fruit_sketch; 249 | ---- 250 | 3 251 | 252 | # ------------------------------------------------------------------- 253 | # 9. Upper and Lower Bounds 254 | # ------------------------------------------------------------------- 255 | 256 | # In exact mode, bounds should equal the estimate 257 | query III 258 | SELECT 259 | datasketch_frequent_items_estimate(sketch, 'apple'), 260 | datasketch_frequent_items_lower_bound(sketch, 'apple'), 261 | datasketch_frequent_items_upper_bound(sketch, 'apple') 262 | FROM fruit_sketch; 263 | ---- 264 | 3 3 3 265 | 266 | # In approximate mode with heavy_hitter 267 | query I 268 | SELECT 269 | datasketch_frequent_items_upper_bound(sketch, 'heavy_hitter') >= 270 | datasketch_frequent_items_lower_bound(sketch, 'heavy_hitter') 271 | FROM stream_sketch; 272 | ---- 273 | true 274 | 275 | # Bounds should satisfy: lower <= estimate <= upper 276 | query I 277 | SELECT 278 | datasketch_frequent_items_lower_bound(sketch, 'heavy_hitter') <= 279 | datasketch_frequent_items_estimate(sketch, 'heavy_hitter') AND 280 | datasketch_frequent_items_estimate(sketch, 'heavy_hitter') <= 281 | datasketch_frequent_items_upper_bound(sketch, 'heavy_hitter') 282 | FROM stream_sketch; 283 | ---- 284 | true 285 | 286 | # ------------------------------------------------------------------- 287 | # 11. All Items Same Frequency 288 | # ------------------------------------------------------------------- 289 | 290 | statement ok 291 | CREATE TABLE uniform(item VARCHAR); 292 | 293 | statement ok 294 | INSERT INTO uniform VALUES ('a'), ('b'), ('c'), ('d'), ('e'); 295 | 296 | statement ok 297 | CREATE TABLE uniform_sketch AS SELECT datasketch_frequent_items(item) as sketch FROM uniform; 298 | 299 | # All should have estimate = 1 300 | query I 301 | SELECT datasketch_frequent_items_estimate(sketch, 'a') = 302 | datasketch_frequent_items_estimate(sketch, 'b') AND 303 | datasketch_frequent_items_estimate(sketch, 'b') = 304 | datasketch_frequent_items_estimate(sketch, 'c') 305 | FROM uniform_sketch; 306 | ---- 307 | true 308 | 309 | # ------------------------------------------------------------------- 310 | # 12. Single Item Dataset 311 | # ------------------------------------------------------------------- 312 | 313 | statement ok 314 | CREATE TABLE singleton(item VARCHAR); 315 | 316 | statement ok 317 | INSERT INTO singleton VALUES ('only_one'); 318 | 319 | statement ok 320 | CREATE TABLE singleton_sketch AS SELECT datasketch_frequent_items(item) as sketch FROM singleton; 321 | 322 | query I 323 | SELECT datasketch_frequent_items_estimate(sketch, 'only_one') FROM singleton_sketch; 324 | ---- 325 | 1 326 | 327 | query I 328 | SELECT datasketch_frequent_items_total_weight(sketch) FROM singleton_sketch; 329 | ---- 330 | 1 331 | 332 | query I 333 | SELECT datasketch_frequent_items_num_active(sketch) FROM singleton_sketch; 334 | ---- 335 | 1 336 | 337 | # ------------------------------------------------------------------- 338 | # 13. GROUP BY with Multiple Sketches 339 | # ------------------------------------------------------------------- 340 | 341 | statement ok 342 | CREATE TABLE events(category VARCHAR, event_type VARCHAR); 343 | 344 | statement ok 345 | INSERT INTO events VALUES 346 | ('web', 'click'), ('web', 'click'), ('web', 'view'), 347 | ('mobile', 'click'), ('mobile', 'swipe'), ('mobile', 'swipe'), ('mobile', 'swipe'); 348 | 349 | # Create sketches per category 350 | statement ok 351 | CREATE TABLE category_sketches AS 352 | SELECT 353 | category, 354 | datasketch_frequent_items(event_type) as sketch 355 | FROM events 356 | GROUP BY category; 357 | 358 | # Verify web category 359 | query I 360 | SELECT datasketch_frequent_items_estimate(sketch, 'click') 361 | FROM category_sketches 362 | WHERE category = 'web'; 363 | ---- 364 | 2 365 | 366 | # Verify mobile category 367 | query I 368 | SELECT datasketch_frequent_items_estimate(sketch, 'swipe') 369 | FROM category_sketches 370 | WHERE category = 'mobile'; 371 | ---- 372 | 3 373 | 374 | # ------------------------------------------------------------------- 375 | # 14. Serialization Persistence 376 | # ------------------------------------------------------------------- 377 | 378 | # Create a sketch, insert it, retrieve it, and verify it works 379 | statement ok 380 | CREATE TABLE sketch_storage(id INTEGER, sketch_data sketch_frequent_items); 381 | 382 | statement ok 383 | INSERT INTO sketch_storage 384 | SELECT 1, datasketch_frequent_items(name) FROM fruits; 385 | 386 | # Query from stored sketch 387 | query I 388 | SELECT datasketch_frequent_items_estimate(sketch_data, 'apple') 389 | FROM sketch_storage 390 | WHERE id = 1; 391 | ---- 392 | 3 393 | 394 | # ------------------------------------------------------------------- 395 | # 15. Very Large K Parameter 396 | # ------------------------------------------------------------------- 397 | 398 | statement ok 399 | CREATE TABLE large_k_data(item VARCHAR); 400 | 401 | statement ok 402 | INSERT INTO large_k_data VALUES ('a'), ('b'), ('c'); 403 | 404 | # Use lg_k = 20 (very large map: 2^20 = ~1M entries) 405 | statement ok 406 | CREATE TABLE large_k_sketch AS SELECT datasketch_frequent_items(20, item) as sketch FROM large_k_data; 407 | 408 | query I 409 | SELECT datasketch_frequent_items_estimate(sketch, 'a') FROM large_k_sketch; 410 | ---- 411 | 1 412 | 413 | # Epsilon should be very small for large K 414 | query I 415 | SELECT datasketch_frequent_items_epsilon(sketch) < 0.001 FROM large_k_sketch; 416 | ---- 417 | true 418 | 419 | # ------------------------------------------------------------------- 420 | # 16. Duplicate Item Streams 421 | # ------------------------------------------------------------------- 422 | 423 | statement ok 424 | CREATE TABLE duplicates(item VARCHAR); 425 | 426 | statement ok 427 | INSERT INTO duplicates SELECT 'repeated' FROM range(0, 1000); 428 | 429 | statement ok 430 | CREATE TABLE dup_sketch AS SELECT datasketch_frequent_items(item) as sketch FROM duplicates; 431 | 432 | query I 433 | SELECT datasketch_frequent_items_estimate(sketch, 'repeated') FROM dup_sketch; 434 | ---- 435 | 1000 436 | 437 | query I 438 | SELECT datasketch_frequent_items_total_weight(sketch) FROM dup_sketch; 439 | ---- 440 | 1000 441 | 442 | # Only 1 unique item 443 | query I 444 | SELECT datasketch_frequent_items_num_active(sketch) FROM dup_sketch; 445 | ---- 446 | 1 447 | 448 | # ------------------------------------------------------------------- 449 | # 17. Zipfian Distribution (Realistic Workload) 450 | # ------------------------------------------------------------------- 451 | # Create a Zipf-like distribution where a few items are very frequent 452 | 453 | statement ok 454 | CREATE TABLE zipf_data(item VARCHAR); 455 | 456 | statement ok 457 | INSERT INTO zipf_data 458 | SELECT 'rank_' || (i % 10)::VARCHAR 459 | FROM range(0, 1000) t(i) 460 | WHERE i % 2 = 0 -- 'rank_0' appears most 461 | UNION ALL 462 | SELECT 'rank_' || ((i % 100) / 10)::VARCHAR 463 | FROM range(0, 1000) t(i) 464 | WHERE i % 3 = 0; -- Add more skew 465 | 466 | statement ok 467 | CREATE TABLE zipf_sketch AS SELECT datasketch_frequent_items(8, item) as sketch FROM zipf_data; 468 | 469 | # Verify top items are captured 470 | query I 471 | SELECT count(*) > 0 472 | FROM zipf_sketch, 473 | UNNEST(datasketch_frequent_items_get_frequent(sketch, 'NO_FALSE_POSITIVES')) as t(f) 474 | WHERE f.item = 'rank_0'; 475 | ---- 476 | true 477 | 478 | # ------------------------------------------------------------------- 479 | # 18. Error Type Difference Verification 480 | # ------------------------------------------------------------------- 481 | 482 | statement ok 483 | CREATE TABLE error_test(item VARCHAR); 484 | 485 | statement ok 486 | INSERT INTO error_test 487 | SELECT 'freq_' || (i % 5)::VARCHAR FROM range(0, 100) t(i) 488 | UNION ALL 489 | SELECT 'rare_' || i::VARCHAR FROM range(0, 50) t(i); 490 | 491 | statement ok 492 | CREATE TABLE error_sketch AS SELECT datasketch_frequent_items(6, item) as sketch FROM error_test; 493 | 494 | # NO_FALSE_NEGATIVES should include items that might not be heavy hitters 495 | # NO_FALSE_POSITIVES should only include confirmed heavy hitters 496 | 497 | # Get counts for each error type 498 | query I 499 | SELECT count(*) > 0 500 | FROM error_sketch, 501 | UNNEST(datasketch_frequent_items_get_frequent(sketch, 'NO_FALSE_NEGATIVES')) as t(f); 502 | ---- 503 | true 504 | 505 | query I 506 | SELECT count(*) > 0 507 | FROM error_sketch, 508 | UNNEST(datasketch_frequent_items_get_frequent(sketch, 'NO_FALSE_POSITIVES')) as t(f); 509 | ---- 510 | true 511 | 512 | # ------------------------------------------------------------------- 513 | # 19. Non-Existent Item Queries 514 | # ------------------------------------------------------------------- 515 | 516 | # Querying items that were never added should return 0 517 | query I 518 | SELECT datasketch_frequent_items_estimate(sketch, 'never_existed') FROM fruit_sketch; 519 | ---- 520 | 0 521 | 522 | query II 523 | SELECT 524 | datasketch_frequent_items_lower_bound(sketch, 'never_existed'), 525 | datasketch_frequent_items_upper_bound(sketch, 'never_existed') 526 | FROM fruit_sketch; 527 | ---- 528 | 0 0 529 | 530 | 531 | # ------------------------------------------------------------------- 532 | # 10. Integer Type Support 533 | # ------------------------------------------------------------------- 534 | 535 | statement ok 536 | CREATE TABLE user_ids(id INTEGER); 537 | 538 | statement ok 539 | INSERT INTO user_ids VALUES (101), (101), (101), (202), (202), (303); 540 | 541 | statement ok 542 | CREATE TABLE int_sketch AS SELECT datasketch_frequent_items(id) as sketch FROM user_ids; 543 | 544 | query I 545 | SELECT datasketch_frequent_items_estimate(sketch, 101) FROM int_sketch; 546 | ---- 547 | 3 548 | 549 | query I 550 | SELECT datasketch_frequent_items_estimate(sketch, 202) FROM int_sketch; 551 | ---- 552 | 2 553 | 554 | query I 555 | SELECT datasketch_frequent_items_estimate(sketch, 303) FROM int_sketch; 556 | ---- 557 | 1 558 | 559 | # Get frequent integers 560 | query ITII 561 | SELECT 562 | f.item, 563 | f.estimate, 564 | f.lower_bound, 565 | f.upper_bound 566 | FROM int_sketch, 567 | UNNEST(datasketch_frequent_items_get_frequent(sketch, 'NO_FALSE_POSITIVES')) as t(f) 568 | ORDER BY f.estimate DESC; 569 | ---- 570 | 101 3 3 3 571 | 202 2 2 2 572 | 303 1 1 1 573 | 574 | # ------------------------------------------------------------------- 575 | # 20. BIGINT Support 576 | # ------------------------------------------------------------------- 577 | 578 | statement ok 579 | CREATE TABLE big_numbers(val BIGINT); 580 | 581 | statement ok 582 | INSERT INTO big_numbers VALUES 583 | (9223372036854775807), 584 | (9223372036854775807), 585 | (-9223372036854775808); 586 | 587 | statement ok 588 | CREATE TABLE bigint_sketch AS SELECT datasketch_frequent_items(val) as sketch FROM big_numbers; 589 | 590 | query I 591 | SELECT datasketch_frequent_items_estimate(sketch, 9223372036854775807) FROM bigint_sketch; 592 | ---- 593 | 2 594 | 595 | query I 596 | SELECT datasketch_frequent_items_estimate(sketch, -9223372036854775808) FROM bigint_sketch; 597 | ---- 598 | 1 599 | 600 | # ------------------------------------------------------------------- 601 | # 21. TINYINT Support 602 | # ------------------------------------------------------------------- 603 | 604 | statement ok 605 | CREATE TABLE tiny_vals(val TINYINT); 606 | 607 | statement ok 608 | INSERT INTO tiny_vals VALUES (1), (1), (1), (2), (2), (-128), (127); 609 | 610 | statement ok 611 | CREATE TABLE tiny_sketch AS SELECT datasketch_frequent_items(val) as sketch FROM tiny_vals; 612 | 613 | query I 614 | SELECT datasketch_frequent_items_estimate(sketch, 1::TINYINT) FROM tiny_sketch; 615 | ---- 616 | 3 617 | 618 | query I 619 | SELECT datasketch_frequent_items_estimate(sketch, (-128)::TINYINT) FROM tiny_sketch; 620 | ---- 621 | 1 622 | 623 | query I 624 | SELECT datasketch_frequent_items_estimate(sketch, 127::TINYINT) FROM tiny_sketch; 625 | ---- 626 | 1 627 | 628 | # ------------------------------------------------------------------- 629 | # 22. SMALLINT Support 630 | # ------------------------------------------------------------------- 631 | 632 | statement ok 633 | CREATE TABLE small_vals(val SMALLINT); 634 | 635 | statement ok 636 | INSERT INTO small_vals VALUES (1000), (1000), (-32768), (32767); 637 | 638 | statement ok 639 | CREATE TABLE small_sketch AS SELECT datasketch_frequent_items(val) as sketch FROM small_vals; 640 | 641 | query I 642 | SELECT datasketch_frequent_items_estimate(sketch, 1000::SMALLINT) FROM small_sketch; 643 | ---- 644 | 2 645 | 646 | query I 647 | SELECT datasketch_frequent_items_estimate(sketch, (-32768)::SMALLINT) FROM small_sketch; 648 | ---- 649 | 1 650 | 651 | # ------------------------------------------------------------------- 652 | # 23. Unsigned Integer Types (UTINYINT, USMALLINT, UINTEGER, UBIGINT) 653 | # ------------------------------------------------------------------- 654 | 655 | statement ok 656 | CREATE TABLE unsigned_vals( 657 | ut UTINYINT, 658 | us USMALLINT, 659 | ui UINTEGER, 660 | ub UBIGINT 661 | ); 662 | 663 | statement ok 664 | INSERT INTO unsigned_vals VALUES 665 | (255, 65535, 4294967295, 18446744073709551615), 666 | (255, 65535, 4294967295, 18446744073709551615), 667 | (0, 0, 0, 0); 668 | 669 | # UTINYINT 670 | statement ok 671 | CREATE TABLE ut_sketch AS SELECT datasketch_frequent_items(ut) as sketch FROM unsigned_vals; 672 | 673 | query I 674 | SELECT datasketch_frequent_items_estimate(sketch, 255::UTINYINT) FROM ut_sketch; 675 | ---- 676 | 2 677 | 678 | query I 679 | SELECT datasketch_frequent_items_estimate(sketch, 0::UTINYINT) FROM ut_sketch; 680 | ---- 681 | 1 682 | 683 | # USMALLINT 684 | statement ok 685 | CREATE TABLE us_sketch AS SELECT datasketch_frequent_items(us) as sketch FROM unsigned_vals; 686 | 687 | query I 688 | SELECT datasketch_frequent_items_estimate(sketch, 65535::USMALLINT) FROM us_sketch; 689 | ---- 690 | 2 691 | 692 | # UINTEGER 693 | statement ok 694 | CREATE TABLE ui_sketch AS SELECT datasketch_frequent_items(ui) as sketch FROM unsigned_vals; 695 | 696 | query I 697 | SELECT datasketch_frequent_items_estimate(sketch, 4294967295::UINTEGER) FROM ui_sketch; 698 | ---- 699 | 2 700 | 701 | # UBIGINT 702 | statement ok 703 | CREATE TABLE ub_sketch AS SELECT datasketch_frequent_items(ub) as sketch FROM unsigned_vals; 704 | 705 | query I 706 | SELECT datasketch_frequent_items_estimate(sketch, 18446744073709551615::UBIGINT) FROM ub_sketch; 707 | ---- 708 | 2 709 | 710 | # ------------------------------------------------------------------- 711 | # 24. FLOAT Support 712 | # ------------------------------------------------------------------- 713 | 714 | statement ok 715 | CREATE TABLE float_vals(val FLOAT); 716 | 717 | statement ok 718 | INSERT INTO float_vals VALUES (3.14), (3.14), (3.14), (2.71), (-0.5); 719 | 720 | statement ok 721 | CREATE TABLE float_sketch AS SELECT datasketch_frequent_items(val) as sketch FROM float_vals; 722 | 723 | query I 724 | SELECT datasketch_frequent_items_estimate(sketch, 3.14::FLOAT) FROM float_sketch; 725 | ---- 726 | 3 727 | 728 | query I 729 | SELECT datasketch_frequent_items_estimate(sketch, 2.71::FLOAT) FROM float_sketch; 730 | ---- 731 | 1 732 | 733 | query I 734 | SELECT datasketch_frequent_items_estimate(sketch, (-0.5)::FLOAT) FROM float_sketch; 735 | ---- 736 | 1 737 | 738 | # ------------------------------------------------------------------- 739 | # 25. DOUBLE Support 740 | # ------------------------------------------------------------------- 741 | 742 | statement ok 743 | CREATE TABLE double_vals(val DOUBLE); 744 | 745 | statement ok 746 | INSERT INTO double_vals VALUES 747 | (3.141592653589793), 748 | (3.141592653589793), 749 | (2.718281828459045), 750 | (1.7976931348623157e+308); 751 | 752 | statement ok 753 | CREATE TABLE double_sketch AS SELECT datasketch_frequent_items(val) as sketch FROM double_vals; 754 | 755 | query I 756 | SELECT datasketch_frequent_items_estimate(sketch, 3.141592653589793::DOUBLE) FROM double_sketch; 757 | ---- 758 | 2 759 | 760 | query I 761 | SELECT datasketch_frequent_items_estimate(sketch, 1.7976931348623157e+308::DOUBLE) FROM double_sketch; 762 | ---- 763 | 1 764 | 765 | # ------------------------------------------------------------------- 766 | # 26. Mixed Types in GROUP BY 767 | # ------------------------------------------------------------------- 768 | 769 | statement ok 770 | CREATE TABLE mixed_events(category VARCHAR, count_val INTEGER, amount DOUBLE); 771 | 772 | statement ok 773 | INSERT INTO mixed_events VALUES 774 | ('A', 100, 9.99), ('A', 100, 9.99), ('A', 200, 19.99), 775 | ('B', 100, 9.99), ('B', 300, 29.99), ('B', 300, 29.99); 776 | 777 | # Integer sketch per category 778 | statement ok 779 | CREATE TABLE mixed_int_sketch AS 780 | SELECT category, datasketch_frequent_items(count_val) as sketch 781 | FROM mixed_events GROUP BY category; 782 | 783 | query I 784 | SELECT datasketch_frequent_items_estimate(sketch, 100) 785 | FROM mixed_int_sketch WHERE category = 'A'; 786 | ---- 787 | 2 788 | 789 | query I 790 | SELECT datasketch_frequent_items_estimate(sketch, 300) 791 | FROM mixed_int_sketch WHERE category = 'B'; 792 | ---- 793 | 2 794 | 795 | # Double sketch per category 796 | statement ok 797 | CREATE TABLE mixed_dbl_sketch AS 798 | SELECT category, datasketch_frequent_items(amount) as sketch 799 | FROM mixed_events GROUP BY category; 800 | 801 | query I 802 | SELECT datasketch_frequent_items_estimate(sketch, 9.99::DOUBLE) 803 | FROM mixed_dbl_sketch WHERE category = 'A'; 804 | ---- 805 | 2 806 | 807 | # ------------------------------------------------------------------- 808 | # 27. Bounds Work for All Types 809 | # ------------------------------------------------------------------- 810 | 811 | query III 812 | SELECT 813 | datasketch_frequent_items_estimate(sketch, 255::UTINYINT), 814 | datasketch_frequent_items_lower_bound(sketch, 255::UTINYINT), 815 | datasketch_frequent_items_upper_bound(sketch, 255::UTINYINT) 816 | FROM ut_sketch; 817 | ---- 818 | 2 2 2 819 | 820 | query III 821 | SELECT 822 | datasketch_frequent_items_estimate(sketch, 3.14::FLOAT), 823 | datasketch_frequent_items_lower_bound(sketch, 3.14::FLOAT), 824 | datasketch_frequent_items_upper_bound(sketch, 3.14::FLOAT) 825 | FROM float_sketch; 826 | ---- 827 | 3 3 3 828 | 829 | -------------------------------------------------------------------------------- /codegen/generator.py: -------------------------------------------------------------------------------- 1 | from jinja2 import Environment, FileSystemLoader 2 | from typing import Any 3 | 4 | # Set up the Jinja2 environment 5 | env = Environment(loader=FileSystemLoader(searchpath="./codegen/"), autoescape=False) 6 | template = env.get_template("generated.cpp.j2") 7 | 8 | 9 | counting_sketch_names = ["CPC", "HLL"] 10 | 11 | logical_type_mapping = { 12 | "LogicalType::BOOLEAN": "bool", 13 | "LogicalType::TINYINT": "int8_t", 14 | "LogicalType::SMALLINT": "int16_t", 15 | "LogicalType::INTEGER": "int32_t", 16 | "LogicalType::BIGINT": "int64_t", 17 | "LogicalType::FLOAT": "float", 18 | "LogicalType::DOUBLE": "double", 19 | "LogicalType::UTINYINT": "uint8_t", 20 | "LogicalType::USMALLINT": "uint16_t", 21 | "LogicalType::UINTEGER": "uint32_t", 22 | "LogicalType::UBIGINT": "uint64_t", 23 | "LogicalType::VARCHAR": "string_t", 24 | } 25 | 26 | cpp_type_mapping = {value: key for key, value in logical_type_mapping.items()} 27 | 28 | 29 | def sketch_type_to_allowed_logical_types(sketch_type): 30 | if sketch_type in counting_sketch_names: 31 | return { 32 | "LogicalType::TINYINT": "int8_t", 33 | "LogicalType::SMALLINT": "int16_t", 34 | "LogicalType::INTEGER": "int32_t", 35 | "LogicalType::BIGINT": "int64_t", 36 | "LogicalType::FLOAT": "float", 37 | "LogicalType::DOUBLE": "double", 38 | "LogicalType::UTINYINT": "uint8_t", 39 | "LogicalType::USMALLINT": "uint16_t", 40 | "LogicalType::UINTEGER": "uint32_t", 41 | "LogicalType::UBIGINT": "uint64_t", 42 | "LogicalType::VARCHAR": "string_t", 43 | "LogicalType::BLOB": "string_t", 44 | } 45 | 46 | if sketch_type == "TDigest": 47 | return {"LogicalType::FLOAT": "float", "LogicalType::DOUBLE": "double"} 48 | 49 | return { 50 | "LogicalType::TINYINT": "int8_t", 51 | "LogicalType::SMALLINT": "int16_t", 52 | "LogicalType::INTEGER": "int32_t", 53 | "LogicalType::BIGINT": "int64_t", 54 | "LogicalType::FLOAT": "float", 55 | "LogicalType::DOUBLE": "double", 56 | "LogicalType::UTINYINT": "uint8_t", 57 | "LogicalType::USMALLINT": "uint16_t", 58 | "LogicalType::UINTEGER": "uint32_t", 59 | "LogicalType::UBIGINT": "uint64_t", 60 | } 61 | 62 | 63 | def get_sketch_class_name(sketch_type: str): 64 | if sketch_type == "TDigest": 65 | return "datasketches::tdigest" 66 | return f"datasketches::{sketch_type.lower()}_sketch" 67 | 68 | 69 | def unary_functions_per_sketch_type(sketch_type: str): 70 | if sketch_type not in counting_sketch_names: 71 | deserialize_sketch = f""" 72 | auto sketch = [&]() {{ 73 | try {{ 74 | return {get_sketch_class_name(sketch_type)}::deserialize(sketch_data.GetDataUnsafe(), sketch_data.GetSize()); 75 | }} catch (const std::exception &e) {{ 76 | throw InvalidInputException("Failed to deserialize {sketch_type} sketch: %s", e.what()); 77 | }} 78 | }}();""" 79 | else: 80 | deserialize_sketch = f""" 81 | auto sketch = [&]() {{ 82 | try {{ 83 | return {get_sketch_class_name(sketch_type)}::deserialize(sketch_data.GetDataUnsafe(), sketch_data.GetSize()); 84 | }} catch (const std::exception &e) {{ 85 | throw InvalidInputException("Failed to deserialize {sketch_type} sketch: %s", e.what()); 86 | }} 87 | }}();""" 88 | 89 | if sketch_type in counting_sketch_names: 90 | sketch_argument = { 91 | "cpp_type": "string_t", 92 | "duckdb_type": lambda contained_type: "sketch_type", 93 | "name": "sketch", 94 | "process": deserialize_sketch, 95 | } 96 | else: 97 | sketch_argument = { 98 | "cpp_type": "string_t", 99 | "duckdb_type": lambda contained_type: f"sketch_map_types[{contained_type.replace('LogicalType', 'LogicalTypeId')}]", 100 | "name": "sketch", 101 | "process": deserialize_sketch, 102 | } 103 | 104 | cdf_points_argument = { 105 | "cpp_type": "list_entry_t", 106 | "duckdb_type": lambda contained_type: f"LogicalType::LIST({contained_type})", 107 | "name": "split_points", 108 | "pre_executor": """ 109 | UnifiedVectorFormat unified_split_points; 110 | split_points_vector.ToUnifiedFormat(args.size(), unified_split_points); 111 | 112 | // auto split_points_list_entries = UnifiedVectorFormat::GetData(unified_split_points); 113 | // auto split_points_validitiy = FlatVector::Validity(split_points_vector); 114 | 115 | auto &split_points_list_children = ListVector::GetEntry(split_points_vector); 116 | 117 | UnifiedVectorFormat split_points_children_unified; 118 | split_points_list_children.ToUnifiedFormat(args.size(), split_points_children_unified); 119 | 120 | const T *split_points_list_children_data = UnifiedVectorFormat::GetData(split_points_children_unified); 121 | """, 122 | "process": """ 123 | std::vector passing_points(split_points_data.length); 124 | for (idx_t i = 0; i < split_points_data.length; i++) 125 | { 126 | passing_points[i] = split_points_list_children_data[i + split_points_data.offset]; 127 | } 128 | """, 129 | } 130 | pmf_points_argument = cdf_points_argument 131 | 132 | result = [ 133 | { 134 | "method": "return sketch.is_empty();", 135 | "name": "is_empty", 136 | "description": "Return a boolean indicating if the sketch is empty", 137 | "example": f"datasketch_{sketch_type.lower()}_is_empty(sketch)", 138 | "arguments": [ 139 | sketch_argument, 140 | ], 141 | "return_type": "LogicalType::BOOLEAN", 142 | }, 143 | ] 144 | 145 | if sketch_type not in counting_sketch_names: 146 | result.extend( 147 | [ 148 | { 149 | "method": "return sketch.get_k();", 150 | "arguments": [sketch_argument], 151 | "name": "k", 152 | "description": "Return the value of K for this sketch", 153 | "example": f"datasketch_{sketch_type.lower()}_k(sketch)", 154 | "return_type": "LogicalType::USMALLINT", 155 | }, 156 | { 157 | "name": "cdf", 158 | "description": "Return the Cumulative Distribution Function (CDF) of the sketch for a series of points", 159 | "example": f"datasketch_{sketch_type.lower()}_cdf(sketch, points, inclusive)" 160 | if sketch_type != "TDigest" 161 | else f"datasketch_{sketch_type.lower()}_cdf(sketch, points)", 162 | "method": ( 163 | "auto cdf_result = sketch.get_CDF(passing_points.data(), split_points_data.length, inclusive_data);" 164 | if sketch_type != "TDigest" 165 | else "auto cdf_result = sketch.get_CDF(passing_points.data(), split_points_data.length);" 166 | ) 167 | + """ 168 | auto current_size = ListVector::GetListSize(result); 169 | auto new_size = current_size + cdf_result.size(); 170 | if (ListVector::GetListCapacity(result) < new_size) 171 | { 172 | ListVector::Reserve(result, new_size); 173 | } 174 | 175 | auto &child_entry = ListVector::GetEntry(result); 176 | auto child_vals = FlatVector::GetData(child_entry); 177 | //auto &child_validity = FlatVector::Validity(child_entry); 178 | for (idx_t i = 0; i < cdf_result.size(); i++) 179 | { 180 | child_vals[current_size + i] = cdf_result[i]; 181 | } 182 | ListVector::SetListSize(result, new_size); 183 | return list_entry_t{current_size, cdf_result.size()}; 184 | """, 185 | "arguments": [ 186 | sketch_argument, 187 | cdf_points_argument, 188 | { 189 | "cpp_type": "bool", 190 | "name": "inclusive", 191 | }, 192 | ] 193 | if sketch_type != "TDigest" 194 | else [sketch_argument, cdf_points_argument], 195 | "return_type_dynamic_list": True, 196 | }, 197 | { 198 | "name": "pmf", 199 | "description": "Return the Probability Mass Function (PMF) of the sketch for a series of points", 200 | "example": f"datasketch_{sketch_type.lower()}_pmf(sketch, points, inclusive)" 201 | if sketch_type != "TDigest" 202 | else f"datasketch_{sketch_type.lower()}_pmf(sketch, points)", 203 | "method": ( 204 | "auto pmf_result = sketch.get_PMF(passing_points.data(), split_points_data.length, inclusive_data);" 205 | if sketch_type != "TDigest" 206 | else "auto pmf_result = sketch.get_PMF(passing_points.data(), split_points_data.length);" 207 | ) 208 | + """ 209 | auto current_size = ListVector::GetListSize(result); 210 | auto new_size = current_size + pmf_result.size(); 211 | if (ListVector::GetListCapacity(result) < new_size) 212 | { 213 | ListVector::Reserve(result, new_size); 214 | } 215 | 216 | auto &child_entry = ListVector::GetEntry(result); 217 | auto child_vals = FlatVector::GetData(child_entry); 218 | //auto &child_validity = FlatVector::Validity(child_entry); 219 | for (idx_t i = 0; i < pmf_result.size(); i++) 220 | { 221 | child_vals[current_size + i] = pmf_result[i]; 222 | } 223 | ListVector::SetListSize(result, new_size); 224 | return list_entry_t{current_size, pmf_result.size()}; 225 | """, 226 | "arguments": [ 227 | sketch_argument, 228 | pmf_points_argument, 229 | { 230 | "cpp_type": "bool", 231 | "name": "inclusive", 232 | }, 233 | ] 234 | if sketch_type != "TDigest" 235 | else [sketch_argument, pmf_points_argument], 236 | "return_type_dynamic_list": True, 237 | }, 238 | ] 239 | ) 240 | 241 | if sketch_type == "HLL": 242 | result.extend( 243 | [ 244 | { 245 | "method": "return StringVector::AddString(result, sketch.to_string(summary_data, detail_data, false, false));", 246 | "description": "Return a string representation of the sketch", 247 | "example": f"datasketch_{sketch_type.lower()}_describe(sketch, include_summary, include_detail)", 248 | "arguments": [ 249 | sketch_argument, 250 | { 251 | "cpp_type": "bool", 252 | "name": "summary", 253 | }, 254 | { 255 | "cpp_type": "bool", 256 | "name": "detail", 257 | }, 258 | ], 259 | "name": "describe", 260 | "return_type": "LogicalType::VARCHAR", 261 | }, 262 | { 263 | "method": "return sketch.get_lg_config_k();", 264 | "description": "Return the value of log base 2 K for this sketch", 265 | "example": f"datasketch_{sketch_type.lower()}_lg_config_k(sketch)", 266 | "arguments": [ 267 | sketch_argument, 268 | ], 269 | "name": "lg_config_k", 270 | "return_type": "LogicalType::UTINYINT", 271 | }, 272 | { 273 | "method": "return sketch.is_compact();", 274 | "description": "Return whether the sketch is in compact form", 275 | "example": f"datasketch_{sketch_type.lower()}_is_compact(sketch)", 276 | "arguments": [ 277 | sketch_argument, 278 | ], 279 | "name": "is_compact", 280 | "return_type": "LogicalType::BOOLEAN", 281 | }, 282 | ] 283 | ) 284 | 285 | if sketch_type == "CPC": 286 | result.append( 287 | { 288 | "method": "return StringVector::AddString(result, sketch.to_string());", 289 | "description": "Return a string representation of the sketch", 290 | "example": f"datasketch_{sketch_type.lower()}_describe(sketch)", 291 | "arguments": [ 292 | sketch_argument, 293 | ], 294 | "name": "describe", 295 | "return_type": "LogicalType::VARCHAR", 296 | }, 297 | ) 298 | 299 | if sketch_type in counting_sketch_names: 300 | result.extend( 301 | [ 302 | { 303 | "method": "return sketch.get_estimate();", 304 | "description": "Return the estimate of the number of distinct items seen by the sketch", 305 | "example": f"datasketch_{sketch_type.lower()}_estimate(sketch)", 306 | "arguments": [ 307 | sketch_argument, 308 | ], 309 | "name": "estimate", 310 | "return_type": "LogicalType::DOUBLE", 311 | }, 312 | { 313 | "description": "Return the lower bound of the number of distinct items seen by the sketch", 314 | "example": f"datasketch_{sketch_type.lower()}_lower_bound(sketch, std_dev)", 315 | "method": "return sketch.get_lower_bound(std_dev_data);", 316 | "arguments": [ 317 | sketch_argument, 318 | { 319 | "cpp_type": "uint8_t", 320 | "name": "std_dev", 321 | }, 322 | ], 323 | "name": "lower_bound", 324 | "return_type": "LogicalType::DOUBLE", 325 | }, 326 | { 327 | "description": "Return the upper bound of the number of distinct items seen by the sketch", 328 | "example": f"datasketch_{sketch_type.lower()}_upper_bound(sketch, std_dev)", 329 | "method": "return sketch.get_upper_bound(std_dev_data);", 330 | "arguments": [ 331 | sketch_argument, 332 | { 333 | "cpp_type": "uint8_t", 334 | "name": "std_dev", 335 | }, 336 | ], 337 | "name": "upper_bound", 338 | "return_type": "LogicalType::DOUBLE", 339 | }, 340 | ] 341 | ) 342 | 343 | if sketch_type == "TDigest": 344 | result.extend( 345 | [ 346 | { 347 | "description": "Return a description of this sketch", 348 | "example": f"datasketch_{sketch_type.lower()}_describe(sketch, include_centroids)", 349 | "method": "return StringVector::AddString(result, sketch.to_string(include_centroids_data));", 350 | "arguments": [ 351 | sketch_argument, 352 | { 353 | "cpp_type": "bool", 354 | "name": "include_centroids", 355 | }, 356 | ], 357 | "name": "describe", 358 | "return_type": "LogicalType::VARCHAR", 359 | }, 360 | { 361 | "description": "Return the rank of an item in the sketch", 362 | "example": f"datasketch_{sketch_type.lower()}_rank(sketch, item)", 363 | "method": "return sketch.get_rank(item_data);", 364 | "name": "rank", 365 | "arguments": [ 366 | sketch_argument, 367 | { 368 | "cpp_type_dynamic": True, 369 | "name": "item", 370 | }, 371 | ], 372 | "return_type": "LogicalType::DOUBLE", 373 | }, 374 | { 375 | "description": "Return the total weight of this sketch", 376 | "example": f"datasketch_{sketch_type.lower()}_total_weight(sketch)", 377 | "method": "return sketch.get_total_weight();", 378 | "name": "total_weight", 379 | "arguments": [ 380 | sketch_argument, 381 | ], 382 | "return_type": "LogicalType::UBIGINT", 383 | }, 384 | { 385 | "description": "Return the quantile of a rank in the sketch", 386 | "example": f"datasketch_{sketch_type.lower()}_quantile(sketch, rank)", 387 | "method": "return sketch.get_quantile(rank_data);", 388 | "name": "quantile", 389 | "arguments": [ 390 | sketch_argument, 391 | { 392 | "cpp_type": "double", 393 | "name": "rank", 394 | }, 395 | ], 396 | "dynamic_return_type": True, 397 | }, 398 | ] 399 | ) 400 | 401 | if sketch_type not in ("TDigest", "REQ", "HLL", "CPC"): 402 | result.extend( 403 | [ 404 | { 405 | "description": "Return the normalized rank error of the sketch", 406 | "example": f"datasketch_{sketch_type.lower()}_normalized_rank_error(sketch, is_pmf)", 407 | "method": "return sketch.get_normalized_rank_error(is_pmf_data);", 408 | "name": "normalized_rank_error", 409 | "arguments": [ 410 | sketch_argument, 411 | { 412 | "cpp_type": "bool", 413 | "name": "is_pmf", 414 | }, 415 | ], 416 | "return_type": "LogicalType::DOUBLE", 417 | }, 418 | ] 419 | ) 420 | 421 | if sketch_type != "TDigest" and sketch_type not in counting_sketch_names: 422 | result.extend( 423 | [ 424 | { 425 | "description": "Return a description of this sketch", 426 | "example": f"datasketch_{sketch_type.lower()}_describe(sketch, include_levels, include_items)", 427 | "method": "return StringVector::AddString(result, sketch.to_string(include_levels_data, include_items_data));", 428 | "arguments": [ 429 | sketch_argument, 430 | { 431 | "cpp_type": "bool", 432 | "name": "include_levels", 433 | }, 434 | { 435 | "cpp_type": "bool", 436 | "name": "include_items", 437 | }, 438 | ], 439 | "name": "describe", 440 | "return_type": "LogicalType::VARCHAR", 441 | }, 442 | { 443 | "description": "Return the rank of an item in the sketch", 444 | "example": f"datasketch_{sketch_type.lower()}_rank(sketch, item, inclusive)", 445 | "method": "return sketch.get_rank(item_data, inclusive_data);", 446 | "name": "rank", 447 | "arguments": [ 448 | sketch_argument, 449 | { 450 | "cpp_type_dynamic": True, 451 | "name": "item", 452 | }, 453 | { 454 | "cpp_type": "bool", 455 | "name": "inclusive", 456 | }, 457 | ], 458 | "return_type": "LogicalType::DOUBLE", 459 | }, 460 | { 461 | "description": "Return the quantile of a rank in the sketch", 462 | "example": f"datasketch_{sketch_type.lower()}_rank(sketch, rank, inclusive)", 463 | "method": "return sketch.get_quantile(rank_data, inclusive_data);", 464 | "name": "quantile", 465 | "arguments": [ 466 | sketch_argument, 467 | { 468 | "cpp_type": "double", 469 | "name": "rank", 470 | }, 471 | { 472 | "cpp_type": "bool", 473 | "name": "inclusive", 474 | }, 475 | ], 476 | "dynamic_return_type": True, 477 | }, 478 | { 479 | "description": "Return the number of items contained in the sketch", 480 | "example": f"datasketch_{sketch_type.lower()}_rank(sketch)", 481 | "method": "return sketch.get_n();", 482 | "name": "n", 483 | "arguments": [ 484 | sketch_argument, 485 | ], 486 | "return_type": "LogicalType::UBIGINT", 487 | }, 488 | { 489 | "description": "Return a boolean indicating if the sketch is in estimation mode", 490 | "example": f"datasketch_{sketch_type.lower()}_is_estimation_mode(sketch)", 491 | "method": "return sketch.is_estimation_mode();", 492 | "name": "is_estimation_mode", 493 | "arguments": [ 494 | sketch_argument, 495 | ], 496 | "return_type": "LogicalType::BOOLEAN", 497 | }, 498 | { 499 | "description": "Return the number of retained items in the sketch", 500 | "example": f"datasketch_{sketch_type.lower()}_num_retained(sketch)", 501 | "method": "return sketch.get_num_retained();", 502 | "name": "num_retained", 503 | "arguments": [ 504 | sketch_argument, 505 | ], 506 | "return_type": "LogicalType::UBIGINT", 507 | }, 508 | { 509 | "description": "Return the minimum item in the sketch", 510 | "example": f"datasketch_{sketch_type.lower()}_min_item(sketch)", 511 | "method": "return sketch.get_min_item();", 512 | "name": "min_item", 513 | "arguments": [ 514 | sketch_argument, 515 | ], 516 | "dynamic_return_type": True, 517 | }, 518 | { 519 | "description": "Return the maxium item in the sketch", 520 | "example": f"datasketch_{sketch_type.lower()}_max_item(sketch)", 521 | "method": "return sketch.get_max_item();", 522 | "name": "max_item", 523 | "arguments": [sketch_argument], 524 | "dynamic_return_type": True, 525 | }, 526 | ] 527 | ) 528 | return result 529 | 530 | 531 | def get_executor_name(arguments: list) -> str: 532 | if len(arguments) == 1: 533 | return "UnaryExecutor" 534 | elif len(arguments) == 2: 535 | return "BinaryExecutor" 536 | elif len(arguments) == 3: 537 | return "TernaryExecutor" 538 | else: 539 | raise NotImplementedError(f"Unhandled number of arguments {len(arguments)}") 540 | 541 | 542 | def get_scalar_function_args( 543 | function_info: Any, logical_type: str, cpp_type: str 544 | ) -> str: 545 | input_parameters = [] 546 | for arg in function_info["arguments"]: 547 | if "duckdb_type" in arg: 548 | input_parameters.append(arg["duckdb_type"](logical_type)) 549 | elif "cpp_type_dynamic" in arg: 550 | input_parameters.append(logical_type) 551 | else: 552 | input_parameters.append(cpp_type_mapping[arg["cpp_type"]]) 553 | 554 | joined_input_parameters = ",".join(input_parameters) 555 | 556 | all_args = [f"{{{joined_input_parameters}}}"] 557 | 558 | if function_info.get("dynamic_return_type"): 559 | all_args.append(logical_type) 560 | elif function_info.get("return_type_dynamic_list"): 561 | all_args.append(f"LogicalType::LIST({logical_type})") 562 | else: 563 | all_args.append(function_info["return_type"]) 564 | 565 | return ",".join(all_args) 566 | 567 | 568 | def get_function_block(function_info: Any) -> str: 569 | cpp_types = [] 570 | for value in function_info["arguments"]: 571 | if value.get("cpp_type_dynamic"): 572 | cpp_types.append("T") 573 | else: 574 | cpp_types.append(value["cpp_type"]) 575 | 576 | if function_info.get("return_type_dynamic_list"): 577 | cpp_types.append("list_entry_t") 578 | elif function_info.get("dynamic_return_type"): 579 | cpp_types.append("T") 580 | else: 581 | cpp_types.append(logical_type_mapping[function_info["return_type"]]) 582 | 583 | joined_cpp_types = ",".join(cpp_types) 584 | 585 | executor_args = [] 586 | lambda_args = [] 587 | lambda_lines = [] 588 | pre_executor_lines = [] 589 | 590 | for argument in function_info["arguments"]: 591 | executor_args.append(f"{argument['name']}_vector") 592 | 593 | if argument.get("cpp_type_dynamic"): 594 | lambda_args.append(f"T {argument['name']}_data") 595 | else: 596 | lambda_args.append(f"{argument['cpp_type']} {argument['name']}_data") 597 | 598 | if "process" in argument: 599 | lambda_lines.append(argument["process"]) 600 | 601 | if "pre_executor" in argument: 602 | pre_executor_lines.append(argument["pre_executor"]) 603 | 604 | executor_args.append("result") 605 | executor_args.append("args.size()") 606 | 607 | joined_executor_args = ",".join(executor_args) 608 | joined_lambda_args = ",".join(lambda_args) 609 | 610 | lambda_lines.append(function_info["method"]) 611 | 612 | lambda_body = "\n".join(lambda_lines) 613 | pre_executor_body = "\n".join(pre_executor_lines) 614 | 615 | result = f""" 616 | {pre_executor_body} 617 | {get_executor_name(function_info["arguments"])}::Execute 618 | <{joined_cpp_types}> 619 | ( 620 | {joined_executor_args}, 621 | [&]({joined_lambda_args}) {{ 622 | 623 | {lambda_body} 624 | }});""" 625 | 626 | return result 627 | 628 | 629 | # Data to render the template 630 | data = { 631 | "sketch_class_name": get_sketch_class_name, 632 | "counting_sketch_names": counting_sketch_names, 633 | # "function_names_per_sketch": get_sketch_function_names, 634 | "sketch_types": ["Quantiles", "KLL", "REQ", "TDigest", "HLL", "CPC"], 635 | "logical_type_to_cplusplus_type": sketch_type_to_allowed_logical_types, 636 | "functions_per_sketch_type": unary_functions_per_sketch_type, 637 | "get_function_block": get_function_block, 638 | "get_scalar_function_args": get_scalar_function_args, 639 | "logical_type_mapping": logical_type_mapping, 640 | "to_type_id": lambda v: v.replace("LogicalType", "LogicalTypeId"), 641 | "sketch_k_cpp_type": { 642 | "Quantiles": "int32_t", 643 | "KLL": "int32_t", 644 | "REQ": "int32_t", 645 | "TDigest": "int32_t", 646 | "HLL": "int32_t", 647 | "CPC": "int32_t", 648 | }, 649 | "cpp_type_mapping": cpp_type_mapping, 650 | } 651 | 652 | 653 | # Render the template 654 | output = template.render(data) 655 | 656 | # Write the generated C++ code to a file 657 | with open("src/generated.cpp", "w") as f: 658 | f.write(output) 659 | 660 | print("C++ file generated successfully!") 661 | -------------------------------------------------------------------------------- /src/theta_sketch.cpp: -------------------------------------------------------------------------------- 1 | #include "datasketches_extension.hpp" 2 | #include "duckdb/function/scalar_function.hpp" 3 | #include "duckdb/parser/parsed_data/create_scalar_function_info.hpp" 4 | #include "duckdb/parser/parsed_data/create_aggregate_function_info.hpp" 5 | 6 | // Apache DataSketches Headers 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace duckdb 13 | { 14 | namespace 15 | { 16 | // ============================================================ 17 | // 1. Helpers & Bind Data 18 | // ============================================================ 19 | 20 | struct DSThetaBindData : public FunctionData 21 | { 22 | DSThetaBindData() : lg_k(12) {} 23 | explicit DSThetaBindData(uint8_t lg_k) : lg_k(lg_k) {} 24 | 25 | unique_ptr Copy() const override 26 | { 27 | return make_uniq(lg_k); 28 | } 29 | 30 | bool Equals(const FunctionData &other_p) const override 31 | { 32 | auto &other = other_p.Cast(); 33 | return lg_k == other.lg_k; 34 | } 35 | 36 | uint8_t lg_k; 37 | }; 38 | 39 | unique_ptr DSThetaBindWithK(ClientContext &context, AggregateFunction &function, 40 | vector> &arguments) 41 | { 42 | if (arguments[0]->HasParameter()) 43 | throw ParameterNotResolvedException(); 44 | if (!arguments[0]->IsFoldable()) 45 | throw BinderException("Theta Sketch lg_k must be constant"); 46 | 47 | Value k_val = ExpressionExecutor::EvaluateScalar(context, *arguments[0]); 48 | if (k_val.IsNull()) 49 | throw BinderException("Theta Sketch lg_k cannot be NULL"); 50 | 51 | auto lg_k = (uint8_t)k_val.GetValue(); 52 | Function::EraseArgument(function, arguments, 0); 53 | return make_uniq(lg_k); 54 | } 55 | 56 | unique_ptr DSThetaBindDefault(ClientContext &context, AggregateFunction &function, 57 | vector> &arguments) 58 | { 59 | return make_uniq(12); 60 | } 61 | 62 | // ============================================================ 63 | // 2. State & Operations 64 | // ============================================================ 65 | 66 | struct DSThetaState 67 | { 68 | datasketches::update_theta_sketch *update_sketch = nullptr; 69 | datasketches::theta_union *union_sketch = nullptr; 70 | 71 | ~DSThetaState() 72 | { 73 | if (update_sketch) 74 | delete update_sketch; 75 | if (union_sketch) 76 | delete union_sketch; 77 | } 78 | 79 | void CreateUpdateSketch(uint8_t lg_k) 80 | { 81 | if (!update_sketch) 82 | { 83 | datasketches::update_theta_sketch::builder b; 84 | b.set_lg_k(lg_k); 85 | update_sketch = new datasketches::update_theta_sketch(b.build()); 86 | } 87 | } 88 | 89 | void CreateUnionSketch(uint8_t lg_k) 90 | { 91 | if (!union_sketch) 92 | { 93 | datasketches::theta_union::builder b; 94 | b.set_lg_k(lg_k); 95 | union_sketch = new datasketches::theta_union(b.build()); 96 | } 97 | } 98 | }; 99 | 100 | struct DSThetaOperationBase 101 | { 102 | template 103 | static void Initialize(STATE &state) 104 | { 105 | state.update_sketch = nullptr; 106 | state.union_sketch = nullptr; 107 | } 108 | template 109 | static void Destroy(STATE &state, AggregateInputData &) 110 | { 111 | if (state.update_sketch) 112 | delete state.update_sketch; 113 | if (state.union_sketch) 114 | delete state.union_sketch; 115 | } 116 | static bool IgnoreNull() { return true; } 117 | 118 | template 119 | static void Combine(const STATE &source, STATE &target, AggregateInputData &aggr_input_data) 120 | { 121 | if (!source.update_sketch && !source.union_sketch) 122 | return; 123 | 124 | if (!target.union_sketch) 125 | { 126 | auto &bind_data = aggr_input_data.bind_data->template Cast(); 127 | target.CreateUnionSketch(bind_data.lg_k); 128 | if (target.update_sketch) 129 | { 130 | target.union_sketch->update(*target.update_sketch); 131 | delete target.update_sketch; 132 | target.update_sketch = nullptr; 133 | } 134 | } 135 | if (source.update_sketch) 136 | target.union_sketch->update(*source.update_sketch); 137 | if (source.union_sketch) 138 | target.union_sketch->update(source.union_sketch->get_result()); 139 | } 140 | 141 | template 142 | static void Finalize(STATE &state, T &target, AggregateFinalizeData &finalize_data) 143 | { 144 | if (state.union_sketch) 145 | { 146 | auto compact = state.union_sketch->get_result(); 147 | auto serialized = compact.serialize(); 148 | target = StringVector::AddStringOrBlob(finalize_data.result, std::string(serialized.begin(), serialized.end())); 149 | } 150 | else if (state.update_sketch) 151 | { 152 | auto compact = state.update_sketch->compact(); 153 | auto serialized = compact.serialize(); 154 | target = StringVector::AddStringOrBlob(finalize_data.result, std::string(serialized.begin(), serialized.end())); 155 | } 156 | else 157 | { 158 | auto &bind_data = finalize_data.input.bind_data->template Cast(); 159 | datasketches::update_theta_sketch::builder b; 160 | b.set_lg_k(bind_data.lg_k); 161 | auto empty_sketch = b.build(); 162 | auto compact = empty_sketch.compact(); 163 | auto serialized = compact.serialize(); 164 | target = StringVector::AddStringOrBlob(finalize_data.result, std::string(serialized.begin(), serialized.end())); 165 | } 166 | } 167 | }; 168 | 169 | struct DSThetaCreateOperation : DSThetaOperationBase 170 | { 171 | template 172 | static void Operation(STATE &state, const A_TYPE &a_data, AggregateUnaryInput &idata) 173 | { 174 | auto &bind_data = idata.input.bind_data->template Cast(); 175 | state.CreateUpdateSketch(bind_data.lg_k); 176 | if constexpr (std::is_same_v) 177 | { 178 | state.update_sketch->update(a_data.GetData(), a_data.GetSize()); 179 | } 180 | else 181 | { 182 | state.update_sketch->update(a_data); 183 | } 184 | } 185 | 186 | template 187 | static void ConstantOperation(STATE &state, const INPUT_TYPE &input, AggregateUnaryInput &unary_input, idx_t count) 188 | { 189 | for (idx_t i = 0; i < count; i++) 190 | { 191 | Operation(state, input, unary_input); 192 | } 193 | } 194 | }; 195 | 196 | struct DSThetaMergeOperation : DSThetaOperationBase 197 | { 198 | template 199 | static void Operation(STATE &state, const A_TYPE &a_data, AggregateUnaryInput &idata) 200 | { 201 | auto &bind_data = idata.input.bind_data->template Cast(); 202 | state.CreateUnionSketch(bind_data.lg_k); 203 | auto sketch = datasketches::compact_theta_sketch::deserialize(a_data.GetDataUnsafe(), a_data.GetSize()); 204 | state.union_sketch->update(sketch); 205 | } 206 | 207 | template 208 | static void ConstantOperation(STATE &state, const INPUT_TYPE &input, AggregateUnaryInput &unary_input, idx_t count) 209 | { 210 | for (idx_t i = 0; i < count; i++) 211 | { 212 | Operation(state, input, unary_input); 213 | } 214 | } 215 | }; 216 | 217 | // ============================================================ 218 | // 3. Scalar Functions 219 | // ============================================================ 220 | 221 | static void DSThetaUnion(DataChunk &args, ExpressionState &state, Vector &result) 222 | { 223 | BinaryExecutor::Execute( 224 | args.data[0], args.data[1], result, args.size(), 225 | [&](string_t a_blob, string_t b_blob) 226 | { 227 | datasketches::theta_union::builder b; 228 | b.set_lg_k(12); 229 | auto union_obj = b.build(); 230 | auto sketch_a = datasketches::compact_theta_sketch::deserialize(a_blob.GetDataUnsafe(), a_blob.GetSize()); 231 | auto sketch_b = datasketches::compact_theta_sketch::deserialize(b_blob.GetDataUnsafe(), b_blob.GetSize()); 232 | union_obj.update(sketch_a); 233 | union_obj.update(sketch_b); 234 | auto res = union_obj.get_result(); 235 | auto serialized = res.serialize(); 236 | return StringVector::AddStringOrBlob(result, std::string(serialized.begin(), serialized.end())); 237 | }); 238 | } 239 | 240 | static void DSThetaIntersect(DataChunk &args, ExpressionState &state, Vector &result) 241 | { 242 | BinaryExecutor::Execute( 243 | args.data[0], args.data[1], result, args.size(), 244 | [&](string_t a_blob, string_t b_blob) 245 | { 246 | auto sketch_a = datasketches::compact_theta_sketch::deserialize(a_blob.GetDataUnsafe(), a_blob.GetSize()); 247 | auto sketch_b = datasketches::compact_theta_sketch::deserialize(b_blob.GetDataUnsafe(), b_blob.GetSize()); 248 | datasketches::theta_intersection intersection; 249 | intersection.update(sketch_a); 250 | intersection.update(sketch_b); 251 | auto res = intersection.get_result(); 252 | auto serialized = res.serialize(); 253 | return StringVector::AddStringOrBlob(result, std::string(serialized.begin(), serialized.end())); 254 | }); 255 | } 256 | 257 | static void DSThetaANotB(DataChunk &args, ExpressionState &state, Vector &result) 258 | { 259 | BinaryExecutor::Execute( 260 | args.data[0], args.data[1], result, args.size(), 261 | [&](string_t a_blob, string_t b_blob) 262 | { 263 | auto sketch_a = datasketches::compact_theta_sketch::deserialize(a_blob.GetDataUnsafe(), a_blob.GetSize()); 264 | auto sketch_b = datasketches::compact_theta_sketch::deserialize(b_blob.GetDataUnsafe(), b_blob.GetSize()); 265 | datasketches::theta_a_not_b a_not_b; 266 | auto res = a_not_b.compute(sketch_a, sketch_b); 267 | auto serialized = res.serialize(); 268 | return StringVector::AddStringOrBlob(result, std::string(serialized.begin(), serialized.end())); 269 | }); 270 | } 271 | 272 | static void DSThetaEstimate(DataChunk &args, ExpressionState &state, Vector &result) 273 | { 274 | UnaryExecutor::Execute(args.data[0], result, args.size(), 275 | [&](string_t sketch_blob) 276 | { 277 | return datasketches::compact_theta_sketch::deserialize(sketch_blob.GetDataUnsafe(), sketch_blob.GetSize()).get_estimate(); 278 | }); 279 | } 280 | 281 | static void DSThetaLowerBound(DataChunk &args, ExpressionState &state, Vector &result) 282 | { 283 | BinaryExecutor::Execute(args.data[0], args.data[1], result, args.size(), 284 | [&](string_t sketch_blob, int32_t num_std_devs) 285 | { 286 | return datasketches::compact_theta_sketch::deserialize(sketch_blob.GetDataUnsafe(), sketch_blob.GetSize()).get_lower_bound(static_cast(num_std_devs)); 287 | }); 288 | } 289 | 290 | static void DSThetaUpperBound(DataChunk &args, ExpressionState &state, Vector &result) 291 | { 292 | BinaryExecutor::Execute(args.data[0], args.data[1], result, args.size(), 293 | [&](string_t sketch_blob, int32_t num_std_devs) 294 | { 295 | return datasketches::compact_theta_sketch::deserialize(sketch_blob.GetDataUnsafe(), sketch_blob.GetSize()).get_upper_bound(static_cast(num_std_devs)); 296 | }); 297 | } 298 | 299 | static void DSThetaDescribe(DataChunk &args, ExpressionState &state, Vector &result) 300 | { 301 | UnaryExecutor::Execute(args.data[0], result, args.size(), 302 | [&](string_t sketch_blob) 303 | { 304 | return StringVector::AddString(result, datasketches::compact_theta_sketch::deserialize(sketch_blob.GetDataUnsafe(), sketch_blob.GetSize()).to_string(false)); 305 | }); 306 | } 307 | 308 | // --- METADATA FUNCTIONS 309 | 310 | static void DSThetaIsEmpty(DataChunk &args, ExpressionState &state, Vector &result) 311 | { 312 | UnaryExecutor::Execute(args.data[0], result, args.size(), 313 | [&](string_t sketch_blob) 314 | { 315 | return datasketches::compact_theta_sketch::deserialize(sketch_blob.GetDataUnsafe(), sketch_blob.GetSize()).is_empty(); 316 | }); 317 | } 318 | 319 | static void DSThetaIsEstimation(DataChunk &args, ExpressionState &state, Vector &result) 320 | { 321 | UnaryExecutor::Execute(args.data[0], result, args.size(), 322 | [&](string_t sketch_blob) 323 | { 324 | return datasketches::compact_theta_sketch::deserialize(sketch_blob.GetDataUnsafe(), sketch_blob.GetSize()).is_estimation_mode(); 325 | }); 326 | } 327 | 328 | static void DSThetaGetTheta(DataChunk &args, ExpressionState &state, Vector &result) 329 | { 330 | UnaryExecutor::Execute(args.data[0], result, args.size(), 331 | [&](string_t sketch_blob) 332 | { 333 | return datasketches::compact_theta_sketch::deserialize(sketch_blob.GetDataUnsafe(), sketch_blob.GetSize()).get_theta(); 334 | }); 335 | } 336 | 337 | static void DSThetaNumRetained(DataChunk &args, ExpressionState &state, Vector &result) 338 | { 339 | UnaryExecutor::Execute(args.data[0], result, args.size(), 340 | [&](string_t sketch_blob) 341 | { 342 | return (int64_t)datasketches::compact_theta_sketch::deserialize(sketch_blob.GetDataUnsafe(), sketch_blob.GetSize()).get_num_retained(); 343 | }); 344 | } 345 | 346 | static void DSThetaGetSeed(DataChunk &args, ExpressionState &state, Vector &result) 347 | { 348 | // Note: Compact sketches typically store the Seed HASH, not the full seed. 349 | UnaryExecutor::Execute(args.data[0], result, args.size(), 350 | [&](string_t sketch_blob) 351 | { 352 | return (int64_t)datasketches::compact_theta_sketch::deserialize(sketch_blob.GetDataUnsafe(), sketch_blob.GetSize()).get_seed_hash(); 353 | }); 354 | } 355 | 356 | // ============================================================ 357 | // 4. Type Creation & Registration Helpers 358 | // ============================================================ 359 | 360 | static LogicalType CreateThetaSketchType(ExtensionLoader &loader) 361 | { 362 | auto new_type = LogicalType(LogicalTypeId::BLOB); 363 | auto new_type_name = "sketch_theta"; 364 | auto type_info = CreateTypeInfo(new_type_name, LogicalType::BLOB); 365 | type_info.temporary = false; 366 | type_info.internal = true; 367 | type_info.comment = "Sketch type for Theta Sketch"; 368 | new_type.SetAlias(new_type_name); 369 | 370 | auto &system_catalog = Catalog::GetSystemCatalog(loader.GetDatabaseInstance()); 371 | auto data = CatalogTransaction::GetSystemTransaction(loader.GetDatabaseInstance()); 372 | system_catalog.CreateType(data, type_info); 373 | 374 | loader.RegisterCastFunction(LogicalType::BLOB, new_type, DefaultCasts::ReinterpretCast, 1); 375 | loader.RegisterCastFunction(new_type, LogicalType::BLOB, DefaultCasts::ReinterpretCast, 1); 376 | return new_type; 377 | } 378 | 379 | template 380 | static void RegisterThetaAggregates(AggregateFunctionSet &set, const LogicalType &input_type, const LogicalType &result_type) 381 | { 382 | auto fun_default = AggregateFunction::UnaryAggregateDestructor( 383 | input_type, result_type); 384 | fun_default.bind = DSThetaBindDefault; 385 | fun_default.order_dependent = AggregateOrderDependent::NOT_ORDER_DEPENDENT; 386 | set.AddFunction(fun_default); 387 | 388 | auto fun_with_k = AggregateFunction::UnaryAggregateDestructor( 389 | input_type, result_type); 390 | fun_with_k.bind = DSThetaBindWithK; 391 | fun_with_k.order_dependent = AggregateOrderDependent::NOT_ORDER_DEPENDENT; 392 | fun_with_k.arguments.insert(fun_with_k.arguments.begin(), LogicalType::INTEGER); 393 | set.AddFunction(fun_with_k); 394 | } 395 | } 396 | 397 | // ============================================================ 398 | // 5. Main Loader 399 | // ============================================================ 400 | 401 | void LoadThetaSketch(ExtensionLoader &loader) 402 | { 403 | auto sketch_type = CreateThetaSketchType(loader); 404 | AggregateFunctionSet sketch_agg("datasketch_theta"); 405 | 406 | // 1. RAW DATA - Register specific types 407 | // IMPORTANT: DO NOT register LogicalType::BLOB here! 408 | // If we do, it shadows the Merge operation for "sketch_theta". 409 | RegisterThetaAggregates(sketch_agg, LogicalType::TINYINT, sketch_type); 410 | RegisterThetaAggregates(sketch_agg, LogicalType::SMALLINT, sketch_type); 411 | RegisterThetaAggregates(sketch_agg, LogicalType::INTEGER, sketch_type); 412 | RegisterThetaAggregates(sketch_agg, LogicalType::BIGINT, sketch_type); 413 | RegisterThetaAggregates(sketch_agg, LogicalType::FLOAT, sketch_type); 414 | RegisterThetaAggregates(sketch_agg, LogicalType::DOUBLE, sketch_type); 415 | RegisterThetaAggregates(sketch_agg, LogicalType::VARCHAR, sketch_type); 416 | 417 | // 2. MERGE SKETCHES (sketch_theta / BLOB) 418 | auto fun_merge = AggregateFunction::UnaryAggregateDestructor( 419 | sketch_type, sketch_type); 420 | fun_merge.bind = DSThetaBindDefault; 421 | fun_merge.arguments = {sketch_type}; 422 | sketch_agg.AddFunction(fun_merge); 423 | 424 | auto fun_merge_k = AggregateFunction::UnaryAggregateDestructor( 425 | sketch_type, sketch_type); 426 | fun_merge_k.bind = DSThetaBindWithK; 427 | fun_merge_k.arguments = {LogicalType::INTEGER, sketch_type}; 428 | sketch_agg.AddFunction(fun_merge_k); 429 | 430 | { 431 | CreateAggregateFunctionInfo info(sketch_agg); 432 | FunctionDescription desc; 433 | desc.description = "Creates a Theta sketch for estimating set cardinality and performing set operations"; 434 | desc.examples.push_back("datasketch_theta(column)"); 435 | desc.examples.push_back("datasketch_theta(12, column)"); 436 | info.descriptions.push_back(desc); 437 | loader.RegisterFunction(info); 438 | } 439 | 440 | // --- SCALAR FUNCTIONS --- 441 | { 442 | CreateScalarFunctionInfo info(ScalarFunction("datasketch_theta_intersect", {sketch_type, sketch_type}, sketch_type, DSThetaIntersect)); 443 | FunctionDescription desc; 444 | desc.description = "Returns a new Theta sketch representing the intersection of two sketches"; 445 | desc.examples.push_back("datasketch_theta_intersect(sketch1, sketch2)"); 446 | info.descriptions.push_back(desc); 447 | loader.RegisterFunction(info); 448 | } 449 | { 450 | CreateScalarFunctionInfo info(ScalarFunction("datasketch_theta_union", {sketch_type, sketch_type}, sketch_type, DSThetaUnion)); 451 | FunctionDescription desc; 452 | desc.description = "Returns a new Theta sketch representing the union of two sketches"; 453 | desc.examples.push_back("datasketch_theta_union(sketch1, sketch2)"); 454 | info.descriptions.push_back(desc); 455 | loader.RegisterFunction(info); 456 | } 457 | { 458 | CreateScalarFunctionInfo info(ScalarFunction("datasketch_theta_a_not_b", {sketch_type, sketch_type}, sketch_type, DSThetaANotB)); 459 | FunctionDescription desc; 460 | desc.description = "Returns a new Theta sketch representing elements in sketch A but not in sketch B (set difference)"; 461 | desc.examples.push_back("datasketch_theta_a_not_b(sketch_a, sketch_b)"); 462 | info.descriptions.push_back(desc); 463 | loader.RegisterFunction(info); 464 | } 465 | { 466 | CreateScalarFunctionInfo info(ScalarFunction("datasketch_theta_estimate", {sketch_type}, LogicalType::DOUBLE, DSThetaEstimate)); 467 | FunctionDescription desc; 468 | desc.description = "Returns the estimated number of distinct values in the Theta sketch"; 469 | desc.examples.push_back("datasketch_theta_estimate(sketch)"); 470 | info.descriptions.push_back(desc); 471 | loader.RegisterFunction(info); 472 | } 473 | { 474 | CreateScalarFunctionInfo info(ScalarFunction("datasketch_theta_lower_bound", {sketch_type, LogicalType::INTEGER}, LogicalType::DOUBLE, DSThetaLowerBound)); 475 | FunctionDescription desc; 476 | desc.description = "Returns the lower bound estimate at the given number of standard deviations (1, 2, or 3)"; 477 | desc.examples.push_back("datasketch_theta_lower_bound(sketch, 2)"); 478 | info.descriptions.push_back(desc); 479 | loader.RegisterFunction(info); 480 | } 481 | { 482 | CreateScalarFunctionInfo info(ScalarFunction("datasketch_theta_upper_bound", {sketch_type, LogicalType::INTEGER}, LogicalType::DOUBLE, DSThetaUpperBound)); 483 | FunctionDescription desc; 484 | desc.description = "Returns the upper bound estimate at the given number of standard deviations (1, 2, or 3)"; 485 | desc.examples.push_back("datasketch_theta_upper_bound(sketch, 2)"); 486 | info.descriptions.push_back(desc); 487 | loader.RegisterFunction(info); 488 | } 489 | { 490 | CreateScalarFunctionInfo info(ScalarFunction("datasketch_theta_describe", {sketch_type}, LogicalType::VARCHAR, DSThetaDescribe)); 491 | FunctionDescription desc; 492 | desc.description = "Returns a human-readable description of the Theta sketch"; 493 | desc.examples.push_back("datasketch_theta_describe(sketch)"); 494 | info.descriptions.push_back(desc); 495 | loader.RegisterFunction(info); 496 | } 497 | 498 | // Metadata 499 | { 500 | CreateScalarFunctionInfo info(ScalarFunction("datasketch_theta_is_empty", {sketch_type}, LogicalType::BOOLEAN, DSThetaIsEmpty)); 501 | FunctionDescription desc; 502 | desc.description = "Returns true if the Theta sketch is empty"; 503 | desc.examples.push_back("datasketch_theta_is_empty(sketch)"); 504 | info.descriptions.push_back(desc); 505 | loader.RegisterFunction(info); 506 | } 507 | { 508 | CreateScalarFunctionInfo info(ScalarFunction("datasketch_theta_is_estimation_mode", {sketch_type}, LogicalType::BOOLEAN, DSThetaIsEstimation)); 509 | FunctionDescription desc; 510 | desc.description = "Returns true if the sketch is in estimation mode (has exceeded exact counting capacity)"; 511 | desc.examples.push_back("datasketch_theta_is_estimation_mode(sketch)"); 512 | info.descriptions.push_back(desc); 513 | loader.RegisterFunction(info); 514 | } 515 | { 516 | CreateScalarFunctionInfo info(ScalarFunction("datasketch_theta_get_theta", {sketch_type}, LogicalType::DOUBLE, DSThetaGetTheta)); 517 | FunctionDescription desc; 518 | desc.description = "Returns the theta value of the sketch (sampling probability)"; 519 | desc.examples.push_back("datasketch_theta_get_theta(sketch)"); 520 | info.descriptions.push_back(desc); 521 | loader.RegisterFunction(info); 522 | } 523 | { 524 | CreateScalarFunctionInfo info(ScalarFunction("datasketch_theta_num_retained", {sketch_type}, LogicalType::BIGINT, DSThetaNumRetained)); 525 | FunctionDescription desc; 526 | desc.description = "Returns the number of hash values retained in the sketch"; 527 | desc.examples.push_back("datasketch_theta_num_retained(sketch)"); 528 | info.descriptions.push_back(desc); 529 | loader.RegisterFunction(info); 530 | } 531 | { 532 | CreateScalarFunctionInfo info(ScalarFunction("datasketch_theta_get_seed", {sketch_type}, LogicalType::BIGINT, DSThetaGetSeed)); 533 | FunctionDescription desc; 534 | desc.description = "Returns the seed hash used by the sketch"; 535 | desc.examples.push_back("datasketch_theta_get_seed(sketch)"); 536 | info.descriptions.push_back(desc); 537 | loader.RegisterFunction(info); 538 | } 539 | } 540 | 541 | } 542 | -------------------------------------------------------------------------------- /codegen/generated.cpp.j2: -------------------------------------------------------------------------------- 1 | #include "datasketches_extension.hpp" 2 | 3 | 4 | #include "duckdb/parser/parsed_data/create_scalar_function_info.hpp" 5 | #include "duckdb/parser/parsed_data/create_aggregate_function_info.hpp" 6 | #include "duckdb/function/scalar_function.hpp" 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | namespace duckdb 17 | { 18 | 19 | 20 | static std::string toLowerCase(const std::string& input) { 21 | std::string result = input; 22 | std::transform(result.begin(), result.end(), result.begin(), [](unsigned char c) { 23 | return std::tolower(c); 24 | }); 25 | return result; 26 | } 27 | 28 | {% for sketch_type in sketch_types %} 29 | 30 | 31 | struct DS{{sketch_type}}BindData : public FunctionData { 32 | DS{{sketch_type}}BindData() { 33 | } 34 | explicit DS{{sketch_type}}BindData({{sketch_k_cpp_type[sketch_type]}} k) : k(k) { 35 | } 36 | 37 | unique_ptr Copy() const override { 38 | return make_uniq(k); 39 | } 40 | 41 | bool Equals(const FunctionData &other_p) const override { 42 | auto &other = other_p.Cast(); 43 | return k == other.k; 44 | } 45 | 46 | {{sketch_k_cpp_type[sketch_type]}} k; 47 | }; 48 | 49 | 50 | unique_ptr DS{{sketch_type}}Bind(ClientContext &context, AggregateFunction &function, 51 | vector> &arguments) { 52 | if (arguments[0]->HasParameter()) { 53 | throw ParameterNotResolvedException(); 54 | } 55 | if (!arguments[0]->IsFoldable()) { 56 | throw BinderException("{{sketch_type}} can only take a constant K value"); 57 | } 58 | Value k_val = ExpressionExecutor::EvaluateScalar(context, *arguments[0]); 59 | if (k_val.IsNull()) { 60 | throw BinderException("{{sketch_type}} K value cannot be NULL"); 61 | } 62 | 63 | auto actual_k = k_val.GetValue<{{sketch_k_cpp_type[sketch_type]}}>(); 64 | 65 | {% if sketch_type in ["Quantiles", "KLL"] %} 66 | // Validate K parameter: must be in range (0, 32768] 67 | if (actual_k <= 0 || actual_k > 32768) { 68 | throw BinderException("{{sketch_type}} K value must be between 1 and 32768, got: " + std::to_string(actual_k)); 69 | } 70 | {% elif sketch_type == "REQ" %} 71 | // Validate K parameter: must be in range [4, 1024] 72 | if (actual_k < 4 || actual_k > 1024) { 73 | throw BinderException("REQ K value must be between 4 and 1024, got: " + std::to_string(actual_k)); 74 | } 75 | {% elif sketch_type == "TDigest" %} 76 | // Validate K parameter: must be positive (TDigest compression parameter) 77 | if (actual_k <= 0) { 78 | throw BinderException("TDigest K (compression) value must be positive, got: " + std::to_string(actual_k)); 79 | } 80 | {% elif sketch_type == "HLL" %} 81 | // Validate K parameter: lg_k must be in range [4, 21] for HLL 82 | if (actual_k < 4 || actual_k > 21) { 83 | throw BinderException("HLL K (lg_k) value must be between 4 and 21, got: " + std::to_string(actual_k)); 84 | } 85 | {% elif sketch_type == "CPC" %} 86 | // Validate K parameter: lg_k must be in range [4, 26] for CPC 87 | if (actual_k < 4 || actual_k > 26) { 88 | throw BinderException("CPC K (lg_k) value must be between 4 and 26, got: " + std::to_string(actual_k)); 89 | } 90 | {% endif %} 91 | 92 | Function::EraseArgument(function, arguments, 0); 93 | return make_uniq(actual_k); 94 | } 95 | 96 | 97 | {% if sketch_type not in counting_sketch_names %} 98 | template 99 | {% endif %} 100 | struct DS{{sketch_type}}State 101 | { 102 | {% if sketch_type in counting_sketch_names %} 103 | {{sketch_class_name(sketch_type)}} *sketch = nullptr; 104 | {% else %} 105 | {{sketch_class_name(sketch_type)}} *sketch = nullptr; 106 | {% endif %} 107 | 108 | ~DS{{sketch_type}}State() 109 | { 110 | if (sketch) 111 | { 112 | delete sketch; 113 | } 114 | } 115 | 116 | {% if sketch_type in ["Quantiles", "KLL"] %} 117 | void CreateSketch(int32_t k) 118 | { 119 | D_ASSERT(!sketch); 120 | D_ASSERT(k > 0); 121 | D_ASSERT(k <= 32768); 122 | sketch = new {{sketch_class_name(sketch_type)}}(k); 123 | } 124 | {% elif sketch_type == "REQ" %} 125 | void CreateSketch(int32_t k) 126 | { 127 | D_ASSERT(!sketch); 128 | D_ASSERT(k >= 4); 129 | D_ASSERT(k <= 1024); 130 | sketch = new {{sketch_class_name(sketch_type)}}(k); 131 | } 132 | {% elif sketch_type == "TDigest" %} 133 | void CreateSketch(uint16_t k) 134 | { 135 | D_ASSERT(!sketch); 136 | sketch = new {{sketch_class_name(sketch_type)}}(k); 137 | } 138 | {% elif sketch_type == "HLL" %} 139 | void CreateSketch(uint16_t k) 140 | { 141 | D_ASSERT(!sketch); 142 | sketch = new {{sketch_class_name(sketch_type)}}(k); 143 | } 144 | {% elif sketch_type == "CPC" %} 145 | void CreateSketch(uint8_t k) 146 | { 147 | D_ASSERT(!sketch); 148 | sketch = new {{sketch_class_name(sketch_type)}}(k); 149 | } 150 | {% endif %} 151 | 152 | void CreateSketch(const DS{{sketch_type}}State &existing) 153 | { 154 | if (existing.sketch) 155 | { 156 | {% if sketch_type in counting_sketch_names %} 157 | sketch = new {{sketch_class_name(sketch_type)}}(*existing.sketch); 158 | {% else %} 159 | sketch = new {{sketch_class_name(sketch_type)}}(*existing.sketch); 160 | {% endif %} 161 | } 162 | } 163 | 164 | {% if sketch_type not in counting_sketch_names %} 165 | {{sketch_class_name(sketch_type)}} deserialize_sketch(const string_t &data) 166 | { 167 | try { 168 | return {{sketch_class_name(sketch_type)}}::deserialize(data.GetDataUnsafe(), data.GetSize()); 169 | } catch (const std::exception &e) { 170 | throw InvalidInputException("Failed to deserialize {{sketch_type}} sketch: %s", e.what()); 171 | } 172 | } 173 | {% else %} 174 | {{sketch_class_name(sketch_type)}} deserialize_sketch(const string_t &data) 175 | { 176 | try { 177 | return {{sketch_class_name(sketch_type)}}::deserialize(data.GetDataUnsafe(), data.GetSize()); 178 | } catch (const std::exception &e) { 179 | throw InvalidInputException("Failed to deserialize {{sketch_type}} sketch: %s", e.what()); 180 | } 181 | } 182 | {% endif %} 183 | }; 184 | 185 | 186 | 187 | {% if sketch_type in counting_sketch_names %} 188 | static LogicalType Create{{sketch_type}}CountingSketchType(ExtensionLoader &loader) 189 | { 190 | auto new_type = LogicalType(LogicalTypeId::BLOB); 191 | auto new_type_name = "sketch_{{sketch_type|lower}}"; 192 | auto type_info = CreateTypeInfo(new_type_name, LogicalType::BLOB); 193 | type_info.temporary = false; 194 | type_info.internal = true; 195 | type_info.comment = "Sketch type for {{sketch_type}} sketch"; 196 | new_type.SetAlias(new_type_name); 197 | auto &system_catalog = Catalog::GetSystemCatalog(loader.GetDatabaseInstance()); 198 | auto data = CatalogTransaction::GetSystemTransaction(loader.GetDatabaseInstance()); 199 | system_catalog.CreateType(data, type_info); 200 | loader.RegisterCastFunction(LogicalType::BLOB, new_type, DefaultCasts::ReinterpretCast, 1); 201 | loader.RegisterCastFunction(new_type, LogicalType::BLOB, DefaultCasts::ReinterpretCast, 1); 202 | return new_type; 203 | } 204 | {% else %} 205 | static LogicalType Create{{sketch_type}}SketchType(ExtensionLoader &loader, LogicalType embedded_type) 206 | { 207 | auto new_type = LogicalType(LogicalTypeId::BLOB); 208 | auto type_suffix = toLowerCase(embedded_type.ToString()); 209 | auto new_type_name = "sketch_{{sketch_type|lower}}_" + type_suffix; 210 | 211 | 212 | new_type.SetAlias(new_type_name); 213 | auto type_info = CreateTypeInfo(new_type_name, new_type); 214 | type_info.temporary = false; 215 | type_info.internal = true; 216 | type_info.comment = "Sketch type for {{sketch_type}} sketch with embedded type " + embedded_type.ToString(); 217 | auto &system_catalog = Catalog::GetSystemCatalog(loader.GetDatabaseInstance()); 218 | auto data = CatalogTransaction::GetSystemTransaction(loader.GetDatabaseInstance()); 219 | system_catalog.CreateType(data, type_info); 220 | loader.RegisterCastFunction(LogicalType::BLOB, new_type, DefaultCasts::ReinterpretCast, 1); 221 | loader.RegisterCastFunction(new_type, LogicalType::BLOB, DefaultCasts::ReinterpretCast, 1); 222 | return new_type; 223 | } 224 | {% endif %} 225 | 226 | 227 | {%- endfor %} 228 | 229 | struct DSSketchOperationBase { 230 | template 231 | static void Initialize(STATE &state) 232 | { 233 | state.sketch = nullptr; 234 | } 235 | 236 | template 237 | static void Destroy(STATE &state, AggregateInputData &aggr_input_data) { 238 | if (state.sketch) { 239 | delete state.sketch; 240 | state.sketch = nullptr; 241 | } 242 | } 243 | 244 | static bool IgnoreNull() { return true; } 245 | }; 246 | 247 | template 248 | struct DSQuantilesMergeOperation : DSSketchOperationBase 249 | { 250 | template 251 | static void Operation(STATE &state, 252 | const A_TYPE &a_data, 253 | AggregateUnaryInput &idata) 254 | { 255 | if (!state.sketch) 256 | { 257 | auto &bind_data = idata.input.bind_data->template Cast(); 258 | state.CreateSketch(bind_data.k); 259 | } 260 | 261 | // this is a sketch in b_data, so we need to deserialize it. 262 | state.sketch->merge(state.deserialize_sketch(a_data)); 263 | } 264 | 265 | template 266 | static void ConstantOperation(STATE &state, const INPUT_TYPE &input, AggregateUnaryInput &unary_input, 267 | idx_t count) { 268 | for (idx_t i = 0; i < count; i++) { 269 | Operation(state, input, unary_input); 270 | } 271 | } 272 | 273 | template 274 | static void Combine(const STATE &source, STATE &target, 275 | AggregateInputData &aggr_input_data) 276 | { 277 | if (!target.sketch) 278 | { 279 | target.CreateSketch(source); 280 | } 281 | else 282 | { 283 | target.sketch->merge(*source.sketch); 284 | } 285 | } 286 | 287 | template 288 | static void Finalize(STATE &state, T &target, 289 | AggregateFinalizeData &finalize_data) 290 | { 291 | if (!state.sketch) 292 | { 293 | finalize_data.ReturnNull(); 294 | } 295 | else 296 | { 297 | auto serialized_data = state.sketch->serialize(); 298 | auto sketch_string = std::string(serialized_data.begin(), serialized_data.end()); 299 | target = StringVector::AddStringOrBlob(finalize_data.result, sketch_string); 300 | } 301 | } 302 | }; 303 | 304 | template 305 | struct DSQuantilesCreateOperation : DSSketchOperationBase 306 | { 307 | template 308 | static void Operation(STATE &state, 309 | const A_TYPE &a_data, 310 | AggregateUnaryInput &idata) 311 | { 312 | if (!state.sketch) 313 | { 314 | auto &bind_data = idata.input.bind_data->template Cast(); 315 | state.CreateSketch(bind_data.k); 316 | } 317 | 318 | state.sketch->update(a_data); 319 | } 320 | 321 | template 322 | static void ConstantOperation(STATE &state, const INPUT_TYPE &input, AggregateUnaryInput &unary_input, 323 | idx_t count) { 324 | for (idx_t i = 0; i < count; i++) { 325 | Operation(state, input, unary_input); 326 | } 327 | } 328 | 329 | template 330 | static void Combine(const STATE &source, STATE &target, 331 | AggregateInputData &aggr_input_data) 332 | { 333 | if (!target.sketch) 334 | { 335 | target.CreateSketch(source); 336 | } 337 | else 338 | { 339 | target.sketch->merge(*source.sketch); 340 | } 341 | } 342 | 343 | template 344 | static void Finalize(STATE &state, T &target, 345 | AggregateFinalizeData &finalize_data) 346 | { 347 | if (!state.sketch) 348 | { 349 | finalize_data.ReturnNull(); 350 | } 351 | else 352 | { 353 | auto serialized_data = state.sketch->serialize(); 354 | auto sketch_string = std::string(serialized_data.begin(), serialized_data.end()); 355 | target = StringVector::AddStringOrBlob(finalize_data.result, sketch_string); 356 | } 357 | } 358 | }; 359 | 360 | template 361 | struct DSHLLCreateOperation : DSSketchOperationBase 362 | { 363 | template 364 | static void Operation(STATE &state, 365 | const A_TYPE &a_data, 366 | AggregateUnaryInput &idata) 367 | { 368 | if (!state.sketch) 369 | { 370 | auto &bind_data = idata.input.bind_data->template Cast(); 371 | state.CreateSketch(bind_data.k); 372 | } 373 | 374 | if constexpr (std::is_same_v) { 375 | state.sketch->update(a_data.GetData(), a_data.GetSize()); 376 | } else { 377 | state.sketch->update(a_data); 378 | } 379 | } 380 | 381 | template 382 | static void ConstantOperation(STATE &state, const INPUT_TYPE &input, AggregateUnaryInput &unary_input, 383 | idx_t count) { 384 | for (idx_t i = 0; i < count; i++) { 385 | Operation(state, input, unary_input); 386 | } 387 | } 388 | 389 | template 390 | static void Combine(const STATE &source, STATE &target, 391 | AggregateInputData &aggr_input_data) 392 | { 393 | if (!target.sketch) 394 | { 395 | target.CreateSketch(source); 396 | } 397 | else 398 | { 399 | datasketches::hll_union u(target.sketch->get_lg_config_k()); 400 | u.update(*target.sketch); 401 | if(source.sketch) { 402 | u.update(*source.sketch); 403 | } 404 | *target.sketch = u.get_result(datasketches::target_hll_type::HLL_4); 405 | } 406 | } 407 | 408 | template 409 | static void Finalize(STATE &state, T &target, 410 | AggregateFinalizeData &finalize_data) 411 | { 412 | if (!state.sketch) 413 | { 414 | finalize_data.ReturnNull(); 415 | } 416 | else 417 | { 418 | auto serialized_data = state.sketch->serialize_updatable(); 419 | auto sketch_string = std::string(serialized_data.begin(), serialized_data.end()); 420 | target = StringVector::AddStringOrBlob(finalize_data.result, sketch_string); 421 | } 422 | } 423 | }; 424 | 425 | template 426 | struct DSHLLMergeOperation : DSSketchOperationBase 427 | { 428 | 429 | template 430 | static void Operation(STATE &state, 431 | const A_TYPE &a_data, 432 | AggregateUnaryInput &idata) 433 | { 434 | auto &bind_data = idata.input.bind_data->template Cast(); 435 | 436 | if (!state.sketch) 437 | { 438 | state.CreateSketch(bind_data.k); 439 | } 440 | 441 | auto a_sketch = state.deserialize_sketch(a_data); 442 | 443 | datasketches::hll_union u(bind_data.k); 444 | if(state.sketch) { 445 | u.update(*state.sketch); 446 | } 447 | u.update(a_sketch); 448 | 449 | *state.sketch = u.get_result(datasketches::target_hll_type::HLL_4); 450 | } 451 | 452 | template 453 | static void ConstantOperation(STATE &state, const INPUT_TYPE &input, AggregateUnaryInput &unary_input, 454 | idx_t count) { 455 | for (idx_t i = 0; i < count; i++) { 456 | Operation(state, input, unary_input); 457 | } 458 | } 459 | 460 | template 461 | static void Combine(const STATE &source, STATE &target, 462 | AggregateInputData &aggr_input_data) 463 | { 464 | if (!target.sketch) 465 | { 466 | target.CreateSketch(source); 467 | } 468 | else 469 | { 470 | datasketches::hll_union u(target.sketch->get_lg_config_k()); 471 | if(source.sketch) { 472 | u.update(*source.sketch); 473 | } 474 | u.update(*target.sketch); 475 | 476 | *target.sketch = u.get_result(datasketches::target_hll_type::HLL_4); 477 | } 478 | } 479 | 480 | template 481 | static void Finalize(STATE &state, T &target, 482 | AggregateFinalizeData &finalize_data) 483 | { 484 | if (!state.sketch) 485 | { 486 | finalize_data.ReturnNull(); 487 | } 488 | else 489 | { 490 | auto serialized_data = state.sketch->serialize_updatable(); 491 | auto sketch_string = std::string(serialized_data.begin(), serialized_data.end()); 492 | target = StringVector::AddStringOrBlob(finalize_data.result, sketch_string); 493 | } 494 | } 495 | }; 496 | 497 | 498 | template 499 | struct DSCPCMergeOperation : DSSketchOperationBase 500 | { 501 | template 502 | static void Operation(STATE &state, 503 | const A_TYPE &a_data, 504 | AggregateUnaryInput &idata) 505 | { 506 | auto &bind_data = idata.input.bind_data->template Cast(); 507 | 508 | if (!state.sketch) 509 | { 510 | state.CreateSketch(bind_data.k); 511 | } 512 | 513 | auto a_sketch = state.deserialize_sketch(a_data); 514 | datasketches::cpc_union u(bind_data.k); 515 | if(state.sketch) { 516 | u.update(*state.sketch); 517 | } 518 | u.update(a_sketch); 519 | 520 | *state.sketch = u.get_result(); 521 | } 522 | 523 | template 524 | static void ConstantOperation(STATE &state, const INPUT_TYPE &input, AggregateUnaryInput &unary_input, 525 | idx_t count) { 526 | for (idx_t i = 0; i < count; i++) { 527 | Operation(state, input, unary_input); 528 | } 529 | } 530 | 531 | template 532 | static void Combine(const STATE &source, STATE &target, 533 | AggregateInputData &aggr_input_data) 534 | { 535 | if (!target.sketch) 536 | { 537 | target.CreateSketch(source); 538 | } 539 | else 540 | { 541 | datasketches::cpc_union u(target.sketch->get_lg_k()); 542 | if(source.sketch) { 543 | u.update(*source.sketch); 544 | } 545 | u.update(*target.sketch); 546 | *target.sketch = u.get_result(); 547 | } 548 | } 549 | 550 | template 551 | static void Finalize(STATE &state, T &target, 552 | AggregateFinalizeData &finalize_data) 553 | { 554 | if (!state.sketch) 555 | { 556 | finalize_data.ReturnNull(); 557 | } 558 | else 559 | { 560 | auto serialized_data = state.sketch->serialize(); 561 | auto sketch_string = std::string(serialized_data.begin(), serialized_data.end()); 562 | target = StringVector::AddStringOrBlob(finalize_data.result, sketch_string); 563 | } 564 | } 565 | }; 566 | 567 | template 568 | struct DSCPCCreateOperation : DSSketchOperationBase 569 | { 570 | template 571 | static void Operation(STATE &state, 572 | const A_TYPE &a_data, 573 | AggregateUnaryInput &idata) 574 | { 575 | if (!state.sketch) 576 | { 577 | auto &bind_data = idata.input.bind_data->template Cast(); 578 | state.CreateSketch(bind_data.k); 579 | } 580 | 581 | if constexpr (std::is_same_v) { 582 | state.sketch->update(a_data.GetData(), a_data.GetSize()); 583 | } else { 584 | state.sketch->update(a_data); 585 | } 586 | } 587 | 588 | template 589 | static void ConstantOperation(STATE &state, const INPUT_TYPE &input, AggregateUnaryInput &unary_input, 590 | idx_t count) { 591 | for (idx_t i = 0; i < count; i++) { 592 | Operation(state, input, unary_input); 593 | } 594 | } 595 | 596 | 597 | template 598 | static void Combine(const STATE &source, STATE &target, 599 | AggregateInputData &aggr_input_data) 600 | { 601 | if (!target.sketch) 602 | { 603 | target.CreateSketch(source); 604 | } 605 | else 606 | { 607 | datasketches::cpc_union u(target.sketch->get_lg_k()); 608 | u.update(*target.sketch); 609 | if(source.sketch) { 610 | u.update(*source.sketch); 611 | } 612 | *target.sketch = u.get_result(); 613 | } 614 | } 615 | 616 | 617 | template 618 | static void Finalize(STATE &state, T &target, 619 | AggregateFinalizeData &finalize_data) 620 | { 621 | if (!state.sketch) 622 | { 623 | finalize_data.ReturnNull(); 624 | } 625 | else 626 | { 627 | auto serialized_data = state.sketch->serialize(); 628 | auto sketch_string = std::string(serialized_data.begin(), serialized_data.end()); 629 | target = StringVector::AddStringOrBlob(finalize_data.result, sketch_string); 630 | } 631 | } 632 | }; 633 | 634 | 635 | 636 | {% for sketch_type in sketch_types %} 637 | {% for unary_function in functions_per_sketch_type(sketch_type) %} 638 | 639 | {% if sketch_type not in counting_sketch_names %} 640 | template 641 | {% endif %} 642 | static inline void DS{{sketch_type}}{{unary_function.name}}(DataChunk &args, ExpressionState &state, Vector &result) 643 | { 644 | // Get the references to the incoming vectors. 645 | D_ASSERT(args.ColumnCount() == {{unary_function.arguments|length}}); 646 | 647 | {% for a in unary_function.arguments %} 648 | auto &{{a.name}}_vector = args.data[{{loop.index0}}]; 649 | {%- endfor %} 650 | 651 | {{ get_function_block(unary_function) }} 652 | 653 | } 654 | 655 | {% endfor %} 656 | 657 | 658 | {% if sketch_type not in counting_sketch_names %} 659 | template 660 | auto static DS{{sketch_type}}MergeAggregate(const LogicalType &type, const LogicalType &result_type) -> AggregateFunction 661 | {% else %} 662 | auto static DS{{sketch_type}}MergeAggregate(const LogicalType &result_type) -> AggregateFunction 663 | {% endif %} 664 | { 665 | {% if sketch_type == "HLL" %} 666 | return AggregateFunction::UnaryAggregateDestructor, AggregateDestructorType::LEGACY>( 667 | result_type, result_type); 668 | {% elif sketch_type == "CPC" %} 669 | return AggregateFunction::UnaryAggregateDestructor, AggregateDestructorType::LEGACY>( 670 | result_type, result_type); 671 | {% else %} 672 | return AggregateFunction::UnaryAggregateDestructor, string_t, string_t, DSQuantilesMergeOperation, AggregateDestructorType::LEGACY>( 673 | result_type, result_type); 674 | {% endif %} 675 | } 676 | 677 | 678 | 679 | template 680 | auto static DS{{sketch_type}}CreateAggregate(const LogicalType &type, const LogicalType &result_type) -> AggregateFunction 681 | { 682 | {% if sketch_type == 'HLL' %} 683 | return AggregateFunction::UnaryAggregateDestructor, AggregateDestructorType::LEGACY>( 684 | type, result_type); 685 | {% elif sketch_type == 'CPC' %} 686 | return AggregateFunction::UnaryAggregateDestructor, AggregateDestructorType::LEGACY>( 687 | type, result_type); 688 | {% else %} 689 | return AggregateFunction::UnaryAggregateDestructor, T, string_t, DSQuantilesCreateOperation, AggregateDestructorType::LEGACY>( 690 | type, result_type); 691 | {% endif %} 692 | } 693 | 694 | 695 | void Load{{sketch_type}}Sketch(ExtensionLoader &loader) { 696 | 697 | 698 | {% if sketch_type in counting_sketch_names %} 699 | auto sketch_type = Create{{sketch_type}}CountingSketchType(loader); 700 | {% else %} 701 | std::unordered_map sketch_map_types; 702 | {% for logical_type in logical_type_to_cplusplus_type(sketch_type).keys() %} 703 | sketch_map_types.insert({ {{to_type_id(logical_type)}}, Create{{sketch_type}}SketchType(loader, LogicalType({{to_type_id(logical_type)}}))}); 704 | {%- endfor %} 705 | {% endif %} 706 | 707 | {% for unary_function in functions_per_sketch_type(sketch_type) %} 708 | { 709 | ScalarFunctionSet fs("datasketch_{{sketch_type|lower}}_{{unary_function.name|lower}}"); 710 | {% if sketch_type in counting_sketch_names %} 711 | fs.AddFunction(ScalarFunction( 712 | {{get_scalar_function_args(unary_function, None, None)}} 713 | , DS{{sketch_type}}{{unary_function.name}})); 714 | {% else %} 715 | {% for logical_type, cpp_type in logical_type_to_cplusplus_type(sketch_type).items() %} 716 | fs.AddFunction(ScalarFunction( 717 | {{get_scalar_function_args(unary_function, logical_type, cpp_type)}} 718 | , DS{{sketch_type}}{{unary_function.name}}<{{cpp_type}}>)); 719 | {%- endfor %} 720 | {% endif %} 721 | 722 | CreateScalarFunctionInfo info(std::move(fs)); 723 | 724 | { 725 | FunctionDescription desc; 726 | desc.description = "{{unary_function.description}}"; 727 | desc.examples.push_back("{{unary_function.example}}"); 728 | info.descriptions.push_back(desc); 729 | } 730 | 731 | loader.RegisterFunction(info); 732 | 733 | 734 | } 735 | {%- endfor %} 736 | 737 | // This funciton creates the sketches. 738 | { 739 | AggregateFunctionSet sketch("datasketch_{{sketch_type|lower}}"); 740 | {% for logical_type, cpp_type in logical_type_to_cplusplus_type(sketch_type).items() %} 741 | {% if sketch_type in counting_sketch_names %} 742 | { 743 | auto fun = DS{{sketch_type}}CreateAggregate<{{cpp_type}}>({{logical_type}}, sketch_type); 744 | fun.bind = DS{{sketch_type}}Bind; 745 | fun.order_dependent = AggregateOrderDependent::NOT_ORDER_DEPENDENT; 746 | fun.arguments.insert(fun.arguments.begin(), {{cpp_type_mapping[sketch_k_cpp_type[sketch_type]]}}); 747 | sketch.AddFunction(fun); 748 | } 749 | {% else %} 750 | { 751 | auto fun = DS{{sketch_type}}CreateAggregate<{{cpp_type}}>({{logical_type}}, sketch_map_types[{{to_type_id(logical_type)}}]); 752 | fun.bind = DS{{sketch_type}}Bind; 753 | fun.order_dependent = AggregateOrderDependent::NOT_ORDER_DEPENDENT; 754 | fun.arguments.insert(fun.arguments.begin(), {{cpp_type_mapping[sketch_k_cpp_type[sketch_type]]}}); 755 | sketch.AddFunction(fun); 756 | } 757 | //sketch.AddFunction(DS{{sketch_type}}CreateAggregate<{{cpp_type}}>({{logical_type}}, sketch_map_types[{{to_type_id(logical_type)}}])); 758 | { 759 | auto fun = DS{{sketch_type}}MergeAggregate<{{cpp_type}}>({{logical_type}}, sketch_map_types[{{to_type_id(logical_type)}}]); 760 | fun.bind = DS{{sketch_type}}Bind; 761 | fun.order_dependent = AggregateOrderDependent::NOT_ORDER_DEPENDENT; 762 | fun.arguments.insert(fun.arguments.begin(), {{cpp_type_mapping[sketch_k_cpp_type[sketch_type]]}}); 763 | sketch.AddFunction(fun); 764 | } 765 | //sketch.AddFunction(DS{{sketch_type}}MergeAggregate<{{cpp_type}}>({{logical_type}}, sketch_map_types[{{to_type_id(logical_type)}}])); 766 | {% endif %} 767 | {%- endfor %} 768 | CreateAggregateFunctionInfo sketch_info(sketch); 769 | 770 | 771 | { 772 | FunctionDescription desc; 773 | desc.description = "Creates a sketch_{{sketch_type|lower}} data sketch by aggregating values or by aggregating other {{sketch_type}} data sketches"; 774 | desc.examples.push_back("datasketch_{{sketch_type|lower}}(k, data)"); 775 | sketch_info.descriptions.push_back(desc); 776 | } 777 | 778 | loader.RegisterFunction(sketch_info); 779 | } 780 | 781 | 782 | 783 | {% if sketch_type in counting_sketch_names %} 784 | { 785 | AggregateFunctionSet sketch("datasketch_{{sketch_type|lower}}_union"); 786 | auto fun = DS{{sketch_type}}MergeAggregate(sketch_type); 787 | fun.bind = DS{{sketch_type}}Bind; 788 | fun.order_dependent = AggregateOrderDependent::NOT_ORDER_DEPENDENT; 789 | fun.arguments.insert(fun.arguments.begin(), {{cpp_type_mapping[sketch_k_cpp_type[sketch_type]]}}); 790 | sketch.AddFunction(fun); 791 | CreateAggregateFunctionInfo sketch_info(sketch); 792 | 793 | { 794 | FunctionDescription desc; 795 | desc.description = "Creates a sketch_{{sketch_type}} data sketch by aggregating other {{sketch_type}} data sketches"; 796 | desc.examples.push_back("datasketch_{{sketch_type|lower}}_union(k, data)"); 797 | sketch_info.descriptions.push_back(desc); 798 | } 799 | 800 | loader.RegisterFunction(sketch_info); 801 | } 802 | {% endif %} 803 | 804 | 805 | } 806 | {%- endfor %} 807 | 808 | 809 | } --------------------------------------------------------------------------------