├── .editorconfig ├── .github ├── dependabot.yml └── workflows │ ├── MainDistributionPipeline.yml │ └── schedule-1.2.yml ├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── LICENSE ├── Makefile ├── README.md ├── docs └── space-filling-curve-ducks.jpg ├── duckdb_lindel_rust ├── Cargo.lock ├── Cargo.toml ├── cbindgen.toml └── src │ └── lib.rs ├── extension_config.cmake ├── scripts ├── bootstrap-template.py └── extension-upload.sh ├── src ├── include │ ├── lindel_extension.hpp │ └── rust.h └── lindel_extension.cpp ├── test ├── README.md └── sql │ └── lindel.test └── vcpkg.json /.editorconfig: -------------------------------------------------------------------------------- 1 | # Unix-style newlines with a newline ending every file 2 | [*.{c,cpp,h,hpp}] 3 | end_of_line = lf 4 | insert_final_newline = true 5 | indent_style = tab 6 | tab_width = 4 7 | indent_size = tab 8 | trim_trailing_whitespace = true 9 | charset = utf-8 10 | max_line_length = 120 11 | x-soft-wrap-text = true 12 | x-soft-wrap-mode = CharacterWidth 13 | x-soft-wrap-limit = 120 14 | x-show-invisibles = false 15 | x-show-spaces = false 16 | 17 | [*.{java}] 18 | end_of_line = lf 19 | insert_final_newline = true 20 | indent_style = tab 21 | tab_width = 4 22 | indent_size = tab 23 | trim_trailing_whitespace = false 24 | charset = utf-8 25 | max_line_length = 120 26 | x-soft-wrap-text = true 27 | x-soft-wrap-mode = CharacterWidth 28 | x-soft-wrap-limit = 120 29 | x-show-invisibles = false 30 | x-show-spaces = false 31 | 32 | [*.{test,test_slow,test_coverage,benchmark}] 33 | end_of_line = lf 34 | insert_final_newline = true 35 | indent_style = tab 36 | tab_width = 4 37 | indent_size = tab 38 | trim_trailing_whitespace = false 39 | charset = utf-8 40 | x-soft-wrap-text = false 41 | 42 | [Makefile] 43 | end_of_line = lf 44 | insert_final_newline = true 45 | indent_style = tab 46 | tab_width = 4 47 | indent_size = tab 48 | trim_trailing_whitespace = true 49 | charset = utf-8 50 | x-soft-wrap-text = false 51 | 52 | [*keywords.list] 53 | insert_final_newline = false 54 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "cargo" # See documentation for possible values 9 | directory: "/duckdb_lindel_rust" # Location of package manifests 10 | schedule: 11 | interval: "daily" 12 | -------------------------------------------------------------------------------- /.github/workflows/MainDistributionPipeline.yml: -------------------------------------------------------------------------------- 1 | # 2 | # This workflow calls the main distribution pipeline from DuckDB to build, test and (optionally) release the extension 3 | # 4 | name: Main Extension Distribution Pipeline 5 | on: 6 | push: 7 | pull_request: 8 | workflow_dispatch: 9 | schedule: 10 | - cron: '0 2 * * *' # Runs every night at 02:00 UTC 11 | 12 | concurrency: 13 | group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/main' || github.sha }} 14 | cancel-in-progress: true 15 | 16 | jobs: 17 | # duckdb-next-build: 18 | # name: Build extension binaries 19 | # uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main 20 | # with: 21 | # duckdb_version: main 22 | # ci_tools_version: main 23 | # enable_rust: true 24 | # extension_name: lindel 25 | # exclude_archs: "windows_amd64_rtools" 26 | 27 | duckdb-stable-build: 28 | name: Build extension binaries 29 | uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main 30 | with: 31 | duckdb_version: main 32 | ci_tools_version: main 33 | extension_name: lindel 34 | enable_rust: true 35 | exclude_archs: "windows_amd64_rtools;wasm_mvp;wasm_eh;wasm_threads" 36 | -------------------------------------------------------------------------------- /.github/workflows/schedule-1.2.yml: -------------------------------------------------------------------------------- 1 | name: Scheduled Trigger for 1.2 2 | 3 | on: 4 | schedule: 5 | - cron: '0 12 * * *' # Runs at 12:00 UTC every day 6 | workflow_dispatch: # Allows manual trigger 7 | 8 | jobs: 9 | trigger: 10 | runs-on: ubuntu-latest 11 | permissions: 12 | actions: write # Allow triggering workflows 13 | steps: 14 | - name: Checkout repository # Required for gh to work 15 | uses: actions/checkout@v4 16 | 17 | - name: Install GitHub CLI 18 | run: | 19 | sudo apt update && sudo apt install gh -y 20 | 21 | - name: Authenticate GH CLI 22 | run: | 23 | echo "${{ secrets.GITHUB_TOKEN }}" | gh auth login --with-token 24 | 25 | - name: Trigger Workflow on my-branch 26 | run: | 27 | gh workflow run MainDistributionPipeline.yml --ref v1.2 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | .idea 3 | cmake-build-debug 4 | duckdb_unittest_tempdir/ 5 | .DS_Store 6 | testext 7 | test/python/__pycache__/ 8 | .Rhistory 9 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "duckdb"] 2 | path = duckdb 3 | url = https://github.com/duckdb/duckdb 4 | branch = main 5 | [submodule "extension-ci-tools"] 6 | path = extension-ci-tools 7 | url = https://github.com/duckdb/extension-ci-tools 8 | branch = main -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.5) 2 | 3 | set(CORROSION_VERBOSE_OUTPUT ON) 4 | set(CMAKE_CXX_STANDARD 11) 5 | set(CMAKE_CXX_STANDARD_REQUIRED 1) 6 | 7 | 8 | set(prefix_to_check "wasm") 9 | # Get the length of the prefix 10 | string(LENGTH "${prefix_to_check}" prefix_length) 11 | # Extract the prefix from the example_string 12 | string(SUBSTRING "${DUCKDB_PLATFORM}" 0 ${prefix_length} extracted_platform_prefix) 13 | 14 | 15 | execute_process( 16 | COMMAND rustup target list --installed 17 | OUTPUT_VARIABLE RUST_TARGETS 18 | ) 19 | 20 | # Propagate arch to rust build for CI 21 | set(Rust_CARGO_TARGET "") 22 | if("${OS_NAME}" STREQUAL "linux") 23 | if ("${OS_ARCH}" STREQUAL "arm64") 24 | set(Rust_CARGO_TARGET "aarch64-unknown-linux-gnu") 25 | elseif("${CMAKE_CXX_COMPILER}" MATCHES "aarch64") 26 | set(Rust_CARGO_TARGET ${RUST_ENV_VARS} CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER=aarch64-linux-gnu-gcc) 27 | set(Rust_CARGO_TARGET "aarch64-unknown-linux-gnu") 28 | else() 29 | string(FIND "${RUST_TARGETS}" "musl" MUSL_TARGET_FOUND) 30 | if(NOT MUSL_TARGET_FOUND EQUAL -1) 31 | set(Rust_CARGO_TARGET "x86_64-unknown-linux-musl") 32 | else() 33 | set(Rust_CARGO_TARGET "x86_64-unknown-linux-gnu") 34 | endif() 35 | endif() 36 | elseif("${OS_NAME}" STREQUAL "osx") 37 | if ("${OSX_BUILD_ARCH}" STREQUAL "arm64") 38 | set(Rust_CARGO_TARGET "aarch64-apple-darwin") 39 | elseif ("${OSX_BUILD_ARCH}" STREQUAL "x86_64") 40 | set(Rust_CARGO_TARGET "x86_64-apple-darwin") 41 | elseif ("${OS_ARCH}" STREQUAL "arm64") 42 | set(Rust_CARGO_TARGET "aarch64-apple-darwin") 43 | endif() 44 | elseif(WIN32) 45 | if (MINGW AND "${OS_ARCH}" STREQUAL "arm64") 46 | set(Rust_CARGO_TARGET "aarch64-pc-windows-gnu") 47 | elseif (MINGW AND "${OS_ARCH}" STREQUAL "amd64") 48 | set(Rust_CARGO_TARGET "x86_64-pc-windows-gnu") 49 | elseif (MSVC AND "${OS_ARCH}" STREQUAL "arm64") 50 | set(Rust_CARGO_TARGET "aarch64-pc-windows-msvc") 51 | elseif (MSVC AND "${OS_ARCH}" STREQUAL "amd64") 52 | set(Rust_CARGO_TARGET "x86_64-pc-windows-msvc") 53 | endif() 54 | endif() 55 | 56 | string(FIND "${RUST_TARGETS}" "wasm32-unknown-emscripten" WASM_TARGET_FOUND) 57 | 58 | if (NOT WASM_TARGET_FOUND EQUAL -1) 59 | set(Rust_CARGO_TARGET "wasm32-unknown-emscripten") 60 | endif() 61 | 62 | message(STATUS "RUST_TARGETS: ${RUST_TARGETS}") 63 | message(STATUS "WASM_TARGET_FOUND: ${WASM_TARGET_FOUND}") 64 | message(STATUS "TARGET: ${TARGET}") 65 | message(STATUS "DUCKDB_BUILD_TYPE: ${DUCKDB_BUILD_TYPE}") 66 | message(STATUS "TARGET NAME: ${TARGET_NAME}") 67 | message(STATUS "DUCKDB_PLATFORM: ${DUCKDB_PLATFORM}") 68 | message(STATUS "OS_ARCH: ${OS_ARCH}") 69 | message(STATUS "OS_NAME: ${OS_NAME}") 70 | message(STATUS "Rust_CARGO_TARGET: ${Rust_CARGO_TARGET}") 71 | # We currently only support the predefined targets. 72 | #if ("${Rust_CARGO_TARGET}" STREQUAL "") 73 | # message(FATAL_ERROR "Failed to detect the correct platform") 74 | #endif() 75 | 76 | 77 | include(FetchContent) 78 | 79 | FetchContent_Declare( 80 | Corrosion 81 | GIT_REPOSITORY https://github.com/corrosion-rs/corrosion.git 82 | GIT_TAG v0.5 83 | ) 84 | # Set any global configuration variables such as `Rust_TOOLCHAIN` before this line! 85 | FetchContent_MakeAvailable(Corrosion) 86 | 87 | # Import targets defined in a package or workspace manifest `Cargo.toml` file 88 | corrosion_import_crate(MANIFEST_PATH "${CMAKE_SOURCE_DIR}/../duckdb_lindel_rust/Cargo.toml" 89 | CRATES "duckdb_lindel_rust" 90 | ) 91 | 92 | # Set extension name here 93 | set(TARGET_NAME lindel) 94 | 95 | set(EXTENSION_NAME ${TARGET_NAME}_extension) 96 | set(LOADABLE_EXTENSION_NAME ${TARGET_NAME}_loadable_extension) 97 | 98 | project(${TARGET_NAME}) 99 | 100 | include_directories(src/include) 101 | 102 | set(EXTENSION_SOURCES src/lindel_extension.cpp) 103 | 104 | build_static_extension(${TARGET_NAME} ${EXTENSION_SOURCES}) 105 | build_loadable_extension(${TARGET_NAME} " " ${EXTENSION_SOURCES}) 106 | 107 | get_target_property(fake_includes duckdb_lindel_rust INCLUDE_DIRECTORIES) 108 | 109 | target_link_libraries(${EXTENSION_NAME} duckdb_lindel_rust-static) 110 | target_link_libraries(${LOADABLE_EXTENSION_NAME} duckdb_lindel_rust) 111 | 112 | install( 113 | TARGETS ${EXTENSION_NAME} 114 | EXPORT "${DUCKDB_EXPORT_SET}" 115 | LIBRARY DESTINATION "${INSTALL_LIB_DIR}" 116 | ARCHIVE DESTINATION "${INSTALL_LIB_DIR}") 117 | 118 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2024 Rusty Conover 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PROJ_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) 2 | 3 | # Configuration of extension 4 | EXT_NAME=lindel 5 | EXT_CONFIG=${PROJ_DIR}extension_config.cmake 6 | 7 | # Include the Makefile from extension-ci-tools 8 | include extension-ci-tools/makefiles/duckdb_extension.Makefile 9 | 10 | rust_binding_headers: 11 | cd duckdb_lindel_rust && cbindgen --config ./cbindgen.toml --crate duckdb_lindel_rust --output ../src/include/rust.h 12 | 13 | clean_all: clean 14 | cd duckdb_lindel_rust && cargo clean -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Lindel (linearizer-delinearizer) Extension for DuckDB 2 | 3 | ![Ducks filling Space-Filling Curves](./docs/space-filling-curve-ducks.jpg) 4 | 5 | This `lindel` extension adds functions for the [linearization](https://en.wikipedia.org/wiki/Linearization) and delinearization of numeric arrays in [DuckDB](https://www.duckdb.org). It allows you to order multi-dimensional data using space-filling curves. 6 | 7 | ## Installation 8 | 9 | **`lindel` is a [DuckDB Community Extension](https://github.com/duckdb/community-extensions).** 10 | 11 | You can now use this by using this SQL: 12 | 13 | ```sql 14 | install lindel from community; 15 | load lindel; 16 | ``` 17 | 18 | ## What is linearization? 19 | 20 | An animation of the Hilbert Curve from Wikipedia 21 | 22 | [Linearization](https://en.wikipedia.org/wiki/Linearization) maps multi-dimensional data into a one-dimensional sequence while [preserving locality](https://en.wikipedia.org/wiki/Locality_of_reference), enhancing the efficiency of data structures and algorithms for spatial data, such as in databases, GIS, and memory caches. 23 | 24 | > "The principle of locality states that programs tend to reuse data and instructions they have used recently." 25 | 26 | In SQL, sorting by a single column (e.g., time or identifier) is often sufficient, but sometimes queries involve multiple fields, such as: 27 | 28 | - Time and identifier (historical trading data) 29 | - Latitude and Longitude (GIS applications) 30 | - Latitude, Longitude, and Altitude (flight tracking) 31 | - Latitude, Longitude, Altitude, and Time (flight history) 32 | 33 | Sorting by a single field isn't optimal for multi-field queries. Linearization maps multiple fields into a single value, while preserving locality—meaning values close in the original representation remain close in the mapped representation. 34 | 35 | #### Where has this been used before? 36 | 37 | DataBricks has long supported Z-Ordering (they also now default to using the Hilbert curve for the ordering). This [video explains how Delta Lake queries are faster when the data is Z-Ordered.](https://www.youtube.com/watch?v=A1aR1A8OwOU) This extension also allows DuckDB to write files with the same ordering optimization. 38 | 39 | Numerous articles describe the benefits of applying a Z-Ordering/Hilbert ordering to data for query performance. 40 | 41 | - [https://delta.io/blog/2023-06-03-delta-lake-z-order/](https://delta.io/blog/2023-06-03-delta-lake-z-order/) 42 | - [https://blog.cloudera.com/speeding-up-queries-with-z-order/](https://blog.cloudera.com/speeding-up-queries-with-z-order/) 43 | - [https://www.linkedin.com/pulse/z-order-visualization-implementation-nick-karpov/](https://www.linkedin.com/pulse/z-order-visualization-implementation-nick-karpov/) 44 | 45 | From one of the articles: 46 | 47 | ![Delta Lake Query Speed Improvement from using Z-Ordering](https://delta.io/static/c1801cd120999d77de0ee51b227acccb/a13c9/image1.png) 48 | 49 | Your particular performance improvements will vary, but for some query patterns Z-Ordering and Hilbert ordering will make quite a big difference. 50 | 51 | ## When would I use this? 52 | 53 | For query patterns across multiple numeric or short text columns, consider sorting rows using Hilbert encoding when storing data in Parquet: 54 | 55 | ```sql 56 | COPY ( 57 | select * from 'source.csv' 58 | order by 59 | hilbert_encode([source_data.time, source_data.symbol_id]::integer[2]) 60 | ) 61 | TO 'example.parquet' (FORMAT PARQUET) 62 | 63 | -- or if dealing with latitude and longitude 64 | 65 | COPY ( 66 | select * from 'source.csv' 67 | order by 68 | hilbert_encode([source_data.lat, source_data.lon]::double[2]) 69 | ) TO 'example.parquet' (FORMAT PARQUET) 70 | ``` 71 | 72 | The Parquet file format stores statistics for each row group. Since rows are sorted with locality into these row groups the query execution may be able to skip row groups that contain no relevant rows, leading to faster query execution times. 73 | 74 | ## Encoding Types 75 | 76 | This extension offers two different encoding types, [Hilbert](https://en.wikipedia.org/wiki/Hilbert_curve) and [Morton](https://en.wikipedia.org/wiki/Z-order_curve) encoding. 77 | 78 | ### Hilbert Encoding 79 | 80 | Hilbert encoding uses the Hilbert curve, a continuous fractal space-filling curve named after [David Hilbert](https://en.wikipedia.org/wiki/David_Hilbert). It rearranges coordinates based on the Hilbert curve's path, preserving spatial locality better than Morton encoding. 81 | 82 | This is a great explanation of the [Hilbert curve](https://www.youtube.com/watch?v=3s7h2MHQtxc). 83 | 84 | 85 | 86 | ### Morton Encoding (Z-order Curve) 87 | 88 | Morton encoding, also known as the Z-order curve, interleaves the binary representations of coordinates into a single integer. It is named after Glenn K. Morton. 89 | 90 | **Locality:** Hilbert encoding generally preserves locality better than Morton encoding, making it preferable for applications where spatial proximity matters. 91 | 92 | ## API 93 | 94 | ### Encoding 95 | 96 | **Supported types:** Any signed or unsigned integer, float, or double (`INPUT_TYPE`). 97 | **Output:** The smallest unsigned integer type that can represent the input array. 98 | 99 | ### Encoding Functions 100 | 101 | * `hilbert_encode(ARRAY[INPUT_TYPE, 1-16])` 102 | * `morton_encode(ARRAY[INPUT_TYPE, 1-16])` 103 | 104 | Output is limited to a 128-bit `UHUGEINT`. The input array size is validated to ensure it fits within this limit. 105 | 106 | | Input Type | Maximum Number of Elements | Output Type (depends on number of elements) | 107 | |---|--|-------------| 108 | | `UTINYINT` | 16 | 1: `UTINYINT`
2: `USMALLINT`
3-4: `UINTEGER`
4-8: `UBIGINT`
8-16: `UHUGEINT`| 109 | | `USMALLINT` | 8 | 1: `USMALLINT`
2: `UINTEGER`
3-4: `UBIGINT`
4-8: `UHUGEINT` | 110 | | `UINTEGER` | 4 | 1: `UINTEGER`
2: `UBIGINT`
3-4: `UHUGEINT` | 111 | | `UBIGINT` | 2 | 1: `UBIGINT`
2: `UHUGEINT` | 112 | | `FLOAT` | 4 | 1: `UINTEGER`
2: `UBIGINT`
3-4: `UHUGEINT` | 113 | | `DOUBLE` | 2 | 1: `UBIGINT`
2: `UHUGEINT` | 114 | 115 | ### Encoding examples 116 | 117 | ```sql 118 | install lindel from community; 119 | load lindel; 120 | 121 | with elements as ( 122 | select * as id from range(3) 123 | ) 124 | select 125 | a.id as a, 126 | b.id as b, 127 | hilbert_encode([a.id, b.id]::tinyint[2]) as hilbert, 128 | morton_encode([a.id, b.id]::tinyint[2]) as morton 129 | from 130 | elements as a cross join elements as b; 131 | ┌───────┬───────┬─────────┬────────┐ 132 | │ a │ b │ hilbert │ morton │ 133 | │ int64 │ int64 │ uint16 │ uint16 │ 134 | ├───────┼───────┼─────────┼────────┤ 135 | │ 0 │ 0 │ 0 │ 0 │ 136 | │ 0 │ 1 │ 3 │ 1 │ 137 | │ 0 │ 2 │ 4 │ 4 │ 138 | │ 1 │ 0 │ 1 │ 2 │ 139 | │ 1 │ 1 │ 2 │ 3 │ 140 | │ 1 │ 2 │ 7 │ 6 │ 141 | │ 2 │ 0 │ 14 │ 8 │ 142 | │ 2 │ 1 │ 13 │ 9 │ 143 | │ 2 │ 2 │ 8 │ 12 │ 144 | └───────┴───────┴─────────┴────────┘ 145 | 146 | -- Now sort that same table using Hilbert encoding 147 | 148 | ┌───────┬───────┬─────────┬────────┐ 149 | │ a │ b │ hilbert │ morton │ 150 | │ int64 │ int64 │ uint16 │ uint16 │ 151 | ├───────┼───────┼─────────┼────────┤ 152 | │ 0 │ 0 │ 0 │ 0 │ 153 | │ 1 │ 0 │ 1 │ 2 │ 154 | │ 1 │ 1 │ 2 │ 3 │ 155 | │ 0 │ 1 │ 3 │ 1 │ 156 | │ 0 │ 2 │ 4 │ 4 │ 157 | │ 1 │ 2 │ 7 │ 6 │ 158 | │ 2 │ 2 │ 8 │ 12 │ 159 | │ 2 │ 1 │ 13 │ 9 │ 160 | │ 2 │ 0 │ 14 │ 8 │ 161 | └───────┴───────┴─────────┴────────┘ 162 | 163 | -- Do you notice how when A and B are closer to 2 the rows are "closer"? 164 | ``` 165 | 166 | Encoding doesn't only work with integers it can also be used with floats. 167 | 168 | ```sql 169 | install lindel from community; 170 | load lindel; 171 | 172 | -- Encode two 32-bit floats into one uint64 173 | select hilbert_encode([37.8, .2]::float[2]) as hilbert; 174 | ┌─────────────────────┐ 175 | │ hilbert │ 176 | │ uint64 │ 177 | ├─────────────────────┤ 178 | │ 2303654869236839926 │ 179 | └─────────────────────┘ 180 | 181 | -- Since doubles use 64 bits of precision the encoding 182 | -- must result in a uint128 183 | 184 | select hilbert_encode([37.8, .2]::double[2]) as hilbert; 185 | ┌────────────────────────────────────────┐ 186 | │ hilbert │ 187 | │ uint128 │ 188 | ├────────────────────────────────────────┤ 189 | │ 42534209309512799991913666633619307890 │ 190 | └────────────────────────────────────────┘ 191 | 192 | -- 3 dimensional encoding. 193 | select hilbert_encode([1.0, 5.0, 6.0]::float[3]) as hilbert; 194 | ┌──────────────────────────────┐ 195 | │ hilbert │ 196 | │ uint128 │ 197 | ├──────────────────────────────┤ 198 | │ 8002395622101954260073409974 │ 199 | └──────────────────────────────┘ 200 | ``` 201 | 202 | Not to be left out you can also encode strings. 203 | 204 | ```sql 205 | 206 | select hilbert_encode([ord(x) for x in split('abcd', '')]::tinyint[4]) as hilbert; 207 | ┌───────────┐ 208 | │ hilbert │ 209 | │ uint32 │ 210 | ├───────────┤ 211 | │ 178258816 │ 212 | └───────────┘ 213 | 214 | --- This splits the string 'abcd' by character, then converts each character into 215 | --- its ordinal representation, finally converts them all to 8 bit integers and then 216 | --- performs encoding. 217 | 218 | ``` 219 | 220 | Currently, the input for `hilbert_encode()` and `morton_encode()` functions in DuckDB requires that all elements in the input array be of the same size. If you need to encode different-sized types, you must break up larger data types into units of the smallest data type. Results may vary. 221 | 222 | ### Decoding Functions 223 | 224 | * `hilbert_encode(ANY_UNSIGNED_INTEGER_TYPE, TINYINT, BOOLEAN, BOOLEAN)` 225 | * `morton_encode(ANY_UNSIGNED_INTEGER_TYPE, TINYINT, BOOLEAN, BOOLEAN)` 226 | 227 | The decoding functions take four parameters: 228 | 229 | 1. **Value to be decoded:** This is always an unsigned integer type. 230 | 2. **Number of elements to decode:** This is a `TINYINT` specifying how many elements should be decoded. 231 | 3. **Float return type:** This `BOOLEAN` indicates whether the values should be returned as floats (REAL or DOUBLE). Set to true to enable this. 232 | 4. **Unsigned return type:** This `BOOLEAN` indicates whether the values should be unsigned if not using floats. 233 | 234 | The return type of these functions is always an array, with the element type determined by the number of elements requested and whether "float" handling is enabled by the third parameter. 235 | 236 | ### Examples 237 | 238 | ```sql 239 | -- Start out just by encoding two values. 240 | select hilbert_encode([1, 2]::tinyint[2]) as hilbert; 241 | ┌─────────┐ 242 | │ hilbert │ 243 | │ uint16 │ 244 | ├─────────┤ 245 | │ 7 │ 246 | └─────────┘ 247 | D select hilbert_decode(7::uint16, 2, false, true) as values; 248 | ┌─────────────┐ 249 | │ values │ 250 | │ utinyint[2] │ 251 | ├─────────────┤ 252 | │ [1, 2] │ 253 | └─────────────┘ 254 | 255 | -- Show that the decoder works with the encoder. 256 | select hilbert_decode(hilbert_encode([1, 2]::tinyint[2]), 2, false, false) as values; 257 | ┌─────────────┐ 258 | │ values │ 259 | │ utinyint[2] │ 260 | ├─────────────┤ 261 | │ [1, 2] │ 262 | └─────────────┘ 263 | 264 | -- FIXME: need to implement a signed or unsigned flag on the decoder function. 265 | select hilbert_decode(hilbert_encode([1, -2]::bigint[2]), 2, false, false) as values; 266 | ┌───────────┐ 267 | │ values │ 268 | │ bigint[2] │ 269 | ├───────────┤ 270 | │ [1, -2] │ 271 | └───────────┘ 272 | 273 | select hilbert_encode([1.0, 5.0, 6.0]::float[3]) as hilbert; 274 | ┌──────────────────────────────┐ 275 | │ hilbert │ 276 | │ uint128 │ 277 | ├──────────────────────────────┤ 278 | │ 8002395622101954260073409974 │ 279 | └──────────────────────────────┘ 280 | 281 | select hilbert_decode(8002395622101954260073409974::UHUGEINT, 3, True, False) as values; 282 | ┌─────────────────┐ 283 | │ values │ 284 | │ float[3] │ 285 | ├─────────────────┤ 286 | │ [1.0, 5.0, 6.0] │ 287 | └─────────────────┘ 288 | ``` 289 | ## Credits 290 | 291 | 1. This DuckDB extension utilizes and is named after the [`lindel`](https://crates.io/crates/lindel) Rust crate created by [DoubleHyphen](https://github.com/DoubleHyphen). 292 | 293 | 2. It also uses the [DuckDB Extension Template](https://github.com/duckdb/extension-template). 294 | 295 | 3. This extension uses [Corrosion](https://github.com/corrosion-rs/corrosion) to combine CMake with a Rust/Cargo build process. 296 | 297 | 4. I've gotten a lot of help from the generous DuckDB developer community. 298 | 299 | ### Build Architecture 300 | 301 | For the DuckDB extension to call the Rust code a tool called `cbindgen` is used to write the C++ headers for the exposed Rust interface. 302 | 303 | The headers can be updated by running `make rust_binding_headers`. 304 | 305 | #### Building on MacOS X 306 | 307 | Example setup + build steps for macOS users: 308 | 309 | ```sh 310 | # Remove rust if previously installed via brew 311 | brew uninstall rust 312 | 313 | # Install rustup + cbindgen 314 | # (use rustup to switch versions of Rust without extra fuss) 315 | brew install cbindgen rustup 316 | 317 | rustup toolchain install stable 318 | 319 | # Initialize rustup 320 | # Zsh users: customize installation, answer n to "Modify PATH variable?", 321 | # and continue with defaults for everything else 322 | rustup-init 323 | 324 | # OPTIONAL step for zsh users: add rust + cargo env setup to zshrc: 325 | echo '. "$HOME/.cargo/env"' >> ~/.zshrc 326 | 327 | # Use rustc stable version by default 328 | rustup default stable 329 | 330 | # Build headers 331 | make rust_binding_headers 332 | 333 | GEN=ninja make 334 | ``` 335 | 336 | ### Build steps 337 | Now to build the extension, run: 338 | ```sh 339 | make 340 | ``` 341 | The main binaries that will be built are: 342 | ```sh 343 | ./build/release/duckdb 344 | ./build/release/test/unittest 345 | ./build/release/extension/lindel/lindel.duckdb_extension 346 | ``` 347 | - `duckdb` is the binary for the duckdb shell with the extension code automatically loaded. 348 | - `unittest` is the test runner of duckdb. Again, the extension is already linked into the binary. 349 | - `lindel.duckdb_extension` is the loadable binary as it would be distributed. 350 | 351 | ## Running the extension 352 | To run the extension code, simply start the shell with `./build/release/duckdb`. 353 | 354 | Now we can use the features from the extension directly in DuckDB. 355 | 356 | ``` 357 | D select hilbert_encode([1.0, 5.0, 6.0]::float[3]) as hilbert; 358 | ┌──────────────────────────────┐ 359 | │ hilbert │ 360 | │ uint128 │ 361 | ├──────────────────────────────┤ 362 | │ 8002395622101954260073409974 │ 363 | └──────────────────────────────┘ 364 | ``` 365 | 366 | ## Running the tests 367 | Different tests can be created for DuckDB extensions. The primary way of testing DuckDB extensions should be the SQL tests in `./test/sql`. These SQL tests can be run using: 368 | ```sh 369 | make test 370 | ``` 371 | 372 | ### Installing the deployed binaries 373 | To install your extension binaries from S3, you will need to do two things. Firstly, DuckDB should be launched with the 374 | `allow_unsigned_extensions` option set to true. How to set this will depend on the client you're using. Some examples: 375 | 376 | CLI: 377 | ```shell 378 | duckdb -unsigned 379 | ``` 380 | 381 | Python: 382 | ```python 383 | con = duckdb.connect(':memory:', config={'allow_unsigned_extensions' : 'true'}) 384 | ``` 385 | 386 | NodeJS: 387 | ```js 388 | db = new duckdb.Database(':memory:', {"allow_unsigned_extensions": "true"}); 389 | ``` 390 | 391 | Secondly, you will need to set the repository endpoint in DuckDB to the HTTP url of your bucket + version of the extension 392 | you want to install. To do this run the following SQL query in DuckDB: 393 | ```sql 394 | SET custom_extension_repository='bucket.s3.us-east-1.amazonaws.com/lindel/latest'; 395 | ``` 396 | Note that the `/latest` path will allow you to install the latest extension version available for your current version of 397 | DuckDB. To specify a specific version, you can pass the version instead. 398 | 399 | After running these steps, you can install and load your extension using the regular INSTALL/LOAD commands in DuckDB: 400 | ```sql 401 | INSTALL lindel 402 | LOAD lindel 403 | ``` 404 | -------------------------------------------------------------------------------- /docs/space-filling-curve-ducks.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Query-farm/lindel/ebf029f3039986fb01822dfefad31732a072c440/docs/space-filling-curve-ducks.jpg -------------------------------------------------------------------------------- /duckdb_lindel_rust/Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "autocfg" 7 | version = "1.4.0" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" 10 | 11 | [[package]] 12 | name = "duckdb_lindel_rust" 13 | version = "0.1.0" 14 | dependencies = [ 15 | "lindel", 16 | ] 17 | 18 | [[package]] 19 | name = "lindel" 20 | version = "0.1.1" 21 | source = "registry+https://github.com/rust-lang/crates.io-index" 22 | checksum = "e049ba2901c1380bbc3d9a10646d1eea9a478ec72e8de8cabb6d67e589aca99c" 23 | dependencies = [ 24 | "morton-encoding", 25 | "num", 26 | "num-traits", 27 | ] 28 | 29 | [[package]] 30 | name = "morton-encoding" 31 | version = "2.0.1" 32 | source = "registry+https://github.com/rust-lang/crates.io-index" 33 | checksum = "f66c953d92a578cd98a4598021e3b473520d214665917eb51dba49dc227936c8" 34 | dependencies = [ 35 | "num", 36 | "num-traits", 37 | ] 38 | 39 | [[package]] 40 | name = "num" 41 | version = "0.2.1" 42 | source = "registry+https://github.com/rust-lang/crates.io-index" 43 | checksum = "b8536030f9fea7127f841b45bb6243b27255787fb4eb83958aa1ef9d2fdc0c36" 44 | dependencies = [ 45 | "num-bigint", 46 | "num-complex", 47 | "num-integer", 48 | "num-iter", 49 | "num-rational", 50 | "num-traits", 51 | ] 52 | 53 | [[package]] 54 | name = "num-bigint" 55 | version = "0.2.6" 56 | source = "registry+https://github.com/rust-lang/crates.io-index" 57 | checksum = "090c7f9998ee0ff65aa5b723e4009f7b217707f1fb5ea551329cc4d6231fb304" 58 | dependencies = [ 59 | "autocfg", 60 | "num-integer", 61 | "num-traits", 62 | ] 63 | 64 | [[package]] 65 | name = "num-complex" 66 | version = "0.2.4" 67 | source = "registry+https://github.com/rust-lang/crates.io-index" 68 | checksum = "b6b19411a9719e753aff12e5187b74d60d3dc449ec3f4dc21e3989c3f554bc95" 69 | dependencies = [ 70 | "autocfg", 71 | "num-traits", 72 | ] 73 | 74 | [[package]] 75 | name = "num-integer" 76 | version = "0.1.46" 77 | source = "registry+https://github.com/rust-lang/crates.io-index" 78 | checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" 79 | dependencies = [ 80 | "num-traits", 81 | ] 82 | 83 | [[package]] 84 | name = "num-iter" 85 | version = "0.1.45" 86 | source = "registry+https://github.com/rust-lang/crates.io-index" 87 | checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" 88 | dependencies = [ 89 | "autocfg", 90 | "num-integer", 91 | "num-traits", 92 | ] 93 | 94 | [[package]] 95 | name = "num-rational" 96 | version = "0.2.4" 97 | source = "registry+https://github.com/rust-lang/crates.io-index" 98 | checksum = "5c000134b5dbf44adc5cb772486d335293351644b801551abe8f75c84cfa4aef" 99 | dependencies = [ 100 | "autocfg", 101 | "num-bigint", 102 | "num-integer", 103 | "num-traits", 104 | ] 105 | 106 | [[package]] 107 | name = "num-traits" 108 | version = "0.2.19" 109 | source = "registry+https://github.com/rust-lang/crates.io-index" 110 | checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" 111 | dependencies = [ 112 | "autocfg", 113 | ] 114 | -------------------------------------------------------------------------------- /duckdb_lindel_rust/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "duckdb_lindel_rust" 3 | version = "0.1.0" 4 | edition = "2021" 5 | publish = false 6 | 7 | [lib] 8 | name = "duckdb_lindel_rust" 9 | crate-type = ["staticlib"] 10 | 11 | [dependencies] 12 | lindel = "0.1.1" 13 | -------------------------------------------------------------------------------- /duckdb_lindel_rust/cbindgen.toml: -------------------------------------------------------------------------------- 1 | # This is a template cbindgen.toml file with all of the default values. 2 | # Some values are commented out because their absence is the real default. 3 | # 4 | # See https://github.com/mozilla/cbindgen/blob/master/docs.md#cbindgentoml 5 | # for detailed documentation of every option here. 6 | 7 | 8 | 9 | language = "C++" 10 | 11 | 12 | 13 | ############## Options for Wrapping the Contents of the Header ################# 14 | 15 | # header = "/* Text to put at the beginning of the generated file. Probably a license. */" 16 | # trailer = "/* Text to put at the end of the generated file */" 17 | # include_guard = "my_bindings_h" 18 | # pragma_once = true 19 | # autogen_warning = "/* Warning, this file is autogenerated by cbindgen. Don't modify this manually. */" 20 | include_version = false 21 | # namespace = "my_namespace" 22 | namespaces = [] 23 | using_namespaces = [] 24 | sys_includes = [] 25 | includes = [] 26 | no_includes = false 27 | # cpp_compat = true 28 | after_includes = "" 29 | 30 | 31 | 32 | 33 | ############################ Code Style Options ################################ 34 | 35 | braces = "SameLine" 36 | line_length = 100 37 | tab_width = 2 38 | documentation = true 39 | documentation_style = "auto" 40 | documentation_length = "full" 41 | line_endings = "LF" # also "CR", "CRLF", "Native" 42 | 43 | 44 | 45 | 46 | ############################# Codegen Options ################################## 47 | 48 | style = "both" 49 | sort_by = "Name" # default for `fn.sort_by` and `const.sort_by` 50 | usize_is_size_t = true 51 | 52 | 53 | 54 | [defines] 55 | # "target_os = freebsd" = "DEFINE_FREEBSD" 56 | # "feature = serde" = "DEFINE_SERDE" 57 | 58 | 59 | 60 | [export] 61 | include = [] 62 | exclude = [] 63 | # prefix = "CAPI_" 64 | item_types = [] 65 | renaming_overrides_prefixing = false 66 | 67 | 68 | 69 | [export.rename] 70 | 71 | 72 | 73 | [export.body] 74 | 75 | 76 | [export.mangle] 77 | 78 | 79 | [fn] 80 | rename_args = "None" 81 | # must_use = "MUST_USE_FUNC" 82 | # deprecated = "DEPRECATED_FUNC" 83 | # deprecated_with_note = "DEPRECATED_FUNC_WITH_NOTE" 84 | # no_return = "NO_RETURN" 85 | # prefix = "START_FUNC" 86 | # postfix = "END_FUNC" 87 | args = "auto" 88 | sort_by = "Name" 89 | 90 | 91 | 92 | 93 | [struct] 94 | rename_fields = "None" 95 | # must_use = "MUST_USE_STRUCT" 96 | # deprecated = "DEPRECATED_STRUCT" 97 | # deprecated_with_note = "DEPRECATED_STRUCT_WITH_NOTE" 98 | derive_constructor = false 99 | derive_eq = false 100 | derive_neq = false 101 | derive_lt = false 102 | derive_lte = false 103 | derive_gt = false 104 | derive_gte = false 105 | 106 | 107 | 108 | 109 | [enum] 110 | rename_variants = "None" 111 | # must_use = "MUST_USE_ENUM" 112 | # deprecated = "DEPRECATED_ENUM" 113 | # deprecated_with_note = "DEPRECATED_ENUM_WITH_NOTE" 114 | add_sentinel = false 115 | prefix_with_name = false 116 | derive_helper_methods = false 117 | derive_const_casts = false 118 | derive_mut_casts = false 119 | # cast_assert_name = "ASSERT" 120 | derive_tagged_enum_destructor = false 121 | derive_tagged_enum_copy_constructor = false 122 | enum_class = true 123 | private_default_tagged_enum_constructor = false 124 | 125 | 126 | 127 | 128 | [const] 129 | allow_static_const = true 130 | allow_constexpr = false 131 | sort_by = "Name" 132 | 133 | 134 | 135 | 136 | [macro_expansion] 137 | bitflags = false 138 | 139 | 140 | 141 | 142 | 143 | 144 | ############## Options for How Your Rust library Should Be Parsed ############## 145 | 146 | [parse] 147 | parse_deps = false 148 | # include = [] 149 | exclude = [] 150 | clean = false 151 | extra_bindings = [] 152 | 153 | 154 | 155 | [parse.expand] 156 | crates = ["duckdb_lindel_rust"] 157 | all_features = false 158 | default_features = true 159 | features = [] -------------------------------------------------------------------------------- /duckdb_lindel_rust/src/lib.rs: -------------------------------------------------------------------------------- 1 | // duckdb_lindel_rust 2 | // Copyright 2024 Rusty Conover 3 | // Licensed under the MIT License 4 | 5 | use std::ffi::c_void; 6 | 7 | // Decode an encoded value and store it in the destination pointer. 8 | #[no_mangle] 9 | pub extern "C" fn perform_decode( 10 | encoding_type: u8, 11 | element_bit_width: u8, 12 | src: *const c_void, 13 | dest: *mut c_void, 14 | dest_len: usize, 15 | ) { 16 | macro_rules! decode_and_copy { 17 | ($dest_type: ty, $src_type:ty, $len:expr) => {{ 18 | unsafe { 19 | let dest_ptr = dest as *mut $dest_type; 20 | let function = match encoding_type { 21 | 0 => lindel::hilbert_decode, 22 | 1 => lindel::morton_decode, 23 | _ => panic!("Invalid encoding type"), 24 | }; 25 | let values: [$dest_type; $len] = function(*(src as *const $src_type)); 26 | for i in 0..$len { 27 | *dest_ptr.add(i) = values[i]; 28 | } 29 | }; 30 | }}; 31 | } 32 | 33 | match element_bit_width { 34 | 8 => match dest_len { 35 | 1 => decode_and_copy!(u8, u8, 1), 36 | 2 => decode_and_copy!(u8, u16, 2), 37 | 3 => decode_and_copy!(u8, u32, 3), 38 | 4 => decode_and_copy!(u8, u32, 4), 39 | 5 => decode_and_copy!(u8, u64, 5), 40 | 6 => decode_and_copy!(u8, u64, 6), 41 | 7 => decode_and_copy!(u8, u64, 7), 42 | 8 => decode_and_copy!(u8, u64, 8), 43 | 9 => decode_and_copy!(u8, u128, 9), 44 | 10 => decode_and_copy!(u8, u128, 10), 45 | 11 => decode_and_copy!(u8, u128, 11), 46 | 12 => decode_and_copy!(u8, u128, 12), 47 | 13 => decode_and_copy!(u8, u128, 13), 48 | 14 => decode_and_copy!(u8, u128, 14), 49 | 15 => decode_and_copy!(u8, u128, 15), 50 | 16 => decode_and_copy!(u8, u128, 16), 51 | _ => panic!("Invalid length"), 52 | }, 53 | 16 => match dest_len { 54 | 1 => decode_and_copy!(u16, u16, 1), 55 | 2 => decode_and_copy!(u16, u32, 2), 56 | 3 => decode_and_copy!(u16, u64, 3), 57 | 4 => decode_and_copy!(u16, u64, 4), 58 | 5 => decode_and_copy!(u16, u128, 5), 59 | 6 => decode_and_copy!(u16, u128, 6), 60 | 7 => decode_and_copy!(u16, u128, 7), 61 | 8 => decode_and_copy!(u16, u128, 8), 62 | _ => panic!("Invalid length"), 63 | }, 64 | 32 => match dest_len { 65 | 1 => decode_and_copy!(u32, u32, 1), 66 | 2 => decode_and_copy!(u32, u64, 2), 67 | 3 => decode_and_copy!(u32, u128, 3), 68 | 4 => decode_and_copy!(u32, u128, 4), 69 | _ => panic!("Invalid length"), 70 | }, 71 | 64 => match dest_len { 72 | 1 => decode_and_copy!(u64, u64, 1), 73 | 2 => decode_and_copy!(u64, u128, 2), 74 | _ => panic!("Invalid length"), 75 | }, 76 | _ => panic!("Invalid element bit width"), 77 | } 78 | } 79 | 80 | // Create a macro to handle the repetitive part 81 | macro_rules! encode_and_store { 82 | ($function:expr, $array:expr, $type:ty, $result:expr) => {{ 83 | let calculated_result = $function($array); 84 | let result_ptr = $result as *mut $type; 85 | unsafe { 86 | *result_ptr = calculated_result; 87 | } 88 | }}; 89 | } 90 | 91 | macro_rules! generic_encode_u8_var { 92 | ($func_name:ident, $encoding_expr: expr) => { 93 | /// # Safety 94 | /// 95 | /// This function is unsafe because it dereferences raw pointers. 96 | #[no_mangle] 97 | pub unsafe extern "C" fn $func_name(ptr: *const u8, len: usize, result: *mut c_void) -> () { 98 | let args = unsafe { 99 | assert!(!ptr.is_null()); 100 | std::slice::from_raw_parts(ptr, len) 101 | }; 102 | 103 | match args.len() { 104 | 1 => encode_and_store!($encoding_expr, [args[0]], u8, result), 105 | 2 => encode_and_store!($encoding_expr, [args[0], args[1]], u16, result), 106 | 3 => encode_and_store!($encoding_expr, [args[0], args[1], args[2]], u32, result), 107 | 4 => encode_and_store!( 108 | $encoding_expr, 109 | [args[0], args[1], args[2], args[3]], 110 | u32, 111 | result 112 | ), 113 | 5 => encode_and_store!( 114 | $encoding_expr, 115 | [args[0], args[1], args[2], args[3], args[4]], 116 | u64, 117 | result 118 | ), 119 | 6 => encode_and_store!( 120 | $encoding_expr, 121 | [args[0], args[1], args[2], args[3], args[4], args[5]], 122 | u64, 123 | result 124 | ), 125 | 7 => encode_and_store!( 126 | $encoding_expr, 127 | [args[0], args[1], args[2], args[3], args[4], args[5], args[6]], 128 | u64, 129 | result 130 | ), 131 | 8 => encode_and_store!( 132 | $encoding_expr, 133 | [args[0], args[1], args[2], args[3], args[4], args[5], args[6], args[7]], 134 | u64, 135 | result 136 | ), 137 | 9 => encode_and_store!( 138 | $encoding_expr, 139 | [ 140 | args[0], args[1], args[2], args[3], args[4], args[5], args[6], args[7], 141 | args[8] 142 | ], 143 | u128, 144 | result 145 | ), 146 | 10 => encode_and_store!( 147 | $encoding_expr, 148 | [ 149 | args[0], args[1], args[2], args[3], args[4], args[5], args[6], args[7], 150 | args[8], args[9] 151 | ], 152 | u128, 153 | result 154 | ), 155 | 11 => encode_and_store!( 156 | $encoding_expr, 157 | [ 158 | args[0], args[1], args[2], args[3], args[4], args[5], args[6], args[7], 159 | args[8], args[9], args[10] 160 | ], 161 | u128, 162 | result 163 | ), 164 | 12 => encode_and_store!( 165 | $encoding_expr, 166 | [ 167 | args[0], args[1], args[2], args[3], args[4], args[5], args[6], args[7], 168 | args[8], args[9], args[10], args[11] 169 | ], 170 | u128, 171 | result 172 | ), 173 | 13 => encode_and_store!( 174 | $encoding_expr, 175 | [ 176 | args[0], args[1], args[2], args[3], args[4], args[5], args[6], args[7], 177 | args[8], args[9], args[10], args[11], args[12] 178 | ], 179 | u128, 180 | result 181 | ), 182 | 14 => encode_and_store!( 183 | $encoding_expr, 184 | [ 185 | args[0], args[1], args[2], args[3], args[4], args[5], args[6], args[7], 186 | args[8], args[9], args[10], args[11], args[12], args[13] 187 | ], 188 | u128, 189 | result 190 | ), 191 | 15 => encode_and_store!( 192 | $encoding_expr, 193 | [ 194 | args[0], args[1], args[2], args[3], args[4], args[5], args[6], args[7], 195 | args[8], args[9], args[10], args[11], args[12], args[13], args[14] 196 | ], 197 | u128, 198 | result 199 | ), 200 | 16 => encode_and_store!( 201 | $encoding_expr, 202 | [ 203 | args[0], args[1], args[2], args[3], args[4], args[5], args[6], args[7], 204 | args[8], args[9], args[10], args[11], args[12], args[13], args[14], 205 | args[16] 206 | ], 207 | u128, 208 | result 209 | ), 210 | _ => panic!("Invalid length"), 211 | } 212 | } 213 | }; 214 | } 215 | 216 | generic_encode_u8_var!(hilbert_encode_u8_var, lindel::hilbert_encode); 217 | generic_encode_u8_var!(morton_encode_u8_var, lindel::morton_encode); 218 | 219 | macro_rules! generic_encode_u16_var { 220 | ($func_name:ident, $encoding_expr: expr) => { 221 | /// # Safety 222 | /// 223 | /// This function is unsafe because it dereferences raw pointers. 224 | #[no_mangle] 225 | pub unsafe extern "C" fn $func_name( 226 | ptr: *const u16, 227 | len: usize, 228 | result: *mut c_void, 229 | ) -> () { 230 | let args = unsafe { 231 | assert!(!ptr.is_null()); 232 | std::slice::from_raw_parts(ptr, len) 233 | }; 234 | 235 | match args.len() { 236 | 1 => encode_and_store!($encoding_expr, [args[0]], u16, result), // 16 237 | 2 => encode_and_store!($encoding_expr, [args[0], args[1]], u32, result), //32 238 | 3 => encode_and_store!($encoding_expr, [args[0], args[1], args[2]], u64, result), // 48 - 64 239 | 4 => encode_and_store!( 240 | $encoding_expr, 241 | [args[0], args[1], args[2], args[3]], 242 | u64, 243 | result 244 | ), // 64 - 64 245 | 5 => encode_and_store!( 246 | $encoding_expr, 247 | [args[0], args[1], args[2], args[3], args[4]], 248 | u128, 249 | result 250 | ), 251 | 6 => encode_and_store!( 252 | $encoding_expr, 253 | [args[0], args[1], args[2], args[3], args[4], args[5]], 254 | u128, 255 | result 256 | ), 257 | 7 => encode_and_store!( 258 | $encoding_expr, 259 | [args[0], args[1], args[2], args[3], args[4], args[5], args[6]], 260 | u128, 261 | result 262 | ), 263 | 8 => encode_and_store!( 264 | $encoding_expr, 265 | [args[0], args[1], args[2], args[3], args[4], args[5], args[6], args[7]], 266 | u128, 267 | result 268 | ), 269 | _ => panic!("Invalid length"), 270 | } 271 | } 272 | }; 273 | } 274 | 275 | generic_encode_u16_var!(hilbert_encode_u16_var, lindel::hilbert_encode); 276 | generic_encode_u16_var!(morton_encode_u16_var, lindel::morton_encode); 277 | 278 | macro_rules! generic_encode_u32_var { 279 | ($func_name:ident, $encoding_expr: expr) => { 280 | /// # Safety 281 | /// 282 | /// This function is unsafe because it dereferences raw pointers. 283 | #[no_mangle] 284 | pub unsafe extern "C" fn $func_name( 285 | ptr: *const u32, 286 | len: usize, 287 | result: *mut c_void, 288 | ) -> () { 289 | let args = unsafe { 290 | assert!(!ptr.is_null()); 291 | std::slice::from_raw_parts(ptr, len) 292 | }; 293 | 294 | match args.len() { 295 | 1 => encode_and_store!($encoding_expr, [args[0]], u32, result), 296 | 2 => encode_and_store!($encoding_expr, [args[0], args[1]], u64, result), 297 | 3 => encode_and_store!($encoding_expr, [args[0], args[1], args[2]], u128, result), 298 | 4 => encode_and_store!( 299 | $encoding_expr, 300 | [args[0], args[1], args[2], args[3]], 301 | u128, 302 | result 303 | ), 304 | _ => panic!("Invalid length"), 305 | } 306 | } 307 | }; 308 | } 309 | 310 | generic_encode_u32_var!(hilbert_encode_u32_var, lindel::hilbert_encode); 311 | generic_encode_u32_var!(morton_encode_u32_var, lindel::morton_encode); 312 | 313 | macro_rules! generic_encode_u64_var { 314 | ($func_name:ident, $encoding_expr: expr) => { 315 | /// # Safety 316 | /// 317 | /// This function is unsafe because it dereferences raw pointers. 318 | #[no_mangle] 319 | pub unsafe extern "C" fn $func_name( 320 | ptr: *const u64, 321 | len: usize, 322 | result: *mut c_void, 323 | ) -> () { 324 | let args = unsafe { 325 | assert!(!ptr.is_null()); 326 | std::slice::from_raw_parts(ptr, len) 327 | }; 328 | 329 | match args.len() { 330 | 1 => encode_and_store!($encoding_expr, [args[0]], u64, result), 331 | 2 => encode_and_store!($encoding_expr, [args[0], args[1]], u128, result), 332 | _ => panic!("Invalid length"), 333 | } 334 | } 335 | }; 336 | } 337 | 338 | generic_encode_u64_var!(hilbert_encode_u64_var, lindel::hilbert_encode); 339 | generic_encode_u64_var!(morton_encode_u64_var, lindel::morton_encode); 340 | 341 | #[cfg(test)] 342 | mod tests {} 343 | -------------------------------------------------------------------------------- /extension_config.cmake: -------------------------------------------------------------------------------- 1 | # This file is included by DuckDB's build system. It specifies which extension to load 2 | 3 | # Extension from this repo 4 | duckdb_extension_load(lindel 5 | SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR} 6 | LOAD_TESTS 7 | LINKED_LIBS "../../cargo/build/wasm32-unknown-emscripten/release/libduckdb_lindel_rust.a" 8 | ) 9 | 10 | # Any extra extensions that should be built 11 | # e.g.: duckdb_extension_load(json) -------------------------------------------------------------------------------- /scripts/bootstrap-template.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import sys, os, shutil, re 4 | from pathlib import Path 5 | 6 | shutil.copyfile(f'docs/NEXT_README.md', f'README.md') 7 | os.remove(f'docs/NEXT_README.md') 8 | os.remove(f'docs/README.md') 9 | 10 | if (len(sys.argv) != 2): 11 | raise Exception('usage: python3 bootstrap-template.py ') 12 | 13 | name_extension = sys.argv[1] 14 | 15 | def is_snake_case(s): 16 | # Define the regex pattern for snake case with numbers 17 | pattern = r'^[a-z0-9]+(_[a-z0-9]+)*$' 18 | 19 | # Use re.match to check if the string matches the pattern 20 | if re.match(pattern, s): 21 | return True 22 | else: 23 | return False 24 | 25 | if name_extension[0].isdigit(): 26 | raise Exception('Please dont start your extension name with a number.') 27 | 28 | if not is_snake_case(name_extension): 29 | raise Exception('Please enter the name of your extension in valid snake_case containing only lower case letters and numbers') 30 | 31 | def to_camel_case(snake_str): 32 | return "".join(x.capitalize() for x in snake_str.lower().split("_")) 33 | 34 | def replace(file_name, to_find, to_replace): 35 | with open(file_name, 'r', encoding="utf8") as file : 36 | filedata = file.read() 37 | filedata = filedata.replace(to_find, to_replace) 38 | with open(file_name, 'w', encoding="utf8") as file: 39 | file.write(filedata) 40 | 41 | files_to_search = [] 42 | files_to_search.extend(Path('./.github').rglob('./**/*.yml')) 43 | files_to_search.extend(Path('./test').rglob('./**/*.test')) 44 | files_to_search.extend(Path('./src').rglob('./**/*.hpp')) 45 | files_to_search.extend(Path('./src').rglob('./**/*.cpp')) 46 | files_to_search.extend(Path('./src').rglob('./**/*.txt')) 47 | files_to_search.extend(Path('./src').rglob('./*.md')) 48 | 49 | def replace_everywhere(to_find, to_replace): 50 | for path in files_to_search: 51 | replace(path, to_find, to_replace) 52 | replace(path, to_find.capitalize(), to_camel_case(to_replace)) 53 | replace(path, to_find.upper(), to_replace.upper()) 54 | 55 | replace("./CMakeLists.txt", to_find, to_replace) 56 | replace("./Makefile", to_find, to_replace) 57 | replace("./Makefile", to_find.capitalize(), to_camel_case(to_replace)) 58 | replace("./Makefile", to_find.upper(), to_replace.upper()) 59 | replace("./README.md", to_find, to_replace) 60 | replace("./extension_config.cmake", to_find, to_replace) 61 | 62 | replace_everywhere("quack", name_extension) 63 | replace_everywhere("Quack", name_extension.capitalize()) 64 | replace_everywhere("", name_extension) 65 | 66 | string_to_replace = name_extension 67 | string_to_find = "quack" 68 | 69 | # rename files 70 | os.rename(f'test/sql/{string_to_find}.test', f'test/sql/{string_to_replace}.test') 71 | os.rename(f'src/{string_to_find}_extension.cpp', f'src/{string_to_replace}_extension.cpp') 72 | os.rename(f'src/include/{string_to_find}_extension.hpp', f'src/include/{string_to_replace}_extension.hpp') 73 | 74 | # remove template-specific files 75 | os.remove('.github/workflows/ExtensionTemplate.yml') 76 | 77 | # finally, remove this bootstrap file 78 | os.remove(__file__) -------------------------------------------------------------------------------- /scripts/extension-upload.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Extension upload script 4 | 5 | # Usage: ./extension-upload.sh 6 | # : Name of the extension 7 | # : Version (commit / version tag) of the extension 8 | # : Version (commit / version tag) of DuckDB 9 | # : Architecture target of the extension binary 10 | # : S3 bucket to upload to 11 | # : Set this as the latest version ("true" / "false", default: "false") 12 | # : Set this as a versioned version that will prevent its deletion 13 | 14 | set -e 15 | 16 | if [[ $4 == wasm* ]]; then 17 | ext="/tmp/extension/$1.duckdb_extension.wasm" 18 | else 19 | ext="/tmp/extension/$1.duckdb_extension" 20 | fi 21 | 22 | echo $ext 23 | 24 | script_dir="$(dirname "$(readlink -f "$0")")" 25 | 26 | # calculate SHA256 hash of extension binary 27 | cat $ext > $ext.append 28 | 29 | if [[ $4 == wasm* ]]; then 30 | # 0 for custom section 31 | # 113 in hex = 275 in decimal, total lenght of what follows (1 + 16 + 2 + 256) 32 | # [1(continuation) + 0010011(payload) = \x93, 0(continuation) + 10(payload) = \x02] 33 | echo -n -e '\x00' >> $ext.append 34 | echo -n -e '\x93\x02' >> $ext.append 35 | # 10 in hex = 16 in decimal, lenght of name, 1 byte 36 | echo -n -e '\x10' >> $ext.append 37 | echo -n -e 'duckdb_signature' >> $ext.append 38 | # the name of the WebAssembly custom section, 16 bytes 39 | # 100 in hex, 256 in decimal 40 | # [1(continuation) + 0000000(payload) = ff, 0(continuation) + 10(payload)], 41 | # for a grand total of 2 bytes 42 | echo -n -e '\x80\x02' >> $ext.append 43 | fi 44 | 45 | # (Optionally) Sign binary 46 | if [ "$DUCKDB_EXTENSION_SIGNING_PK" != "" ]; then 47 | echo "$DUCKDB_EXTENSION_SIGNING_PK" > private.pem 48 | $script_dir/../duckdb/scripts/compute-extension-hash.sh $ext.append > $ext.hash 49 | openssl pkeyutl -sign -in $ext.hash -inkey private.pem -pkeyopt digest:sha256 -out $ext.sign 50 | rm -f private.pem 51 | fi 52 | 53 | # Signature is always there, potentially defaulting to 256 zeros 54 | truncate -s 256 $ext.sign 55 | 56 | # append signature to extension binary 57 | cat $ext.sign >> $ext.append 58 | 59 | # compress extension binary 60 | if [[ $4 == wasm_* ]]; then 61 | brotli < $ext.append > "$ext.compressed" 62 | else 63 | gzip < $ext.append > "$ext.compressed" 64 | fi 65 | 66 | set -e 67 | 68 | # Abort if AWS key is not set 69 | if [ -z "$AWS_ACCESS_KEY_ID" ]; then 70 | echo "No AWS key found, skipping.." 71 | exit 0 72 | fi 73 | 74 | # upload versioned version 75 | if [[ $7 = 'true' ]]; then 76 | if [[ $4 == wasm* ]]; then 77 | aws s3 cp $ext.compressed s3://$5/$1/$2/$3/$4/$1.duckdb_extension.wasm --acl public-read --content-encoding br --content-type="application/wasm" 78 | else 79 | aws s3 cp $ext.compressed s3://$5/$1/$2/$3/$4/$1.duckdb_extension.gz --acl public-read 80 | fi 81 | fi 82 | 83 | # upload to latest version 84 | if [[ $6 = 'true' ]]; then 85 | if [[ $4 == wasm* ]]; then 86 | aws s3 cp $ext.compressed s3://$5/$3/$4/$1.duckdb_extension.wasm --acl public-read --content-encoding br --content-type="application/wasm" 87 | else 88 | aws s3 cp $ext.compressed s3://$5/$3/$4/$1.duckdb_extension.gz --acl public-read 89 | fi 90 | fi 91 | -------------------------------------------------------------------------------- /src/include/lindel_extension.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "duckdb.hpp" 4 | 5 | namespace duckdb { 6 | 7 | class LindelExtension : public Extension { 8 | public: 9 | void Load(DuckDB &db) override; 10 | std::string Name() override; 11 | std::string Version() const override; 12 | }; 13 | 14 | } // namespace duckdb 15 | -------------------------------------------------------------------------------- /src/include/rust.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | 9 | 10 | extern "C" { 11 | 12 | ///Free a value returned from `duckdb_malloc`, `duckdb_value_varchar`, `duckdb_value_blob`, or `duckdb_value_string`. 13 | /// 14 | /// ptr: The memory region to de-allocate. 15 | extern void duckdb_free(void *ptr); 16 | 17 | ///Allocate `size` bytes of memory using the duckdb internal malloc function. Any memory allocated in this manner should be freed using `duckdb_free`. 18 | /// 19 | /// size: The number of bytes to allocate. returns: A pointer to the allocated memory region. 20 | extern void *duckdb_malloc(size_t size); 21 | 22 | void hilbert_encode_u16_var(const uint16_t *ptr, size_t len, void *result); 23 | 24 | void hilbert_encode_u32_var(const uint32_t *ptr, size_t len, void *result); 25 | 26 | void hilbert_encode_u64_var(const uint64_t *ptr, size_t len, void *result); 27 | 28 | void hilbert_encode_u8_var(const uint8_t *ptr, size_t len, void *result); 29 | 30 | void morton_encode_u16_var(const uint16_t *ptr, size_t len, void *result); 31 | 32 | void morton_encode_u32_var(const uint32_t *ptr, size_t len, void *result); 33 | 34 | void morton_encode_u64_var(const uint64_t *ptr, size_t len, void *result); 35 | 36 | void morton_encode_u8_var(const uint8_t *ptr, size_t len, void *result); 37 | 38 | void perform_decode(uint8_t encoding_type, 39 | uint8_t element_bit_width, 40 | const void *src, 41 | void *dest, 42 | size_t dest_len); 43 | 44 | } // extern "C" 45 | -------------------------------------------------------------------------------- /src/lindel_extension.cpp: -------------------------------------------------------------------------------- 1 | #define DUCKDB_EXTENSION_MAIN 2 | 3 | #include "lindel_extension.hpp" 4 | #include "duckdb.hpp" 5 | #include "duckdb/common/exception.hpp" 6 | #include "duckdb/common/optional_idx.hpp" 7 | #include "duckdb/common/string_util.hpp" 8 | #include "duckdb/function/scalar_function.hpp" 9 | #include "duckdb/planner/expression/bound_function_expression.hpp" 10 | #include "duckdb/main/extension_util.hpp" 11 | #include 12 | 13 | // Include the declarations of things from Rust. 14 | #include "rust.h" 15 | 16 | namespace duckdb 17 | { 18 | 19 | // Since we have functions that can decode or encode using two different types of encoding to reduce 20 | // the number of functions we need to write we'll use a single function to handle both. 21 | // and just store the encoding type in the bind_info. 22 | // 23 | // The encoding type is 0 for Hilbert and 1 for Morton. 24 | // 25 | // This extension supports two different types of encoding, Hilbert and Morton. 26 | // 27 | // In both cases the encoding is done in a similar way, the only difference is the 28 | // encoding function that is called. 29 | // 30 | // Rather than writing two separate functions for each encoding type we'll write a single 31 | // function that can handle both and just store the encoding type in the bind_info object. 32 | // 33 | // The bind_info object is created before the functions are called but when DuckDB starts to evaluate 34 | // the expression. 35 | struct lindelEncodingBindData : public FunctionData 36 | { 37 | uint8_t encoding_type; 38 | lindelEncodingBindData(uint8_t encoding_type_p) : FunctionData(), encoding_type(encoding_type_p) 39 | { 40 | } 41 | 42 | duckdb::unique_ptr Copy() const override 43 | { 44 | return make_uniq(encoding_type); 45 | } 46 | 47 | bool Equals(const FunctionData &other_p) const override 48 | { 49 | auto &other = other_p.Cast(); 50 | return encoding_type == other.encoding_type; 51 | } 52 | }; 53 | 54 | // This is the "bind" fucntion that is called when we are decoding an array of values. 55 | // 56 | // In SQL this will be a function of the form: 57 | // 58 | // hilbert_decode(UTINYINT|USMALLINT|UINTEGER|UBIGINT|UHUGEINT, TINYINT, BOOLEAN) 59 | // morton_decode(UTINYINT|USMALLINT|UINTEGER|UBIGINT|UHUGEINT, TINYINT, BOOLEAN) 60 | // 61 | // The arguments are as follows: 62 | // 63 | // 1. The value to decode. 64 | // 2. The number of parts to return. 65 | // 3. Whether or not to return the parts as floats or integers. 66 | // 4. Whether or not to return unsigned integers (true if unsigned) 67 | // 68 | // This binding function also needs to determine the encoding type by looking at the bound function name. 69 | // 70 | // This function also determines the actual type that will be returned by the function, it will always be an array 71 | // but the type of element and number of elements will depend on the input type and what the caller requests. 72 | // 73 | static unique_ptr lindelDecodeToArrayBind(ClientContext &context, ScalarFunction &bound_function, 74 | vector> &arguments) 75 | { 76 | unique_ptr bind_data = make_uniq(0); 77 | if (bound_function.name == "hilbert_decode") 78 | { 79 | bind_data->encoding_type = 0; 80 | } 81 | else if (bound_function.name == "morton_decode") 82 | { 83 | bind_data->encoding_type = 1; 84 | } 85 | else 86 | { 87 | throw NotImplementedException("Unknown function name in lindelDecodeToArrayBind, expected either hilbert_decode() or morton_decode()"); 88 | } 89 | 90 | auto &left_type = arguments[0]->return_type; 91 | 92 | auto get_foldable_value = [&](size_t index, LogicalType expected_type, const string &error_msg) -> Value 93 | { 94 | if (!arguments[index]->IsFoldable()) 95 | { 96 | throw NotImplementedException(error_msg); 97 | } 98 | Value val = ExpressionExecutor::EvaluateScalar(context, *arguments[index]).CastAs(context, expected_type); 99 | if (val.IsNull()) 100 | { 101 | throw NotImplementedException(error_msg + " expected a not-null value"); 102 | } 103 | return val; 104 | }; 105 | 106 | auto return_number_of_parts = UTinyIntValue::Get(get_foldable_value(1, LogicalType::UTINYINT, "hilbert_decode(ANY, TINYINT, BOOLEAN, BOOLEAN)")); 107 | auto return_float = BooleanValue::Get(get_foldable_value(2, LogicalType::BOOLEAN, "hilbert_decode(ANY, TINYINT, BOOLEAN, BOOLEAN)")); 108 | auto return_unsigned = BooleanValue::Get(get_foldable_value(3, LogicalType::BOOLEAN, "hilbert_decode(ANY, TINYINT, BOOLEAN, BOOLEAN)")); 109 | 110 | if (return_number_of_parts == 0) 111 | { 112 | throw InvalidInputException("Number of parts to return must be greater than 0."); 113 | } 114 | 115 | auto set_return_type = [&](LogicalType base_type, size_t parts, string_t allowed_types, const vector &type_options) 116 | { 117 | if (find(type_options.begin(), type_options.end(), left_type.id()) == type_options.end()) 118 | { 119 | throw InvalidInputException("Expected one of the following types:" + allowed_types.GetString()); 120 | } 121 | bound_function.return_type = LogicalType::ARRAY(base_type, parts); 122 | }; 123 | 124 | if (return_float) 125 | { 126 | switch (left_type.id()) 127 | { 128 | case LogicalType::UINTEGER: 129 | set_return_type(LogicalType::FLOAT, 1, "UINTEGER", {LogicalType::UINTEGER}); 130 | break; 131 | case LogicalType::UBIGINT: 132 | if (return_number_of_parts == 1) 133 | { 134 | set_return_type(LogicalType::DOUBLE, 1, "UBIGINT", {LogicalType::UBIGINT}); 135 | } 136 | else if (return_number_of_parts == 2) 137 | { 138 | set_return_type(LogicalType::FLOAT, 2, "UBIGINT", {LogicalType::UBIGINT}); 139 | } 140 | else 141 | { 142 | throw InvalidInputException("Expected 1 or 2 parts for UBIGINT"); 143 | } 144 | break; 145 | case LogicalType::UHUGEINT: 146 | if (return_number_of_parts == 2) 147 | { 148 | set_return_type(LogicalType::DOUBLE, 2, "UHUGEINT", {LogicalType::UHUGEINT}); 149 | } 150 | else if (return_number_of_parts >= 3 && return_number_of_parts <= 4) 151 | { 152 | set_return_type(LogicalType::FLOAT, return_number_of_parts, "UHUGEINT", {LogicalType::UHUGEINT}); 153 | } 154 | else 155 | { 156 | throw InvalidInputException("Expected 2-4 parts for UHUGEINT"); 157 | } 158 | break; 159 | default: 160 | throw InvalidInputException("Expected UINTEGER, UBIGINT, or UHUGEINT"); 161 | } 162 | return bind_data; 163 | } 164 | 165 | if (return_number_of_parts == 1) 166 | { 167 | set_return_type(left_type.id(), 1, "UINTEGER, USMALLINT, UTINYINT, UBIGINT, UHUGEINT", { 168 | (return_unsigned ? LogicalType::UINTEGER : LogicalType::INTEGER), 169 | (return_unsigned ? LogicalType::USMALLINT : LogicalType::SMALLINT), 170 | (return_unsigned ? LogicalType::UTINYINT : LogicalType::TINYINT), 171 | (return_unsigned ? LogicalType::UBIGINT : LogicalType::BIGINT), 172 | }); 173 | return bind_data; 174 | } 175 | 176 | auto set_integer_return_type = [&](LogicalType base_type, size_t parts, string_t allowed_types, string_t bounds, const map &type_map) 177 | { 178 | if (type_map.find(return_number_of_parts) != type_map.end()) 179 | { 180 | set_return_type(type_map.at(return_number_of_parts), return_number_of_parts, allowed_types, {base_type}); 181 | } 182 | else 183 | { 184 | throw InvalidInputException("Expected " + bounds.GetString() + " parts for " + base_type.ToString()); 185 | } 186 | }; 187 | 188 | // The number of parts in the output array is determined by the number of parts requested and the datatype passed 189 | // to decode. 190 | 191 | switch (left_type.id()) 192 | { 193 | case LogicalType::UTINYINT: 194 | throw InvalidInputException("Expected 1 parts for UTINYINT"); 195 | case LogicalType::USMALLINT: 196 | set_integer_return_type(LogicalType::USMALLINT, return_number_of_parts, "UTINYINT", "2", {{2, return_unsigned ? LogicalType::UTINYINT : LogicalType::TINYINT}}); 197 | break; 198 | case LogicalType::UINTEGER: 199 | set_integer_return_type(LogicalType::UINTEGER, return_number_of_parts, "UTINYINT, USMALLINT", "2-4", {{2, (return_unsigned ? LogicalType::USMALLINT : LogicalType::SMALLINT)}, {3, (return_unsigned ? LogicalType::UTINYINT : LogicalType::TINYINT)}}); 200 | break; 201 | case LogicalType::UBIGINT: 202 | set_integer_return_type(LogicalType::UBIGINT, return_number_of_parts, "UTINYINT, USMALLINT, UINTEGER", "2-8", {{2, (return_unsigned ? LogicalType::UINTEGER : LogicalType::INTEGER)}, {3, (return_unsigned ? LogicalType::USMALLINT : LogicalType::SMALLINT)}, {4, (return_unsigned ? LogicalType::USMALLINT : LogicalType::SMALLINT)}, {5, (return_unsigned ? LogicalType::UTINYINT : LogicalType::TINYINT)}, {6, (return_unsigned ? LogicalType::UTINYINT : LogicalType::TINYINT)}, {7, (return_unsigned ? LogicalType::UTINYINT : LogicalType::TINYINT)}, {8, (return_unsigned ? LogicalType::UTINYINT : LogicalType::TINYINT)}}); 203 | break; 204 | case LogicalType::UHUGEINT: 205 | set_integer_return_type(LogicalType::UHUGEINT, return_number_of_parts, "UTINYINT, USMALLINT, UINTEGER, UBIGINT", "2-16", {{2, (return_unsigned ? LogicalType::UBIGINT : LogicalType::BIGINT)}, {3, (return_unsigned ? LogicalType::UINTEGER : LogicalType::INTEGER)}, {4, (return_unsigned ? LogicalType::UINTEGER : LogicalType::INTEGER)}, {5, (return_unsigned ? LogicalType::USMALLINT : LogicalType::SMALLINT)}, {6, (return_unsigned ? LogicalType::USMALLINT : LogicalType::SMALLINT)}, {7, (return_unsigned ? LogicalType::USMALLINT : LogicalType::SMALLINT)}, {8, (return_unsigned ? LogicalType::USMALLINT : LogicalType::SMALLINT)}, {9, (return_unsigned ? LogicalType::UTINYINT : LogicalType::TINYINT)}, {10, (return_unsigned ? LogicalType::UTINYINT : LogicalType::TINYINT)}, {11, (return_unsigned ? LogicalType::UTINYINT : LogicalType::TINYINT)}, {12, (return_unsigned ? LogicalType::UTINYINT : LogicalType::TINYINT)}, {13, (return_unsigned ? LogicalType::UTINYINT : LogicalType::TINYINT)}, {14, (return_unsigned ? LogicalType::UTINYINT : LogicalType::TINYINT)}, {15, (return_unsigned ? LogicalType::UTINYINT : LogicalType::TINYINT)}, {16, (return_unsigned ? LogicalType::UTINYINT : LogicalType::TINYINT)}}); 206 | break; 207 | default: 208 | throw InvalidInputException("Expected UINTEGER, USMALLINT, UTINYINT, UBIGINT, or UHUGEINT"); 209 | } 210 | 211 | return bind_data; 212 | } 213 | 214 | // This function performs the actual decoding of values as a DuckDB scalar function. 215 | // 216 | inline void lindelDecodeArrayFun(DataChunk &args, ExpressionState &state, Vector &result) 217 | { 218 | // This is the number of elements in the output array, not the number of rows being procssed. 219 | auto output_number_of_elements = ArrayType::GetSize(result.GetType()); 220 | 221 | // The type of the elements in the output array this will either be an integer type or a float type. 222 | auto output_child_type = ArrayType::GetChildType(result.GetType()); 223 | 224 | // Get a reference to the bind data that was already created that will determine the type 225 | // of encoding to use. 226 | auto &func_expr = state.expr.Cast(); 227 | auto &bind_info = func_expr.bind_info->Cast(); 228 | 229 | // Reference the source data. 230 | auto left = args.data[0]; 231 | 232 | // Standardize the vectors to a unified format, so it can be iterated. 233 | UnifiedVectorFormat left_format; 234 | left.ToUnifiedFormat(args.size(), left_format); 235 | 236 | // Since this function can take a variety of input types with different sizes, get different 237 | // pointers to the different data types of the input. 238 | auto left_data_8 = FlatVector::GetData(left); 239 | 240 | // So the output type changes based on the number of inputs and the type of inputs. 241 | 242 | // Get the reference to the children of the result. 243 | auto &result_data_children = ArrayVector::GetEntry(result); 244 | 245 | // Since this function can produce a variety of output types with different sizes follow 246 | // the same pattern that was used for the input types. All of these are just pointers. 247 | auto result_data_u8 = FlatVector::GetData(result_data_children); 248 | 249 | uint8_t output_element_bit_width; 250 | 251 | switch (output_child_type.id()) 252 | { 253 | case LogicalTypeId::UTINYINT: 254 | case LogicalTypeId::TINYINT: 255 | { 256 | output_element_bit_width = 8; 257 | } 258 | break; 259 | case LogicalTypeId::USMALLINT: 260 | case LogicalTypeId::SMALLINT: 261 | { 262 | output_element_bit_width = 16; 263 | } 264 | break; 265 | case LogicalTypeId::UINTEGER: 266 | case LogicalTypeId::INTEGER: 267 | case LogicalTypeId::FLOAT: 268 | { 269 | output_element_bit_width = 32; 270 | } 271 | break; 272 | case LogicalTypeId::UBIGINT: 273 | case LogicalTypeId::BIGINT: 274 | case LogicalTypeId::DOUBLE: 275 | { 276 | output_element_bit_width = 64; 277 | } 278 | break; 279 | case LogicalTypeId::UHUGEINT: 280 | case LogicalTypeId::HUGEINT: 281 | { 282 | output_element_bit_width = 128; 283 | } 284 | break; 285 | default: 286 | throw NotImplementedException("hilbert_decode()/morton_decode() only supports destination types of UTINYINT, USMALLINT, UINTEGER, UBIGINT, UHUGEINT types"); 287 | } 288 | size_t input_pointer_increment; 289 | switch (left.GetType().id()) 290 | { 291 | case LogicalTypeId::UTINYINT: 292 | case LogicalTypeId::TINYINT: 293 | { 294 | input_pointer_increment = 1; 295 | } 296 | break; 297 | case LogicalTypeId::USMALLINT: 298 | case LogicalTypeId::SMALLINT: 299 | { 300 | input_pointer_increment = 2; 301 | } 302 | break; 303 | case LogicalTypeId::UINTEGER: 304 | case LogicalTypeId::INTEGER: 305 | { 306 | input_pointer_increment = 4; 307 | } 308 | break; 309 | case LogicalTypeId::UBIGINT: 310 | case LogicalTypeId::BIGINT: 311 | { 312 | input_pointer_increment = 8; 313 | } 314 | break; 315 | case LogicalTypeId::UHUGEINT: 316 | case LogicalTypeId::HUGEINT: 317 | { 318 | input_pointer_increment = 16; 319 | } 320 | break; 321 | default: 322 | throw NotImplementedException("hilbert_decode()/morton_decode() only supports incoming sources of UTINYINT, USMALLINT, UINTEGER, UBIGINT, UHUGEINT types"); 323 | } 324 | 325 | const size_t output_pointer_increment = output_element_bit_width / 8; 326 | 327 | for (idx_t i = 0; i < args.size(); i++) 328 | { 329 | auto left_idx = left_format.sel->get_index(i); 330 | 331 | // If the input value is NULL then the output value should be NULL. 332 | if (!left_format.validity.RowIsValid(left_idx)) 333 | { 334 | FlatVector::SetNull(result, i, true); 335 | continue; 336 | } 337 | 338 | // Get the offset of where the result for this row should begin, since 339 | // there is always a fixed number of result elements, its pretty simple. 340 | auto result_offset = i * output_number_of_elements; 341 | 342 | // Depending on the output type call the appropriate decode function with the appropriate 343 | // result location. 344 | 345 | void *output_location = result_data_u8 + result_offset * output_pointer_increment; 346 | void *source_location = left_data_8 + (left_idx * input_pointer_increment); 347 | 348 | perform_decode(bind_info.encoding_type, output_element_bit_width, source_location, output_location, output_number_of_elements); 349 | } 350 | 351 | if (args.size() == 1) 352 | { 353 | result.SetVectorType(VectorType::CONSTANT_VECTOR); 354 | } 355 | } 356 | 357 | // This is the "bind" function that is called for encoding an array of values. 358 | // 359 | // It doesn't have to do anything with the return type right now but it may in the future. 360 | static unique_ptr lindelEncodeArrayBind(ClientContext &context, ScalarFunction &bound_function, 361 | vector> &arguments) 362 | { 363 | unique_ptr bind_data = make_uniq(0); 364 | if (bound_function.name == "hilbert_encode") 365 | { 366 | bind_data->encoding_type = 0; 367 | } 368 | else if (bound_function.name == "morton_encode") 369 | { 370 | bind_data->encoding_type = 1; 371 | } 372 | else 373 | { 374 | throw NotImplementedException("Unknown function name in lindelEncodeBind"); 375 | } 376 | 377 | // Now deal with validating the input type 378 | auto &left_type = arguments[0]->return_type; 379 | 380 | // This is the number of elements in the output array, not the number of rows being procssed. 381 | auto input_number_of_elements = ArrayType::GetSize(left_type); 382 | 383 | // The type of the elements in the output array this will either be an integer type or a float type. 384 | auto input_child_type = ArrayType::GetChildType(left_type); 385 | 386 | switch (input_child_type.id()) 387 | { 388 | case LogicalTypeId::DOUBLE: 389 | { 390 | switch (input_number_of_elements) 391 | { 392 | case 1: 393 | bound_function.return_type = LogicalType::UBIGINT; 394 | break; 395 | case 2: 396 | bound_function.return_type = LogicalType::UHUGEINT; 397 | break; 398 | default: 399 | throw InvalidInputException("hilbert_encode()/morton_encode() only supports arrays of lengths of 1 or 2 for DOUBLE."); 400 | } 401 | } 402 | break; 403 | case LogicalTypeId::FLOAT: 404 | { 405 | switch (input_number_of_elements) 406 | { 407 | case 1: 408 | bound_function.return_type = LogicalType::UINTEGER; 409 | break; 410 | case 2: 411 | bound_function.return_type = LogicalType::UBIGINT; 412 | break; 413 | case 3: 414 | case 4: 415 | bound_function.return_type = LogicalType::UHUGEINT; 416 | break; 417 | default: 418 | throw InvalidInputException("hilbert_encode()/morton_encode() only supports arrays of lengths 1-4 for FLOAT."); 419 | } 420 | } 421 | break; 422 | case LogicalTypeId::UBIGINT: 423 | case LogicalTypeId::BIGINT: 424 | { 425 | switch (input_number_of_elements) 426 | { 427 | case 1: 428 | bound_function.return_type = LogicalType::UBIGINT; 429 | break; 430 | case 2: 431 | bound_function.return_type = LogicalType::UHUGEINT; 432 | break; 433 | default: 434 | throw InvalidInputException("hilbert_encode()/morton_encode() only supports arrays of lengths of 1 or 2 for BIGINT/UBIGINT."); 435 | } 436 | } 437 | break; 438 | case LogicalTypeId::UINTEGER: 439 | case LogicalTypeId::INTEGER: 440 | { 441 | switch (input_number_of_elements) 442 | { 443 | case 1: 444 | bound_function.return_type = LogicalType::UINTEGER; 445 | break; 446 | case 2: 447 | bound_function.return_type = LogicalType::UBIGINT; 448 | break; 449 | case 3: 450 | case 4: 451 | bound_function.return_type = LogicalType::UHUGEINT; 452 | break; 453 | default: 454 | throw InvalidInputException("hilbert_encode()/morton_encode() only supports arrays of lengths 1-4 for UINTEGER/INTEGER."); 455 | } 456 | } 457 | break; 458 | case LogicalTypeId::USMALLINT: 459 | case LogicalTypeId::SMALLINT: 460 | { 461 | switch (input_number_of_elements) 462 | { 463 | case 1: // 16 464 | bound_function.return_type = LogicalType::USMALLINT; 465 | break; 466 | case 2: // 32 467 | bound_function.return_type = LogicalType::UINTEGER; 468 | break; 469 | case 3: 470 | case 4: 471 | bound_function.return_type = LogicalType::UBIGINT; 472 | break; 473 | case 5: 474 | case 6: 475 | case 7: 476 | case 8: 477 | bound_function.return_type = LogicalType::UHUGEINT; 478 | break; 479 | default: 480 | throw InvalidInputException("hilbert_encode()/morton_encode() only supports arrays of lengths 1-8 for USMALLINT/SMALLINT."); 481 | } 482 | } 483 | break; 484 | case LogicalTypeId::UTINYINT: 485 | case LogicalTypeId::TINYINT: 486 | { 487 | switch (input_number_of_elements) 488 | { 489 | case 1: 490 | bound_function.return_type = LogicalType::UTINYINT; 491 | break; 492 | case 2: 493 | bound_function.return_type = LogicalType::USMALLINT; 494 | break; 495 | case 3: 496 | case 4: 497 | bound_function.return_type = LogicalType::UINTEGER; 498 | break; 499 | case 5: 500 | case 6: 501 | case 7: 502 | case 8: 503 | bound_function.return_type = LogicalType::UBIGINT; 504 | break; 505 | case 9: 506 | case 10: 507 | case 11: 508 | case 12: 509 | case 13: 510 | case 14: 511 | case 15: 512 | case 16: 513 | bound_function.return_type = LogicalType::UHUGEINT; 514 | break; 515 | default: 516 | throw InvalidInputException("hilbert_encode()/morton_encode() only supports arrays of lengths 1-16 for UTINYINT/TINYINT."); 517 | } 518 | } 519 | break; 520 | default: 521 | throw InvalidInputException("hilbert_encode()/morton_encode() only supports arrays of types DOUBLE, FLOAT, UBIGINT, BIGINT, UINTEGER, INTEGER, USMALLINT, SMALLINT, UTINYINT, TINYINT"); 522 | } 523 | 524 | return bind_data; 525 | } 526 | 527 | // Perform encoding for an array of values. 528 | inline void lindelEncodeArrayFunc(DataChunk &args, ExpressionState &state, Vector &result) 529 | { 530 | // Get a reference to the bind data. 531 | auto &func_expr = state.expr.Cast(); 532 | auto &bind_info = func_expr.bind_info->Cast(); 533 | 534 | // This is the size of the array 535 | auto array_number_of_elements = ArrayType::GetSize(args.data[0].GetType()); 536 | auto child_type = ArrayType::GetChildType(args.data[0].GetType()); 537 | 538 | // Get a pointer to the input data. 539 | auto left = args.data[0]; 540 | auto &left_child = ArrayVector::GetEntry(left); 541 | auto &left_child_validity = FlatVector::Validity(left_child); 542 | UnifiedVectorFormat left_format; 543 | 544 | left.ToUnifiedFormat(args.size(), left_format); 545 | 546 | // Need the different input types since we're doing pointer math below. 547 | auto left_data_8 = FlatVector::GetData(left_child); 548 | auto left_data_16 = FlatVector::GetData(left_child); 549 | auto left_data_32 = FlatVector::GetData(left_child); 550 | auto left_data_64 = FlatVector::GetData(left_child); 551 | auto left_data_float = FlatVector::GetData(left_child); 552 | auto left_data_double = FlatVector::GetData(left_child); 553 | 554 | // So the output type changes based on the number of inputs and the type of inputs. 555 | auto result_data_u8 = FlatVector::GetData(result); 556 | auto result_data_u16 = FlatVector::GetData(result); 557 | auto result_data_u32 = FlatVector::GetData(result); 558 | auto result_data_u64 = FlatVector::GetData(result); 559 | auto result_data_u128 = FlatVector::GetData(result); 560 | 561 | for (idx_t i = 0; i < args.size(); i++) 562 | { 563 | auto left_idx = left_format.sel->get_index(i); 564 | if (!left_format.validity.RowIsValid(left_idx)) 565 | { 566 | FlatVector::SetNull(result, i, true); 567 | continue; 568 | } 569 | 570 | auto left_offset = left_idx * array_number_of_elements; 571 | if (!left_child_validity.CheckAllValid(left_offset + array_number_of_elements, left_offset)) 572 | { 573 | throw InvalidInputException(StringUtil::Format("%s: array can not contain NULL values", "hilbert_encode")); 574 | } 575 | 576 | switch (child_type.id()) 577 | { 578 | case LogicalTypeId::DOUBLE: 579 | { 580 | auto encoder = bind_info.encoding_type == 0 ? hilbert_encode_u64_var : morton_encode_u64_var; 581 | switch (array_number_of_elements) 582 | { 583 | case 1: 584 | { 585 | encoder((uint64_t *)(left_data_double + left_offset), array_number_of_elements, result_data_u64 + i); 586 | break; 587 | } 588 | case 2: 589 | { 590 | encoder((uint64_t *)(left_data_double + left_offset), array_number_of_elements, result_data_u128 + i); 591 | break; 592 | } 593 | default: 594 | throw NotImplementedException("hilbert_encode()/morton_encode() only supports arrays of lengths of 1 or 2 for DOUBLE."); 595 | } 596 | } 597 | break; 598 | case LogicalTypeId::FLOAT: 599 | { 600 | // The number of elements in the array dictates the output type. 601 | auto encoder = bind_info.encoding_type == 0 ? hilbert_encode_u32_var : morton_encode_u32_var; 602 | switch (array_number_of_elements) 603 | { 604 | case 1: 605 | { 606 | encoder((uint32_t *)(left_data_float + left_offset), array_number_of_elements, result_data_u32 + i); 607 | break; 608 | } 609 | case 2: 610 | case 3: 611 | { 612 | encoder((uint32_t *)(left_data_float + left_offset), array_number_of_elements, result_data_u64 + i); 613 | break; 614 | } 615 | case 4: 616 | { 617 | hilbert_encode_u32_var((uint32_t *)(left_data_float + left_offset), array_number_of_elements, result_data_u128 + i); 618 | break; 619 | } 620 | default: 621 | throw NotImplementedException("hilbert_encode()/morton_encode() only supports arrays of lengths 1-4 for FLOAT."); 622 | } 623 | } 624 | break; 625 | case LogicalTypeId::UBIGINT: 626 | case LogicalTypeId::BIGINT: 627 | { 628 | auto encoder = bind_info.encoding_type == 0 ? hilbert_encode_u64_var : morton_encode_u64_var; 629 | switch (array_number_of_elements) 630 | { 631 | case 1: 632 | { 633 | encoder((uint64_t *)(left_data_64 + left_offset), array_number_of_elements, result_data_u64 + i); 634 | break; 635 | } 636 | case 2: 637 | { 638 | encoder((uint64_t *)(left_data_64 + left_offset), array_number_of_elements, result_data_u128 + i); 639 | break; 640 | } 641 | default: 642 | throw NotImplementedException("hilbert_encode()/morton_encode() only supports arrays of lengths of 1 or 2 for BIGINT/UBIGINT."); 643 | } 644 | } 645 | break; 646 | 647 | case LogicalTypeId::UINTEGER: 648 | case LogicalTypeId::INTEGER: 649 | { 650 | auto encoder = bind_info.encoding_type == 0 ? hilbert_encode_u32_var : morton_encode_u32_var; 651 | // The number of elements in the array dictates the output type. 652 | switch (array_number_of_elements) 653 | { 654 | case 1: 655 | { 656 | encoder((uint32_t *)(left_data_32 + left_offset), array_number_of_elements, result_data_u32 + i); 657 | break; 658 | } 659 | case 2: 660 | case 3: 661 | { 662 | encoder((uint32_t *)(left_data_32 + left_offset), array_number_of_elements, result_data_u64 + i); 663 | break; 664 | } 665 | case 4: 666 | { 667 | encoder((uint32_t *)(left_data_32 + left_offset), array_number_of_elements, result_data_u128 + i); 668 | break; 669 | } 670 | default: 671 | throw NotImplementedException("hilbert_encode()/morton_encode() only supports arrays of lengths 1-4 for UINTEGER/INTEGER."); 672 | } 673 | } 674 | break; 675 | case LogicalTypeId::SMALLINT: 676 | case LogicalTypeId::USMALLINT: 677 | { 678 | // The number of elements in the array dictates the output type. 679 | auto encoder = bind_info.encoding_type == 0 ? hilbert_encode_u16_var : morton_encode_u16_var; 680 | switch (array_number_of_elements) 681 | { 682 | case 1: 683 | { 684 | encoder((uint16_t *)(left_data_16 + left_offset), array_number_of_elements, result_data_u16 + i); 685 | break; 686 | } 687 | case 2: 688 | { 689 | encoder((uint16_t *)(left_data_16 + left_offset), array_number_of_elements, result_data_u32 + i); 690 | break; 691 | } 692 | case 3: 693 | case 4: 694 | { 695 | encoder((uint16_t *)(left_data_16 + left_offset), array_number_of_elements, result_data_u64 + i); 696 | break; 697 | } 698 | case 5: 699 | case 6: 700 | case 7: 701 | case 8: 702 | { 703 | encoder((uint16_t *)(left_data_16 + left_offset), array_number_of_elements, result_data_u128 + i); 704 | break; 705 | } 706 | default: 707 | throw NotImplementedException("hilbert_encode()/morton_encode() only supports arrays of length 1-8 for SMALLINT/USMALLINT."); 708 | } 709 | } 710 | break; 711 | case LogicalTypeId::TINYINT: 712 | case LogicalTypeId::UTINYINT: 713 | { 714 | // The number of elements in the array dictates the output type. 715 | auto encoder = bind_info.encoding_type == 0 ? hilbert_encode_u8_var : morton_encode_u8_var; 716 | switch (array_number_of_elements) 717 | { 718 | case 1: 719 | { 720 | encoder((uint8_t *)(left_data_8 + left_offset), array_number_of_elements, result_data_u8 + i); 721 | break; 722 | } 723 | case 2: 724 | { 725 | encoder((uint8_t *)(left_data_8 + left_offset), array_number_of_elements, result_data_u16 + i); 726 | break; 727 | } 728 | case 3: 729 | case 4: 730 | { 731 | encoder((uint8_t *)(left_data_8 + left_offset), array_number_of_elements, result_data_u32 + i); 732 | break; 733 | } 734 | case 5: 735 | case 6: 736 | case 7: 737 | case 8: 738 | { 739 | encoder((uint8_t *)(left_data_8 + left_offset), array_number_of_elements, result_data_u64 + i); 740 | break; 741 | } 742 | case 9: 743 | case 10: 744 | case 11: 745 | case 12: 746 | case 13: 747 | case 14: 748 | case 15: 749 | case 16: 750 | { 751 | encoder((uint8_t *)(left_data_8 + left_offset), array_number_of_elements, result_data_u128 + i); 752 | break; 753 | } 754 | default: 755 | throw NotImplementedException("hilbert_encode()/morton_encode() only supports arrays of length 1-16 for UTINYINT/TINYINT."); 756 | } 757 | } 758 | break; 759 | default: 760 | throw NotImplementedException("hilbert_encode()/morton_encode() only supports arrays of FLOAT, DOUBLE, BIGINT, UBIGINT, INTEGER, UINTEGER, SMALLINT, USMALLINT, TINYINT, UTINYINT types"); 761 | } 762 | } 763 | 764 | if (args.size() == 1) 765 | { 766 | result.SetVectorType(VectorType::CONSTANT_VECTOR); 767 | } 768 | } 769 | 770 | // Extension initalization. 771 | static void LoadInternal(DatabaseInstance &instance) 772 | { 773 | ScalarFunctionSet hilbert_encode("hilbert_encode"); 774 | ScalarFunctionSet morton_encode("morton_encode"); 775 | 776 | using SF = ScalarFunction; // Alias for ScalarFunction 777 | 778 | hilbert_encode.AddFunction(SF({LogicalType::ARRAY(LogicalType::ANY, optional_idx::Invalid())}, LogicalType::ANY, lindelEncodeArrayFunc, lindelEncodeArrayBind)); 779 | morton_encode.AddFunction(SF({LogicalType::ARRAY(LogicalType::ANY, optional_idx::Invalid())}, LogicalType::ANY, lindelEncodeArrayFunc, lindelEncodeArrayBind)); 780 | 781 | ExtensionUtil::RegisterFunction(instance, hilbert_encode); 782 | ExtensionUtil::RegisterFunction(instance, morton_encode); 783 | 784 | ScalarFunctionSet hilbert_decode = ScalarFunctionSet("hilbert_decode"); 785 | ScalarFunctionSet morton_decode = ScalarFunctionSet("morton_decode"); 786 | 787 | std::vector types_that_can_be_decoded = { 788 | LogicalType::UTINYINT, 789 | LogicalType::USMALLINT, 790 | LogicalType::UINTEGER, 791 | LogicalType::UBIGINT, 792 | LogicalType::UHUGEINT}; 793 | 794 | for (const auto &decodable_type : types_that_can_be_decoded) 795 | { 796 | hilbert_decode.AddFunction( 797 | ScalarFunction({decodable_type, LogicalType::UTINYINT, LogicalType::BOOLEAN, LogicalType::BOOLEAN}, LogicalType::ARRAY(LogicalType::ANY, optional_idx::Invalid()), 798 | lindelDecodeArrayFun, 799 | lindelDecodeToArrayBind)); 800 | 801 | morton_decode.AddFunction( 802 | ScalarFunction({decodable_type, LogicalType::UTINYINT, LogicalType::BOOLEAN, LogicalType::BOOLEAN}, LogicalType::ARRAY(LogicalType::ANY, optional_idx::Invalid()), 803 | lindelDecodeArrayFun, 804 | lindelDecodeToArrayBind)); 805 | } 806 | 807 | ExtensionUtil::RegisterFunction(instance, hilbert_decode); 808 | ExtensionUtil::RegisterFunction(instance, morton_decode); 809 | } 810 | 811 | void LindelExtension::Load(DuckDB &db) 812 | { 813 | LoadInternal(*db.instance); 814 | } 815 | std::string LindelExtension::Name() 816 | { 817 | return "lindel"; 818 | } 819 | 820 | std::string LindelExtension::Version() const 821 | { 822 | #ifdef EXT_VERSION_QUACK 823 | return EXT_VERSION_QUACK; 824 | #else 825 | return ""; 826 | #endif 827 | } 828 | 829 | } // namespace duckdb 830 | 831 | extern "C" 832 | { 833 | 834 | DUCKDB_EXTENSION_API void lindel_init(duckdb::DatabaseInstance &db) 835 | { 836 | duckdb::DuckDB db_wrapper(db); 837 | db_wrapper.LoadExtension(); 838 | } 839 | 840 | DUCKDB_EXTENSION_API const char *lindel_version() 841 | { 842 | return "1.0.1"; 843 | } 844 | } 845 | 846 | #ifndef DUCKDB_EXTENSION_MAIN 847 | #error DUCKDB_EXTENSION_MAIN not defined 848 | #endif 849 | -------------------------------------------------------------------------------- /test/README.md: -------------------------------------------------------------------------------- 1 | # Testing this extension 2 | This directory contains all the tests for this extension. The `sql` directory holds tests that are written as [SQLLogicTests](https://duckdb.org/dev/sqllogictest/intro.html). DuckDB aims to have most of its tests in this format as SQL statements, so for the lindel extension, this should probably be the goal too. 3 | 4 | The root makefile contains targets to build and run all of these tests. To run the SQLLogicTests: 5 | ```bash 6 | make test 7 | ``` 8 | or 9 | ```bash 10 | make test_debug 11 | ``` -------------------------------------------------------------------------------- /test/sql/lindel.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/lindel.test 2 | # description: test lindel extension 3 | # group: [lindel] 4 | 5 | # Before we load the extension, this will fail 6 | statement error 7 | SELECT hilbert_encode([1, 2, 3]::tinyint[3]); 8 | ---- 9 | Catalog Error: Scalar Function with name hilbert_encode does not exist! 10 | 11 | # Require statement will ensure this test is run with this extension loaded 12 | require lindel 13 | 14 | # Confirm the extension works 15 | query I 16 | SELECT hilbert_encode([1, 2, 3]::tinyint[3]); 17 | ---- 18 | 22 19 | 20 | query I 21 | SELECT morton_encode([1, 2, 3]::tinyint[3]); 22 | ---- 23 | 29 24 | 25 | query I 26 | select hilbert_decode(22::uinteger, 3, false, false) 27 | ---- 28 | [1, 2, 3] 29 | 30 | query IIIIII 31 | with elements as ( 32 | select * as id from range(5) 33 | ) 34 | select 35 | a.id as a, 36 | b.id as b, 37 | hilbert_encode([a.id, b.id]::tinyint[2]) as hilbert, 38 | morton_encode([a.id, b.id]::tinyint[2]) as morton, 39 | hilbert_decode(hilbert_encode([a.id, b.id]::tinyint[2]), 2, false, false) as hilbert_decoded, 40 | morton_decode(morton_encode([a.id, b.id]::tinyint[2]), 2, false, false) as morton_decoded, 41 | from 42 | elements as a cross join elements as b order by a, b; 43 | ---- 44 | 0 0 0 0 [0, 0] [0, 0] 45 | 0 1 3 1 [0, 1] [0, 1] 46 | 0 2 4 4 [0, 2] [0, 2] 47 | 0 3 5 5 [0, 3] [0, 3] 48 | 0 4 58 16 [0, 4] [0, 4] 49 | 1 0 1 2 [1, 0] [1, 0] 50 | 1 1 2 3 [1, 1] [1, 1] 51 | 1 2 7 6 [1, 2] [1, 2] 52 | 1 3 6 7 [1, 3] [1, 3] 53 | 1 4 57 18 [1, 4] [1, 4] 54 | 2 0 14 8 [2, 0] [2, 0] 55 | 2 1 13 9 [2, 1] [2, 1] 56 | 2 2 8 12 [2, 2] [2, 2] 57 | 2 3 9 13 [2, 3] [2, 3] 58 | 2 4 54 24 [2, 4] [2, 4] 59 | 3 0 15 10 [3, 0] [3, 0] 60 | 3 1 12 11 [3, 1] [3, 1] 61 | 3 2 11 14 [3, 2] [3, 2] 62 | 3 3 10 15 [3, 3] [3, 3] 63 | 3 4 53 26 [3, 4] [3, 4] 64 | 4 0 16 32 [4, 0] [4, 0] 65 | 4 1 17 33 [4, 1] [4, 1] 66 | 4 2 30 36 [4, 2] [4, 2] 67 | 4 3 31 37 [4, 3] [4, 3] 68 | 4 4 32 48 [4, 4] [4, 4] 69 | 70 | 71 | # Try to encode as integers. 72 | 73 | query IIIIII 74 | with elements as ( 75 | select * as id from range(5) 76 | ) 77 | select 78 | a.id as a, 79 | b.id as b, 80 | hilbert_encode([a.id, b.id]::int[2]) as hilbert, 81 | morton_encode([a.id, b.id]::int[2]) as morton, 82 | hilbert_decode(hilbert_encode([a.id, b.id]::int[2]), 2, false, false) as hilbert_decoded, 83 | morton_decode(morton_encode([a.id, b.id]::int[2]), 2, false, false) as morton_decoded, 84 | from 85 | elements as a cross join elements as b order by a, b; 86 | ---- 87 | 0 0 0 0 [0, 0] [0, 0] 88 | 0 1 3 1 [0, 1] [0, 1] 89 | 0 2 4 4 [0, 2] [0, 2] 90 | 0 3 5 5 [0, 3] [0, 3] 91 | 0 4 58 16 [0, 4] [0, 4] 92 | 1 0 1 2 [1, 0] [1, 0] 93 | 1 1 2 3 [1, 1] [1, 1] 94 | 1 2 7 6 [1, 2] [1, 2] 95 | 1 3 6 7 [1, 3] [1, 3] 96 | 1 4 57 18 [1, 4] [1, 4] 97 | 2 0 14 8 [2, 0] [2, 0] 98 | 2 1 13 9 [2, 1] [2, 1] 99 | 2 2 8 12 [2, 2] [2, 2] 100 | 2 3 9 13 [2, 3] [2, 3] 101 | 2 4 54 24 [2, 4] [2, 4] 102 | 3 0 15 10 [3, 0] [3, 0] 103 | 3 1 12 11 [3, 1] [3, 1] 104 | 3 2 11 14 [3, 2] [3, 2] 105 | 3 3 10 15 [3, 3] [3, 3] 106 | 3 4 53 26 [3, 4] [3, 4] 107 | 4 0 16 32 [4, 0] [4, 0] 108 | 4 1 17 33 [4, 1] [4, 1] 109 | 4 2 30 36 [4, 2] [4, 2] 110 | 4 3 31 37 [4, 3] [4, 3] 111 | 4 4 32 48 [4, 4] [4, 4] -------------------------------------------------------------------------------- /vcpkg.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": [] 3 | } --------------------------------------------------------------------------------