├── .clang-format ├── .cmake-format ├── .editorconfig ├── .github └── workflows │ ├── MainDistributionPipeline.yml │ ├── NodeJS.yml │ ├── Python.yml │ ├── _extension_deploy.yml │ └── dev.yaml ├── .gitignore ├── .gitmodules ├── .pre-commit-config.yaml ├── CMakeLists.txt ├── CMakeUserPresets.json ├── LICENSE ├── Makefile ├── README.md ├── benchmark └── lineitem.py ├── data ├── fruit.arrow ├── multifile │ ├── different_order.arrows │ ├── different_type.arrows │ ├── different_type_int.arrows │ ├── different_type_order.arrows │ ├── fruit_extra.arrows │ ├── glob │ │ ├── f1.arrow │ │ ├── f2.arrow │ │ └── f3.arrow │ └── hive │ │ ├── part=a │ │ ├── f1.arrow │ │ └── f2.arrow │ │ └── part=b │ │ ├── f1.arrow │ │ └── f3.arrow ├── parquet-testing │ └── lineitem_sf0_01.parquet ├── test.arrow └── test.arrows ├── docs └── UPDATING.md ├── extension_config.cmake ├── scripts ├── extension-upload.sh └── setup-custom-toolchain.sh ├── src ├── file_scanner │ ├── arrow_file_scan.cpp │ └── arrow_multi_file_info.cpp ├── include │ ├── file_scanner │ │ ├── arrow_file_scan.hpp │ │ └── arrow_multi_file_info.hpp │ ├── ipc │ │ ├── array_stream.hpp │ │ ├── stream_factory.hpp │ │ └── stream_reader │ │ │ ├── base_stream_reader.hpp │ │ │ ├── ipc_buffer_stream_reader.hpp │ │ │ └── ipc_file_stream_reader.hpp │ ├── nanoarrow_errors.hpp │ ├── nanoarrow_extension.hpp │ ├── table_function │ │ ├── arrow_ipc_function_data.hpp │ │ ├── read_arrow.hpp │ │ └── scan_arrow_ipc.hpp │ ├── write_arrow_stream.hpp │ └── writer │ │ ├── arrow_stream_writer.hpp │ │ ├── column_data_collection_serializer.hpp │ │ └── to_arrow_ipc.hpp ├── ipc │ ├── array_stream.cpp │ ├── stream_factory.cpp │ └── stream_reader │ │ ├── base_stream_reader.cpp │ │ ├── ipc_buffer_stream_reader.cpp │ │ └── ipc_file_stream_reader.cpp ├── nanoarrow_extension.cpp ├── scanner │ ├── read_arrow.cpp │ └── scan_arrow_ipc.cpp └── writer │ ├── arrow_stream_writer.cpp │ ├── column_data_collection_serializer.cpp │ ├── to_arrow_ipc.cpp │ └── write_arrow_stream.cpp ├── test ├── README.md ├── nodejs │ └── arrow_test.js ├── python │ ├── conftest.py │ ├── requirements-dev.txt │ ├── test_arrow_ipc_scan.py │ ├── test_arrow_ipc_writer.py │ └── test_integration.py └── sql │ ├── arrow_testing.test │ ├── multifile_reading.test │ ├── nanoarrow.test │ ├── read_arrow.test │ ├── read_arrow_file.test │ ├── test_copy_to.test │ ├── to_arrow_ipc.test │ └── write_arrow_stream.test ├── test_local.sh └── vcpkg.json /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | BasedOnStyle: Google 3 | ColumnLimit: 90 4 | DerivePointerAlignment: false 5 | IncludeBlocks: Preserve 6 | -------------------------------------------------------------------------------- /.cmake-format: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # cmake-format configuration file 19 | # Use `archery lint --cmake-format --fix` to reformat all cmake files in the 20 | # source tree 21 | 22 | # ----------------------------- 23 | # Options affecting formatting. 24 | # ----------------------------- 25 | with section("format"): 26 | # How wide to allow formatted cmake files 27 | line_width = 90 28 | 29 | # How many spaces to tab for indent 30 | tab_size = 2 31 | 32 | # If a positional argument group contains more than this many arguments, 33 | # then force it to a vertical layout. 34 | max_pargs_hwrap = 4 35 | 36 | # If the statement spelling length (including space and parenthesis) is 37 | # smaller than this amount, then force reject nested layouts. 38 | # This value only comes into play when considering whether or not to nest 39 | # arguments below their parent. If the number of characters in the parent 40 | # is less than this value, we will not nest. 41 | min_prefix_chars = 32 42 | 43 | # If true, separate flow control names from their parentheses with a space 44 | separate_ctrl_name_with_space = False 45 | 46 | # If true, separate function names from parentheses with a space 47 | separate_fn_name_with_space = False 48 | 49 | # If a statement is wrapped to more than one line, than dangle the closing 50 | # parenthesis on it's own line 51 | dangle_parens = False 52 | 53 | # What style line endings to use in the output. 54 | line_ending = 'unix' 55 | 56 | # Format command names consistently as 'lower' or 'upper' case 57 | command_case = 'lower' 58 | 59 | # Format keywords consistently as 'lower' or 'upper' case 60 | keyword_case = 'unchanged' 61 | 62 | # ------------------------------------------------ 63 | # Options affecting comment reflow and formatting. 64 | # ------------------------------------------------ 65 | with section("markup"): 66 | # enable comment markup parsing and reflow 67 | enable_markup = False 68 | 69 | # If comment markup is enabled, don't reflow the first comment block in 70 | # eachlistfile. Use this to preserve formatting of your 71 | # copyright/licensestatements. 72 | first_comment_is_literal = True 73 | 74 | # If comment markup is enabled, don't reflow any comment block which 75 | # matchesthis (regex) pattern. Default is `None` (disabled). 76 | literal_comment_pattern = None 77 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | duckdb/.editorconfig -------------------------------------------------------------------------------- /.github/workflows/MainDistributionPipeline.yml: -------------------------------------------------------------------------------- 1 | # 2 | # This workflow calls the main distribution pipeline from DuckDB to build, test and (optionally) release the extension 3 | # 4 | name: Main Extension Distribution Pipeline 5 | on: 6 | push: 7 | pull_request: 8 | workflow_dispatch: 9 | 10 | concurrency: 11 | group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/main' || github.sha }} 12 | cancel-in-progress: true 13 | 14 | jobs: 15 | duckdb-next-build: 16 | name: Build extension binaries 17 | uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main 18 | with: 19 | duckdb_version: main 20 | ci_tools_version: main 21 | extension_name: nanoarrow 22 | 23 | duckdb-stable-build: 24 | name: Build extension binaries 25 | if: github.ref == 'refs/heads/stable' 26 | uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@v1.2.1 27 | with: 28 | duckdb_version: v1.2.1 29 | ci_tools_version: v1.2.1 30 | extension_name: nanoarrow 31 | -------------------------------------------------------------------------------- /.github/workflows/NodeJS.yml: -------------------------------------------------------------------------------- 1 | on: [push, pull_request,repository_dispatch,workflow_dispatch] 2 | 3 | defaults: 4 | run: 5 | shell: bash 6 | 7 | jobs: 8 | nodejs: 9 | if: github.ref == 'refs/heads/stable' 10 | name: NodeJS 11 | runs-on: macos-latest 12 | env: 13 | GEN: ninja 14 | 15 | steps: 16 | - name: Install Ninja 17 | run: brew install ninja 18 | 19 | - uses: actions/checkout@v2 20 | with: 21 | fetch-depth: 0 22 | submodules: 'true' 23 | 24 | - uses: actions/setup-python@v2 25 | with: 26 | python-version: '3.9' 27 | 28 | - uses: actions/setup-node@v4 29 | with: 30 | node-version: '20' 31 | 32 | - name: Install required node packages 33 | run: | 34 | sudo npm i duckdb@1.2.1 35 | sudo npm install -g apache-arrow mocha 36 | sudo npm install apache-arrow mocha 37 | npm -v 38 | node -v 39 | 40 | - name: Build duckdb 41 | run: | 42 | cd duckdb 43 | git checkout 8e52ec43959ab363643d63cb78ee214577111da4 #v1.2.1 44 | cd .. 45 | make 46 | 47 | - name: Run JS tests 48 | run: | 49 | make test_release_js 50 | -------------------------------------------------------------------------------- /.github/workflows/Python.yml: -------------------------------------------------------------------------------- 1 | on: [push, pull_request,repository_dispatch] 2 | 3 | defaults: 4 | run: 5 | shell: bash 6 | 7 | jobs: 8 | python: 9 | name: Python 10 | runs-on: macos-latest 11 | env: 12 | GEN: ninja 13 | 14 | steps: 15 | - name: Install Ninja 16 | run: brew install ninja 17 | 18 | - uses: actions/checkout@v2 19 | with: 20 | fetch-depth: 0 21 | submodules: true 22 | 23 | - uses: actions/setup-python@v2 24 | with: 25 | python-version: '3.11' 26 | 27 | - name: Build DuckDB (Python) 28 | run: | 29 | cd duckdb 30 | git checkout main 31 | cd tools/pythonpkg 32 | python3 -m pip install . 33 | 34 | - name: Build Arrow Extension 35 | run: make release 36 | 37 | - name: Install Python Dependencies 38 | shell: bash 39 | run: | 40 | pip install -r test/python/requirements-dev.txt 41 | 42 | - name: Test Python 43 | run: | 44 | (cd test/python && python -m pytest) 45 | -------------------------------------------------------------------------------- /.github/workflows/_extension_deploy.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Reusable workflow that deploys the artifacts produced by github.com/duckdb/duckdb/.github/workflows/_extension_distribution.yml 3 | # 4 | # note: this workflow needs to be located in the extension repository, as it requires secrets to be passed to the 5 | # deploy script. However, it should generally not be necessary to modify this workflow in your extension repository, as 6 | # this workflow can be configured to use a custom deploy script. 7 | 8 | 9 | name: Extension Deployment 10 | on: 11 | workflow_call: 12 | inputs: 13 | # The name of the extension 14 | extension_name: 15 | required: true 16 | type: string 17 | # DuckDB version to build against 18 | duckdb_version: 19 | required: true 20 | type: string 21 | # ';' separated list of architectures to exclude, for example: 'linux_amd64;osx_arm64' 22 | exclude_archs: 23 | required: false 24 | type: string 25 | default: "" 26 | # Whether to upload this deployment as the latest. This may overwrite a previous deployment. 27 | deploy_latest: 28 | required: false 29 | type: boolean 30 | default: false 31 | # Whether to upload this deployment under a versioned path. These will not be deleted automatically 32 | deploy_versioned: 33 | required: false 34 | type: boolean 35 | default: false 36 | # Postfix added to artifact names. Can be used to guarantee unique names when this workflow is called multiple times 37 | artifact_postfix: 38 | required: false 39 | type: string 40 | default: "" 41 | # Override the default deploy script with a custom script 42 | deploy_script: 43 | required: false 44 | type: string 45 | default: "./scripts/extension-upload.sh" 46 | # Override the default matrix parse script with a custom script 47 | matrix_parse_script: 48 | required: false 49 | type: string 50 | default: "./duckdb/scripts/modify_distribution_matrix.py" 51 | 52 | jobs: 53 | generate_matrix: 54 | name: Generate matrix 55 | runs-on: ubuntu-latest 56 | outputs: 57 | deploy_matrix: ${{ steps.parse-matrices.outputs.deploy_matrix }} 58 | steps: 59 | - uses: actions/checkout@v3 60 | with: 61 | fetch-depth: 0 62 | submodules: 'true' 63 | 64 | - name: Checkout DuckDB to version 65 | run: | 66 | cd duckdb 67 | git checkout ${{ inputs.duckdb_version }} 68 | 69 | - id: parse-matrices 70 | run: | 71 | python3 ${{ inputs.matrix_parse_script }} --input ./duckdb/.github/config/distribution_matrix.json --deploy_matrix --output deploy_matrix.json --exclude "${{ inputs.exclude_archs }}" --pretty 72 | deploy_matrix="`cat deploy_matrix.json`" 73 | echo deploy_matrix=$deploy_matrix >> $GITHUB_OUTPUT 74 | echo `cat $GITHUB_OUTPUT` 75 | 76 | deploy: 77 | name: Deploy 78 | runs-on: ubuntu-latest 79 | needs: generate_matrix 80 | if: ${{ needs.generate_matrix.outputs.deploy_matrix != '{}' && needs.generate_matrix.outputs.deploy_matrix != '' }} 81 | strategy: 82 | matrix: ${{fromJson(needs.generate_matrix.outputs.deploy_matrix)}} 83 | 84 | steps: 85 | - uses: actions/checkout@v3 86 | with: 87 | fetch-depth: 0 88 | submodules: 'true' 89 | 90 | - name: Checkout DuckDB to version 91 | run: | 92 | cd duckdb 93 | git checkout ${{ inputs.duckdb_version }} 94 | 95 | - uses: actions/download-artifact@v3 96 | with: 97 | name: ${{ inputs.extension_name }}-${{ inputs.duckdb_version }}-extension-${{matrix.duckdb_arch}}${{inputs.artifact_postfix}}${{startsWith(matrix.duckdb, 'wasm') && '.wasm' || ''}} 98 | path: | 99 | /tmp/extension 100 | 101 | - name: Deploy 102 | shell: bash 103 | env: 104 | AWS_ACCESS_KEY_ID: ${{ secrets.S3_DEPLOY_ID }} 105 | AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_DEPLOY_KEY }} 106 | AWS_DEFAULT_REGION: ${{ secrets.S3_REGION }} 107 | BUCKET_NAME: ${{ secrets.S3_BUCKET }} 108 | DUCKDB_EXTENSION_SIGNING_PK: ${{ secrets.S3_DUCKDB_ORG_EXTENSION_SIGNING_PK }} 109 | run: | 110 | pwd 111 | python3 -m pip install pip awscli 112 | git config --global --add safe.directory '*' 113 | cd duckdb 114 | git fetch --tags 115 | export DUCKDB_VERSION=`git tag --points-at HEAD` 116 | export DUCKDB_VERSION=${DUCKDB_VERSION:=`git log -1 --format=%h`} 117 | cd .. 118 | git fetch --tags 119 | export EXT_VERSION=`git tag --points-at HEAD` 120 | export EXT_VERSION=${EXT_VERSION:=`git log -1 --format=%h`} 121 | ${{ inputs.deploy_script }} ${{ inputs.extension_name }} $EXT_VERSION $DUCKDB_VERSION ${{ matrix.duckdb_arch }} $BUCKET_NAME ${{inputs.deploy_latest || 'true' && 'false'}} ${{inputs.deploy_versioned || 'true' && 'false'}} 122 | -------------------------------------------------------------------------------- /.github/workflows/dev.yaml: -------------------------------------------------------------------------------- 1 | name: dev 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | permissions: 12 | contents: read 13 | 14 | jobs: 15 | pre-commit: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v4 19 | with: 20 | fetch-depth: 0 21 | persist-credentials: false 22 | - uses: actions/setup-python@v5 23 | with: 24 | python-version: '3.x' 25 | - name: pre-commit (cache) 26 | uses: actions/cache@v4 27 | with: 28 | path: ~/.cache/pre-commit 29 | key: pre-commit-${{ hashFiles('.pre-commit-config.yaml') }} 30 | - name: pre-commit (--all-files) 31 | run: | 32 | python -m pip install pre-commit 33 | pre-commit run --show-diff-on-failure --color=always --all-files 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | .idea 3 | cmake-build-debug 4 | duckdb_unittest_tempdir/ 5 | .DS_Store 6 | testext 7 | test/python/__pycache__/ 8 | .Rhistory 9 | .vscode/settings.json 10 | .cache 11 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "duckdb"] 2 | path = duckdb 3 | url = https://github.com/duckdb/duckdb 4 | branch = main 5 | [submodule "extension-ci-tools"] 6 | path = extension-ci-tools 7 | url = https://github.com/duckdb/extension-ci-tools 8 | branch = main 9 | [submodule "arrow-testing"] 10 | path = arrow-testing 11 | url = https://github.com/apache/arrow-testing.git 12 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v2.3.0 4 | hooks: 5 | - id: check-yaml 6 | - id: end-of-file-fixer 7 | - id: trailing-whitespace 8 | - repo: https://github.com/pre-commit/mirrors-clang-format 9 | rev: v19.1.4 10 | hooks: 11 | - id: clang-format 12 | types_or: [c, c++] 13 | - repo: https://github.com/cheshirekow/cmake-format-precommit 14 | rev: v0.6.13 15 | hooks: 16 | - id: cmake-format 17 | args: [--in-place] 18 | - repo: https://github.com/codespell-project/codespell 19 | rev: v2.2.5 20 | hooks: 21 | - id: codespell 22 | types_or: [rst, markdown, c, c++] 23 | additional_dependencies: [tomli] 24 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.5) 2 | include(FetchContent) 3 | 4 | # Set extension name here 5 | set(TARGET_NAME nanoarrow) 6 | 7 | set(NANOARROW_IPC ON) 8 | set(NANOARROW_NAMESPACE "DuckDBExt${TARGET_NAME}") 9 | fetchcontent_declare(nanoarrow 10 | URL "https://github.com/apache/arrow-nanoarrow/archive/4bf5a9322626e95e3717e43de7616c0a256179eb.zip" 11 | URL_HASH SHA256=49d588ee758a2a1d099ed4525c583a04adf71ce40405011e0190aa1e75e61b59 12 | ) 13 | fetchcontent_makeavailable(nanoarrow) 14 | 15 | set(EXTENSION_NAME ${TARGET_NAME}_extension) 16 | set(LOADABLE_EXTENSION_NAME ${TARGET_NAME}_loadable_extension) 17 | 18 | project(${TARGET_NAME}) 19 | include_directories(src/include) 20 | 21 | set(EXTENSION_SOURCES 22 | src/file_scanner/arrow_file_scan.cpp 23 | src/file_scanner/arrow_multi_file_info.cpp 24 | src/ipc/array_stream.cpp 25 | src/ipc/stream_factory.cpp 26 | src/ipc/stream_reader/base_stream_reader.cpp 27 | src/ipc/stream_reader/ipc_file_stream_reader.cpp 28 | src/ipc/stream_reader/ipc_buffer_stream_reader.cpp 29 | src/scanner/read_arrow.cpp 30 | src/scanner/scan_arrow_ipc.cpp 31 | src/nanoarrow_extension.cpp 32 | src/writer/arrow_stream_writer.cpp 33 | src/writer/column_data_collection_serializer.cpp 34 | src/writer/write_arrow_stream.cpp 35 | src/writer/to_arrow_ipc.cpp) 36 | 37 | build_static_extension(${TARGET_NAME} ${EXTENSION_SOURCES}) 38 | build_loadable_extension(${TARGET_NAME} " " ${EXTENSION_SOURCES}) 39 | 40 | # Link nanoarrow in both the static library as the loadable extension 41 | target_link_libraries(${EXTENSION_NAME} nanoarrow::nanoarrow nanoarrow::nanoarrow_ipc) 42 | target_link_libraries(${LOADABLE_EXTENSION_NAME} nanoarrow::nanoarrow 43 | nanoarrow::nanoarrow_ipc) 44 | 45 | install(TARGETS ${EXTENSION_NAME} 46 | EXPORT "${DUCKDB_EXPORT_SET}" 47 | LIBRARY DESTINATION "${INSTALL_LIB_DIR}" 48 | ARCHIVE DESTINATION "${INSTALL_LIB_DIR}") 49 | -------------------------------------------------------------------------------- /CMakeUserPresets.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 3, 3 | "cmakeMinimumRequired": { 4 | "major": 3, 5 | "minor": 21, 6 | "patch": 0 7 | }, 8 | "configurePresets": [ 9 | { 10 | "name": "extension", 11 | "displayName": "Extension", 12 | "generator": "Ninja", 13 | "binaryDir": "${sourceDir}/../build", 14 | "cacheVariables": { 15 | "EXTENSION_STATIC_BUILD": "1", 16 | "DUCKDB_EXTENSION_CONFIGS": "${sourceDir}/../extension_config.cmake" 17 | } 18 | }, 19 | { 20 | "name": "extension_debug", 21 | "displayName": "Extension (Debug build)", 22 | "inherits": ["extension"], 23 | "cacheVariables": { 24 | "CMAKE_BUILD_TYPE": "Debug" 25 | } 26 | }, 27 | { 28 | "name": "extension_vcpkg_config", 29 | "hidden": true, 30 | "cacheVariables": { 31 | "CMAKE_TOOLCHAIN_FILE": "/$env{VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake", 32 | "VCPKG_MANIFEST_DIR": "${sourceDir}/..", 33 | "VCPKG_BUILD": "1" 34 | } 35 | }, 36 | { 37 | "name": "extension_vcpkg", 38 | "displayName": "Extension (using vcpkg)", 39 | "inherits": ["extension", "extension_vcpkg_config"] 40 | }, 41 | { 42 | "name": "extension_vcpkg_debug", 43 | "displayName": "Extension (Debug build using vcpkg)", 44 | "inherits": ["extension_debug", "extension_vcpkg_config"] 45 | } 46 | ] 47 | } 48 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2018-2024 Stichting DuckDB Foundation 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PROJ_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) 2 | 3 | # Configuration of extension 4 | EXT_NAME=nanoarrow 5 | EXT_CONFIG=${PROJ_DIR}extension_config.cmake 6 | 7 | # Include the Makefile from extension-ci-tools 8 | include extension-ci-tools/makefiles/duckdb_extension.Makefile 9 | 10 | # Client tests 11 | DEBUG_EXT_PATH='$(PROJ_DIR)build/debug/extension/nanoarrow/nanoarrow.duckdb_extension' 12 | RELDEBUG_EXT_PATH='$(PROJ_DIR)build/release/extension/nanoarrow/nanoarrow.duckdb_extension' 13 | RELEASE_EXT_PATH='$(PROJ_DIR)build/release/extension/nanoarrow/nanoarrow.duckdb_extension' 14 | 15 | test_js: 16 | test_debug_js: 17 | ARROW_EXTENSION_BINARY_PATH=$(DEBUG_EXT_PATH) mocha -R spec --timeout 480000 -n expose-gc --exclude 'test/*.ts' -- "test/nodejs/**/*.js" 18 | test_release_js: 19 | ARROW_EXTENSION_BINARY_PATH=$(RELEASE_EXT_PATH) mocha -R spec --timeout 480000 -n expose-gc --exclude 'test/*.ts' -- "test/nodejs/**/*.js" 20 | 21 | run_benchmark: 22 | python3 benchmark/lineitem.py $(RELEASE_EXT_PATH) 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # nanoarrow for DuckDB 2 | 3 | This extension, nanoarrow, allows you to read Arrow IPC streams and files. It serves a similar purpose as the now-deprecated [Arrow DuckDB core extension](https://github.com/duckdb/arrow). 4 | However, it comes with the added functionality to query Arrow IPC files and is much better tested. This extension is released as a DuckDB Community Extension. 5 | For compatibility reasons with the previous Arrow core extension, this extension is also aliased as `arrow`. 6 | 7 | You can install and load it as: 8 | 9 | ```sql 10 | -- arrow would also be a suitable name 11 | INSTALL nanoarrow FROM community; 12 | LOAD nanoarrow; 13 | ``` 14 | 15 | ## Usage 16 | Below is a complete example of how to use our extension to read an Arrow IPC file. 17 | In addition to our extension, you will also need the `httpfs` extension installed and loaded to fetch the data directly from GitHub. 18 | 19 | ```sql 20 | LOAD httpfs; 21 | LOAD nanoarrow; 22 | SELECT 23 | commit, message 24 | FROM 25 | 'https://github.com/apache/arrow-experiments/raw/refs/heads/main/data/arrow-commits/arrow-commits.arrows' 26 | LIMIT 10; 27 | ``` 28 | 29 | ``` 30 | ┌───────────────────────────┬───────────────────────────────────────────────────────────────────────────────────────────┐ 31 | │ commit │ message │ 32 | │ varchar │ varchar │ 33 | ├───────────────────────────┼───────────────────────────────────────────────────────────────────────────────────────────┤ 34 | │ 49cdb0fe4e98fda19031c86… │ GH-40370: [C++] Define ARROW_FORCE_INLINE for non-MSVC builds (#40372) │ 35 | │ 1d966e98e41ce817d1f8c51… │ GH-40386: [Python] Fix except clauses (#40387) │ 36 | │ 96f26a89bd73997f7532643… │ GH-40227: [R] ensure executable files in `create_package_with_all_dependencies` (#40232) │ 37 | │ ee1a8c39a55f3543a82fed9… │ GH-40366: [C++] Remove const qualifier from Buffer::mutable_span_as (#40367) │ 38 | │ 3d467ac7bfae03cf2db0980… │ GH-20127: [Python][CI] Remove legacy hdfs tests from hdfs and hypothesis setup (#40363) │ 39 | │ ef6ea6beed071ed070daf03… │ GH-40345: [FlightRPC][C++][Java][Go] Add URI scheme to reuse connection (#40084) │ 40 | │ 53e0c745ad491af98a5bf18… │ GH-40153: [C++][Python] Fix test_gdb failures on 32-bit (#40293) │ 41 | │ 3ba6d286caad328b8572a3b… │ GH-40059: [C++][Python] Basic conversion of RecordBatch to Arrow Tensor (#40064) │ 42 | │ 4ce9a5edd2710fb8bf0c642… │ GH-40153: [Python] Make `Tensor.__getbuffer__` work on 32-bit platforms (#40294) │ 43 | │ 2445975162905bd8d9a42ff… │ GH-40334: [C++][Gandiva] Add missing OpenSSL dependency to encrypt_utils_test.cc (#40338) │ 44 | ├───────────────────────────┴───────────────────────────────────────────────────────────────────────────────────────────┤ 45 | │ 10 rows 2 columns │ 46 | └───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ 47 | ``` 48 | 49 | In the remainder of this section, we cover the supported parameters and usages for our IPC readers/writers. 50 | 51 | ### IPC Files 52 | 53 | #### Write 54 | Writing an Arrow IPC file is done using the COPY statement Below is a simple example of how you can use DuckDB to create such a file. 55 | 56 | ```sql 57 | COPY (SELECT 42 as foofy, 'string' as stringy) TO "test.arrows"; 58 | ``` 59 | 60 | Both `.arrows` and `.arrow` will be automatically recognized by DuckDB as Arrow IPC streams. 61 | However, if you wish to use a different extension, you can manually specify the format using: 62 | 63 | ```sql 64 | COPY (SELECT 42 as foofy, 'string' as stringy) TO "test.ipc" (FORMAT ARROWS); 65 | ``` 66 | 67 | The Copy function of the Copy To Arrow File operation accepts the following parameters: 68 | * `row_group_size`: The size of a row group. By default, the value is 122,880. A lower value may reduce performance but can be beneficial for streaming. It is important to note that this value is not exact, a slightly higher value divisible by 2,048 (DuckDB's standard vector size) may be used as the actual row group size. 69 | * `chunk_size`: An alias for the `row_group_size` parameter. 70 | * `row_group_size_bytes`: The size of row groups in bytes. 71 | * `row_groups_per_file`: The maximum number of row groups per file. If this option is set, multiple files can be generated in a single `COPY` call. This means the specified path will create a directory, and the `row_group_size` parameter will also be used to determine the partition sizes. 72 | * `kv_metadata`: Key-value metadata to be added to the file schema. 73 | 74 | If `row_group_size_bytes` and either `chunk_size` or `row_group_size` are used, the row groups will be defined by the smallest of these parameters. 75 | 76 | #### Read 77 | You can consume the file using the `read_arrow` scanner. For example, to read the file we just created, you could run: 78 | ```sql 79 | FROM read_arrow('test.arrows'); 80 | ``` 81 | 82 | Similar to the copy function, the extension also registers `.arrows` and `.arrow` as valid extensions for the Arrow IPC format. This means that a replacement scan can be applied if that is the file extension, so the following would also be a valid query: 83 | ```sql 84 | FROM 'test.arrows'; 85 | ``` 86 | 87 | Besides single-file reading, our extension also fully supports multi-file reading, including all valid multi-file options. 88 | 89 | If we were to create a second test file using: 90 | ```sql 91 | COPY (SELECT 42 as foofy, 'string' as stringy) TO "test_2.arrows" (FORMAT ARROWS); 92 | ``` 93 | 94 | We can then run a query that reads both files using a glob pattern or a list of file paths: 95 | 96 | ```sql 97 | -- Glob 98 | FROM read_arrow('*.arrows') 99 | 100 | -- List 101 | FROM read_arrow(['test.arrows','test_2.arrows']) 102 | ``` 103 | 104 | When reading multiple files, the following parameters are also supported: 105 | * `union_by_name`: If the schemas of the files differ, setting `union_by_name` allows DuckDB to construct the schema by aligning columns with the same name. 106 | * `filename`: If set to `True`, this will add a column with the name of the file that generated each row. 107 | * `hive_partitioning`: Enables reading data from a Hive-partitioned dataset and applies partition filtering. 108 | > [!NOTE] 109 | > [Arrow IPC files (.arrow)](https://arrow.apache.org/docs/format/Columnar.html#ipc-file-format) and [Arrow IPC streams (.arrows)](https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format) are distinct but related formats. This extension can read both but only writes Arrow IPC Streams. 110 | ### IPC Stream Buffers 111 | Similar to the old core Arrow extension, this extension also allows direct production and consumption of the Arrow IPC streaming format from in-memory buffers in both Python and Node.js. 112 | In this section, we will demonstrate how to use the Python API, but you can find many tests that serve as examples for both [Node.js](https://github.com/paleolimbot/duckdb-nanoarrow/tree/main/test/nodejs) and [Python](https://github.com/paleolimbot/duckdb-nanoarrow/tree/main/test/python). 113 | 114 | Our extension can create Arrow IPC buffers using the `to_arrow_ipc` function. This function returns two columns: one containing the serialized data as a `BLOB`, and another `BOOL` column indicating which tuples contain the header information of the messages. For example, consider the following table in our DuckDB database: 115 | ```python 116 | import pyarrow as pa 117 | import duckdb 118 | import pyarrow.ipc as ipc 119 | 120 | connection = duckdb.connect() 121 | connection.execute("CREATE TABLE T (f0 integer, f1 varchar, f2 bool )") 122 | connection.execute("INSERT INTO T values (1, 'foo', true),(2, 'bar', NULL), (3, 'baz', false), (4, NULL, true) ") 123 | ``` 124 | 125 | We can then obtain our buffers by simply issuing a `to_arrow_ipc` call, like this: 126 | 127 | ```python 128 | buffers = connection.execute("FROM to_arrow_ipc((FROM T))").fetchall() 129 | ``` 130 | In this case, our buffers will contain two tuples: the first is the header of our message, and the second is the data. To convert this into an Arrow table, we simply concatenate the tuples and use the `ipc.RecordBatchStreamReader`. For example, you can read them as follows: 131 | 132 | 133 | ```python 134 | batches = [] 135 | with pa.BufferReader(pa.py_buffer(buffers[0][0] + buffers[1][0])) as reader: 136 | stream_reader = ipc.RecordBatchStreamReader(reader) 137 | schema = stream_reader.schema 138 | batches.extend(stream_reader) 139 | arrow_table = pa.Table.from_batches(batches, schema=schema) 140 | ``` 141 | 142 | To read buffers with DuckDB, you must use the Python function `from_arrow`. Continuing from our example, we would first need to convert our Arrow table into the Arrow IPC format. 143 | ```python 144 | batch = arrow_table.to_batches()[0] 145 | sink = pa.BufferOutputStream() 146 | with pa.ipc.new_stream(sink, batch.schema) as writer: 147 | writer.write_batch(batch) 148 | buffer = sink.getvalue() 149 | buf_reader = pa.BufferReader(buffer) 150 | msg_reader = ipc.MessageReader.open_stream(buf_reader) 151 | ``` 152 | 153 | After this, the following query will return a DuckDB relation with the deserialized Arrow IPC: 154 | 155 | ```python 156 | connection.from_arrow(msg_reader) 157 | ``` 158 | 159 | ## Building 160 | 161 | To build the extension, clone the repository with submodules: 162 | 163 | ``` shell 164 | git clone --recurse-submodules https://github.com/paleolimbot/duckdb-nanoarrow.git 165 | ``` 166 | 167 | ...or if you forget to clone the submodules/you're using VSCode to do your checkout, you can run: 168 | 169 | ``` shell 170 | git submodule init 171 | git submodule update --checkout 172 | ``` 173 | 174 | A quick-and-dirty way to get your build up and running is to run `make`: 175 | 176 | ```sh 177 | make 178 | 179 | ``` 180 | The main binaries that will be built are: 181 | 182 | ```sh 183 | ./build/release/duckdb 184 | ./build/release/test/unittest 185 | ./build/release/extension/nanoarrow/nanoarrow.duckdb_extension 186 | ``` 187 | 188 | - `duckdb` is the binary for the duckdb shell with the extension code automatically loaded. 189 | - `unittest` is the test runner of duckdb. Again, the extension is already linked into the binary. 190 | - `nanoarrow.duckdb_extension` is the loadable binary as it would be distributed. 191 | 192 | If you'd like to use VSCode with the integration provided by the CMake/clangd extension, you 193 | can run: 194 | 195 | ``` shell 196 | cp CMakeUserPresets.json duckdb/ 197 | ``` 198 | 199 | ...and ensure that `.vscode/settings.json` contains: 200 | 201 | ``` json 202 | { 203 | "cmake.sourceDirectory": "${workspaceFolder}/duckdb" 204 | } 205 | ``` 206 | 207 | Then choose *Developer: Reload window* from the command palette and choose the 208 | *Extension (Debug build)* preset. 209 | 210 | ## Running the extension 211 | 212 | To run the extension code, simply start the shell with `./build/release/duckdb` 213 | (if you're using `make` to build) or `./build/duckdb` (if you're using CMake 214 | via VSCode). 215 | 216 | Now we can use the features from the extension directly in DuckDB. 217 | 218 | ## Running the tests 219 | 220 | Different tests can be created for DuckDB extensions. Tests are written in 221 | SQL `./test/sql`. These SQL tests can be run using `make test` (if using 222 | make) or `./test_local.sh` (if using CMake via VSCode). 223 | 224 | ## Debugging 225 | 226 | You can debug an interactive SQL session by launching it with `gdb` or `lldb`: 227 | 228 | ``` shell 229 | lldb build/duckdb 230 | ``` 231 | 232 | ...or you can use the CodeLLDB extension (Command Palette: *LLDB: Attach to process*) 233 | to launch a VSCode interactive debugger launched in a terminal. 234 | -------------------------------------------------------------------------------- /benchmark/lineitem.py: -------------------------------------------------------------------------------- 1 | import pyarrow as pa 2 | import duckdb 3 | import time 4 | import statistics 5 | import sys 6 | from decimal import Decimal 7 | import concurrent.futures 8 | 9 | 10 | def measure_execution_time(con, query, result = None, lineitem_arrow = None): 11 | times = [] 12 | for _ in range(5): 13 | start = time.perf_counter() 14 | res = con.execute(query) 15 | end = time.perf_counter() 16 | times.append(end - start) 17 | if result: 18 | assert res.fetchall() == result 19 | 20 | return statistics.median(times) 21 | 22 | def get_queries(table_name): 23 | return [ 24 | f"""SELECT 25 | sum(l_extendedprice * l_discount) AS revenue 26 | FROM 27 | {table_name} 28 | WHERE 29 | l_shipdate >= CAST('1994-01-01' AS date) 30 | AND l_shipdate < CAST('1995-01-01' AS date) 31 | AND l_discount BETWEEN 0.05 32 | AND 0.07 33 | AND l_quantity < 24;""" 34 | ] 35 | 36 | def run_duckdb_native(con): 37 | # Lets see how long it takes to run the queries in DuckDB 38 | print ("Read DuckDB - Native") 39 | queries = get_queries("lineitem") 40 | for query in queries: 41 | print(measure_execution_time(con,query,[(Decimal('123141078.2283'),)])) 42 | 43 | 44 | def run_duckdb_arrow_array_stream(con): 45 | # Lets see how long it takes to run the queries in DuckDB 46 | print ("Generate PyArrow - Arrow Stream") 47 | times = [] 48 | for _ in range(5): 49 | start = time.perf_counter() 50 | results = con.execute("FROM lineitem").fetch_record_batch() 51 | while True: 52 | try: 53 | # Process chunks 54 | results.read_next_batch() 55 | except StopIteration: 56 | break 57 | end = time.perf_counter() 58 | times.append(end - start) 59 | print(statistics.median(times)) 60 | 61 | print ("Read PyArrow - Arrow Stream") 62 | queries = get_queries("record_batch_reader") 63 | arrow_table = con.execute("FROM lineitem").arrow() 64 | batches = arrow_table.to_batches(2048*120) 65 | for query in queries: 66 | times = [] 67 | for _ in range(5): 68 | record_batch_reader = pa.RecordBatchReader.from_batches(arrow_table.schema, batches) 69 | start = time.perf_counter() 70 | res = con.execute(query) 71 | end = time.perf_counter() 72 | times.append(end - start) 73 | assert res.fetchall() == [(Decimal('123141078.2283'),)] 74 | print(statistics.median(times)) 75 | 76 | def run_arrow_ipc(con): 77 | print ("Read IPC Buffers") 78 | 79 | queries = get_queries("lineitem_arrow") 80 | times = [] 81 | for _ in range(5): 82 | batches = con.execute("FROM lineitem").arrow().to_batches(2048*120) 83 | sink = pa.BufferOutputStream() 84 | 85 | with pa.ipc.new_stream(sink, batches[0].schema) as writer: 86 | for batch in batches: 87 | writer.write_batch(batch) 88 | 89 | buffer = sink.getvalue() 90 | with pa.BufferReader(buffer) as buf_reader: 91 | for query in queries: 92 | msg_reader = pa.ipc.MessageReader.open_stream(buf_reader) 93 | start = time.perf_counter() 94 | lineitem_arrow = con.from_arrow(msg_reader) 95 | res = con.execute(query) 96 | end = time.perf_counter() 97 | times.append(end - start) 98 | assert res.fetchall() == [(Decimal('123141078.2283'),)] 99 | print(statistics.median(times)) 100 | 101 | print("Generate IPC Buffers") 102 | print(measure_execution_time(con,"FROM to_arrow_ipc((FROM lineitem))")) 103 | 104 | def run_pyarrow(con): 105 | # Lets see how long it takes to run the queries in DuckDB 106 | print ("DuckDB - Native") 107 | queries = get_queries("lineitem") 108 | for query in queries: 109 | print(query) 110 | print(measure_execution_time(con,query)) 111 | 112 | def run_parquet(con): 113 | print("Generate Parquet") 114 | print(measure_execution_time(con, "COPY lineitem TO 'lineitem.parquet'")) 115 | 116 | queries = get_queries("lineitem.parquet") 117 | print("Read From Parquet") 118 | print(measure_execution_time(con, queries[0], [(Decimal('123141078.2283'),)])) 119 | 120 | def run_arrow_file(con): 121 | print("Generate ArrowIPC File") 122 | print(measure_execution_time(con, "COPY lineitem TO 'lineitem.arrows' (FORMAT arrows)")) 123 | 124 | queries = get_queries("lineitem.arrows") 125 | print("Read From ArrowIPC File") 126 | print(measure_execution_time(con, queries[0], [(Decimal('123141078.2283'),)])) 127 | 128 | # Use Arrow IPC to generate an ipc file 129 | table = con.execute("FROM lineitem").arrow() 130 | 131 | # Write the table to an IPC file (Arrow file format) 132 | print("Generate ArrowIPC File Pure") 133 | times = [] 134 | for _ in range(5): 135 | options = pa.ipc.IpcWriteOptions(compression = 'zstd') 136 | with open("lineitem_ipc.arrow", "wb") as f: 137 | start = time.perf_counter() 138 | writer = pa.ipc.RecordBatchFileWriter(f, table.schema, options=options) 139 | writer.write_table(table) 140 | writer.close() 141 | end = time.perf_counter() 142 | times.append(end - start) 143 | print(statistics.median(times)) 144 | queries = get_queries("lineitem_ipc.arrow") 145 | print("Read From ArrowIPC - Pure File") 146 | print(measure_execution_time(con, queries[0], [(Decimal('123141078.2283'),)])) 147 | 148 | 149 | def create_con(path,sf): 150 | con = duckdb.connect(config={"allow_unsigned_extensions":"true"}) 151 | con.execute(f"load '{path}'") 152 | con.execute(f"CALL dbgen(sf={sf});") 153 | return con 154 | 155 | if len(sys.argv) < 2: 156 | print("Usage: lineitem.py ") 157 | sys.exit(1) 158 | 159 | path = sys.argv[1] 160 | 161 | def run_buffer_benchmark(): 162 | con = create_con(path,1) 163 | run_duckdb_native(con) 164 | run_duckdb_arrow_array_stream(con) 165 | run_arrow_ipc(con) 166 | 167 | def run_file_benchmark(): 168 | con = create_con(path,1) 169 | run_parquet(con) 170 | run_arrow_file(con) 171 | 172 | if __name__ == "__main__": 173 | run_file_benchmark() 174 | -------------------------------------------------------------------------------- /data/fruit.arrow: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paleolimbot/duckdb-nanoarrow/8d76db91a3d375262e0d425a8a85fd8645b31ae9/data/fruit.arrow -------------------------------------------------------------------------------- /data/multifile/different_order.arrows: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paleolimbot/duckdb-nanoarrow/8d76db91a3d375262e0d425a8a85fd8645b31ae9/data/multifile/different_order.arrows -------------------------------------------------------------------------------- /data/multifile/different_type.arrows: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paleolimbot/duckdb-nanoarrow/8d76db91a3d375262e0d425a8a85fd8645b31ae9/data/multifile/different_type.arrows -------------------------------------------------------------------------------- /data/multifile/different_type_int.arrows: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paleolimbot/duckdb-nanoarrow/8d76db91a3d375262e0d425a8a85fd8645b31ae9/data/multifile/different_type_int.arrows -------------------------------------------------------------------------------- /data/multifile/different_type_order.arrows: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paleolimbot/duckdb-nanoarrow/8d76db91a3d375262e0d425a8a85fd8645b31ae9/data/multifile/different_type_order.arrows -------------------------------------------------------------------------------- /data/multifile/fruit_extra.arrows: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paleolimbot/duckdb-nanoarrow/8d76db91a3d375262e0d425a8a85fd8645b31ae9/data/multifile/fruit_extra.arrows -------------------------------------------------------------------------------- /data/multifile/glob/f1.arrow: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paleolimbot/duckdb-nanoarrow/8d76db91a3d375262e0d425a8a85fd8645b31ae9/data/multifile/glob/f1.arrow -------------------------------------------------------------------------------- /data/multifile/glob/f2.arrow: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paleolimbot/duckdb-nanoarrow/8d76db91a3d375262e0d425a8a85fd8645b31ae9/data/multifile/glob/f2.arrow -------------------------------------------------------------------------------- /data/multifile/glob/f3.arrow: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paleolimbot/duckdb-nanoarrow/8d76db91a3d375262e0d425a8a85fd8645b31ae9/data/multifile/glob/f3.arrow -------------------------------------------------------------------------------- /data/multifile/hive/part=a/f1.arrow: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paleolimbot/duckdb-nanoarrow/8d76db91a3d375262e0d425a8a85fd8645b31ae9/data/multifile/hive/part=a/f1.arrow -------------------------------------------------------------------------------- /data/multifile/hive/part=a/f2.arrow: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paleolimbot/duckdb-nanoarrow/8d76db91a3d375262e0d425a8a85fd8645b31ae9/data/multifile/hive/part=a/f2.arrow -------------------------------------------------------------------------------- /data/multifile/hive/part=b/f1.arrow: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paleolimbot/duckdb-nanoarrow/8d76db91a3d375262e0d425a8a85fd8645b31ae9/data/multifile/hive/part=b/f1.arrow -------------------------------------------------------------------------------- /data/multifile/hive/part=b/f3.arrow: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paleolimbot/duckdb-nanoarrow/8d76db91a3d375262e0d425a8a85fd8645b31ae9/data/multifile/hive/part=b/f3.arrow -------------------------------------------------------------------------------- /data/parquet-testing/lineitem_sf0_01.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paleolimbot/duckdb-nanoarrow/8d76db91a3d375262e0d425a8a85fd8645b31ae9/data/parquet-testing/lineitem_sf0_01.parquet -------------------------------------------------------------------------------- /data/test.arrow: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paleolimbot/duckdb-nanoarrow/8d76db91a3d375262e0d425a8a85fd8645b31ae9/data/test.arrow -------------------------------------------------------------------------------- /data/test.arrows: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paleolimbot/duckdb-nanoarrow/8d76db91a3d375262e0d425a8a85fd8645b31ae9/data/test.arrows -------------------------------------------------------------------------------- /docs/UPDATING.md: -------------------------------------------------------------------------------- 1 | # Extension updating 2 | When cloning this template, the target version of DuckDB should be the latest stable release of DuckDB. However, there 3 | will inevitably come a time when a new DuckDB is released and the extension repository needs updating. This process goes 4 | as follows: 5 | 6 | - Bump submodules 7 | - `./duckdb` should be set to latest tagged release 8 | - `./extension-ci-tools` should be set to updated branch corresponding to latest DuckDB release. So if you're building for DuckDB `v1.1.0` there will be a branch in `extension-ci-tools` named `v1.1.0` to which you should check out. 9 | - Bump versions in `./github/workflows` 10 | - `duckdb_version` input in `duckdb-stable-build` job in `MainDistributionPipeline.yml` should be set to latest tagged release 11 | - `duckdb_version` input in `duckdb-stable-deploy` job in `MainDistributionPipeline.yml` should be set to latest tagged release 12 | - the reusable workflow `duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml` for the `duckdb-stable-build` job should be set to latest tagged release 13 | 14 | # API changes 15 | DuckDB extensions built with this extension template are built against the internal C++ API of DuckDB. This API is not guaranteed to be stable. 16 | What this means for extension development is that when updating your extensions DuckDB target version using the above steps, you may run into the fact that your extension no longer builds properly. 17 | 18 | Currently, DuckDB does not (yet) provide a specific change log for these API changes, but it is generally not too hard to figure out what has changed. 19 | 20 | For figuring out how and why the C++ API changed, we recommend using the following resources: 21 | - DuckDB's [Release Notes](https://github.com/duckdb/duckdb/releases) 22 | - DuckDB's history of [Core extension patches](https://github.com/duckdb/duckdb/commits/main/.github/patches/extensions) 23 | - The git history of the relevant C++ Header file of the API that has changed 24 | -------------------------------------------------------------------------------- /extension_config.cmake: -------------------------------------------------------------------------------- 1 | # This file is included by DuckDB's build system. It specifies which extension to load 2 | 3 | # Extension from this repo 4 | duckdb_extension_load(nanoarrow 5 | SOURCE_DIR 6 | ${CMAKE_CURRENT_LIST_DIR} 7 | LOAD_TESTS 8 | LINKED_LIBS 9 | "../../_deps/nanoarrow-build/lib*.a") 10 | 11 | # Any extra extensions that should be built 12 | # e.g.: duckdb_extension_load(json) 13 | -------------------------------------------------------------------------------- /scripts/extension-upload.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Extension upload script 4 | 5 | # Usage: ./extension-upload.sh 6 | # : Name of the extension 7 | # : Version (commit / version tag) of the extension 8 | # : Version (commit / version tag) of DuckDB 9 | # : Architecture target of the extension binary 10 | # : S3 bucket to upload to 11 | # : Set this as the latest version ("true" / "false", default: "false") 12 | # : Set this as a versioned version that will prevent its deletion 13 | 14 | set -e 15 | 16 | if [[ $4 == wasm* ]]; then 17 | ext="/tmp/extension/$1.duckdb_extension.wasm" 18 | else 19 | ext="/tmp/extension/$1.duckdb_extension" 20 | fi 21 | 22 | echo $ext 23 | 24 | script_dir="$(dirname "$(readlink -f "$0")")" 25 | 26 | # calculate SHA256 hash of extension binary 27 | cat $ext > $ext.append 28 | 29 | if [[ $4 == wasm* ]]; then 30 | # 0 for custom section 31 | # 113 in hex = 275 in decimal, total lenght of what follows (1 + 16 + 2 + 256) 32 | # [1(continuation) + 0010011(payload) = \x93, 0(continuation) + 10(payload) = \x02] 33 | echo -n -e '\x00' >> $ext.append 34 | echo -n -e '\x93\x02' >> $ext.append 35 | # 10 in hex = 16 in decimal, lenght of name, 1 byte 36 | echo -n -e '\x10' >> $ext.append 37 | echo -n -e 'duckdb_signature' >> $ext.append 38 | # the name of the WebAssembly custom section, 16 bytes 39 | # 100 in hex, 256 in decimal 40 | # [1(continuation) + 0000000(payload) = ff, 0(continuation) + 10(payload)], 41 | # for a grand total of 2 bytes 42 | echo -n -e '\x80\x02' >> $ext.append 43 | fi 44 | 45 | # (Optionally) Sign binary 46 | if [ "$DUCKDB_EXTENSION_SIGNING_PK" != "" ]; then 47 | echo "$DUCKDB_EXTENSION_SIGNING_PK" > private.pem 48 | $script_dir/../duckdb/scripts/compute-extension-hash.sh $ext.append > $ext.hash 49 | openssl pkeyutl -sign -in $ext.hash -inkey private.pem -pkeyopt digest:sha256 -out $ext.sign 50 | rm -f private.pem 51 | fi 52 | 53 | # Signature is always there, potentially defaulting to 256 zeros 54 | truncate -s 256 $ext.sign 55 | 56 | # append signature to extension binary 57 | cat $ext.sign >> $ext.append 58 | 59 | # compress extension binary 60 | if [[ $4 == wasm_* ]]; then 61 | brotli < $ext.append > "$ext.compressed" 62 | else 63 | gzip < $ext.append > "$ext.compressed" 64 | fi 65 | 66 | set -e 67 | 68 | # Abort if AWS key is not set 69 | if [ -z "$AWS_ACCESS_KEY_ID" ]; then 70 | echo "No AWS key found, skipping.." 71 | exit 0 72 | fi 73 | 74 | # upload versioned version 75 | if [[ $7 = 'true' ]]; then 76 | if [[ $4 == wasm* ]]; then 77 | aws s3 cp $ext.compressed s3://$5/$1/$2/$3/$4/$1.duckdb_extension.wasm --acl public-read --content-encoding br --content-type="application/wasm" 78 | else 79 | aws s3 cp $ext.compressed s3://$5/$1/$2/$3/$4/$1.duckdb_extension.gz --acl public-read 80 | fi 81 | fi 82 | 83 | # upload to latest version 84 | if [[ $6 = 'true' ]]; then 85 | if [[ $4 == wasm* ]]; then 86 | aws s3 cp $ext.compressed s3://$5/$3/$4/$1.duckdb_extension.wasm --acl public-read --content-encoding br --content-type="application/wasm" 87 | else 88 | aws s3 cp $ext.compressed s3://$5/$3/$4/$1.duckdb_extension.gz --acl public-read 89 | fi 90 | fi 91 | -------------------------------------------------------------------------------- /scripts/setup-custom-toolchain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This is an example script that can be used to install additional toolchain dependencies. Feel free to remove this script 4 | # if no additional toolchains are required 5 | 6 | # To enable this script, set the `custom_toolchain_script` option to true when calling the reusable workflow 7 | # `.github/workflows/_extension_distribution.yml` from `https://github.com/duckdb/extension-ci-tools` 8 | 9 | # note that the $DUCKDB_PLATFORM environment variable can be used to discern between the platforms 10 | echo "This is the sample custom toolchain script running for architecture '$DUCKDB_PLATFORM' for the nanoarrow extension." 11 | -------------------------------------------------------------------------------- /src/file_scanner/arrow_file_scan.cpp: -------------------------------------------------------------------------------- 1 | #include "file_scanner/arrow_file_scan.hpp" 2 | 3 | #include "file_scanner/arrow_multi_file_info.hpp" 4 | 5 | namespace duckdb { 6 | namespace ext_nanoarrow { 7 | struct ArrowFileLocalState; 8 | 9 | ArrowFileScan::ArrowFileScan(ClientContext& context, const string& file_name) 10 | : BaseFileReader(file_name) { 11 | factory = make_uniq(context, file_name); 12 | 13 | factory->InitReader(); 14 | factory->GetFileSchema(schema_root); 15 | DBConfig& config = DatabaseInstance::GetDatabase(context).config; 16 | ArrowTableFunction::PopulateArrowTableType(config, arrow_table_type, schema_root, names, 17 | types); 18 | QueryResult::DeduplicateColumns(names); 19 | if (types.empty()) { 20 | throw InvalidInputException("Provided table/dataframe must have at least one column"); 21 | } 22 | columns = MultiFileColumnDefinition::ColumnsFromNamesAndTypes(names, types); 23 | } 24 | 25 | string ArrowFileScan::GetReaderType() const { return "ARROW"; } 26 | 27 | const vector& ArrowFileScan::GetNames() { return names; } 28 | const vector& ArrowFileScan::GetTypes() { return types; } 29 | 30 | bool ArrowFileScan::TryInitializeScan(ClientContext& context, 31 | GlobalTableFunctionState& gstate_p, 32 | LocalTableFunctionState& lstate_p) { 33 | auto& gstate = gstate_p.Cast(); 34 | auto& lstate = lstate_p.Cast(); 35 | if (gstate.files.find(file_list_idx.GetIndex()) != gstate.files.end()) { 36 | // Return false because we don't currently support more than one thread 37 | // scanning a file. In the future we may be able to support this by (e.g.) 38 | // reading the Arrow file footer or sending a thread to read ahead to scan 39 | // for RecordBatch messages. 40 | return false; 41 | } 42 | gstate.files.insert(file_list_idx.GetIndex()); 43 | 44 | // lstate.file_scan = shared_ptr_cast(this); 45 | lstate.local_arrow_function_data = make_uniq( 46 | &FileIPCStreamFactory::Produce, reinterpret_cast(factory.get())); 47 | lstate.local_arrow_function_data->schema_root = schema_root; 48 | lstate.local_arrow_function_data->arrow_table = arrow_table_type; 49 | if (!column_indexes.empty()) { 50 | lstate.init_input = make_uniq( 51 | *lstate.local_arrow_function_data, column_indexes, 52 | gstate.global_state.projection_ids, filters); 53 | } else { 54 | lstate.init_input = make_uniq( 55 | *lstate.local_arrow_function_data, gstate.global_state.column_indexes, 56 | gstate.global_state.projection_ids, filters); 57 | } 58 | lstate.local_arrow_global_state = 59 | ArrowTableFunction::ArrowScanInitGlobal(context, *lstate.init_input); 60 | lstate.local_arrow_local_state = 61 | ArrowTableFunction::ArrowScanInitLocal(lstate.execution_context, *lstate.init_input, 62 | lstate.local_arrow_global_state.get()); 63 | lstate.table_function_input = make_uniq( 64 | lstate.local_arrow_function_data.get(), lstate.local_arrow_local_state.get(), 65 | lstate.local_arrow_global_state.get()); 66 | return true; 67 | } 68 | void ArrowFileScan::Scan(ClientContext& context, GlobalTableFunctionState& global_state, 69 | LocalTableFunctionState& local_state, DataChunk& chunk) { 70 | auto& lstate = local_state.Cast(); 71 | ArrowTableFunction::ArrowScanFunction(context, *lstate.table_function_input, chunk); 72 | } 73 | 74 | shared_ptr ArrowFileScan::GetUnionData(idx_t file_idx) { 75 | auto data = make_shared_ptr(GetFileName()); 76 | data->names = GetNames(); 77 | data->types = GetTypes(); 78 | return data; 79 | } 80 | 81 | } // namespace ext_nanoarrow 82 | } // namespace duckdb 83 | -------------------------------------------------------------------------------- /src/file_scanner/arrow_multi_file_info.cpp: -------------------------------------------------------------------------------- 1 | #include "file_scanner/arrow_multi_file_info.hpp" 2 | 3 | #include "ipc/stream_reader/ipc_file_stream_reader.hpp" 4 | 5 | #include "duckdb/common/bind_helpers.hpp" 6 | #include "file_scanner/arrow_file_scan.hpp" 7 | #include "ipc/stream_factory.hpp" 8 | 9 | namespace duckdb { 10 | namespace ext_nanoarrow { 11 | 12 | unique_ptr ArrowMultiFileInfo::InitializeOptions( 13 | ClientContext& context, optional_ptr info) { 14 | return make_uniq(); 15 | } 16 | 17 | bool ArrowMultiFileInfo::ParseCopyOption(ClientContext& context, const string& key, 18 | const vector& values, 19 | BaseFileReaderOptions& options_p, 20 | vector& expected_names, 21 | vector& expected_types) { 22 | // We currently do not have any options for the scanner, so we always return false 23 | return false; 24 | } 25 | 26 | unique_ptr ArrowMultiFileInfo::InitializeInterface( 27 | ClientContext& context, MultiFileReader& reader, MultiFileList& file_list) { 28 | return make_uniq(); 29 | } 30 | 31 | bool ArrowMultiFileInfo::ParseOption(ClientContext& context, const string& key, 32 | const Value& val, MultiFileOptions& file_options, 33 | BaseFileReaderOptions& options) { 34 | // We currently do not have any options for the scanner, so we always return false 35 | return false; 36 | } 37 | 38 | void ArrowMultiFileInfo::FinalizeCopyBind(ClientContext& context, 39 | BaseFileReaderOptions& options_p, 40 | const vector& expected_names, 41 | const vector& expected_types) {} 42 | 43 | struct ArrowMultiFileData final : public TableFunctionData { 44 | ArrowMultiFileData() = default; 45 | 46 | unique_ptr file_scan; 47 | }; 48 | 49 | unique_ptr ArrowMultiFileInfo::InitializeBindData( 50 | MultiFileBindData& multi_file_data, unique_ptr options_p) { 51 | return make_uniq(); 52 | } 53 | 54 | void ArrowMultiFileInfo::BindReader(ClientContext& context, 55 | vector& return_types, 56 | vector& names, MultiFileBindData& bind_data) { 57 | ArrowFileReaderOptions options; 58 | auto& multi_file_list = *bind_data.file_list; 59 | if (!bind_data.file_options.union_by_name) { 60 | bind_data.reader_bind = bind_data.multi_file_reader->BindReader( 61 | context, return_types, names, *bind_data.file_list, bind_data, options, 62 | bind_data.file_options); 63 | 64 | } else { 65 | bind_data.reader_bind = bind_data.multi_file_reader->BindUnionReader( 66 | context, return_types, names, multi_file_list, bind_data, options, 67 | bind_data.file_options); 68 | } 69 | D_ASSERT(names.size() == return_types.size()); 70 | } 71 | 72 | void ArrowMultiFileInfo::FinalizeBindData(MultiFileBindData& multi_file_data) {} 73 | 74 | void ArrowMultiFileInfo::GetBindInfo(const TableFunctionData& bind_data, BindInfo& info) { 75 | } 76 | 77 | optional_idx ArrowMultiFileInfo::MaxThreads(const MultiFileBindData& bind_data_p, 78 | const MultiFileGlobalState& global_state, 79 | FileExpandResult expand_result) { 80 | if (expand_result == FileExpandResult::MULTIPLE_FILES) { 81 | // always launch max threads if we are reading multiple files 82 | return {}; 83 | } 84 | // Otherwise, only one thread 85 | return 1; 86 | } 87 | 88 | unique_ptr ArrowMultiFileInfo::InitializeGlobalState( 89 | ClientContext& context, MultiFileBindData& bind_data, 90 | MultiFileGlobalState& global_state) { 91 | return make_uniq( 92 | context, bind_data.file_list->GetTotalFileCount(), bind_data, global_state); 93 | } 94 | 95 | unique_ptr ArrowMultiFileInfo::InitializeLocalState( 96 | ExecutionContext& context, GlobalTableFunctionState& function_state) { 97 | return make_uniq(context); 98 | } 99 | 100 | shared_ptr ArrowMultiFileInfo::CreateReader( 101 | ClientContext& context, GlobalTableFunctionState& gstate_p, BaseUnionData& union_data, 102 | const MultiFileBindData& bind_data) { 103 | return make_shared_ptr(context, union_data.GetFileName()); 104 | } 105 | 106 | shared_ptr ArrowMultiFileInfo::CreateReader( 107 | ClientContext& context, GlobalTableFunctionState& gstate_p, 108 | const OpenFileInfo& file_info, idx_t file_idx, const MultiFileBindData& bind_data) { 109 | return make_shared_ptr(context, file_info.path); 110 | } 111 | 112 | shared_ptr ArrowMultiFileInfo::CreateReader( 113 | ClientContext& context, const OpenFileInfo& file, BaseFileReaderOptions& options, 114 | const MultiFileOptions& file_options) { 115 | return make_shared_ptr(context, file.path); 116 | } 117 | 118 | void ArrowMultiFileInfo::FinalizeReader(ClientContext& context, BaseFileReader& reader, 119 | GlobalTableFunctionState&) {} 120 | 121 | void ArrowMultiFileInfo::FinishFile(ClientContext& context, 122 | GlobalTableFunctionState& global_state, 123 | BaseFileReader& reader) {} 124 | 125 | void ArrowMultiFileInfo::FinishReading(ClientContext& context, 126 | GlobalTableFunctionState& global_state, 127 | LocalTableFunctionState& local_state) {} 128 | 129 | unique_ptr ArrowMultiFileInfo::GetCardinality( 130 | const MultiFileBindData& bind_data, idx_t file_count) { 131 | // TODO: Here is where we might set statistics, for optimizations if we have them 132 | // e.g., cardinality from the file footer 133 | return make_uniq(); 134 | } 135 | 136 | unique_ptr ArrowMultiFileInfo::GetStatistics(ClientContext& context, 137 | BaseFileReader& reader, 138 | const string& name) { 139 | return nullptr; 140 | } 141 | 142 | double ArrowMultiFileInfo::GetProgressInFile(ClientContext& context, 143 | const BaseFileReader& reader) { 144 | auto& file_scan = reader.Cast(); 145 | if (!file_scan.factory->reader) { 146 | // We are done with this file 147 | return 100; 148 | } 149 | auto file_reader = 150 | reinterpret_cast(file_scan.factory->reader.get()); 151 | return file_reader->GetProgress(); 152 | } 153 | 154 | void ArrowMultiFileInfo::GetVirtualColumns(ClientContext&, MultiFileBindData&, 155 | virtual_column_map_t& result) { 156 | if (result.find(COLUMN_IDENTIFIER_EMPTY) != result.end()) { 157 | result.erase(COLUMN_IDENTIFIER_EMPTY); 158 | } 159 | } 160 | 161 | } // namespace ext_nanoarrow 162 | } // namespace duckdb 163 | -------------------------------------------------------------------------------- /src/include/file_scanner/arrow_file_scan.hpp: -------------------------------------------------------------------------------- 1 | //===----------------------------------------------------------------------===// 2 | // DuckDB - nanoarrow 3 | // 4 | // file_scanner/arrow_file_scan.hpp 5 | // 6 | // 7 | //===----------------------------------------------------------------------===// 8 | 9 | #pragma once 10 | 11 | #include "ipc/stream_factory.hpp" 12 | 13 | #include "duckdb/common/multi_file/base_file_reader.hpp" 14 | 15 | namespace duckdb { 16 | namespace ext_nanoarrow { 17 | 18 | //! This class refers to an Arrow File Scan 19 | class ArrowFileScan : public BaseFileReader { 20 | public: 21 | explicit ArrowFileScan(ClientContext& context, const string& file_name); 22 | ~ArrowFileScan() override { 23 | // Release is done by the arrow scanner 24 | schema_root.arrow_schema.release = nullptr; 25 | }; 26 | 27 | //! Factory of this stream 28 | unique_ptr factory; 29 | 30 | string GetReaderType() const override; 31 | 32 | const vector& GetNames(); 33 | const vector& GetTypes(); 34 | ArrowSchemaWrapper schema_root; 35 | ArrowTableType arrow_table_type; 36 | 37 | bool TryInitializeScan(ClientContext& context, GlobalTableFunctionState& gstate, 38 | LocalTableFunctionState& lstate) override; 39 | void Scan(ClientContext& context, GlobalTableFunctionState& global_state, 40 | LocalTableFunctionState& local_state, DataChunk& chunk) override; 41 | 42 | shared_ptr GetUnionData(idx_t file_idx) override; 43 | 44 | private: 45 | vector names; 46 | vector types; 47 | }; 48 | } // namespace ext_nanoarrow 49 | } // namespace duckdb 50 | -------------------------------------------------------------------------------- /src/include/file_scanner/arrow_multi_file_info.hpp: -------------------------------------------------------------------------------- 1 | //===----------------------------------------------------------------------===// 2 | // DuckDB - nanoarrow 3 | // 4 | // file_scanner/arrow_multi_file_info.hpp 5 | // 6 | // 7 | //===----------------------------------------------------------------------===// 8 | 9 | #pragma once 10 | 11 | #include "duckdb/common/multi_file/multi_file_function.hpp" 12 | #include "duckdb/function/table/arrow.hpp" 13 | 14 | namespace duckdb { 15 | namespace ext_nanoarrow { 16 | 17 | //! We might have arrow specific options one day 18 | class ArrowFileReaderOptions : public BaseFileReaderOptions {}; 19 | 20 | class ArrowFileScan; 21 | 22 | //! The Arrow Local File State, basically refers to the Scan of one Arrow File 23 | //! This is done by calling the Arrow Scan directly on one file. 24 | struct ArrowFileLocalState : public LocalTableFunctionState { 25 | public: 26 | explicit ArrowFileLocalState(ExecutionContext& execution_context) 27 | : execution_context(execution_context) {}; 28 | //! Factory Pointer 29 | shared_ptr file_scan; 30 | 31 | ExecutionContext& execution_context; 32 | 33 | //! Each local state refers to an Arrow Scan on a local file 34 | unique_ptr local_arrow_function_data; 35 | unique_ptr init_input; 36 | unique_ptr local_arrow_global_state; 37 | unique_ptr local_arrow_local_state; 38 | unique_ptr table_function_input; 39 | }; 40 | 41 | struct ArrowFileGlobalState : public GlobalTableFunctionState { 42 | public: 43 | ArrowFileGlobalState(ClientContext& context_p, idx_t total_file_count, 44 | const MultiFileBindData& bind_data, 45 | MultiFileGlobalState& global_state) 46 | : global_state(global_state), context(context_p) {}; 47 | 48 | ~ArrowFileGlobalState() override = default; 49 | 50 | const MultiFileGlobalState& global_state; 51 | ClientContext& context; 52 | set files; 53 | }; 54 | 55 | struct ArrowMultiFileInfo : MultiFileReaderInterface { 56 | unique_ptr InitializeOptions( 57 | ClientContext& context, optional_ptr info) override; 58 | 59 | static unique_ptr InitializeInterface( 60 | ClientContext& context, MultiFileReader& reader, MultiFileList& file_list); 61 | 62 | bool ParseCopyOption(ClientContext& context, const string& key, 63 | const vector& values, BaseFileReaderOptions& options, 64 | vector& expected_names, 65 | vector& expected_types) override; 66 | 67 | bool ParseOption(ClientContext& context, const string& key, const Value& val, 68 | MultiFileOptions& file_options, 69 | BaseFileReaderOptions& options) override; 70 | 71 | void FinalizeCopyBind(ClientContext& context, BaseFileReaderOptions& options, 72 | const vector& expected_names, 73 | const vector& expected_types) override; 74 | 75 | unique_ptr InitializeBindData( 76 | MultiFileBindData& multi_file_data, 77 | unique_ptr options) override; 78 | 79 | //! This is where the actual binding must happen, so in this function we either: 80 | //! 1. union_by_name = False. We set the schema/name depending on the first file 81 | //! 2. union_by_name = True. 82 | void BindReader(ClientContext& context, vector& return_types, 83 | vector& names, MultiFileBindData& bind_data) override; 84 | 85 | void FinalizeBindData(MultiFileBindData& multi_file_data) override; 86 | 87 | void GetBindInfo(const TableFunctionData& bind_data, BindInfo& info) override; 88 | 89 | optional_idx MaxThreads(const MultiFileBindData& bind_data_p, 90 | const MultiFileGlobalState& global_state, 91 | FileExpandResult expand_result) override; 92 | 93 | unique_ptr InitializeGlobalState( 94 | ClientContext& context, MultiFileBindData& bind_data, 95 | MultiFileGlobalState& global_state) override; 96 | 97 | unique_ptr InitializeLocalState( 98 | ExecutionContext& context, GlobalTableFunctionState& function_state) override; 99 | 100 | shared_ptr CreateReader(ClientContext& context, 101 | GlobalTableFunctionState& gstate, 102 | BaseUnionData& union_data, 103 | const MultiFileBindData& bind_data_p) override; 104 | 105 | shared_ptr CreateReader(ClientContext& context, 106 | GlobalTableFunctionState& gstate, 107 | const OpenFileInfo& file_info, idx_t file_idx, 108 | const MultiFileBindData& bind_data) override; 109 | 110 | shared_ptr CreateReader(ClientContext& context, 111 | const OpenFileInfo& file, 112 | BaseFileReaderOptions& options, 113 | const MultiFileOptions& file_options) override; 114 | 115 | static void FinalizeReader(ClientContext& context, BaseFileReader& reader, 116 | GlobalTableFunctionState&); 117 | 118 | static void FinishFile(ClientContext& context, GlobalTableFunctionState& global_state, 119 | BaseFileReader& reader); 120 | 121 | void FinishReading(ClientContext& context, GlobalTableFunctionState& global_state, 122 | LocalTableFunctionState& local_state) override; 123 | 124 | unique_ptr GetCardinality(const MultiFileBindData& bind_data, 125 | idx_t file_count) override; 126 | 127 | static unique_ptr GetStatistics(ClientContext& context, 128 | BaseFileReader& reader, 129 | const string& name); 130 | 131 | static double GetProgressInFile(ClientContext& context, const BaseFileReader& reader); 132 | 133 | void GetVirtualColumns(ClientContext& context, MultiFileBindData& bind_data, 134 | virtual_column_map_t& result) override; 135 | }; 136 | 137 | } // namespace ext_nanoarrow 138 | } // namespace duckdb 139 | -------------------------------------------------------------------------------- /src/include/ipc/array_stream.hpp: -------------------------------------------------------------------------------- 1 | //===----------------------------------------------------------------------===// 2 | // DuckDB - nanoarrow 3 | // 4 | // ipc/array_stream.hpp 5 | // 6 | // 7 | //===----------------------------------------------------------------------===// 8 | 9 | #pragma once 10 | 11 | #include "ipc/stream_reader/base_stream_reader.hpp" 12 | 13 | namespace duckdb { 14 | namespace ext_nanoarrow { 15 | class IpcArrayStream { 16 | public: 17 | explicit IpcArrayStream(unique_ptr reader); 18 | 19 | IPCStreamReader& Reader() const; 20 | 21 | void ToArrayStream(ArrowArrayStream* stream); 22 | 23 | int GetSchema(ArrowSchema* schema); 24 | 25 | int GetNext(ArrowArray* array); 26 | 27 | const char* GetLastError() const; 28 | 29 | template 30 | int Wrap(Func&& func) { 31 | try { 32 | func(); 33 | return NANOARROW_OK; 34 | } catch (IOException& e) { 35 | last_msg = std::string("IOException: ") + e.what(); 36 | return EIO; 37 | } catch (InternalException& e) { 38 | last_msg = std::string("InternalException: ") + e.what(); 39 | return EINVAL; 40 | } catch (nanoarrow::Exception& e) { 41 | last_msg = std::string("nanoarrow::Exception: ") + e.what(); 42 | // Could probably find a way to pass on this code, usually ENOMEM 43 | return ENOMEM; 44 | } catch (std::exception& e) { 45 | last_msg = e.what(); 46 | return EINVAL; 47 | } 48 | } 49 | 50 | private: 51 | unique_ptr reader; 52 | string last_msg; 53 | }; 54 | } // namespace ext_nanoarrow 55 | } // namespace duckdb 56 | -------------------------------------------------------------------------------- /src/include/ipc/stream_factory.hpp: -------------------------------------------------------------------------------- 1 | //===----------------------------------------------------------------------===// 2 | // DuckDB - nanoarrow 3 | // 4 | // ipc/ipc_stream_factory.hpp 5 | // 6 | // 7 | //===----------------------------------------------------------------------===// 8 | 9 | #pragma once 10 | 11 | #include "ipc/array_stream.hpp" 12 | 13 | #include "duckdb/common/arrow/arrow_wrapper.hpp" 14 | #include "duckdb/function/table/arrow.hpp" 15 | #include "table_function/scan_arrow_ipc.hpp" 16 | 17 | namespace duckdb { 18 | namespace ext_nanoarrow { 19 | 20 | class ArrowStreamFactory { 21 | ArrowStreamFactory() {}; 22 | }; 23 | //! This Factory is a type invented by DuckDB. Notably, the Produce() 24 | //! function pointer is passed to the constructor of the ArrowScanFunctionData 25 | //! constructor (which we wrap). 26 | class ArrowIPCStreamFactory { 27 | public: 28 | virtual ~ArrowIPCStreamFactory() = default; 29 | explicit ArrowIPCStreamFactory(Allocator& allocator); 30 | 31 | //! Called once when initializing Scan States 32 | static unique_ptr Produce(uintptr_t factory_ptr, 33 | ArrowStreamParameters& parameters); 34 | 35 | //! Get the schema of the arrow object 36 | void GetFileSchema(ArrowSchemaWrapper& schema) const; 37 | 38 | //! Opens the file, wraps it in the ArrowIpcInputStream, and wraps it in 39 | //! the ArrowArrayStream reader. 40 | virtual void InitReader() { 41 | throw NotImplementedException("ArrowIPCStreamFactory::InitReader not implemented"); 42 | } 43 | 44 | Allocator& allocator; 45 | unique_ptr reader; 46 | ArrowError error{}; 47 | }; 48 | 49 | class BufferIPCStreamFactory final : public ArrowIPCStreamFactory { 50 | public: 51 | explicit BufferIPCStreamFactory(ClientContext& context, 52 | const vector& buffers); 53 | void InitReader() override; 54 | 55 | vector buffers; 56 | }; 57 | 58 | class FileIPCStreamFactory final : public ArrowIPCStreamFactory { 59 | public: 60 | explicit FileIPCStreamFactory(ClientContext& context, string src_string); 61 | void InitReader() override; 62 | 63 | FileSystem& fs; 64 | string src_string; 65 | }; 66 | } // namespace ext_nanoarrow 67 | } // namespace duckdb 68 | -------------------------------------------------------------------------------- /src/include/ipc/stream_reader/base_stream_reader.hpp: -------------------------------------------------------------------------------- 1 | //===----------------------------------------------------------------------===// 2 | // DuckDB - nanoarrow 3 | // 4 | // ipc/stream_reader/base_stream_reader.hpp 5 | // 6 | // 7 | //===----------------------------------------------------------------------===// 8 | 9 | #pragma once 10 | 11 | #include "nanoarrow/nanoarrow.hpp" 12 | #include "nanoarrow/nanoarrow_ipc.hpp" 13 | 14 | #include "duckdb/common/allocator.hpp" 15 | #include "duckdb/common/file_system.hpp" 16 | #include "duckdb/common/radix.hpp" 17 | #include "duckdb/common/serializer/buffered_file_reader.hpp" 18 | #include "nanoarrow_errors.hpp" 19 | 20 | #include "table_function/scan_arrow_ipc.hpp" 21 | 22 | namespace duckdb { 23 | namespace ext_nanoarrow { 24 | 25 | //! Missing in nanoarrow_ipc.hpp 26 | struct UniqueSharedBuffer { 27 | struct ArrowIpcSharedBuffer data{}; 28 | 29 | ~UniqueSharedBuffer() { 30 | if (data.private_src.allocator.free != nullptr) { 31 | ArrowIpcSharedBufferReset(&data); 32 | } 33 | } 34 | }; 35 | 36 | struct ArrowIpcMessagePrefix { 37 | uint32_t continuation_token; 38 | int32_t metadata_size; 39 | }; 40 | 41 | //! Base IPC Reader 42 | class IPCStreamReader { 43 | public: 44 | virtual ~IPCStreamReader() = default; 45 | explicit IPCStreamReader(Allocator& allocator) 46 | : decoder(NewDuckDBArrowDecoder()), allocator(allocator) {}; 47 | //! Gets the output schema, which is the file schema with projection pushdown being 48 | //! considered 49 | const ArrowSchema* GetOutputSchema(); 50 | //! Gets the next batch 51 | bool GetNextBatch(ArrowArray* out); 52 | //! Gets the unique buffer to get the next batch 53 | virtual nanoarrow::UniqueBuffer GetUniqueBuffer() { 54 | throw InternalException("IPCStreamReader::GetUniqueBuffer not implemented"); 55 | }; 56 | 57 | //! Sets the projection pushdown for this reader 58 | void SetColumnProjection(const vector& column_names); 59 | //! Gets the base schema with no projection pushdown 60 | const ArrowSchema* GetBaseSchema(); 61 | 62 | ArrowIpcMessageType ReadNextMessage(vector expected_types, 63 | bool end_of_stream_ok = true); 64 | virtual ArrowIpcMessageType ReadNextMessage() { 65 | throw InternalException("IPCStreamReader::ReadNextMessage not implemented"); 66 | } 67 | 68 | protected: 69 | virtual data_ptr_t ReadData(data_ptr_t ptr, idx_t size) { 70 | throw InternalException("IPCStreamReader::ReadData not implemented"); 71 | } 72 | //! Decode Message is composed of 3 steps 73 | ArrowIpcMessageType DecodeMessage(); 74 | //! 1. We decode the message metadata, and return the message_header_size 75 | idx_t DecodeMetadata() const; 76 | //! 2. We decode the message head, if message is finished we return true 77 | virtual bool DecodeHeader(idx_t message_header_size) { 78 | throw InternalException("IPCStreamReader::DecodeHead not implemented"); 79 | } 80 | //! 3. We decode the message body 81 | virtual void DecodeBody() { 82 | throw InternalException("IPCStreamReader::DecodeBody not implemented"); 83 | } 84 | 85 | bool HasProjection() const; 86 | static nanoarrow::ipc::UniqueDecoder NewDuckDBArrowDecoder(); 87 | 88 | static ArrowBufferView AllocatedDataView(const_data_ptr_t data, int64_t size); 89 | static nanoarrow::UniqueBuffer AllocatedDataToOwningBuffer( 90 | const shared_ptr& data); 91 | 92 | static const char* MessageTypeString(ArrowIpcMessageType message_type); 93 | 94 | static int64_t CountFields(const ArrowSchema* schema); 95 | 96 | ArrowError error{}; 97 | nanoarrow::ipc::UniqueDecoder decoder{}; 98 | vector projected_fields; 99 | nanoarrow::UniqueSchema projected_schema; 100 | //! Schema without projection applied to it 101 | nanoarrow::UniqueSchema base_schema; 102 | 103 | //! Information on current buffer 104 | data_ptr_t cur_ptr{}; 105 | int64_t cur_size{}; 106 | 107 | //! Allocator used to allocate buffers with decoded arrow information 108 | Allocator& allocator; 109 | 110 | bool finished{false}; 111 | 112 | ArrowIpcMessagePrefix message_prefix{}; 113 | static constexpr uint32_t kContinuationToken = 0xFFFFFFFF; 114 | }; 115 | 116 | } // namespace ext_nanoarrow 117 | } // namespace duckdb 118 | -------------------------------------------------------------------------------- /src/include/ipc/stream_reader/ipc_buffer_stream_reader.hpp: -------------------------------------------------------------------------------- 1 | //===----------------------------------------------------------------------===// 2 | // DuckDB - nanoarrow 3 | // 4 | // ipc/stream_reader/ipc_buffer_stream_reader.hpp 5 | // 6 | // 7 | //===----------------------------------------------------------------------===// 8 | 9 | #pragma once 10 | 11 | #include "ipc/stream_reader/base_stream_reader.hpp" 12 | 13 | namespace duckdb { 14 | namespace ext_nanoarrow { 15 | 16 | struct IPCBuffer { 17 | idx_t pos = 0; 18 | data_ptr_t ptr = nullptr; 19 | int64_t size = 0; 20 | }; 21 | 22 | //! Buffer Stream 23 | class IPCBufferStreamReader final : public IPCStreamReader { 24 | public: 25 | IPCBufferStreamReader(vector buffers, Allocator& allocator); 26 | 27 | ArrowIpcMessageType ReadNextMessage() override; 28 | 29 | private: 30 | data_ptr_t ReadData(data_ptr_t ptr, idx_t size) override; 31 | bool DecodeHeader(idx_t message_header_size) override; 32 | void DecodeBody() override; 33 | nanoarrow::UniqueBuffer GetUniqueBuffer() override; 34 | vector buffers; 35 | idx_t cur_idx = 0; 36 | IPCBuffer header; 37 | IPCBuffer body; 38 | IPCBuffer cur_buffer; 39 | bool initialized = false; 40 | }; 41 | 42 | } // namespace ext_nanoarrow 43 | } // namespace duckdb 44 | -------------------------------------------------------------------------------- /src/include/ipc/stream_reader/ipc_file_stream_reader.hpp: -------------------------------------------------------------------------------- 1 | //===----------------------------------------------------------------------===// 2 | // DuckDB - nanoarrow 3 | // 4 | // ipc/stream_reader/ipc_file_stream_reader.hpp 5 | // 6 | // 7 | //===----------------------------------------------------------------------===// 8 | 9 | #pragma once 10 | 11 | #include "ipc/stream_reader/base_stream_reader.hpp" 12 | 13 | namespace duckdb { 14 | namespace ext_nanoarrow { 15 | 16 | //! IPC File 17 | class IPCFileStreamReader final : public IPCStreamReader { 18 | public: 19 | IPCFileStreamReader(FileSystem& fs, unique_ptr handle, 20 | Allocator& allocator); 21 | 22 | ArrowIpcMessageType ReadNextMessage() override; 23 | 24 | double GetProgress(); 25 | 26 | private: 27 | BufferedFileReader file_reader; 28 | AllocatedData message_header; 29 | shared_ptr message_body; 30 | 31 | void EnsureInputStreamAligned(); 32 | 33 | data_ptr_t ReadData(data_ptr_t ptr, idx_t size) override; 34 | static void DecodeArray(nanoarrow::ipc::UniqueDecoder& decoder, ArrowArray* out, 35 | ArrowBufferView& body_view, ArrowError* error); 36 | bool DecodeHeader(idx_t message_header_size) override; 37 | void DecodeBody() override; 38 | nanoarrow::UniqueBuffer GetUniqueBuffer() override; 39 | void PopulateNames(vector& names); 40 | }; 41 | 42 | } // namespace ext_nanoarrow 43 | } // namespace duckdb 44 | -------------------------------------------------------------------------------- /src/include/nanoarrow_errors.hpp: -------------------------------------------------------------------------------- 1 | //===----------------------------------------------------------------------===// 2 | // DuckDB - nanoarrow 3 | // 4 | // nanoarrow_errors.hpp 5 | // 6 | // 7 | //===----------------------------------------------------------------------===// 8 | 9 | #pragma once 10 | #define _DUCKDB_NANOARROW_THROW_NOT_OK_IMPL(NAME, ExceptionCls, ERROR_PTR, EXPR, \ 11 | EXPR_STR) \ 12 | do { \ 13 | const int NAME = (EXPR); \ 14 | if (NAME) { \ 15 | throw ExceptionCls(std::string(EXPR_STR) + std::string(" failed with errno ") + \ 16 | std::to_string(NAME) + std::string(": ") + \ 17 | std::string((ERROR_PTR)->message)); \ 18 | } \ 19 | } while (0) 20 | 21 | #define THROW_NOT_OK(ExceptionCls, ERROR_PTR, EXPR) \ 22 | _DUCKDB_NANOARROW_THROW_NOT_OK_IMPL(_NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), \ 23 | ExceptionCls, ERROR_PTR, EXPR, #EXPR) 24 | -------------------------------------------------------------------------------- /src/include/nanoarrow_extension.hpp: -------------------------------------------------------------------------------- 1 | //===----------------------------------------------------------------------===// 2 | // DuckDB - nanoarrow 3 | // 4 | // nanoarrow_extension.hpp 5 | // 6 | // 7 | //===----------------------------------------------------------------------===// 8 | 9 | #pragma once 10 | 11 | #include "duckdb/main/database.hpp" 12 | 13 | namespace duckdb { 14 | 15 | class NanoarrowExtension : public Extension { 16 | public: 17 | void Load(DuckDB& db) override; 18 | std::string Name() override; 19 | std::string Version() const override; 20 | }; 21 | 22 | } // namespace duckdb 23 | -------------------------------------------------------------------------------- /src/include/table_function/arrow_ipc_function_data.hpp: -------------------------------------------------------------------------------- 1 | //===----------------------------------------------------------------------===// 2 | // DuckDB - nanoarrow 3 | // 4 | // table_function/arrow_ipc_function_data.hpp 5 | // 6 | // 7 | //===----------------------------------------------------------------------===// 8 | 9 | #pragma once 10 | 11 | #include "duckdb/function/table/arrow.hpp" 12 | #include "ipc/stream_factory.hpp" 13 | 14 | namespace duckdb { 15 | namespace ext_nanoarrow { 16 | //! Our FunctionData is the same as the ArrowScanFunctionData except we extend it 17 | //! to keep the ArrowIpcArrowArrayStreamFactory alive. 18 | struct ArrowIPCFunctionData : public ArrowScanFunctionData { 19 | explicit ArrowIPCFunctionData(std::unique_ptr factory) 20 | : ArrowScanFunctionData(ArrowIPCStreamFactory::Produce, 21 | reinterpret_cast(factory.get())), 22 | factory(std::move(factory)) {} 23 | std::unique_ptr factory; 24 | }; 25 | } // namespace ext_nanoarrow 26 | } // namespace duckdb 27 | -------------------------------------------------------------------------------- /src/include/table_function/read_arrow.hpp: -------------------------------------------------------------------------------- 1 | //===----------------------------------------------------------------------===// 2 | // DuckDB - nanoarrow 3 | // 4 | // table_function/read_arrow.hpp 5 | // 6 | // 7 | //===----------------------------------------------------------------------===// 8 | 9 | #pragma once 10 | 11 | #include "duckdb/function/table_function.hpp" 12 | #include "duckdb/parser/parsed_data/copy_info.hpp" 13 | 14 | namespace duckdb { 15 | namespace ext_nanoarrow { 16 | 17 | TableFunction ReadArrowStreamFunction(); 18 | 19 | void RegisterReadArrowStream(DatabaseInstance& db); 20 | 21 | } // namespace ext_nanoarrow 22 | } // namespace duckdb 23 | -------------------------------------------------------------------------------- /src/include/table_function/scan_arrow_ipc.hpp: -------------------------------------------------------------------------------- 1 | //===----------------------------------------------------------------------===// 2 | // DuckDB - nanoarrow 3 | // 4 | // table_function/scan_arrow_ipc.hpp 5 | // 6 | // 7 | //===----------------------------------------------------------------------===// 8 | 9 | #pragma once 10 | 11 | #include "duckdb/function/table/arrow.hpp" 12 | 13 | #include "duckdb.hpp" 14 | 15 | namespace duckdb { 16 | namespace ext_nanoarrow { 17 | 18 | //! Arrow IPC Buffer, basically a pointer to the buffer and its size 19 | struct ArrowIPCBuffer { 20 | ArrowIPCBuffer(const uint64_t ptr, const uint64_t size) : ptr(ptr), size(size) {}; 21 | uint64_t ptr; 22 | uint64_t size; 23 | }; 24 | 25 | //! IPC Table scan is identical to ArrowTableFunction arrow scan except instead 26 | //! of CDataInterface header pointers, it takes a bunch of pointers pointing to 27 | //! buffers containing data in Arrow IPC format 28 | struct ScanArrowIPC { 29 | static void RegisterReadArrowStream(DatabaseInstance& db); 30 | }; 31 | } // namespace ext_nanoarrow 32 | } // namespace duckdb 33 | -------------------------------------------------------------------------------- /src/include/write_arrow_stream.hpp: -------------------------------------------------------------------------------- 1 | //===----------------------------------------------------------------------===// 2 | // DuckDB - nanoarrow 3 | // 4 | // write_arrow_stream.hpp 5 | // 6 | // 7 | //===----------------------------------------------------------------------===// 8 | 9 | #pragma once 10 | #include "duckdb/function/copy_function.hpp" 11 | 12 | namespace duckdb { 13 | namespace ext_nanoarrow { 14 | 15 | void RegisterArrowStreamCopyFunction(DatabaseInstance& db); 16 | 17 | } // namespace ext_nanoarrow 18 | } // namespace duckdb 19 | -------------------------------------------------------------------------------- /src/include/writer/arrow_stream_writer.hpp: -------------------------------------------------------------------------------- 1 | //===----------------------------------------------------------------------===// 2 | // DuckDB - nanoarrow 3 | // 4 | // writer/arrow_stream_writer.hpp 5 | // 6 | // 7 | //===----------------------------------------------------------------------===// 8 | 9 | #pragma once 10 | #include "duckdb/main/client_context.hpp" 11 | #include "writer/column_data_collection_serializer.hpp" 12 | 13 | namespace duckdb { 14 | namespace ext_nanoarrow { 15 | 16 | struct ArrowStreamWriter { 17 | ArrowStreamWriter(ClientContext& context, FileSystem& fs, const string& file_path, 18 | const vector& logical_types, 19 | const vector& column_names, 20 | const vector>& metadata); 21 | 22 | void InitSchema(const vector& logical_types, 23 | const vector& column_names, 24 | const vector>& metadata); 25 | 26 | void InitOutputFile(FileSystem& fs, const string& file_path); 27 | 28 | void WriteSchema(); 29 | 30 | unique_ptr NewSerializer(); 31 | 32 | void Flush(ColumnDataCollection& buffer); 33 | 34 | void Flush(ColumnDataCollectionSerializer& serializer); 35 | 36 | void Finalize() const; 37 | 38 | idx_t NumberOfRowGroups() const; 39 | 40 | idx_t FileSize() const; 41 | 42 | private: 43 | ClientProperties options; 44 | Allocator& allocator; 45 | ColumnDataCollectionSerializer serializer; 46 | string file_name; 47 | vector logical_types; 48 | unique_ptr writer; 49 | idx_t row_group_count{0}; 50 | nanoarrow::UniqueSchema schema; 51 | }; 52 | 53 | } // namespace ext_nanoarrow 54 | } // namespace duckdb 55 | -------------------------------------------------------------------------------- /src/include/writer/column_data_collection_serializer.hpp: -------------------------------------------------------------------------------- 1 | //===----------------------------------------------------------------------===// 2 | // DuckDB - nanoarrow 3 | // 4 | // writer/column_data_collection_serializer.hpp 5 | // 6 | // 7 | //===----------------------------------------------------------------------===// 8 | #pragma once 9 | 10 | #include "duckdb/common/arrow/arrow_converter.hpp" 11 | #include "duckdb/common/exception.hpp" 12 | #include "duckdb/common/serializer/buffered_file_writer.hpp" 13 | #include "duckdb/common/types/column/column_data_collection.hpp" 14 | #include "duckdb/function/table/arrow/arrow_duck_schema.hpp" 15 | #include "duckdb/main/client_properties.hpp" 16 | #include "nanoarrow/nanoarrow_ipc.hpp" 17 | #include "nanoarrow_errors.hpp" 18 | 19 | namespace duckdb { 20 | namespace ext_nanoarrow { 21 | 22 | class ColumnDataCollectionSerializer { 23 | public: 24 | ColumnDataCollectionSerializer(ClientProperties options, Allocator& allocator); 25 | 26 | void Init(const ArrowSchema* schema_p, const vector& logical_types); 27 | 28 | void SerializeSchema(); 29 | 30 | idx_t Serialize(ArrowArray& array); 31 | idx_t Serialize(DataChunk& chunk); 32 | 33 | idx_t Serialize(const ColumnDataCollection& buffer); 34 | 35 | void Flush(BufferedFileWriter& writer); 36 | 37 | nanoarrow::UniqueBuffer GetHeader(); 38 | 39 | nanoarrow::UniqueBuffer GetBody(); 40 | 41 | private: 42 | ClientProperties options; 43 | Allocator& allocator; 44 | const ArrowSchema* schema{}; 45 | unordered_map> extension_types; 46 | nanoarrow::ipc::UniqueEncoder encoder; 47 | nanoarrow::UniqueArrayView chunk_view; 48 | nanoarrow::UniqueArray chunk_arrow; 49 | nanoarrow::UniqueBuffer header; 50 | nanoarrow::UniqueBuffer body; 51 | ArrowError error{}; 52 | }; 53 | 54 | } // namespace ext_nanoarrow 55 | } // namespace duckdb 56 | -------------------------------------------------------------------------------- /src/include/writer/to_arrow_ipc.hpp: -------------------------------------------------------------------------------- 1 | //===----------------------------------------------------------------------===// 2 | // DuckDB - nanoarrow 3 | // 4 | // writer/to_arrow_ipc.hpp 5 | // 6 | // 7 | //===----------------------------------------------------------------------===// 8 | #pragma once 9 | #include "duckdb/function/table_function.hpp" 10 | 11 | #include "nanoarrow/hpp/unique.hpp" 12 | 13 | namespace duckdb { 14 | namespace ext_nanoarrow { 15 | 16 | class ArrowStringVectorBuffer : public VectorBuffer { 17 | public: 18 | explicit ArrowStringVectorBuffer(nanoarrow::UniqueBuffer buffer_p) 19 | : VectorBuffer(VectorBufferType::OPAQUE_BUFFER), buffer(std::move(buffer_p)) {} 20 | 21 | private: 22 | nanoarrow::UniqueBuffer buffer; 23 | }; 24 | 25 | class ToArrowIPCFunction { 26 | public: 27 | //! note: this is the number of vectors per chunk 28 | static constexpr idx_t DEFAULT_CHUNK_SIZE = 120; 29 | 30 | static TableFunction GetFunction(); 31 | static void RegisterToIPCFunction(DatabaseInstance& db); 32 | 33 | private: 34 | static unique_ptr InitLocal( 35 | ExecutionContext& context, TableFunctionInitInput& input, 36 | GlobalTableFunctionState* global_state); 37 | static unique_ptr InitGlobal(ClientContext& context, 38 | TableFunctionInitInput& input); 39 | static unique_ptr Bind(ClientContext& context, 40 | TableFunctionBindInput& input, 41 | vector& return_types, 42 | vector& names); 43 | static OperatorResultType Function(ExecutionContext& context, 44 | TableFunctionInput& data_p, DataChunk& input, 45 | DataChunk& output); 46 | static OperatorFinalizeResultType FunctionFinal(ExecutionContext& context, 47 | TableFunctionInput& data_p, 48 | DataChunk& output); 49 | }; 50 | 51 | } // namespace ext_nanoarrow 52 | } // namespace duckdb 53 | -------------------------------------------------------------------------------- /src/ipc/array_stream.cpp: -------------------------------------------------------------------------------- 1 | #include "ipc/array_stream.hpp" 2 | 3 | namespace duckdb { 4 | namespace ext_nanoarrow { 5 | 6 | IpcArrayStream::IpcArrayStream(unique_ptr reader) 7 | : reader(std::move(reader)) {} 8 | 9 | IPCStreamReader& IpcArrayStream::Reader() const { return *reader; } 10 | 11 | void IpcArrayStream::ToArrayStream(ArrowArrayStream* stream) { 12 | nanoarrow::ArrayStreamFactory::InitArrayStream( 13 | new IpcArrayStream(std::move(reader)), stream); 14 | } 15 | 16 | int IpcArrayStream::GetSchema(ArrowSchema* schema) { 17 | return Wrap([&]() { 18 | NANOARROW_THROW_NOT_OK(ArrowSchemaDeepCopy(reader->GetOutputSchema(), schema)); 19 | }); 20 | } 21 | 22 | int IpcArrayStream::GetNext(ArrowArray* array) { 23 | return Wrap([&]() { reader->GetNextBatch(array); }); 24 | } 25 | 26 | const char* IpcArrayStream::GetLastError() const { return last_msg.c_str(); } 27 | 28 | } // namespace ext_nanoarrow 29 | } // namespace duckdb 30 | -------------------------------------------------------------------------------- /src/ipc/stream_factory.cpp: -------------------------------------------------------------------------------- 1 | #include "ipc/stream_factory.hpp" 2 | 3 | #include 4 | #include 5 | 6 | #include "ipc/stream_reader/ipc_buffer_stream_reader.hpp" 7 | #include "ipc/stream_reader/ipc_file_stream_reader.hpp" 8 | 9 | namespace duckdb { 10 | namespace ext_nanoarrow { 11 | ArrowIPCStreamFactory::ArrowIPCStreamFactory(Allocator& allocator_p) 12 | : allocator(allocator_p) {} 13 | 14 | unique_ptr ArrowIPCStreamFactory::Produce( 15 | uintptr_t factory_ptr, ArrowStreamParameters& parameters) { 16 | auto factory = 17 | static_cast(reinterpret_cast(factory_ptr)); 18 | 19 | if (!factory->reader) { 20 | throw InternalException("IpcStreamReader was not initialized or was already moved"); 21 | } 22 | 23 | if (!parameters.projected_columns.columns.empty()) { 24 | factory->reader->SetColumnProjection(parameters.projected_columns.columns); 25 | } 26 | 27 | auto out = make_uniq(); 28 | IpcArrayStream(std::move(factory->reader)).ToArrayStream(&out->arrow_array_stream); 29 | return out; 30 | } 31 | 32 | void ArrowIPCStreamFactory::GetFileSchema(ArrowSchemaWrapper& schema) const { 33 | if (!reader) { 34 | throw InternalException("IpcStreamReader is no longer valid"); 35 | } 36 | 37 | NANOARROW_THROW_NOT_OK( 38 | ArrowSchemaDeepCopy(reader->GetBaseSchema(), &schema.arrow_schema)); 39 | } 40 | 41 | BufferIPCStreamFactory::BufferIPCStreamFactory(ClientContext& context, 42 | const vector& buffers_p) 43 | : ArrowIPCStreamFactory(BufferAllocator::Get(context)), buffers(buffers_p) {} 44 | 45 | void BufferIPCStreamFactory::InitReader() { 46 | if (reader) { 47 | throw InternalException("ArrowArrayStream or IpcStreamReader already initialized"); 48 | } 49 | reader = make_uniq(buffers, allocator); 50 | } 51 | 52 | FileIPCStreamFactory::FileIPCStreamFactory(ClientContext& context, string src_string) 53 | : ArrowIPCStreamFactory(BufferAllocator::Get(context)), 54 | fs(FileSystem::GetFileSystem(context)), 55 | src_string(std::move(src_string)) {} 56 | 57 | void FileIPCStreamFactory::InitReader() { 58 | if (reader) { 59 | throw InternalException("ArrowArrayStream or IpcStreamReader already initialized"); 60 | } 61 | unique_ptr handle = fs.OpenFile(src_string, FileOpenFlags::FILE_FLAGS_READ); 62 | reader = make_uniq(fs, std::move(handle), allocator); 63 | } 64 | 65 | } // namespace ext_nanoarrow 66 | } // namespace duckdb 67 | -------------------------------------------------------------------------------- /src/ipc/stream_reader/base_stream_reader.cpp: -------------------------------------------------------------------------------- 1 | #include "ipc/stream_reader/base_stream_reader.hpp" 2 | #include 3 | #include 4 | #include "zstd.h" 5 | 6 | namespace duckdb { 7 | namespace ext_nanoarrow { 8 | 9 | // A version of ArrowDecompressZstd that uses DuckDB's C++ name-specified 10 | // zstd.h header that doesn't work with a C compiler 11 | static ArrowErrorCode DuckDBDecompressZstd(struct ArrowBufferView src, uint8_t* dst, 12 | int64_t dst_size, struct ArrowError* error) { 13 | size_t code = duckdb_zstd::ZSTD_decompress((void*)dst, (size_t)dst_size, src.data.data, 14 | src.size_bytes); 15 | if (duckdb_zstd::ZSTD_isError(code)) { 16 | ArrowErrorSet(error, 17 | "ZSTD_decompress([buffer with %" PRId64 18 | " bytes] -> [buffer with %" PRId64 " bytes]) failed with error '%s'", 19 | src.size_bytes, dst_size, duckdb_zstd::ZSTD_getErrorName(code)); 20 | return EIO; 21 | } 22 | 23 | if (dst_size != static_cast(code)) { 24 | ArrowErrorSet(error, 25 | "Expected decompressed size of %" PRId64 " bytes but got %" PRId64 26 | " bytes", 27 | dst_size, static_cast(code)); 28 | return EIO; 29 | } 30 | 31 | return NANOARROW_OK; 32 | } 33 | 34 | // Create an ArrowIpcDecoder() with the appropriate decompressor set. 35 | // We could also define a decompressor that uses threads to parellelize 36 | // decompression for batches with many columns. 37 | nanoarrow::ipc::UniqueDecoder IPCStreamReader::NewDuckDBArrowDecoder() { 38 | nanoarrow::ipc::UniqueDecompressor decompressor; 39 | NANOARROW_THROW_NOT_OK(ArrowIpcSerialDecompressor(decompressor.get())); 40 | NANOARROW_THROW_NOT_OK(ArrowIpcSerialDecompressorSetFunction( 41 | decompressor.get(), NANOARROW_IPC_COMPRESSION_TYPE_ZSTD, DuckDBDecompressZstd)); 42 | 43 | nanoarrow::ipc::UniqueDecoder decoder; 44 | NANOARROW_THROW_NOT_OK(ArrowIpcDecoderInit(decoder.get())); 45 | NANOARROW_THROW_NOT_OK( 46 | ArrowIpcDecoderSetDecompressor(decoder.get(), decompressor.get())); 47 | // Bug in nanoarrow! 48 | decompressor->release = nullptr; 49 | return decoder; 50 | } 51 | 52 | const ArrowSchema* IPCStreamReader::GetBaseSchema() { 53 | if (base_schema->release) { 54 | return base_schema.get(); 55 | } 56 | 57 | ReadNextMessage({NANOARROW_IPC_MESSAGE_TYPE_SCHEMA}, /*end_of_stream_ok*/ false); 58 | 59 | if (decoder->feature_flags & NANOARROW_IPC_FEATURE_DICTIONARY_REPLACEMENT) { 60 | throw IOException("This stream uses unsupported feature DICTIONARY_REPLACEMENT"); 61 | } 62 | 63 | // Decode the schema 64 | THROW_NOT_OK(IOException, &error, 65 | ArrowIpcDecoderDecodeSchema(decoder.get(), base_schema.get(), &error)); 66 | 67 | // Set up the decoder to decode batches 68 | THROW_NOT_OK(InternalException, &error, 69 | ArrowIpcDecoderSetEndianness(decoder.get(), decoder->endianness)); 70 | THROW_NOT_OK(InternalException, &error, 71 | ArrowIpcDecoderSetSchema(decoder.get(), base_schema.get(), &error)); 72 | 73 | return base_schema.get(); 74 | } 75 | 76 | bool IPCStreamReader::HasProjection() const { return !projected_fields.empty(); } 77 | 78 | const ArrowSchema* IPCStreamReader::GetOutputSchema() { 79 | if (HasProjection()) { 80 | return projected_schema.get(); 81 | } else { 82 | return GetBaseSchema(); 83 | } 84 | } 85 | 86 | bool IPCStreamReader::GetNextBatch(ArrowArray* out) { 87 | // When nanoarrow supports dictionary batches, we'd accept either a 88 | // RecordBatch or DictionaryBatch message, recording the dictionary batch 89 | // (or possibly ignoring it if it is for a field that we don't care about), 90 | // but looping until we end up with a RecordBatch in the decoder. 91 | ArrowIpcMessageType message_type = 92 | ReadNextMessage({NANOARROW_IPC_MESSAGE_TYPE_RECORD_BATCH}); 93 | if (message_type == NANOARROW_IPC_MESSAGE_TYPE_UNINITIALIZED) { 94 | out->release = nullptr; 95 | return false; 96 | } 97 | 98 | // Use the ArrowIpcSharedBuffer if we have thread safety (i.e., if this was 99 | // compiled with a compiler that supports C11 atomics, i.e., not gcc 4.8 or 100 | // MSVC) 101 | bool thread_safe_shared = ArrowIpcSharedBufferIsThreadSafe(); 102 | struct ArrowBufferView body_view = AllocatedDataView(cur_ptr, cur_size); 103 | nanoarrow::UniqueBuffer body_shared = GetUniqueBuffer(); 104 | UniqueSharedBuffer shared; 105 | NANOARROW_THROW_NOT_OK(ArrowIpcSharedBufferInit(&shared.data, body_shared.get())); 106 | nanoarrow::UniqueArray array; 107 | if (HasProjection()) { 108 | NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(array.get(), NANOARROW_TYPE_STRUCT)); 109 | NANOARROW_THROW_NOT_OK( 110 | ArrowArrayAllocateChildren(array.get(), GetOutputSchema()->n_children)); 111 | 112 | if (thread_safe_shared) { 113 | for (int64_t i = 0; i < array->n_children; i++) { 114 | THROW_NOT_OK(InternalException, &error, 115 | ArrowIpcDecoderDecodeArrayFromShared( 116 | decoder.get(), &shared.data, projected_fields[i], 117 | array->children[i], NANOARROW_VALIDATION_LEVEL_FULL, &error)); 118 | } 119 | } else { 120 | for (int64_t i = 0; i < array->n_children; i++) { 121 | THROW_NOT_OK(InternalException, &error, 122 | ArrowIpcDecoderDecodeArray(decoder.get(), body_view, 123 | projected_fields[i], array->children[i], 124 | NANOARROW_VALIDATION_LEVEL_FULL, &error)); 125 | } 126 | } 127 | 128 | D_ASSERT(array->n_children > 0); 129 | array->length = array->children[0]->length; 130 | array->null_count = 0; 131 | } else if (thread_safe_shared) { 132 | THROW_NOT_OK( 133 | InternalException, &error, 134 | ArrowIpcDecoderDecodeArrayFromShared(decoder.get(), &shared.data, -1, array.get(), 135 | NANOARROW_VALIDATION_LEVEL_FULL, &error)); 136 | } else { 137 | THROW_NOT_OK(InternalException, &error, 138 | ArrowIpcDecoderDecodeArray(decoder.get(), body_view, -1, array.get(), 139 | NANOARROW_VALIDATION_LEVEL_FULL, &error)); 140 | } 141 | 142 | ArrowArrayMove(array.get(), out); 143 | return true; 144 | } 145 | 146 | void IPCStreamReader::SetColumnProjection(const vector& column_names) { 147 | if (column_names.empty()) { 148 | throw InternalException("Can't request zero fields projected from IpcStreamReader"); 149 | } 150 | 151 | // Ensure we have a file schema to work with 152 | GetBaseSchema(); 153 | 154 | nanoarrow::UniqueSchema schema; 155 | ArrowSchemaInit(schema.get()); 156 | NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct( 157 | schema.get(), UnsafeNumericCast(column_names.size()))); 158 | 159 | // The ArrowArray builder needs the flattened field index, which we need to 160 | // keep track of. 161 | unordered_map> name_to_flat_field_map; 162 | 163 | // Duplicate column names are in theory fine as long as they are not queried, 164 | // so we need to make a list of them to check. 165 | unordered_set duplicate_column_names; 166 | 167 | vector names; 168 | // Let's check if we need to deduplicate projection column names 169 | for (idx_t col_idx = 0; col_idx < static_cast(base_schema->n_children); 170 | col_idx++) { 171 | if (base_schema->children[col_idx]->name) { 172 | names.push_back(base_schema->children[col_idx]->name); 173 | } else { 174 | names.push_back(""); 175 | } 176 | } 177 | QueryResult::DeduplicateColumns(names); 178 | // Loop over columns to build the field map 179 | int64_t field_count = 0; 180 | for (int64_t i = 0; i < base_schema->n_children; i++) { 181 | if (name_to_flat_field_map.find(names[i]) != name_to_flat_field_map.end()) { 182 | duplicate_column_names.insert(names[i]); 183 | } 184 | name_to_flat_field_map.insert({names[i], {field_count, base_schema->children[i]}}); 185 | field_count += CountFields(base_schema->children[i]); 186 | } 187 | 188 | // Loop over projected column names to build the projection information 189 | int64_t output_column_index = 0; 190 | for (const auto& column_name : column_names) { 191 | if (duplicate_column_names.find(column_name) != duplicate_column_names.end()) { 192 | throw InternalException(string("Field '") + column_name + 193 | "' refers to a duplicate column name in IPC file schema"); 194 | } 195 | 196 | auto field_id_item = name_to_flat_field_map.find(column_name); 197 | if (field_id_item == name_to_flat_field_map.end()) { 198 | throw InternalException(string("Field '") + column_name + 199 | "' does not exist in IPC file schema"); 200 | } 201 | 202 | // Record the flat field index for this column 203 | projected_fields.push_back(field_id_item->second.first); 204 | 205 | // Record the Schema for this column 206 | NANOARROW_THROW_NOT_OK(ArrowSchemaDeepCopy(field_id_item->second.second, 207 | schema->children[output_column_index])); 208 | 209 | ++output_column_index; 210 | } 211 | projected_schema = std::move(schema); 212 | } 213 | 214 | idx_t IPCStreamReader::DecodeMetadata() const { 215 | idx_t metadata_size; 216 | if (!Radix::IsLittleEndian()) { 217 | metadata_size = static_cast(BSWAP32(message_prefix.metadata_size)); 218 | } else { 219 | metadata_size = message_prefix.metadata_size; 220 | } 221 | 222 | if (metadata_size < 0) { 223 | throw IOException(std::string("Expected metadata size >= 0 but got " + 224 | std::to_string(metadata_size))); 225 | } 226 | return metadata_size + sizeof(message_prefix); 227 | } 228 | 229 | ArrowIpcMessageType IPCStreamReader::DecodeMessage() { 230 | auto message_header_size = DecodeMetadata(); 231 | if (DecodeHeader(message_header_size)) { 232 | return NANOARROW_IPC_MESSAGE_TYPE_UNINITIALIZED; 233 | } 234 | DecodeBody(); 235 | return decoder->message_type; 236 | } 237 | 238 | ArrowIpcMessageType IPCStreamReader::ReadNextMessage( 239 | vector expected_types, bool end_of_stream_ok) { 240 | ArrowIpcMessageType actual_type = ReadNextMessage(); 241 | if (end_of_stream_ok && actual_type == NANOARROW_IPC_MESSAGE_TYPE_UNINITIALIZED) { 242 | return actual_type; 243 | } 244 | 245 | for (const auto expected_type : expected_types) { 246 | if (expected_type == actual_type) { 247 | return actual_type; 248 | } 249 | } 250 | 251 | std::stringstream expected_types_label; 252 | for (size_t i = 0; i < expected_types.size(); i++) { 253 | if (i > 0) { 254 | expected_types_label << " or "; 255 | } 256 | 257 | expected_types_label << MessageTypeString(expected_types[i]); 258 | } 259 | 260 | string actual_type_label; 261 | if (actual_type == NANOARROW_IPC_MESSAGE_TYPE_UNINITIALIZED) { 262 | actual_type_label = "end of stream"; 263 | } else { 264 | actual_type_label = MessageTypeString(actual_type); 265 | } 266 | 267 | throw IOException(string("Expected ") + expected_types_label.str() + 268 | " Arrow IPC message but got " + actual_type_label); 269 | } 270 | 271 | int64_t IPCStreamReader::CountFields(const ArrowSchema* schema) { 272 | int64_t n_fields = 1; 273 | for (int64_t i = 0; i < schema->n_children; i++) { 274 | n_fields += CountFields(schema->children[i]); 275 | } 276 | return n_fields; 277 | } 278 | 279 | ArrowBufferView IPCStreamReader::AllocatedDataView(const_data_ptr_t data, int64_t size) { 280 | ArrowBufferView view{}; 281 | view.data.data = data; 282 | view.size_bytes = size; 283 | return view; 284 | } 285 | 286 | nanoarrow::UniqueBuffer IPCStreamReader::AllocatedDataToOwningBuffer( 287 | const shared_ptr& data) { 288 | nanoarrow::UniqueBuffer out; 289 | if (data) { 290 | nanoarrow::BufferInitWrapped(out.get(), data, data->get(), 291 | UnsafeNumericCast(data->GetSize())); 292 | } 293 | return out; 294 | } 295 | 296 | const char* IPCStreamReader::MessageTypeString(ArrowIpcMessageType message_type) { 297 | switch (message_type) { 298 | case NANOARROW_IPC_MESSAGE_TYPE_SCHEMA: 299 | return "Schema"; 300 | case NANOARROW_IPC_MESSAGE_TYPE_RECORD_BATCH: 301 | return "RecordBatch"; 302 | case NANOARROW_IPC_MESSAGE_TYPE_DICTIONARY_BATCH: 303 | return "DictionaryBatch"; 304 | case NANOARROW_IPC_MESSAGE_TYPE_TENSOR: 305 | return "Tensor"; 306 | case NANOARROW_IPC_MESSAGE_TYPE_SPARSE_TENSOR: 307 | return "SparseTensor"; 308 | case NANOARROW_IPC_MESSAGE_TYPE_UNINITIALIZED: 309 | return "Uninitialized"; 310 | default: 311 | return ""; 312 | } 313 | } 314 | 315 | } // namespace ext_nanoarrow 316 | } // namespace duckdb 317 | -------------------------------------------------------------------------------- /src/ipc/stream_reader/ipc_buffer_stream_reader.cpp: -------------------------------------------------------------------------------- 1 | #include "ipc/stream_reader/ipc_buffer_stream_reader.hpp" 2 | 3 | #include 4 | 5 | namespace duckdb { 6 | namespace ext_nanoarrow { 7 | 8 | IPCBufferStreamReader::IPCBufferStreamReader(vector buffers, 9 | Allocator& allocator) 10 | : IPCStreamReader(allocator), buffers(std::move(buffers)) {} 11 | 12 | ArrowIpcMessageType IPCBufferStreamReader::ReadNextMessage() { 13 | if ((!initialized && cur_idx == buffers.size()) || finished) { 14 | finished = true; 15 | return NANOARROW_IPC_MESSAGE_TYPE_UNINITIALIZED; 16 | } 17 | if (!initialized || cur_buffer.pos >= buffers[cur_idx].size) { 18 | if (initialized) { 19 | cur_idx++; 20 | } 21 | if (cur_idx >= buffers.size()) { 22 | finished = true; 23 | return NANOARROW_IPC_MESSAGE_TYPE_UNINITIALIZED; 24 | } 25 | cur_buffer.ptr = reinterpret_cast(buffers[cur_idx].ptr); 26 | cur_buffer.size = static_cast(buffers[cur_idx].size); 27 | cur_buffer.pos = 0; 28 | initialized = true; 29 | } 30 | auto* message_prefix_ptr = reinterpret_cast( 31 | ReadData(reinterpret_cast(&message_prefix), sizeof(message_prefix))); 32 | message_prefix = *message_prefix_ptr; 33 | return DecodeMessage(); 34 | } 35 | 36 | data_ptr_t IPCBufferStreamReader::ReadData(data_ptr_t ptr, idx_t size) { 37 | D_ASSERT(size + cur_buffer.pos < cur_buffer.size); 38 | data_ptr_t cur_ptr = cur_buffer.ptr + cur_buffer.pos; 39 | cur_buffer.pos += size; 40 | return cur_ptr; 41 | } 42 | 43 | bool IPCBufferStreamReader::DecodeHeader(idx_t message_header_size) { 44 | // Our Header must contain the message prefix 45 | header.ptr = 46 | ReadData(header.ptr, message_prefix.metadata_size) - sizeof(message_prefix); 47 | header.size = message_header_size; 48 | const ArrowErrorCode decode_header_status = ArrowIpcDecoderDecodeHeader( 49 | decoder.get(), AllocatedDataView(header.ptr, header.size), &error); 50 | if (decode_header_status == ENODATA) { 51 | finished = true; 52 | return true; 53 | } 54 | THROW_NOT_OK(IOException, &error, decode_header_status); 55 | return false; 56 | } 57 | 58 | void IPCBufferStreamReader::DecodeBody() { 59 | if (decoder->body_size_bytes > 0) { 60 | body.ptr = ReadData(body.ptr, decoder->body_size_bytes); 61 | } 62 | if (body.ptr) { 63 | cur_ptr = body.ptr; 64 | cur_size = body.size; 65 | } else { 66 | cur_ptr = nullptr; 67 | cur_size = 0; 68 | } 69 | } 70 | 71 | nanoarrow::UniqueBuffer IPCBufferStreamReader::GetUniqueBuffer() { 72 | nanoarrow::UniqueBuffer out; 73 | nanoarrow::BufferInitWrapped(out.get(), body, body.ptr, body.size); 74 | return out; 75 | } 76 | 77 | } // namespace ext_nanoarrow 78 | } // namespace duckdb 79 | -------------------------------------------------------------------------------- /src/ipc/stream_reader/ipc_file_stream_reader.cpp: -------------------------------------------------------------------------------- 1 | #include "ipc/stream_reader/ipc_file_stream_reader.hpp" 2 | #include "duckdb/common/file_system.hpp" 3 | 4 | namespace duckdb { 5 | namespace ext_nanoarrow { 6 | IPCFileStreamReader::IPCFileStreamReader(FileSystem& fs, unique_ptr handle, 7 | Allocator& allocator) 8 | : IPCStreamReader(allocator), file_reader(fs, std::move(handle)) {} 9 | 10 | void IPCFileStreamReader::PopulateNames(vector& names) { 11 | GetBaseSchema(); 12 | for (int64_t i = 0; i < base_schema->n_children; i++) { 13 | const ArrowSchema* column = base_schema->children[i]; 14 | if (!column->name) { 15 | names.push_back(""); 16 | } else { 17 | names.push_back(column->name); 18 | } 19 | } 20 | } 21 | 22 | double IPCFileStreamReader::GetProgress() { 23 | idx_t file_size = file_reader.FileSize(); 24 | if (file_size == 0) { 25 | return 100; 26 | } 27 | auto current_offset = static_cast(file_reader.CurrentOffset()); 28 | return (current_offset / static_cast(file_size)) * 100; 29 | } 30 | 31 | void IPCFileStreamReader::DecodeArray(nanoarrow::ipc::UniqueDecoder& decoder, 32 | ArrowArray* out, ArrowBufferView& body_view, 33 | ArrowError* error) { 34 | // Use the ArrowIpcSharedBuffer if we have thread safety (i.e., if this was 35 | // compiled with a compiler that supports C11 atomics, i.e., not gcc 4.8 or 36 | // MSVC) 37 | nanoarrow::UniqueArray array; 38 | THROW_NOT_OK(InternalException, error, 39 | ArrowIpcDecoderDecodeArray(decoder.get(), body_view, -1, array.get(), 40 | NANOARROW_VALIDATION_LEVEL_FULL, error)); 41 | ArrowArrayMove(array.get(), out); 42 | } 43 | 44 | nanoarrow::UniqueBuffer IPCFileStreamReader::GetUniqueBuffer() { 45 | return AllocatedDataToOwningBuffer(message_body); 46 | } 47 | bool IPCFileStreamReader::DecodeHeader(const idx_t message_header_size) { 48 | if (message_header.GetSize() < message_header_size) { 49 | message_header = allocator.Allocate(message_header_size); 50 | } 51 | // Read the message header. I believe the fact that this loops and calls 52 | // the file handle's Read() method with relatively small chunks will ensure that 53 | // an attempt to read a very large message_header_size can be cancelled. If this 54 | // is not the case, we might want to implement our own buffering. 55 | std::memcpy(message_header.get(), &message_prefix, sizeof(message_prefix)); 56 | ReadData(message_header.get() + sizeof(message_prefix), message_prefix.metadata_size); 57 | 58 | ArrowErrorCode decode_header_status = ArrowIpcDecoderDecodeHeader( 59 | decoder.get(), 60 | AllocatedDataView(message_header.get(), 61 | static_cast(message_header.GetSize())), 62 | &error); 63 | if (decode_header_status == ENODATA) { 64 | finished = true; 65 | return true; 66 | } 67 | THROW_NOT_OK(IOException, &error, decode_header_status); 68 | return false; 69 | } 70 | 71 | void IPCFileStreamReader::DecodeBody() { 72 | if (decoder->body_size_bytes > 0) { 73 | EnsureInputStreamAligned(); 74 | message_body = 75 | make_shared_ptr(allocator.Allocate(decoder->body_size_bytes)); 76 | 77 | // Again, this is possibly a long running Read() call for a large body. 78 | // We could possibly be smarter about how we do this, particularly if we 79 | // are reading a small portion of the input from a seekable file. 80 | ReadData(message_body->get(), decoder->body_size_bytes); 81 | } 82 | if (message_body) { 83 | cur_ptr = message_body->get(); 84 | cur_size = static_cast(message_body->GetSize()); 85 | } else { 86 | cur_ptr = nullptr; 87 | cur_size = 0; 88 | } 89 | } 90 | 91 | data_ptr_t IPCFileStreamReader::ReadData(data_ptr_t ptr, idx_t size) { 92 | file_reader.ReadData(ptr, size); 93 | return ptr; 94 | } 95 | 96 | ArrowIpcMessageType IPCFileStreamReader::ReadNextMessage() { 97 | if (finished) { 98 | return NANOARROW_IPC_MESSAGE_TYPE_UNINITIALIZED; 99 | } 100 | 101 | // If there is no more data to be read, we're done! 102 | try { 103 | EnsureInputStreamAligned(); 104 | file_reader.ReadData(reinterpret_cast(&message_prefix), 105 | sizeof(message_prefix)); 106 | 107 | // If we're at the beginning of the read, and we see the Arrow file format 108 | // header bytes, skip them and try to read the stream anyway. This works because 109 | // there's a full stream within an Arrow file (including the EOS indicator, which 110 | // is key to success. This EOS indicator is unfortunately missing in Rust releases 111 | // prior to ~September 2024). 112 | // 113 | // When we support dictionary encoding we will possibly need to seek to the footer 114 | // here, parse that, and maybe lazily seek and read dictionaries for if/when they are 115 | // required. 116 | if (file_reader.CurrentOffset() == 8 && 117 | std::memcmp("ARROW1\0\0", &message_prefix, 8) == 0) { 118 | return ReadNextMessage(); 119 | } 120 | 121 | if (message_prefix.continuation_token != kContinuationToken) { 122 | throw IOException(std::string("Expected continuation token (0xFFFFFFFF) but got " + 123 | std::to_string(message_prefix.continuation_token))); 124 | } 125 | 126 | } catch (SerializationException& e) { 127 | finished = true; 128 | return NANOARROW_IPC_MESSAGE_TYPE_UNINITIALIZED; 129 | } 130 | 131 | return DecodeMessage(); 132 | } 133 | 134 | void IPCFileStreamReader::EnsureInputStreamAligned() { 135 | uint8_t padding[8]; 136 | int padding_bytes = 8 - (file_reader.CurrentOffset() % 8); 137 | if (padding_bytes != 8) { 138 | file_reader.ReadData(padding, padding_bytes); 139 | } 140 | D_ASSERT((file_reader.CurrentOffset() % 8) == 0); 141 | } 142 | 143 | } // namespace ext_nanoarrow 144 | } // namespace duckdb 145 | -------------------------------------------------------------------------------- /src/nanoarrow_extension.cpp: -------------------------------------------------------------------------------- 1 | #define DUCKDB_EXTENSION_MAIN 2 | 3 | #include "nanoarrow_extension.hpp" 4 | 5 | #include 6 | #include "writer/to_arrow_ipc.hpp" 7 | 8 | #include "duckdb/function/scalar_function.hpp" 9 | #include "duckdb/main/extension_util.hpp" 10 | 11 | #include "nanoarrow/nanoarrow.hpp" 12 | 13 | #include "table_function/read_arrow.hpp" 14 | #include "table_function/scan_arrow_ipc.hpp" 15 | #include "write_arrow_stream.hpp" 16 | 17 | namespace duckdb { 18 | 19 | namespace { 20 | 21 | struct NanoarrowVersion { 22 | static void Register(DatabaseInstance& db) { 23 | auto fn = ScalarFunction("nanoarrow_version", {}, LogicalType::VARCHAR, ExecuteFn); 24 | ExtensionUtil::RegisterFunction(db, fn); 25 | } 26 | 27 | static void ExecuteFn(DataChunk& args, ExpressionState& state, Vector& result) { 28 | result.SetValue(0, StringVector::AddString(result, ArrowNanoarrowVersion())); 29 | result.SetVectorType(VectorType::CONSTANT_VECTOR); 30 | } 31 | }; 32 | 33 | void LoadInternal(DatabaseInstance& db) { 34 | NanoarrowVersion::Register(db); 35 | ext_nanoarrow::RegisterReadArrowStream(db); 36 | ext_nanoarrow::RegisterArrowStreamCopyFunction(db); 37 | 38 | ext_nanoarrow::ScanArrowIPC::RegisterReadArrowStream(db); 39 | ext_nanoarrow::ToArrowIPCFunction::RegisterToIPCFunction(db); 40 | } 41 | 42 | } // namespace 43 | 44 | void NanoarrowExtension::Load(DuckDB& db) { LoadInternal(*db.instance); } 45 | std::string NanoarrowExtension::Name() { return "nanoarrow"; } 46 | 47 | std::string NanoarrowExtension::Version() const { 48 | #ifdef EXT_VERSION_NANOARROW 49 | return EXT_VERSION_NANOARROW; 50 | #else 51 | return ""; 52 | #endif 53 | } 54 | 55 | } // namespace duckdb 56 | 57 | extern "C" { 58 | 59 | DUCKDB_EXTENSION_API void nanoarrow_init(duckdb::DatabaseInstance& db) { 60 | duckdb::DuckDB db_wrapper(db); 61 | db_wrapper.LoadExtension(); 62 | } 63 | 64 | DUCKDB_EXTENSION_API const char* nanoarrow_version() { 65 | return duckdb::DuckDB::LibraryVersion(); 66 | } 67 | } 68 | 69 | #ifndef DUCKDB_EXTENSION_MAIN 70 | #error DUCKDB_EXTENSION_MAIN not defined 71 | #endif 72 | -------------------------------------------------------------------------------- /src/scanner/read_arrow.cpp: -------------------------------------------------------------------------------- 1 | #include "table_function/read_arrow.hpp" 2 | 3 | #include 4 | 5 | #include "file_scanner/arrow_multi_file_info.hpp" 6 | #include "zstd.h" 7 | 8 | #include "duckdb/common/radix.hpp" 9 | #include "duckdb/common/serializer/buffered_file_reader.hpp" 10 | #include "duckdb/function/table/arrow.hpp" 11 | #include "duckdb/function/table_function.hpp" 12 | #include "duckdb/main/database.hpp" 13 | #include "duckdb/main/extension_util.hpp" 14 | #include "duckdb/parser/expression/constant_expression.hpp" 15 | #include "duckdb/parser/expression/function_expression.hpp" 16 | #include "duckdb/parser/tableref/table_function_ref.hpp" 17 | 18 | #include "nanoarrow/nanoarrow.hpp" 19 | #include "nanoarrow/nanoarrow_ipc.hpp" 20 | 21 | #include "ipc/stream_factory.hpp" 22 | #include "ipc/stream_reader/base_stream_reader.hpp" 23 | #include "nanoarrow_errors.hpp" 24 | #include "table_function/arrow_ipc_function_data.hpp" 25 | 26 | // read_arrow() implementation 27 | // 28 | // This version uses the ArrowIpcDecoder directly. instead of nanoarrow's 29 | // ArrowArrayStream wrapper. This lets it use DuckDB's allocator at the 30 | // expense of a bit more verbosity. Because we can apply the projection 31 | // it reduces some of the verbosity of the actual DuckDB part (although the 32 | // ArrayStreamReader from nanoarrow could support a projection, which 33 | // would handle that too). 34 | // 35 | // I like this version better than the simpler one, and there are more parts 36 | // that could get optimized here (whereas with the array stream version you 37 | // don't have much control). 38 | 39 | namespace duckdb { 40 | 41 | namespace ext_nanoarrow { 42 | 43 | struct ReadArrowStream : ArrowTableFunction { 44 | static TableFunction Function() { 45 | MultiFileFunction read_arrow("read_arrow"); 46 | read_arrow.projection_pushdown = true; 47 | read_arrow.filter_pushdown = false; 48 | read_arrow.filter_prune = false; 49 | return static_cast(read_arrow); 50 | } 51 | 52 | static unique_ptr ScanReplacement(ClientContext& context, 53 | ReplacementScanInput& input, 54 | optional_ptr data) { 55 | auto table_name = ReplacementScan::GetFullPath(input); 56 | if (!ReplacementScan::CanReplace(table_name, {"arrows", "arrow"})) { 57 | return nullptr; 58 | } 59 | 60 | auto table_function = make_uniq(); 61 | vector> children; 62 | auto table_name_expr = make_uniq(Value(table_name)); 63 | children.push_back(std::move(table_name_expr)); 64 | auto function_expr = make_uniq("read_arrow", std::move(children)); 65 | table_function->function = std::move(function_expr); 66 | 67 | if (!FileSystem::HasGlob(table_name)) { 68 | auto& fs = FileSystem::GetFileSystem(context); 69 | table_function->alias = fs.ExtractBaseName(table_name); 70 | } 71 | 72 | return std::move(table_function); 73 | } 74 | }; 75 | 76 | TableFunction ReadArrowStreamFunction() { return ReadArrowStream::Function(); } 77 | 78 | void RegisterReadArrowStream(DatabaseInstance& db) { 79 | auto function = ReadArrowStream::Function(); 80 | ExtensionUtil::RegisterFunction(db, function); 81 | // So we can accept a list of paths as well e.g., ['file_1.arrow','file_2.arrow'] 82 | function.arguments = {LogicalType::LIST(LogicalType::VARCHAR)}; 83 | ExtensionUtil::RegisterFunction(db, function); 84 | auto& config = DBConfig::GetConfig(db); 85 | config.replacement_scans.emplace_back(ReadArrowStream::ScanReplacement); 86 | } 87 | 88 | } // namespace ext_nanoarrow 89 | } // namespace duckdb 90 | -------------------------------------------------------------------------------- /src/scanner/scan_arrow_ipc.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "table_function/scan_arrow_ipc.hpp" 3 | #include "duckdb/main/extension_util.hpp" 4 | #include "ipc/stream_factory.hpp" 5 | #include "table_function/arrow_ipc_function_data.hpp" 6 | 7 | #include "duckdb/function/table/arrow.hpp" 8 | 9 | #include "ipc/stream_reader/base_stream_reader.hpp" 10 | 11 | #include "duckdb/function/function.hpp" 12 | #include "duckdb/function/table/arrow/arrow_duck_schema.hpp" 13 | #include "duckdb/function/table_function.hpp" 14 | #include "duckdb/main/config.hpp" 15 | namespace duckdb { 16 | 17 | namespace ext_nanoarrow { 18 | 19 | struct ScanArrowIPCFunction : ArrowTableFunction { 20 | static unique_ptr ScanArrowIPCBind(ClientContext& context, 21 | TableFunctionBindInput& input, 22 | vector& return_types, 23 | vector& names) { 24 | // Create a vector with all the buffers and their sizes 25 | vector buffers; 26 | const auto buffer_ptr_list = ListValue::GetChildren(input.inputs[0]); 27 | for (auto& buffer_ptr_struct : buffer_ptr_list) { 28 | auto unpacked = StructValue::GetChildren(buffer_ptr_struct); 29 | buffers.emplace_back(unpacked[0].GetPointer(), unpacked[1].GetValue()); 30 | } 31 | 32 | auto stream_factory = make_uniq(context, buffers); 33 | auto res = make_uniq(std::move(stream_factory)); 34 | res->factory->InitReader(); 35 | res->factory->GetFileSchema(res->schema_root); 36 | 37 | DBConfig& config = DatabaseInstance::GetDatabase(context).config; 38 | PopulateArrowTableType(config, res->arrow_table, res->schema_root, names, 39 | return_types); 40 | QueryResult::DeduplicateColumns(names); 41 | res->all_types = return_types; 42 | if (return_types.empty()) { 43 | throw InvalidInputException( 44 | "Provided table/dataframe must have at least one column"); 45 | } 46 | 47 | return std::move(res); 48 | } 49 | 50 | static TableFunction Function() { 51 | child_list_t make_buffer_struct_children{{"ptr", LogicalType::POINTER}, 52 | {"size", LogicalType::UBIGINT}}; 53 | TableFunction scan_arrow_ipc_func( 54 | "scan_arrow_ipc", 55 | {LogicalType::LIST(LogicalType::STRUCT(make_buffer_struct_children))}, 56 | ArrowScanFunction, ScanArrowIPCBind, ArrowScanInitGlobal, ArrowScanInitLocal); 57 | 58 | scan_arrow_ipc_func.cardinality = ArrowScanCardinality; 59 | scan_arrow_ipc_func.projection_pushdown = true; 60 | scan_arrow_ipc_func.filter_pushdown = false; 61 | scan_arrow_ipc_func.filter_prune = false; 62 | 63 | return scan_arrow_ipc_func; 64 | } 65 | }; 66 | 67 | void ScanArrowIPC::RegisterReadArrowStream(DatabaseInstance& db) { 68 | auto function = ScanArrowIPCFunction::Function(); 69 | ExtensionUtil::RegisterFunction(db, function); 70 | } 71 | 72 | } // namespace ext_nanoarrow 73 | } // namespace duckdb 74 | -------------------------------------------------------------------------------- /src/writer/arrow_stream_writer.cpp: -------------------------------------------------------------------------------- 1 | #include "writer/arrow_stream_writer.hpp" 2 | namespace duckdb { 3 | 4 | namespace ext_nanoarrow { 5 | 6 | ArrowStreamWriter::ArrowStreamWriter(ClientContext& context, FileSystem& fs, 7 | const string& file_path, 8 | const vector& logical_types, 9 | const vector& column_names, 10 | const vector>& metadata) 11 | : options(context.GetClientProperties()), 12 | allocator(BufferAllocator::Get(context)), 13 | serializer(options, allocator), 14 | file_name(file_path), 15 | logical_types(logical_types) { 16 | InitSchema(logical_types, column_names, metadata); 17 | InitOutputFile(fs, file_path); 18 | } 19 | 20 | void ArrowStreamWriter::InitSchema(const vector& logical_types, 21 | const vector& column_names, 22 | const vector>& metadata) { 23 | nanoarrow::UniqueSchema tmp_schema; 24 | ArrowConverter::ToArrowSchema(tmp_schema.get(), logical_types, column_names, options); 25 | 26 | if (metadata.empty()) { 27 | ArrowSchemaMove(tmp_schema.get(), schema.get()); 28 | } else { 29 | nanoarrow::UniqueBuffer metadata_packed; 30 | NANOARROW_THROW_NOT_OK( 31 | ArrowMetadataBuilderInit(metadata_packed.get(), tmp_schema->metadata)); 32 | ArrowStringView key{}; 33 | ArrowStringView value{}; 34 | for (const auto& item : metadata) { 35 | key = {item.first.data(), static_cast(item.first.size())}; 36 | value = {item.second.data(), static_cast(item.second.size())}; 37 | NANOARROW_THROW_NOT_OK( 38 | ArrowMetadataBuilderAppend(metadata_packed.get(), key, value)); 39 | } 40 | 41 | NANOARROW_THROW_NOT_OK(ArrowSchemaDeepCopy(tmp_schema.get(), schema.get())); 42 | NANOARROW_THROW_NOT_OK(ArrowSchemaSetMetadata( 43 | schema.get(), reinterpret_cast(metadata_packed->data))); 44 | } 45 | 46 | serializer.Init(schema.get(), logical_types); 47 | } 48 | 49 | void ArrowStreamWriter::InitOutputFile(FileSystem& fs, const string& file_path) { 50 | writer = make_uniq( 51 | fs, file_path.c_str(), 52 | FileFlags::FILE_FLAGS_WRITE | FileFlags::FILE_FLAGS_FILE_CREATE_NEW); 53 | } 54 | 55 | void ArrowStreamWriter::WriteSchema() { 56 | serializer.SerializeSchema(); 57 | serializer.Flush(*writer); 58 | } 59 | 60 | unique_ptr ArrowStreamWriter::NewSerializer() { 61 | auto serializer = make_uniq(options, allocator); 62 | serializer->Init(schema.get(), logical_types); 63 | return serializer; 64 | } 65 | 66 | void ArrowStreamWriter::Flush(ColumnDataCollection& buffer) { 67 | serializer.Serialize(buffer); 68 | buffer.Reset(); 69 | serializer.Flush(*writer); 70 | ++row_group_count; 71 | } 72 | 73 | void ArrowStreamWriter::Flush(ColumnDataCollectionSerializer& serializer) { 74 | serializer.Flush(*writer); 75 | ++row_group_count; 76 | } 77 | 78 | void ArrowStreamWriter::Finalize() const { 79 | uint8_t end_of_stream[] = {0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00}; 80 | writer->WriteData(end_of_stream, sizeof(end_of_stream)); 81 | writer->Close(); 82 | } 83 | 84 | idx_t ArrowStreamWriter::NumberOfRowGroups() const { return row_group_count; } 85 | 86 | idx_t ArrowStreamWriter::FileSize() const { return writer->GetTotalWritten(); } 87 | 88 | } // namespace ext_nanoarrow 89 | } // namespace duckdb 90 | -------------------------------------------------------------------------------- /src/writer/column_data_collection_serializer.cpp: -------------------------------------------------------------------------------- 1 | #include "writer/column_data_collection_serializer.hpp" 2 | 3 | #include 4 | namespace duckdb { 5 | 6 | namespace ext_nanoarrow { 7 | 8 | // Initialize buffer whose realloc operations go through DuckDB's memory 9 | // accounting. Note that the Allocator must outlive the buffer (true for 10 | // the case of this writer, but maybe not true for generic production of 11 | // ArrowArrays whose lifetime might outlive the connection/database). 12 | inline void InitArrowDuckBuffer(ArrowBuffer* buffer, Allocator& duck_allocator) { 13 | ArrowBufferInit(buffer); 14 | 15 | buffer->allocator.reallocate = [](ArrowBufferAllocator* allocator, uint8_t* ptr, 16 | int64_t old_size, int64_t new_size) -> uint8_t* { 17 | NANOARROW_DCHECK(allocator->private_data != nullptr); 18 | auto duck_allocator = static_cast(allocator->private_data); 19 | if (ptr == nullptr && new_size > 0) { 20 | return duck_allocator->AllocateData(new_size); 21 | } else if (new_size == 0) { 22 | duck_allocator->FreeData(ptr, old_size); 23 | return nullptr; 24 | } else { 25 | return duck_allocator->ReallocateData(ptr, old_size, new_size); 26 | } 27 | }; 28 | 29 | buffer->allocator.free = [](ArrowBufferAllocator* allocator, uint8_t* ptr, 30 | int64_t old_size) { 31 | NANOARROW_DCHECK(allocator->private_data != nullptr); 32 | auto duck_allocator = static_cast(allocator->private_data); 33 | duck_allocator->FreeData(ptr, old_size); 34 | }; 35 | 36 | buffer->allocator.private_data = &duck_allocator; 37 | } 38 | 39 | ColumnDataCollectionSerializer::ColumnDataCollectionSerializer(ClientProperties options, 40 | Allocator& allocator) 41 | : options(std::move(options)), allocator(allocator) {} 42 | 43 | void ColumnDataCollectionSerializer::Init(const ArrowSchema* schema_p, 44 | const vector& logical_types) { 45 | InitArrowDuckBuffer(header.get(), allocator); 46 | InitArrowDuckBuffer(body.get(), allocator); 47 | NANOARROW_THROW_NOT_OK(ArrowIpcEncoderInit(encoder.get())); 48 | THROW_NOT_OK(InternalException, &error, 49 | ArrowArrayViewInitFromSchema(chunk_view.get(), schema_p, &error)); 50 | 51 | schema = schema_p; 52 | 53 | extension_types = 54 | ArrowTypeExtensionData::GetExtensionTypes(*options.client_context, logical_types); 55 | } 56 | 57 | void ColumnDataCollectionSerializer::SerializeSchema() { 58 | header->size_bytes = 0; 59 | body->size_bytes = 0; 60 | THROW_NOT_OK(InternalException, &error, 61 | ArrowIpcEncoderEncodeSchema(encoder.get(), schema, &error)); 62 | NANOARROW_THROW_NOT_OK( 63 | ArrowIpcEncoderFinalizeBuffer(encoder.get(), true, header.get())); 64 | } 65 | 66 | idx_t ColumnDataCollectionSerializer::Serialize(ArrowArray& array) { 67 | header->size_bytes = 0; 68 | body->size_bytes = 0; 69 | 70 | THROW_NOT_OK(duckdb::InternalException, &error, 71 | ArrowArrayViewSetArray(chunk_view.get(), &array, &error)); 72 | THROW_NOT_OK(InternalException, &error, 73 | ArrowIpcEncoderEncodeSimpleRecordBatch(encoder.get(), chunk_view.get(), 74 | body.get(), &error)); 75 | NANOARROW_THROW_NOT_OK( 76 | ArrowIpcEncoderFinalizeBuffer(encoder.get(), true, header.get())); 77 | 78 | return 1; 79 | } 80 | idx_t ColumnDataCollectionSerializer::Serialize(DataChunk& chunk) { 81 | header->size_bytes = 0; 82 | body->size_bytes = 0; 83 | chunk_arrow.reset(); 84 | 85 | ArrowConverter::ToArrowArray(chunk, chunk_arrow.get(), options, extension_types); 86 | THROW_NOT_OK(duckdb::InternalException, &error, 87 | ArrowArrayViewSetArray(chunk_view.get(), chunk_arrow.get(), &error)); 88 | THROW_NOT_OK(InternalException, &error, 89 | ArrowIpcEncoderEncodeSimpleRecordBatch(encoder.get(), chunk_view.get(), 90 | body.get(), &error)); 91 | NANOARROW_THROW_NOT_OK( 92 | ArrowIpcEncoderFinalizeBuffer(encoder.get(), true, header.get())); 93 | 94 | return 1; 95 | } 96 | 97 | idx_t ColumnDataCollectionSerializer::Serialize(const ColumnDataCollection& buffer) { 98 | header->size_bytes = 0; 99 | body->size_bytes = 0; 100 | if (buffer.Count() == 0) { 101 | return 0; 102 | } 103 | // The ArrowConverter requires all of this to be in one big DataChunk. 104 | // It would be better to append these one at a time using other DuckDB 105 | // internals like the ArrowAppender. (Possibly better would be to skip the 106 | // owning ArrowArray entirely and just expose an ArrowArrayView of the 107 | // chunk. keeping track of any owning elements that had to be allocated, 108 | // since that's all that is strictly required to write). 109 | DataChunk chunk; 110 | chunk.Initialize(allocator, buffer.Types(), buffer.Count()); 111 | for (const auto& item : buffer.Chunks()) { 112 | chunk.Append(item, true); 113 | } 114 | return Serialize(chunk); 115 | } 116 | 117 | void ColumnDataCollectionSerializer::Flush(BufferedFileWriter& writer) { 118 | writer.WriteData(header->data, header->size_bytes); 119 | writer.WriteData(body->data, body->size_bytes); 120 | } 121 | nanoarrow::UniqueBuffer ColumnDataCollectionSerializer::GetHeader() { 122 | auto result_header = std::move(header); 123 | InitArrowDuckBuffer(header.get(), allocator); 124 | return result_header; 125 | } 126 | nanoarrow::UniqueBuffer ColumnDataCollectionSerializer::GetBody() { 127 | auto result_body = std::move(body); 128 | InitArrowDuckBuffer(body.get(), allocator); 129 | return result_body; 130 | } 131 | } // namespace ext_nanoarrow 132 | } // namespace duckdb 133 | -------------------------------------------------------------------------------- /src/writer/to_arrow_ipc.cpp: -------------------------------------------------------------------------------- 1 | #include "writer/to_arrow_ipc.hpp" 2 | 3 | #include "duckdb/main/extension_util.hpp" 4 | 5 | #include "writer/column_data_collection_serializer.hpp" 6 | 7 | #include "duckdb/common/arrow/arrow_appender.hpp" 8 | #include "duckdb/function/function.hpp" 9 | #include "duckdb/function/table_function.hpp" 10 | 11 | #include "duckdb/main/client_context.hpp" 12 | 13 | namespace duckdb { 14 | 15 | namespace ext_nanoarrow { 16 | 17 | struct ToArrowIpcFunctionData : public TableFunctionData { 18 | ToArrowIpcFunctionData() = default; 19 | ArrowSchema schema{}; 20 | vector logical_types; 21 | const idx_t chunk_size = ToArrowIPCFunction::DEFAULT_CHUNK_SIZE * STANDARD_VECTOR_SIZE; 22 | }; 23 | 24 | struct ToArrowIpcGlobalState : public GlobalTableFunctionState { 25 | ToArrowIpcGlobalState() : sent_schema(false) {} 26 | atomic sent_schema; 27 | mutex lock; 28 | }; 29 | 30 | struct ToArrowIpcLocalState : public LocalTableFunctionState { 31 | unique_ptr appender; 32 | unique_ptr serializer; 33 | idx_t current_count = 0; 34 | bool checked_schema = false; 35 | }; 36 | 37 | unique_ptr ToArrowIPCFunction::InitLocal( 38 | ExecutionContext& context, TableFunctionInitInput& input, 39 | GlobalTableFunctionState* global_state) { 40 | auto local_state = make_uniq(); 41 | auto properties = context.client.GetClientProperties(); 42 | local_state->serializer = make_uniq( 43 | properties, BufferAllocator::Get(context.client)); 44 | return local_state; 45 | } 46 | 47 | unique_ptr ToArrowIPCFunction::InitGlobal( 48 | ClientContext& context, TableFunctionInitInput& input) { 49 | return make_uniq(); 50 | } 51 | 52 | unique_ptr ToArrowIPCFunction::Bind(ClientContext& context, 53 | TableFunctionBindInput& input, 54 | vector& return_types, 55 | vector& names) { 56 | auto result = make_uniq(); 57 | 58 | // Set return schema 59 | return_types.emplace_back(LogicalType::BLOB); 60 | names.emplace_back("ipc"); 61 | return_types.emplace_back(LogicalType::BOOLEAN); 62 | names.emplace_back("header"); 63 | 64 | // Create the Arrow schema 65 | auto properties = context.GetClientProperties(); 66 | result->logical_types = input.input_table_types; 67 | ArrowConverter::ToArrowSchema(&result->schema, input.input_table_types, 68 | input.input_table_names, properties); 69 | return std::move(result); 70 | } 71 | 72 | void SerializeArray(const ToArrowIpcLocalState& local_state, 73 | nanoarrow::UniqueBuffer& arrow_serialized_ipc_buffer) { 74 | ArrowArray arr = local_state.appender->Finalize(); 75 | local_state.serializer->Serialize(arr); 76 | arrow_serialized_ipc_buffer = local_state.serializer->GetHeader(); 77 | auto body = local_state.serializer->GetBody(); 78 | idx_t ipc_buffer_size = arrow_serialized_ipc_buffer->size_bytes; 79 | arrow_serialized_ipc_buffer->data = arrow_serialized_ipc_buffer->allocator.reallocate( 80 | &arrow_serialized_ipc_buffer->allocator, arrow_serialized_ipc_buffer->data, 81 | static_cast(ipc_buffer_size), 82 | static_cast(ipc_buffer_size + body->size_bytes)); 83 | arrow_serialized_ipc_buffer->size_bytes += body->size_bytes; 84 | arrow_serialized_ipc_buffer->capacity_bytes += body->size_bytes; 85 | memcpy(arrow_serialized_ipc_buffer->data + ipc_buffer_size, body->data, 86 | body->size_bytes); 87 | } 88 | 89 | void InsertMessageToChunk(nanoarrow::UniqueBuffer& arrow_serialized_ipc_buffer, 90 | DataChunk& output) { 91 | const auto ptr = reinterpret_cast(arrow_serialized_ipc_buffer->data); 92 | const auto len = arrow_serialized_ipc_buffer->size_bytes; 93 | const auto wrapped_buffer = 94 | make_buffer(std::move(arrow_serialized_ipc_buffer)); 95 | auto& vector = output.data[0]; 96 | StringVector::AddBuffer(vector, wrapped_buffer); 97 | const auto data_ptr = reinterpret_cast(vector.GetData()); 98 | *data_ptr = string_t(ptr, len); 99 | output.SetCardinality(1); 100 | output.Verify(); 101 | } 102 | 103 | OperatorResultType ToArrowIPCFunction::Function(ExecutionContext& context, 104 | TableFunctionInput& data_p, 105 | DataChunk& input, DataChunk& output) { 106 | nanoarrow::UniqueBuffer arrow_serialized_ipc_buffer; 107 | auto& data = data_p.bind_data->Cast(); 108 | auto& local_state = data_p.local_state->Cast(); 109 | auto& global_state = data_p.global_state->Cast(); 110 | 111 | bool sending_schema = false; 112 | 113 | bool caching_disabled = !PhysicalOperator::OperatorCachingAllowed(context); 114 | local_state.serializer->Init(&data.schema, data.logical_types); 115 | 116 | if (!local_state.checked_schema) { 117 | if (!global_state.sent_schema) { 118 | lock_guard init_lock(global_state.lock); 119 | if (!global_state.sent_schema) { 120 | // This run will send the schema, other threads can just send the 121 | // buffers 122 | global_state.sent_schema = true; 123 | sending_schema = true; 124 | } 125 | } 126 | local_state.checked_schema = true; 127 | } 128 | 129 | if (sending_schema) { 130 | local_state.serializer->SerializeSchema(); 131 | arrow_serialized_ipc_buffer = local_state.serializer->GetHeader(); 132 | output.data[1].SetValue(0, Value::BOOLEAN(true)); 133 | } else { 134 | if (!local_state.appender) { 135 | local_state.appender = make_uniq( 136 | input.GetTypes(), data.chunk_size, context.client.GetClientProperties(), 137 | ArrowTypeExtensionData::GetExtensionTypes(context.client, input.GetTypes())); 138 | } 139 | 140 | // Append input chunk 141 | local_state.appender->Append(input, 0, input.size(), input.size()); 142 | local_state.current_count += input.size(); 143 | 144 | // If chunk size is reached, we can flush to IPC blob 145 | if (caching_disabled || local_state.current_count >= data.chunk_size) { 146 | SerializeArray(local_state, arrow_serialized_ipc_buffer); 147 | // Reset appender 148 | local_state.appender.reset(); 149 | local_state.current_count = 0; 150 | 151 | // This is a data message, hence we set the second column to false 152 | output.data[1].SetValue(0, Value::BOOLEAN(false)); 153 | } else { 154 | return OperatorResultType::NEED_MORE_INPUT; 155 | } 156 | } 157 | InsertMessageToChunk(arrow_serialized_ipc_buffer, output); 158 | if (sending_schema) { 159 | return OperatorResultType::HAVE_MORE_OUTPUT; 160 | } else { 161 | return OperatorResultType::NEED_MORE_INPUT; 162 | } 163 | } 164 | 165 | OperatorFinalizeResultType ToArrowIPCFunction::FunctionFinal(ExecutionContext& context, 166 | TableFunctionInput& data_p, 167 | DataChunk& output) { 168 | auto& local_state = data_p.local_state->Cast(); 169 | 170 | if (local_state.appender) { 171 | // If we have an appender, we serialize the array into a message and insert it to the 172 | // chunk 173 | nanoarrow::UniqueBuffer arrow_serialized_ipc_buffer; 174 | SerializeArray(local_state, arrow_serialized_ipc_buffer); 175 | InsertMessageToChunk(arrow_serialized_ipc_buffer, output); 176 | 177 | // This is always a data message, so we set the second column to false. 178 | output.data[1].SetValue(0, Value::BOOLEAN(false)); 179 | } 180 | 181 | return OperatorFinalizeResultType::FINISHED; 182 | } 183 | 184 | TableFunction ToArrowIPCFunction::GetFunction() { 185 | TableFunction fun("to_arrow_ipc", {LogicalType::TABLE}, nullptr, Bind, InitGlobal, 186 | InitLocal); 187 | fun.in_out_function = Function; 188 | fun.in_out_function_final = FunctionFinal; 189 | return fun; 190 | } 191 | 192 | void ToArrowIPCFunction::RegisterToIPCFunction(DatabaseInstance& db) { 193 | const auto function = GetFunction(); 194 | ExtensionUtil::RegisterFunction(db, function); 195 | } 196 | } // namespace ext_nanoarrow 197 | } // namespace duckdb 198 | -------------------------------------------------------------------------------- /src/writer/write_arrow_stream.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "write_arrow_stream.hpp" 3 | 4 | #include "duckdb/common/multi_file/multi_file_function.hpp" 5 | #include "file_scanner/arrow_multi_file_info.hpp" 6 | 7 | #include "duckdb/common/arrow/arrow_converter.hpp" 8 | #include "duckdb/common/serializer/buffered_file_writer.hpp" 9 | #include "duckdb/function/copy_function.hpp" 10 | #include "duckdb/main/extension_util.hpp" 11 | 12 | #include "nanoarrow/nanoarrow_ipc.hpp" 13 | 14 | #include "nanoarrow_errors.hpp" 15 | #include "table_function/read_arrow.hpp" 16 | #include "writer/arrow_stream_writer.hpp" 17 | 18 | namespace duckdb { 19 | 20 | namespace ext_nanoarrow { 21 | 22 | namespace { 23 | 24 | struct ArrowWriteBindData : public TableFunctionData { 25 | vector sql_types; 26 | vector column_names; 27 | vector> kv_metadata; 28 | // Storage::ROW_GROUP_SIZE (122880), which seems to be the default 29 | // for Parquet, is higher than the usual number used in IPC writers (65536). 30 | // Using a value of 65536 results in fairly bad performance for the use 31 | // case of "write it all then read it all" (at the expense of not being as 32 | // useful for streaming). 33 | idx_t row_group_size = 122880; 34 | bool row_group_size_set = false; 35 | optional_idx row_groups_per_file; 36 | static constexpr const idx_t BYTES_PER_ROW = 1024; 37 | idx_t row_group_size_bytes{}; 38 | }; 39 | 40 | struct ArrowWriteGlobalState : public GlobalFunctionData { 41 | unique_ptr writer; 42 | }; 43 | 44 | struct ArrowWriteLocalState : public LocalFunctionData { 45 | explicit ArrowWriteLocalState(ClientContext& context, const vector& types) 46 | : buffer(context, types, ColumnDataAllocatorType::HYBRID) { 47 | buffer.InitializeAppend(append_state); 48 | } 49 | 50 | ColumnDataCollection buffer; 51 | ColumnDataAppendState append_state; 52 | }; 53 | 54 | unique_ptr ArrowWriteBind(ClientContext& context, 55 | CopyFunctionBindInput& input, 56 | const vector& names, 57 | const vector& sql_types) { 58 | D_ASSERT(names.size() == sql_types.size()); 59 | auto bind_data = make_uniq(); 60 | bool row_group_size_bytes_set = false; 61 | 62 | for (auto& option : input.info.options) { 63 | const auto loption = StringUtil::Lower(option.first); 64 | if (option.second.size() != 1) { 65 | // All Arrow write options require exactly one argument 66 | throw BinderException("%s requires exactly one argument", 67 | StringUtil::Upper(loption)); 68 | } 69 | 70 | if (loption == "row_group_size" || loption == "chunk_size") { 71 | if (bind_data->row_group_size_set) { 72 | throw BinderException( 73 | "ROW_GROUP_SIZE and ROW_GROUP_SIZE_BYTES are mutually exclusive"); 74 | } 75 | bind_data->row_group_size = option.second[0].GetValue(); 76 | bind_data->row_group_size_set = true; 77 | } else if (loption == "row_group_size_bytes") { 78 | auto roption = option.second[0]; 79 | if (roption.GetTypeMutable().id() == LogicalTypeId::VARCHAR) { 80 | bind_data->row_group_size_bytes = DBConfig::ParseMemoryLimit(roption.ToString()); 81 | } else { 82 | bind_data->row_group_size_bytes = option.second[0].GetValue(); 83 | } 84 | row_group_size_bytes_set = true; 85 | } else if (loption == "row_groups_per_file") { 86 | bind_data->row_groups_per_file = option.second[0].GetValue(); 87 | } else if (loption == "kv_metadata") { 88 | auto& kv_struct = option.second[0]; 89 | auto& kv_struct_type = kv_struct.type(); 90 | if (kv_struct_type.id() != LogicalTypeId::STRUCT) { 91 | throw BinderException("Expected kv_metadata argument to be a STRUCT"); 92 | } 93 | auto values = StructValue::GetChildren(kv_struct); 94 | for (idx_t i = 0; i < values.size(); i++) { 95 | const auto& value = values[i]; 96 | auto key = StructType::GetChildName(kv_struct_type, i); 97 | // If the value is a blob, write the raw blob bytes 98 | // otherwise, cast to string 99 | if (value.type().id() == LogicalTypeId::BLOB) { 100 | bind_data->kv_metadata.emplace_back(key, StringValue::Get(value)); 101 | } else { 102 | bind_data->kv_metadata.emplace_back(key, value.ToString()); 103 | } 104 | } 105 | } 106 | } 107 | 108 | if (row_group_size_bytes_set) { 109 | if (DBConfig::GetConfig(context).options.preserve_insertion_order) { 110 | throw BinderException( 111 | "ROW_GROUP_SIZE_BYTES does not work while preserving insertion order. Use " 112 | "\"SET preserve_insertion_order=false;\" to disable preserving insertion " 113 | "order."); 114 | } 115 | } else { 116 | // We always set a max row group size bytes so we don't use too much memory 117 | bind_data->row_group_size_bytes = 118 | bind_data->row_group_size * ArrowWriteBindData::BYTES_PER_ROW; 119 | } 120 | 121 | bind_data->sql_types = sql_types; 122 | bind_data->column_names = names; 123 | 124 | return std::move(bind_data); 125 | } 126 | 127 | unique_ptr ArrowWriteInitializeGlobal(ClientContext& context, 128 | FunctionData& bind_data, 129 | const string& file_path) { 130 | auto global_state = make_uniq(); 131 | auto& arrow_bind = bind_data.Cast(); 132 | 133 | auto& fs = FileSystem::GetFileSystem(context); 134 | global_state->writer = 135 | make_uniq(context, fs, file_path, arrow_bind.sql_types, 136 | arrow_bind.column_names, arrow_bind.kv_metadata); 137 | global_state->writer->WriteSchema(); 138 | return std::move(global_state); 139 | } 140 | 141 | void ArrowWriteSink(ExecutionContext& context, FunctionData& bind_data_p, 142 | GlobalFunctionData& gstate, LocalFunctionData& lstate, 143 | DataChunk& input) { 144 | auto& bind_data = bind_data_p.Cast(); 145 | auto& global_state = gstate.Cast(); 146 | auto& local_state = lstate.Cast(); 147 | 148 | // append data to the local (buffered) chunk collection 149 | local_state.buffer.Append(local_state.append_state, input); 150 | 151 | if (local_state.buffer.Count() >= bind_data.row_group_size || 152 | local_state.buffer.SizeInBytes() >= bind_data.row_group_size_bytes) { 153 | // if the chunk collection exceeds a certain size (rows/bytes) we flush it to the 154 | // Arrow file 155 | local_state.append_state.current_chunk_state.handles.clear(); 156 | global_state.writer->Flush(local_state.buffer); 157 | local_state.buffer.InitializeAppend(local_state.append_state); 158 | } 159 | } 160 | 161 | void ArrowWriteCombine(ExecutionContext& context, FunctionData& bind_data, 162 | GlobalFunctionData& gstate, LocalFunctionData& lstate) { 163 | auto& global_state = gstate.Cast(); 164 | auto& local_state = lstate.Cast(); 165 | // flush any data left in the local state to the file 166 | global_state.writer->Flush(local_state.buffer); 167 | } 168 | 169 | void ArrowWriteFinalize(ClientContext& context, FunctionData& bind_data, 170 | GlobalFunctionData& gstate) { 171 | auto& global_state = gstate.Cast(); 172 | // finalize: write any additional metadata to the file here 173 | global_state.writer->Finalize(); 174 | } 175 | 176 | unique_ptr ArrowWriteInitializeLocal(ExecutionContext& context, 177 | FunctionData& bind_data_p) { 178 | auto& bind_data = bind_data_p.Cast(); 179 | return make_uniq(context.client, bind_data.sql_types); 180 | } 181 | 182 | CopyFunctionExecutionMode ArrowWriteExecutionMode(bool preserve_insertion_order, 183 | bool supports_batch_index) { 184 | if (!preserve_insertion_order) { 185 | return CopyFunctionExecutionMode::PARALLEL_COPY_TO_FILE; 186 | } 187 | if (supports_batch_index) { 188 | return CopyFunctionExecutionMode::BATCH_COPY_TO_FILE; 189 | } 190 | return CopyFunctionExecutionMode::REGULAR_COPY_TO_FILE; 191 | } 192 | 193 | idx_t ArrowWriteDesiredBatchSize(ClientContext& context, FunctionData& bind_data_p) { 194 | auto& bind_data = bind_data_p.Cast(); 195 | return bind_data.row_group_size; 196 | } 197 | 198 | bool ArrowWriteRotateFiles(FunctionData& bind_data_p, 199 | const optional_idx& file_size_bytes) { 200 | auto& bind_data = bind_data_p.Cast(); 201 | return file_size_bytes.IsValid() || bind_data.row_groups_per_file.IsValid(); 202 | } 203 | 204 | bool ArrowWriteRotateNextFile(GlobalFunctionData& gstate, FunctionData& bind_data_p, 205 | const optional_idx& file_size_bytes) { 206 | auto& global_state = gstate.Cast(); 207 | auto& bind_data = bind_data_p.Cast(); 208 | if (file_size_bytes.IsValid() && 209 | global_state.writer->FileSize() > file_size_bytes.GetIndex()) { 210 | return true; 211 | } 212 | 213 | if (bind_data.row_groups_per_file.IsValid() && 214 | global_state.writer->NumberOfRowGroups() >= 215 | bind_data.row_groups_per_file.GetIndex()) { 216 | return true; 217 | } 218 | return false; 219 | } 220 | 221 | struct ArrowWriteBatchData : public PreparedBatchData { 222 | unique_ptr serializer; 223 | }; 224 | 225 | // This is called concurrently for large writes so it can't interact with the 226 | // writer except to read information needed to initialize. 227 | unique_ptr ArrowWritePrepareBatch( 228 | ClientContext& context, FunctionData& bind_data, GlobalFunctionData& gstate, 229 | unique_ptr collection) { 230 | auto& global_state = gstate.Cast(); 231 | 232 | auto batch = make_uniq(); 233 | batch->serializer = global_state.writer->NewSerializer(); 234 | batch->serializer->Serialize(*collection); 235 | collection->Reset(); 236 | 237 | return std::move(batch); 238 | } 239 | 240 | void ArrowWriteFlushBatch(ClientContext& context, FunctionData& bind_data, 241 | GlobalFunctionData& gstate, PreparedBatchData& batch_p) { 242 | auto& global_state = gstate.Cast(); 243 | auto& batch = batch_p.Cast(); 244 | global_state.writer->Flush(*batch.serializer); 245 | } 246 | 247 | } // namespace 248 | 249 | void RegisterArrowStreamCopyFunction(DatabaseInstance& db) { 250 | CopyFunction function("arrows"); 251 | function.copy_to_bind = ArrowWriteBind; 252 | function.copy_to_initialize_global = ArrowWriteInitializeGlobal; 253 | function.copy_to_initialize_local = ArrowWriteInitializeLocal; 254 | function.copy_to_sink = ArrowWriteSink; 255 | function.copy_to_combine = ArrowWriteCombine; 256 | function.copy_to_finalize = ArrowWriteFinalize; 257 | function.execution_mode = ArrowWriteExecutionMode; 258 | function.copy_from_bind = MultiFileFunction::MultiFileBindCopy; 259 | function.copy_from_function = ReadArrowStreamFunction(); 260 | function.prepare_batch = ArrowWritePrepareBatch; 261 | function.flush_batch = ArrowWriteFlushBatch; 262 | function.desired_batch_size = ArrowWriteDesiredBatchSize; 263 | function.rotate_files = ArrowWriteRotateFiles; 264 | function.rotate_next_file = ArrowWriteRotateNextFile; 265 | 266 | function.extension = "arrows"; 267 | ExtensionUtil::RegisterFunction(db, function); 268 | 269 | function.name = "arrow"; 270 | function.extension = "arrow"; 271 | ExtensionUtil::RegisterFunction(db, function); 272 | } 273 | 274 | } // namespace ext_nanoarrow 275 | } // namespace duckdb 276 | -------------------------------------------------------------------------------- /test/README.md: -------------------------------------------------------------------------------- 1 | # Testing this extension 2 | 3 | This directory contains all the tests for this extension. The `sql` directory holds tests that are written as [SQLLogicTests](https://duckdb.org/dev/sqllogictest/intro.html). 4 | 5 | The root makefile contains targets to build and run all of these tests. To run the SQLLogicTests: 6 | 7 | ```bash 8 | make test 9 | # or make test_debug 10 | ``` 11 | 12 | If you're using CMake + VSCode, you can run 13 | 14 | ``` shell 15 | ./test_local.sh 16 | ``` 17 | 18 | The test data is generated with: 19 | 20 | ```python 21 | import nanoarrow as na 22 | from nanoarrow import ipc 23 | 24 | url = "https://github.com/apache/arrow-experiments/raw/refs/heads/main/data/arrow-commits/arrow-commits.arrows" 25 | with ipc.StreamWriter.from_path("data/test.arrows") as writer: 26 | writer.write_stream(na.ArrayStream.from_url(url)) 27 | ``` 28 | -------------------------------------------------------------------------------- /test/nodejs/arrow_test.js: -------------------------------------------------------------------------------- 1 | var arrow = require('apache-arrow') 2 | var duckdb = require('duckdb'); 3 | var assert = require('assert'); 4 | // import { RecordBatchReader } from "apache-arrow"; 5 | 6 | 7 | const parquet_file_path = "data/parquet-testing/lineitem_sf0_01.parquet"; 8 | 9 | // Wrapper for tests, materializes whole stream 10 | const arrow_ipc_stream = async (conn, sql) => { 11 | const result_stream = await conn.arrowIPCStream(sql); 12 | return await result_stream.toArray(); 13 | } 14 | 15 | // Wrapper for tests 16 | const arrow_ipc_materialized = async (conn, sql) => { 17 | return await new Promise((resolve, reject) => { 18 | conn.arrowIPCAll(sql, function (err, result) { 19 | if (err) { 20 | reject(err) 21 | } 22 | 23 | resolve(result); 24 | }) 25 | }); 26 | } 27 | 28 | const to_ipc_functions = { 29 | 'streaming': arrow_ipc_stream, 30 | 'materialized': arrow_ipc_materialized, 31 | } 32 | 33 | function getDatabase() { 34 | return new duckdb.Database(':memory:', {"allow_unsigned_extensions":"true"}); 35 | } 36 | 37 | 38 | // Stream results by getting an arrowIPCStream, then iterating with an arrow RecordBatchReader 39 | const streamResults = async (con, sql) => { 40 | const results = []; 41 | for await (const batch of await arrow.RecordBatchReader.from( 42 | await con.arrowIPCStream(sql) 43 | )) { 44 | for (const row of batch) { 45 | const result = {}; 46 | for (const [field, val] of row) { 47 | result[field] = val; 48 | } 49 | results.push(result); 50 | } 51 | } 52 | return results; 53 | }; 54 | 55 | function getConnection(db, done) { 56 | let conn = new duckdb.Connection(db); 57 | // Makes CI life a bit easier 58 | conn.exec(`SET allow_extensions_metadata_mismatch=true;`, function (err) { 59 | if (err) throw err; 60 | }); 61 | conn.exec(`LOAD '${process.env.ARROW_EXTENSION_BINARY_PATH}';`, function (err) { 62 | if (err) throw err; 63 | done(); 64 | }); 65 | return conn 66 | } 67 | 68 | describe(`Arrow IPC`, () => { 69 | let db; 70 | let conn; 71 | before((done) => { 72 | db = getDatabase(); 73 | conn = getConnection(db, () => done()) 74 | }); 75 | 76 | it(`Basic examples`, async () => { 77 | const range_size = 130000; 78 | const query = `SELECT * FROM range(0,${range_size}) tbl(i)`; 79 | const arrow_table_expected = new arrow.Table({ 80 | i: new arrow.Vector([arrow.makeData({ type: new arrow.Int32, data: Array.from(new Array(range_size), (x, i) => i) })]), 81 | }); 82 | 83 | // Can use Arrow to read from stream directly 84 | const result_stream = await db.arrowIPCStream(query); 85 | const reader = await arrow.RecordBatchReader.from(result_stream); 86 | const table = await arrow.tableFromIPC(reader); 87 | const array_from_arrow = table.toArray(); 88 | assert.deepEqual(array_from_arrow, arrow_table_expected.toArray()); 89 | 90 | // Can also fully materialize stream first, then pass to Arrow 91 | const result_stream2 = await db.arrowIPCStream(query); 92 | const reader2 = await arrow.RecordBatchReader.from(result_stream2.toArray()); 93 | const table2 = await arrow.tableFromIPC(reader2); 94 | const array_from_arrow2 = table2.toArray(); 95 | assert.deepEqual(array_from_arrow2, arrow_table_expected.toArray()); 96 | 97 | // Can also fully materialize in DuckDB first (allowing parallel execution) 98 | const result_materialized = await new Promise((resolve, reject) => { 99 | db.arrowIPCAll(query, function (err, result) { 100 | if (err) { 101 | reject(err) 102 | } 103 | 104 | resolve(result); 105 | }) 106 | }); 107 | 108 | const reader3 = await arrow.RecordBatchReader.from(result_materialized); 109 | const table3 = await arrow.tableFromIPC(reader3); 110 | const array_from_arrow3 = table3.toArray(); 111 | assert.deepEqual(array_from_arrow3, arrow_table_expected.toArray()); 112 | 113 | // Scanning materialized IPC buffers from DuckDB 114 | db.register_buffer("ipc_table", result_materialized, true); 115 | await new Promise((resolve, reject) => { 116 | db.arrowIPCAll(`SELECT * FROM ipc_table`, function (err, result) { 117 | if (err) { 118 | reject(err); 119 | } 120 | 121 | assert.deepEqual(result, result_materialized); 122 | resolve() 123 | }); 124 | }); 125 | }); 126 | 127 | // Ensure we handle empty result properly 128 | for (const [name, fun] of Object.entries(to_ipc_functions)) { 129 | it(`Empty results (${name})`, async () => { 130 | const range_size = 130000; 131 | const query = `SELECT * FROM range(0,${range_size}) tbl(i) where i > ${range_size}`; 132 | 133 | let ipc_buffers = await fun(conn, query); 134 | const reader = await arrow.RecordBatchReader.from(ipc_buffers); 135 | const table = await arrow.tableFromIPC(reader); 136 | const arr = table.toArray(); 137 | assert.deepEqual(arr, []); 138 | }); 139 | } 140 | }) 141 | 142 | for (const [name, fun] of Object.entries(to_ipc_functions)) { 143 | describe(`DuckDB <-> Arrow IPC (${name})`, () => { 144 | const total = 1000; 145 | 146 | let db; 147 | let conn; 148 | before((done) => { 149 | db = getDatabase(); 150 | conn = getConnection(db, () => done()) 151 | }); 152 | 153 | it(`Buffers are not garbage collected`, async () => { 154 | let ipc_buffers = await fun(conn, 'SELECT * FROM range(1001, 2001) tbl(i)'); 155 | 156 | // Now to scan the buffer, we first need to register it 157 | conn.register_buffer(`ipc_table_${name}`, ipc_buffers, true); 158 | 159 | // Delete JS reference to arrays 160 | ipc_buffers = 0; 161 | 162 | // Run GC to ensure file is deleted 163 | if (global.gc) { 164 | global.gc(); 165 | } else { 166 | throw "should run with --expose-gc"; 167 | } 168 | 169 | // Spray memory overwriting hopefully old buffer 170 | let spray_results = []; 171 | for (let i = 0; i < 3000; i++) { 172 | spray_results.push(await fun(db, 'SELECT * FROM range(2001, 3001) tbl(i)')); 173 | } 174 | 175 | // Now we can query the ipc buffer using DuckDB by providing an object with an alias and the materialized ipc buffers 176 | await new Promise((resolve, reject) => { 177 | conn.all(`SELECT avg(i) as average, count(1) as total 178 | FROM ipc_table_${name};`, function (err, result) { 179 | if (err) { 180 | reject(err); 181 | } 182 | assert.deepEqual(result, [{average: 1500.5, total: 1000}]); 183 | resolve(); 184 | }); 185 | }); 186 | }); 187 | 188 | it(`Round-trip int column`, async () => { 189 | // Now we fetch the ipc stream object and construct the RecordBatchReader 190 | const ipc_buffers = await fun(db, 'SELECT * FROM range(1001, 2001) tbl(i)'); 191 | 192 | // Now to scan the buffer, we first need to register it 193 | conn.register_buffer("ipc_table", ipc_buffers, true, (err) => { 194 | assert(!err); 195 | }); 196 | 197 | // Now we can query the ipc buffer using DuckDB by providing an object with an alias and the materialized ipc buffers 198 | await new Promise((resolve, reject) => { 199 | conn.all(`SELECT avg(i) as average, count(1) as total 200 | FROM ipc_table;`, function (err, result) { 201 | if (err) { 202 | reject(err) 203 | } 204 | assert.deepEqual(result, [{average: 1500.5, total: 1000}]); 205 | resolve(); 206 | }); 207 | }); 208 | }); 209 | 210 | 211 | it(`Joining 2 IPC buffers in DuckDB`, async () => { 212 | // Insert first table 213 | const ipc_buffers1 = await fun(db, 'SELECT * FROM range(1, 3) tbl(i)'); 214 | 215 | // Insert second table 216 | const ipc_buffers2 = await fun(db, 'SELECT * FROM range(2, 4) tbl(i)'); 217 | 218 | // Register buffers for scanning from DuckDB 219 | conn.register_buffer("table1", ipc_buffers1, true, (err) => { 220 | assert(!err); 221 | }); 222 | conn.register_buffer("table2", ipc_buffers2, true, (err) => { 223 | assert(!err); 224 | }); 225 | 226 | await new Promise((resolve, reject) => { 227 | conn.all(`SELECT * 228 | FROM table1 229 | JOIN table2 ON table1.i = table2.i;`, function (err, result) { 230 | if (err) { 231 | reject(err); 232 | } 233 | assert.deepEqual(result, [{i: 2}]); 234 | resolve() 235 | }); 236 | }); 237 | }); 238 | }) 239 | } 240 | 241 | describe('[Benchmark] Arrow IPC Single Int Column (50M tuples)',() => { 242 | // Config 243 | const column_size = 50*1000*1000; 244 | 245 | let db; 246 | let conn; 247 | 248 | before((done) => { 249 | db = getDatabase(); 250 | conn = getConnection(db, () => { 251 | conn.run("CREATE OR REPLACE TABLE test AS select * FROM range(0,?) tbl(i);", column_size, (err) => { 252 | if (err) throw err; 253 | done() 254 | }); 255 | }) 256 | }); 257 | 258 | it('DuckDB table -> DuckDB table', (done) => { 259 | conn.run('CREATE OR REPLACE TABLE copy_table AS SELECT * FROM test', (err) => { 260 | assert(!err); 261 | done(); 262 | }); 263 | }); 264 | 265 | it('DuckDB table -> Stream IPC buffer', async () => { 266 | const result = await conn.arrowIPCStream('SELECT * FROM test'); 267 | const ipc_buffers = await result.toArray(); 268 | const reader = await arrow.RecordBatchReader.from(ipc_buffers); 269 | const table = arrow.tableFromIPC(reader); 270 | assert.equal(table.numRows, column_size); 271 | }); 272 | 273 | it('DuckDB table -> Materialized IPC buffer', (done) => { 274 | conn.arrowIPCAll('SELECT * FROM test', (err,res) => { 275 | done(); 276 | }); 277 | }); 278 | }); 279 | 280 | describe('Buffer registration',() => { 281 | let db; 282 | let conn1; 283 | let conn2; 284 | 285 | before((done) => { 286 | db = new duckdb.Database(':memory:', {"allow_unsigned_extensions":"true"}); 287 | conn1 = new duckdb.Connection(db); 288 | conn2 = new duckdb.Connection(db); 289 | done(); 290 | }); 291 | 292 | before((done) => { 293 | db = getDatabase(); 294 | conn1 = getConnection(db, () => { 295 | conn2 = getConnection(db, () => done()); 296 | }) 297 | }); 298 | 299 | it('Buffers can only be overwritten with force flag', async () => { 300 | const arrow_buffer = await arrow_ipc_materialized(conn1, "SELECT 1337 as a"); 301 | 302 | conn1.register_buffer('arrow_buffer', arrow_buffer, true, (err) => { 303 | assert(!err); 304 | }) 305 | 306 | await new Promise((resolve, reject) => { 307 | try { 308 | conn1.register_buffer('arrow_buffer', arrow_buffer, false); 309 | reject("Expected query to fail"); 310 | } catch (err) { 311 | assert(err.message.includes('Buffer with this name already exists and force_register is not enabled')); 312 | resolve(); 313 | } 314 | }); 315 | }); 316 | 317 | it('Existing tables are silently shadowed by registered buffers', async () => { 318 | // Unregister, in case other test has registered this 319 | conn1.unregister_buffer('arrow_buffer', (err) => { 320 | assert(!err); 321 | }); 322 | 323 | conn1.run('CREATE OR REPLACE TABLE arrow_buffer AS SELECT 7 as a;', (err) => { 324 | assert(!err); 325 | }); 326 | 327 | conn1.all('SELECT * FROM arrow_buffer;', (err, result) => { 328 | assert(!err); 329 | assert.deepEqual(result, [{'a': 7}]); 330 | }); 331 | 332 | const arrow_buffer = await arrow_ipc_materialized(conn1, "SELECT 1337 as b"); 333 | 334 | conn1.register_buffer('arrow_buffer', arrow_buffer, true, (err) => { 335 | assert(!err); 336 | }) 337 | 338 | conn1.all('SELECT * FROM arrow_buffer;', (err, result) => { 339 | assert(!err); 340 | assert.deepEqual(result, [{'b': 1337}]); 341 | }); 342 | 343 | conn1.unregister_buffer('arrow_buffer', (err) => { 344 | assert(!err); 345 | }); 346 | 347 | conn1.all('SELECT * FROM arrow_buffer;', (err, result) => { 348 | assert(!err); 349 | assert.deepEqual(result, [{'a': 7}]); 350 | }); 351 | 352 | await new Promise((resolve, reject) => { 353 | // Cleanup 354 | conn1.run('DROP TABLE arrow_buffer;', (err) => { 355 | if (err) reject(err); 356 | resolve(); 357 | }); 358 | 359 | }); 360 | }); 361 | 362 | it('Registering buffers should only be visible within current connection', async () => { 363 | const arrow_buffer1 = await arrow_ipc_materialized(conn1, "SELECT 1337 as a"); 364 | const arrow_buffer2 = await arrow_ipc_materialized(conn2, "SELECT 42 as b"); 365 | 366 | conn1.register_buffer('arrow_buffer', arrow_buffer1, true, (err) => { 367 | assert(!err); 368 | }) 369 | conn2.register_buffer('arrow_buffer', arrow_buffer2, true, (err) => { 370 | assert(!err); 371 | }) 372 | 373 | conn1.all('SELECT * FROM arrow_buffer;', (err, result) => { 374 | assert(!err); 375 | assert.deepEqual(result, [{'a': 1337}]); 376 | }); 377 | 378 | conn2.all('SELECT * FROM arrow_buffer;', (err, result) => { 379 | assert(!err); 380 | assert.deepEqual(result, [{'b': 42}]); 381 | }); 382 | 383 | conn1 = 0; 384 | 385 | conn2.all('SELECT * FROM arrow_buffer;', (err, result) => { 386 | assert(!err); 387 | assert.deepEqual(result, [{'b': 42}]); 388 | }); 389 | 390 | conn2.unregister_buffer('arrow_buffer', (err) => { 391 | assert(!err); 392 | }) 393 | 394 | await new Promise((resolve, reject) => { 395 | conn2.all('SELECT * FROM arrow_buffer;', (err, result) => { 396 | if (!err) { 397 | reject("Expected error"); 398 | } 399 | assert(err.message.includes('Catalog Error: Table with name arrow_buffer does not exist!')); 400 | resolve(); 401 | }); 402 | }); 403 | }); 404 | }); 405 | 406 | describe(`Single Value IPC`, () => { 407 | let db; 408 | let conn; 409 | before((done) => { 410 | db = getDatabase(); 411 | conn = getConnection(db, () => done()) 412 | }); 413 | 414 | it('Try to read from query returtning one value', async () => { 415 | const sql = "select now() as t"; 416 | const result = await streamResults(conn, sql) 417 | assert.strictEqual(result.length, 1, "Expected exactly one row"); 418 | assert.strictEqual(Object.keys(result[0]).length, 1, "Expected exactly one field"); 419 | }); 420 | }); 421 | 422 | describe('[Benchmark] Arrow IPC TPC-H lineitem.parquet', () => { 423 | const sql = "SELECT sum(l_extendedprice * l_discount) AS revenue FROM lineitem WHERE l_shipdate >= CAST('1994-01-01' AS date) AND l_shipdate < CAST('1995-01-01' AS date) AND l_discount BETWEEN 0.05 AND 0.07 AND l_quantity < 24" 424 | const answer = [{revenue: 1193053.2253}]; 425 | 426 | let db; 427 | let conn; 428 | 429 | before((done) => { 430 | db = getDatabase(); 431 | conn = getConnection(db, () => done()) 432 | }); 433 | 434 | it('Parquet -> DuckDB Streaming-> Arrow IPC -> DuckDB Query', async () => { 435 | const ipc_buffers = await arrow_ipc_stream(conn, 'SELECT * FROM "' + parquet_file_path + '"'); 436 | 437 | const query = sql.replace("lineitem", "my_arrow_ipc_stream"); 438 | conn.register_buffer("my_arrow_ipc_stream", ipc_buffers, true, (err) => { 439 | assert(!err); 440 | }); 441 | 442 | await new Promise((resolve, reject) => { 443 | conn.all(query, function (err, result) { 444 | if (err) { 445 | reject(err) 446 | } 447 | 448 | assert.deepEqual(result, answer); 449 | resolve(); 450 | }) 451 | }); 452 | }); 453 | 454 | it('Parquet -> DuckDB Materialized -> Arrow IPC -> DuckDB' , async () => { 455 | const ipc_buffers = await arrow_ipc_materialized(conn, 'SELECT * FROM "' + parquet_file_path + '"'); 456 | 457 | const query = sql.replace("lineitem", "my_arrow_ipc_stream_2"); 458 | conn.register_buffer("my_arrow_ipc_stream_2", ipc_buffers, true, (err) => { 459 | assert(!err); 460 | }); 461 | 462 | await new Promise((resolve, reject) => { 463 | conn.all(query, function (err, result) { 464 | if (err) { 465 | reject(err) 466 | } else { 467 | assert.deepEqual(result, answer); 468 | resolve(); 469 | } 470 | }) 471 | }); 472 | }); 473 | 474 | it('Parquet -> DuckDB', async () => { 475 | await new Promise((resolve, reject) => { 476 | conn.run('CREATE OR REPLACE TABLE load_parquet_directly AS SELECT * FROM "' + parquet_file_path + '";', (err) => { 477 | if (err) { 478 | reject(err) 479 | } 480 | resolve() 481 | }); 482 | }); 483 | 484 | const query = sql.replace("lineitem", "load_parquet_directly"); 485 | 486 | const result = await new Promise((resolve, reject) => { 487 | conn.all(query, function (err, result) { 488 | if (err) { 489 | reject(err); 490 | } 491 | resolve(result) 492 | }); 493 | }); 494 | 495 | assert.deepEqual(result, answer); 496 | }); 497 | }); 498 | 499 | for (const [name, fun] of Object.entries(to_ipc_functions)) { 500 | describe(`Arrow IPC TPC-H lineitem SF0.01 (${name})`, () => { 501 | // `table_name` in these queries will be replaced by either the parquet file directly, or the ipc buffer 502 | const queries = [ 503 | "select count(*) from table_name LIMIT 10", 504 | "select sum(l_orderkey) as sum_orderkey FROM table_name", 505 | "select * from table_name", 506 | "select l_orderkey from table_name WHERE l_orderkey=2 LIMIT 2", 507 | "select l_extendedprice from table_name", 508 | "select l_extendedprice from table_name WHERE l_extendedprice > 53468 and l_extendedprice < 53469 LIMIT 2", 509 | "select count(l_orderkey) from table_name where l_commitdate > '1996-10-28'", 510 | "SELECT sum(l_extendedprice * l_discount) AS revenue FROM table_name WHERE l_shipdate >= CAST('1994-01-01' AS date) AND l_shipdate < CAST('1995-01-01' AS date) AND l_discount BETWEEN 0.05 AND 0.07 AND l_quantity < 24" 511 | ]; 512 | 513 | let db; 514 | let conn; 515 | before((done) => { 516 | db = getDatabase(); 517 | conn = getConnection(db, () => done()) 518 | }); 519 | 520 | for (const query of queries) { 521 | it(` ${query}`, async () => { 522 | // First do query directly on parquet file 523 | const expected_value = await new Promise((resolve, reject) => { 524 | conn.all(query.replace("table_name", `'${parquet_file_path}'`), function (err, result) { 525 | if (err) { 526 | reject(err); 527 | } 528 | 529 | resolve(result); 530 | }); 531 | }); 532 | 533 | // Copy parquet file completely into Arrow IPC format 534 | const ipc_buffers = await fun(conn, 'SELECT * FROM "' + parquet_file_path + '"'); 535 | 536 | // Register the ipc buffers as table in duckdb, using force to override the previously registered buffers 537 | conn.register_buffer("table_name", ipc_buffers, true, (err) => { 538 | assert(!err); 539 | }); 540 | 541 | await new Promise((resolve, reject) => { 542 | conn.all(query, function (err, result) { 543 | if (err) { 544 | reject(err) 545 | } 546 | 547 | assert.deepEqual(result, expected_value, `Query failed: ${query}`); 548 | resolve(); 549 | }) 550 | }); 551 | }); 552 | } 553 | }) 554 | } 555 | -------------------------------------------------------------------------------- /test/python/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import duckdb 4 | from typing import Union, Optional 5 | from duckdb import DuckDBPyConnection 6 | 7 | dir = os.path.dirname(os.path.abspath(__file__)) 8 | build_type = "release" 9 | 10 | @pytest.fixture(scope="function") 11 | def duckdb_empty_cursor(request): 12 | connection = duckdb.connect('') 13 | cursor = connection.cursor() 14 | return cursor 15 | 16 | def add_extension(extension_name, conn: Union[str, DuckDBPyConnection] = '') -> DuckDBPyConnection: 17 | if (isinstance(conn, str)): 18 | config = { 19 | 'allow_unsigned_extensions' : 'true' 20 | } 21 | conn = duckdb.connect(conn or '', config=config) 22 | file_path = f"'{dir}/../../build/{build_type}/extension/{extension_name}/{extension_name}.duckdb_extension'" 23 | conn.execute(f"LOAD {file_path}") 24 | return conn 25 | 26 | @pytest.fixture(scope="function") 27 | def require(): 28 | def _require(extension_name, db_name=''): 29 | conn = add_extension(extension_name, db_name) 30 | conn.execute("SET allow_extensions_metadata_mismatch=true;") 31 | return conn 32 | 33 | return _require 34 | 35 | @pytest.fixture(scope='function') 36 | def connection(): 37 | return add_extension('nanoarrow') 38 | -------------------------------------------------------------------------------- /test/python/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | pyarrow 3 | -------------------------------------------------------------------------------- /test/python/test_arrow_ipc_scan.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import pyarrow as pa 3 | import duckdb 4 | import pyarrow.ipc as ipc 5 | 6 | 7 | def get_record_batch(): 8 | data = [ 9 | pa.array([1, 2, 3, 4]), 10 | pa.array(['foo', 'bar', 'baz', None]), 11 | pa.array([True, None, False, True]) 12 | ] 13 | 14 | return pa.record_batch(data, names=['f0', 'f1', 'f2']) 15 | 16 | def tables_match(result): 17 | assert result == [(1, 'foo', True), (2, 'bar', None), (3, 'baz', False), (4, None, True), (1, 'foo', True), (2, 'bar', None), (3, 'baz', False), (4, None, True), (1, 'foo', True), (2, 'bar', None), (3, 'baz', False), (4, None, True), (1, 'foo', True), (2, 'bar', None), (3, 'baz', False), (4, None, True), (1, 'foo', True), (2, 'bar', None), (3, 'baz', False), (4, None, True)] 18 | 19 | class TestArrowIPCBufferRead(object): 20 | def test_single_buffer(self, connection): 21 | batch = get_record_batch() 22 | sink = pa.BufferOutputStream() 23 | with pa.ipc.new_stream(sink, batch.schema) as writer: 24 | for i in range(5): 25 | writer.write_batch(batch) 26 | buffer = sink.getvalue() 27 | with pa.BufferReader(buffer) as buf_reader: 28 | msg_reader = ipc.MessageReader.open_stream(buf_reader) 29 | tables_match(connection.from_arrow(msg_reader).fetchall()) 30 | 31 | def test_multi_buffers(self, connection): 32 | batch = get_record_batch() 33 | sink = pa.BufferOutputStream() 34 | 35 | with pa.ipc.new_stream(sink, batch.schema) as writer: 36 | for _ in range(5): # Write 5 batches into one stream 37 | writer.write_batch(batch) 38 | 39 | buffer = sink.getvalue() 40 | 41 | with pa.BufferReader(buffer) as buf_reader: 42 | msg_reader = ipc.MessageReader.open_stream(buf_reader) 43 | tables_match(connection.from_arrow(msg_reader).fetchall()) 44 | 45 | def test_replacement_scan(self, connection): 46 | 47 | batch = get_record_batch() 48 | sink = pa.BufferOutputStream() 49 | 50 | with pa.ipc.new_stream(sink, batch.schema) as writer: 51 | writer.write_batch(batch) 52 | 53 | buffer = sink.getvalue() 54 | 55 | with pa.BufferReader(buffer) as buf_reader: 56 | msg_reader = ipc.MessageReader.open_stream(buf_reader) 57 | with pytest.raises(duckdb.InvalidInputException, 58 | match="not suitable for replacement scans",): 59 | connection.execute("FROM msg_reader") 60 | -------------------------------------------------------------------------------- /test/python/test_arrow_ipc_writer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import pyarrow as pa 3 | import duckdb 4 | import pyarrow.ipc as ipc 5 | 6 | def create_table(connection): 7 | connection.execute("CREATE TABLE T (f0 integer, f1 varchar, f2 bool )") 8 | connection.execute("INSERT INTO T values (1, 'foo', true),(2, 'bar', NULL), (3, 'baz', false), (4, NULL, true) ") 9 | 10 | def tables_match(result): 11 | print(result) 12 | assert result == [(1, 'foo', True), (2, 'bar', None), (3, 'baz', False), (4, None, True)] 13 | 14 | class TestArrowIPCBufferWriter(object): 15 | def test_round_trip(self, connection): 16 | create_table(connection) 17 | buffers = connection.execute("FROM to_arrow_ipc((FROM T))").fetchall() 18 | buffer = pa.py_buffer(buffers[0][0] + buffers[1][0]) 19 | with pa.BufferReader(buffer) as buf_reader: 20 | msg_reader = ipc.MessageReader.open_stream(buf_reader) 21 | tables_match(connection.from_arrow(msg_reader).fetchall()) 22 | 23 | def test_arrow_read_duck_buffers(self, connection): 24 | create_table(connection) 25 | buffers = connection.execute("FROM to_arrow_ipc((FROM T))").fetchall() 26 | arrow_buffers = [] 27 | # We have to concatenate the schema to the data 28 | arrow_buffers.append(pa.py_buffer(buffers[0][0] + buffers[1][0])) 29 | assert buffers[0][1] == True 30 | assert buffers[1][1] == False 31 | batches = [] 32 | with pa.BufferReader(arrow_buffers[0]) as reader: 33 | stream_reader = ipc.RecordBatchStreamReader(reader) 34 | schema = stream_reader.schema 35 | batches.extend(stream_reader) 36 | arrow_table = pa.Table.from_batches(batches, schema=schema) 37 | tables_match(connection.execute("FROM arrow_table").fetchall()) 38 | -------------------------------------------------------------------------------- /test/python/test_integration.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import pyarrow as pa 3 | import duckdb 4 | import pyarrow.ipc as ipc 5 | from pyarrow.ipc import MessageReader as mr 6 | import os 7 | import sys 8 | import tempfile 9 | 10 | # duckdb.duckdb.NotImplementedException: Not implemented Error: Unsupported Internal Arrow Type for Decimal d:37,5,256 11 | # "generated_decimal256.stream", 12 | 13 | # duckdb.duckdb.ConversionException: Conversion Error: Could not convert Interval to Microsecond 14 | # "generated_interval.stream" 15 | 16 | # Not implemented Error: Unsupported Internal Arrow Type: "d" Union 17 | # "generated_union.stream" 18 | 19 | little_big_integration_files = ["generated_null_trivial.stream", "generated_primitive_large_offsets.stream","generated_custom_metadata.stream","generated_datetime.stream","generated_decimal.stream","generated_map_non_canonical.stream","generated_map.stream","generated_nested_large_offsets.stream","generated_nested.stream","generated_null.stream","generated_primitive_no_batches.stream","generated_primitive_zerolength.stream","generated_primitive.stream","generated_recursive_nested.stream"] 20 | 21 | compression_2_0_0 = ["generated_uncompressible_zstd.stream", "generated_zstd.stream"] 22 | 23 | script_path = os.path.dirname(os.path.abspath(__file__)) 24 | 25 | test_folder = os.path.join(script_path,'..','..','arrow-testing','data','arrow-ipc-stream','integration') 26 | 27 | # All Test Folders: 28 | big_endian_folder = os.path.join(test_folder,'1.0.0-bigendian') 29 | little_endian_folder = os.path.join(test_folder,'1.0.0-littleendian') 30 | compression_folder = os.path.join(test_folder,'2.0.0-compression') 31 | 32 | def compare_result(arrow_result,duckdb_result, con): 33 | return con.execute(""" 34 | SELECT COUNT(*) = 0 35 | FROM ( 36 | (SELECT * FROM arrow_result EXCEPT SELECT * FROM duckdb_result) 37 | UNION 38 | (SELECT * FROM duckdb_result EXCEPT SELECT * FROM arrow_result) 39 | ) """).fetchone()[0] 40 | 41 | # 1. Compare result from reading the IPC file in Arrow, and in Duckdb 42 | def compare_ipc_file_reader(con, file): 43 | arrow_result = ipc.open_stream(file).read_all() 44 | duckdb_file_result = con.sql(f"FROM read_arrow('{file}')").arrow() 45 | assert compare_result(arrow_result, duckdb_file_result, con) 46 | 47 | # 2. Now test the writer, write it to a file from DuckDB, read it with arrow and compare 48 | def compare_ipc_file_writer(con, file): 49 | arrow_result = ipc.open_stream(file).read_all() 50 | with tempfile.TemporaryDirectory() as temp_dir: 51 | file_path = os.path.join(temp_dir, "arrow_duck.arrows") 52 | con.execute(f"COPY (FROM read_arrow('{file}')) TO '{file_path}'") 53 | duckdb_file_result = con.sql(f"FROM read_arrow('{file}')").arrow() 54 | assert compare_result(arrow_result, duckdb_file_result, con) 55 | 56 | # 3. Compare result from reading the IPC file in Arrow, and in Duckdb 57 | def compare_ipc_buffer_reader(con, file): 58 | arrow_result = ipc.open_stream(file).read_all() 59 | reader = mr.open_stream(file) 60 | duckdb_struct_result = con.from_arrow(reader).arrow() 61 | assert compare_result(arrow_result, duckdb_struct_result, con) 62 | 63 | # 4. Now test the DuckDB buffer writer, by reading it back with arrow and comparing 64 | def compare_ipc_buffer_writer(con, file): 65 | arrow_result = ipc.open_stream(file).read_all() 66 | buffers = con.execute(f"FROM to_arrow_ipc((FROM read_arrow('{file}')))").fetchall() 67 | if not buffers: 68 | return 69 | arrow_buffers = [] 70 | for i in range (1, len(buffers)): 71 | # We have to concatenate the schema to the data 72 | arrow_buffers.append(pa.py_buffer(buffers[0][0] + buffers[i][0])) 73 | 74 | batches = [] 75 | for buffer in arrow_buffers: 76 | with pa.BufferReader(buffer) as reader: 77 | stream_reader = ipc.RecordBatchStreamReader(reader) 78 | schema = stream_reader.schema 79 | batches.extend(stream_reader) 80 | 81 | duckdb_struct_result = pa.Table.from_batches(batches, schema=schema) 82 | assert compare_result(arrow_result, duckdb_struct_result, con) 83 | 84 | 85 | class TestArrowIntegrationTests(object): 86 | def test_read_ipc_file(self, connection): 87 | for file in little_big_integration_files: 88 | compare_ipc_file_reader(connection,os.path.join(big_endian_folder,file)) 89 | compare_ipc_file_reader(connection,os.path.join(little_endian_folder,file)) 90 | for file in compression_2_0_0: 91 | compare_ipc_file_reader(connection,os.path.join(compression_folder,file)) 92 | 93 | def test_write_ipc_file(self, connection): 94 | for file in little_big_integration_files: 95 | compare_ipc_file_writer(connection,os.path.join(big_endian_folder,file)) 96 | compare_ipc_file_writer(connection,os.path.join(little_endian_folder,file)) 97 | for file in compression_2_0_0: 98 | compare_ipc_file_reader(connection,os.path.join(compression_folder,file)) 99 | 100 | def test_read_ipc_buffer(self, connection): 101 | for file in little_big_integration_files: 102 | compare_ipc_buffer_reader(connection,os.path.join(big_endian_folder,file)) 103 | compare_ipc_buffer_reader(connection,os.path.join(little_endian_folder,file)) 104 | for file in compression_2_0_0: 105 | compare_ipc_file_reader(connection,os.path.join(compression_folder,file)) 106 | 107 | def test_write_ipc_buffer(self, connection): 108 | for file in little_big_integration_files: 109 | compare_ipc_buffer_writer(connection,os.path.join(big_endian_folder,file)) 110 | compare_ipc_buffer_writer(connection,os.path.join(little_endian_folder,file)) 111 | for file in compression_2_0_0: 112 | compare_ipc_buffer_writer(connection,os.path.join(compression_folder,file)) 113 | -------------------------------------------------------------------------------- /test/sql/arrow_testing.test: -------------------------------------------------------------------------------- 1 | # name: 2 | # description: test nanoarrow extension 3 | # group: [nanoarrow] 4 | 5 | # Require statement will ensure this test is run with this extension loaded 6 | require nanoarrow 7 | 8 | statement ok 9 | SET VARIABLE test_files = '__WORKING_DIRECTORY__/arrow-testing/data/arrow-ipc-stream/integration/'; 10 | 11 | # We can do more sophisticated things here (read the arrow_file or .json.gz 12 | # versions of the files, etc.) 13 | statement ok 14 | CREATE MACRO check_arrow_testing_file(test_file) AS TABLE 15 | FROM read_arrow(getvariable('test_files') || test_file || '.stream'); 16 | 17 | statement ok 18 | FROM check_arrow_testing_file('1.0.0-littleendian/generated_primitive') 19 | 20 | statement ok 21 | FROM check_arrow_testing_file('1.0.0-littleendian/generated_datetime') 22 | 23 | statement ok 24 | FROM check_arrow_testing_file('1.0.0-littleendian/generated_decimal') 25 | 26 | # This test will fail,because the struct created in this arrow file does not have names on their children 27 | # This is not supported by DuckDB. 28 | statement error 29 | FROM check_arrow_testing_file('1.0.0-littleendian/generated_duplicate_fieldnames') 30 | ---- 31 | Struct remap can only remap named structs 32 | 33 | statement ok 34 | FROM check_arrow_testing_file('1.0.0-littleendian/generated_map_non_canonical') 35 | 36 | statement ok 37 | FROM check_arrow_testing_file('1.0.0-littleendian/generated_map') 38 | 39 | statement ok 40 | FROM check_arrow_testing_file('1.0.0-littleendian/generated_nested_large_offsets') 41 | 42 | statement ok 43 | FROM check_arrow_testing_file('1.0.0-littleendian/generated_nested') 44 | 45 | statement ok 46 | FROM check_arrow_testing_file('1.0.0-littleendian/generated_null_trivial') 47 | 48 | statement ok 49 | FROM check_arrow_testing_file('1.0.0-littleendian/generated_primitive_large_offsets') 50 | 51 | statement ok 52 | FROM check_arrow_testing_file('1.0.0-littleendian/generated_primitive_no_batches') 53 | 54 | statement ok 55 | FROM check_arrow_testing_file('1.0.0-littleendian/generated_primitive_zerolength') 56 | 57 | statement ok 58 | FROM check_arrow_testing_file('1.0.0-littleendian/generated_recursive_nested') 59 | 60 | statement ok 61 | FROM check_arrow_testing_file('2.0.0-compression/generated_uncompressible_zstd') 62 | 63 | statement ok 64 | FROM check_arrow_testing_file('2.0.0-compression/generated_zstd') 65 | 66 | 67 | # Following tests are failing but are unrelated to the extension: 68 | # Could not convert interval to microsecond? 69 | # statement ok 70 | # SELECT * FROM check_arrow_testing_file('1.0.0-littleendian/generated_interval') 71 | 72 | # Dense unions not supported? 73 | # statement ok 74 | # SELECT * FROM check_arrow_testing_file('1.0.0-littleendian/generated_union') 75 | 76 | # Fails because of missing extension registration 77 | # statement ok 78 | # SELECT * FROM check_arrow_testing_file('1.0.0-littleendian/generated_custom_metadata') 79 | 80 | # Decimal256 apparently not supported 81 | # statement ok 82 | # SELECT * FROM check_arrow_testing_file('1.0.0-littleendian/generated_decimal256') 83 | -------------------------------------------------------------------------------- /test/sql/multifile_reading.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/multifile_reading.test 2 | # description: Test read_arrow over multiple files. 3 | # group: [nanoarrow] 4 | 5 | require nanoarrow 6 | 7 | # Test File List Works 8 | statement ok 9 | CREATE TABLE T as FROM read_arrow(['data/test.arrows', 'data/test.arrows']) 10 | 11 | query I 12 | SELECT count(*) from T; 13 | ---- 14 | 30974 15 | 16 | # Test GLOB 17 | query III 18 | FROM read_arrow('data/multifile/glob/*.arrow') 19 | ---- 20 | apple gala 134.2 21 | orange navel 142.1 22 | apple honeycrisp 158.6 23 | orange valencia 96.7 24 | apple fuji NULL 25 | orange cara cara NULL 26 | 27 | # Test projections 28 | query II 29 | SELECT weight, variety FROM read_arrow('data/multifile/glob/*.arrow') 30 | ---- 31 | 134.2 gala 32 | 142.1 navel 33 | 158.6 honeycrisp 34 | 96.7 valencia 35 | NULL fuji 36 | NULL cara cara 37 | 38 | query II 39 | SELECT count(*), fruit FROM read_arrow('data/multifile/glob/*.arrow') group by fruit order by all 40 | ---- 41 | 3 apple 42 | 3 orange 43 | 44 | # Test mismatching schemas 45 | statement error 46 | FROM read_arrow(['data/test.arrows', 'data/multifile/glob/f1.arrow']) 47 | ---- 48 | If you are trying to read files with different schemas, try setting union_by_name=True 49 | 50 | statement error 51 | FROM read_arrow(['data/multifile/fruit_extra.arrows', 'data/multifile/glob/f1.arrow']) 52 | ---- 53 | If you are trying to read files with different schemas, try setting union_by_name=True 54 | 55 | 56 | # Test UNION BY NAME 57 | query IIII 58 | FROM read_arrow(['data/multifile/fruit_extra.arrows', 'data/multifile/glob/f1.arrow'], union_by_name=True) 59 | ---- 60 | apple pink lady 2.2 10.0 61 | orange jiha NULL NULL 62 | apple gala 134.2 NULL 63 | orange navel 142.1 NULL 64 | 65 | # Test different column order 66 | query III 67 | FROM read_arrow(['data/multifile/different_order.arrows', 'data/multifile/glob/f1.arrow']) order by all 68 | ---- 69 | apple 2.2 pink lady 70 | apple 134.2 gala 71 | orange 142.1 navel 72 | orange NULL jiha 73 | 74 | # Test different types 75 | query III 76 | FROM read_arrow(['data/multifile/different_type.arrows', 'data/multifile/glob/f1.arrow']) order by all 77 | ---- 78 | apple gala 134.2 79 | apple pink lady 2.2 80 | orange jiha NULL 81 | orange navel 142.1 82 | 83 | query III 84 | FROM read_arrow(['data/multifile/glob/f1.arrow', 'data/multifile/different_type.arrows']) order by all 85 | ---- 86 | apple gala 134.2 87 | apple pink lady 2.2 88 | orange jiha NULL 89 | orange navel 142.1 90 | 91 | query I 92 | select typeof(#3) FROM read_arrow(['data/multifile/different_type.arrows', 'data/multifile/glob/f1.arrow']) limit 1 93 | ---- 94 | VARCHAR 95 | 96 | query I 97 | select typeof(#3) FROM read_arrow(['data/multifile/glob/f1.arrow', 'data/multifile/different_type.arrows']) limit 1 98 | ---- 99 | DOUBLE 100 | 101 | query I 102 | select typeof(weight) FROM read_arrow(['data/multifile/different_type_int.arrows','data/multifile/glob/f1.arrow', 'data/multifile/different_type.arrows'], union_by_name = true) limit 1; 103 | ---- 104 | VARCHAR 105 | 106 | query I 107 | select typeof(weight) FROM read_arrow(['data/multifile/different_type_int.arrows','data/multifile/glob/f1.arrow', 'data/multifile/different_type.arrows']) limit 1; 108 | ---- 109 | BIGINT 110 | 111 | query III 112 | FROM read_arrow(['data/multifile/glob/f1.arrow', 'data/multifile/different_type_order.arrows']) order by all 113 | ---- 114 | apple gala 134.2 115 | apple pink lady 2.2 116 | orange jiha NULL 117 | orange navel 142.1 118 | 119 | # Test filename option 120 | query IIII 121 | SELECT fruit, variety, weight, replace(filename, '\', '/') FROM read_arrow('data/multifile/glob/*.arrow', filename = true) 122 | ---- 123 | apple gala 134.2 data/multifile/glob/f1.arrow 124 | orange navel 142.1 data/multifile/glob/f1.arrow 125 | apple honeycrisp 158.6 data/multifile/glob/f2.arrow 126 | orange valencia 96.7 data/multifile/glob/f2.arrow 127 | apple fuji NULL data/multifile/glob/f3.arrow 128 | orange cara cara NULL data/multifile/glob/f3.arrow 129 | 130 | # test hive_partitioning option 131 | query IIII 132 | FROM read_arrow('data/multifile/hive/*/*.arrow', hive_partitioning = true) 133 | ---- 134 | apple gala 134.2 a 135 | orange navel 142.1 a 136 | apple honeycrisp 158.6 a 137 | orange valencia 96.7 a 138 | apple gala 134.2 b 139 | orange navel 142.1 b 140 | apple fuji NULL b 141 | orange cara cara NULL b 142 | 143 | # Multifile reader works with replacement scans 144 | query III 145 | FROM 'data/multifile/glob/*.arrow' ORDER BY ALL 146 | ---- 147 | apple fuji NULL 148 | apple gala 134.2 149 | apple honeycrisp 158.6 150 | orange cara cara NULL 151 | orange navel 142.1 152 | orange valencia 96.7 153 | 154 | statement ok 155 | CREATE TABLE T_2 (fruit varchar, variety varchar, weight double); 156 | 157 | statement error 158 | COPY T_2 FROM 'data/multifile/glob/*.arrow' (FORMAT arrows, Made_up_option FALSE) 159 | ---- 160 | Unsupported option for COPY 161 | 162 | statement ok 163 | COPY T_2 FROM 'data/multifile/glob/*.arrow' (FORMAT arrows) 164 | 165 | query III 166 | FROM T_2 167 | ---- 168 | apple gala 134.2 169 | orange navel 142.1 170 | apple honeycrisp 158.6 171 | orange valencia 96.7 172 | apple fuji NULL 173 | orange cara cara NULL 174 | -------------------------------------------------------------------------------- /test/sql/nanoarrow.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/nanoarrow.test 2 | # description: test nanoarrow extension 3 | # group: [nanoarrow] 4 | 5 | # Before we load the extension, this will fail 6 | statement error 7 | SELECT nanoarrow_version(); 8 | ---- 9 | Catalog Error: Scalar Function with name nanoarrow_version does not exist! 10 | 11 | # Require statement will ensure this test is run with this extension loaded 12 | require nanoarrow 13 | 14 | # Confirm the extension works 15 | query I 16 | SELECT nanoarrow_version(); 17 | ---- 18 | 0.7.0-SNAPSHOT 19 | -------------------------------------------------------------------------------- /test/sql/read_arrow.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/read_arrow.test 2 | # description: test nanoarrow extension 3 | # group: [nanoarrow] 4 | 5 | # Require statement will ensure this test is run with this extension loaded 6 | require nanoarrow 7 | 8 | # Check a basic roundtrip 9 | statement ok 10 | COPY (SELECT 42 as foofy, 'string' as stringy) TO "__TEST_DIR__/test.arrows" (FORMAT ARROWS); 11 | 12 | query II 13 | SELECT * FROM read_arrow('__TEST_DIR__/test.arrows'); 14 | ---- 15 | 42 string 16 | 17 | # Check that the replacement scan works 18 | query II 19 | SELECT * FROM "__TEST_DIR__/test.arrows"; 20 | ---- 21 | 42 string 22 | 23 | # Make sure these project correctly 24 | query I 25 | SELECT foofy FROM read_arrow('__TEST_DIR__/test.arrows') 26 | ---- 27 | 42 28 | 29 | query I 30 | SELECT stringy FROM read_arrow('__TEST_DIR__/test.arrows') 31 | ---- 32 | string 33 | 34 | # Check our more realistic test table 35 | query I 36 | SELECT count(*) FROM read_arrow('__WORKING_DIRECTORY__/data/test.arrows'); 37 | ---- 38 | 15487 39 | 40 | statement error 41 | SELECT count(*) FROM read_arrow('__WORKING_DIRECTORY__/data/test.arrows', made_up_option = false); 42 | ---- 43 | Invalid named parameter "made_up_option" for function read_arrow 44 | 45 | # Check with a filter and projection 46 | query I 47 | SELECT message FROM read_arrow('__WORKING_DIRECTORY__/data/test.arrows') WHERE "commit" = 'fa5f0299f046c46e1b2f671e5e3b4f1956522711'; 48 | ---- 49 | ARROW-1: Initial Arrow Code Commit 50 | 51 | # Check a filter that has to select from multiple batches 52 | query I 53 | SELECT count(*) from "data/test.arrows" WHERE dayname(time::TIMESTAMP) = 'Wednesday'; 54 | ---- 55 | 2927 56 | -------------------------------------------------------------------------------- /test/sql/read_arrow_file.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/read_arrow_file.test 2 | # description: test nanoarrow extension when reading arrow file (with footer) 3 | # group: [nanoarrow] 4 | 5 | # The files here should be generated with "with pa.ipc.new_file(file_path, table.schema) as writer:" 6 | # Require statement will ensure this test is run with this extension loaded 7 | require nanoarrow 8 | 9 | query III 10 | FROM 'data/fruit.arrow' 11 | ---- 12 | apple gala 134.2 13 | apple honeycrisp 158.6 14 | apple fuji NULL 15 | orange navel 142.1 16 | orange valencia 96.7 17 | orange cara cara NULL 18 | -------------------------------------------------------------------------------- /test/sql/test_copy_to.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/test_copy_to.test 2 | # description: test copy to functionality and options 3 | # group: [nanoarrow] 4 | 5 | # Require statement will ensure this test is run with this extension loaded 6 | require nanoarrow 7 | 8 | statement ok 9 | CREATE TABLE test AS SELECT * FROM read_arrow('__WORKING_DIRECTORY__/data/test.arrows'); 10 | 11 | # Try extensions without the format options 12 | statement ok 13 | COPY test TO '__TEST_DIR__/test.arrows' 14 | 15 | statement ok 16 | COPY test TO '__TEST_DIR__/test.arrow' 17 | 18 | query I 19 | SELECT count(*) FROM read_arrow('__TEST_DIR__/test.arrows'); 20 | ---- 21 | 15487 22 | 23 | query I 24 | SELECT count(*) FROM read_arrow('__TEST_DIR__/test.arrows'); 25 | ---- 26 | 15487 27 | 28 | # Lets test the writing options 29 | # row_group_size: The size of a row group. By default, the value is 122,880. A lower value may reduce performance but can be beneficial for streaming. 30 | 31 | statement ok 32 | COPY test TO '__TEST_DIR__/test_row_group.arrow' (row_group_size 10) 33 | 34 | query I 35 | SELECT count(*) FROM read_arrow('__TEST_DIR__/test_row_group.arrow'); 36 | ---- 37 | 15487 38 | 39 | statement ok 40 | COPY test TO '__TEST_DIR__/test_row_group.arrow' (chunk_size 10) 41 | 42 | query I 43 | SELECT count(*) FROM read_arrow('__TEST_DIR__/test_row_group.arrow'); 44 | ---- 45 | 15487 46 | 47 | statement error 48 | COPY test TO '__TEST_DIR__/test_row_group.arrow' (row_group_size 100, chunk_size 10) 49 | ---- 50 | ROW_GROUP_SIZE and ROW_GROUP_SIZE_BYTES are mutually exclusive 51 | 52 | statement error 53 | COPY test TO '__TEST_DIR__/test_row_group.arrow' (row_group_size_bytes 100) 54 | ---- 55 | ROW_GROUP_SIZE_BYTES does not work while preserving insertion order. Use "SET preserve_insertion_order=false;" to disable preserving insertion order. 56 | 57 | statement ok 58 | SET preserve_insertion_order=false; 59 | 60 | statement ok 61 | COPY test TO '__TEST_DIR__/test_row_group.arrow' (row_group_size_bytes 100) 62 | 63 | query I 64 | SELECT count(*) FROM read_arrow('__TEST_DIR__/test_row_group.arrow'); 65 | ---- 66 | 15487 67 | 68 | # This actually has a "minimum" of 2048 69 | statement ok 70 | COPY test TO '__TEST_DIR__/test_row_group_folder' (chunk_size 10, row_groups_per_file 1, FORMAT ARROW) 71 | 72 | query I 73 | select count(file) from glob('__TEST_DIR__/test_row_group_folder/*'); 74 | ---- 75 | 9 76 | 77 | query I 78 | SELECT count(*) FROM read_arrow('__TEST_DIR__/test_row_group_folder/*'); 79 | ---- 80 | 15487 81 | 82 | statement ok 83 | COPY test TO '__TEST_DIR__/data_kv.arrow' (kv_metadata {'test':'works'}) 84 | 85 | query I 86 | SELECT count(*) FROM read_arrow('__TEST_DIR__/data_kv.arrow'); 87 | ---- 88 | 15487 89 | -------------------------------------------------------------------------------- /test/sql/to_arrow_ipc.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/to_arrow_ipc.test 2 | # description: round trip arrow serialization 3 | # group: [nanoarrow] 4 | 5 | # NOTE: for now there's not much we can test here, since we cannot really pass pointers from the 6 | # serialized blobs to the scan_arrow_ipc function in SQL. Therefore tests of these features 7 | # currently live in the NodeJS client tests. 8 | 9 | require nanoarrow 10 | 11 | statement ok 12 | SET disabled_optimizers='column_lifetime' 13 | 14 | statement ok 15 | SELECT * FROM to_arrow_ipc((SELECT 'Its working!')); 16 | 17 | # Test operator caching behaviour is sane 18 | statement ok 19 | create table data as select * from range(0,2000) tbl(col) 20 | 21 | statement ok 22 | WITH data_union AS ( 23 | SELECT * FROM data 24 | UNION ALL 25 | SELECT * FROM data 26 | ) 27 | FROM to_arrow_ipc((SELECT * FROM data_union ORDER BY col)) 28 | -------------------------------------------------------------------------------- /test/sql/write_arrow_stream.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/write_arrow_stream.test 2 | # description: test nanoarrow extension 3 | # group: [nanoarrow] 4 | 5 | # Require statement will ensure this test is run with this extension loaded 6 | require nanoarrow 7 | 8 | statement ok 9 | CREATE TABLE test AS SELECT * FROM read_arrow('__WORKING_DIRECTORY__/data/test.arrows'); 10 | 11 | statement ok 12 | COPY test TO '__TEST_DIR__/test.arrows' (FORMAT ARROWS, BATCH_SIZE 100) 13 | 14 | statement ok 15 | CREATE OR REPLACE TABLE written AS SELECT * FROM read_arrow('__TEST_DIR__/test.arrows'); 16 | 17 | query I 18 | SELECT count(*) FROM read_arrow('__TEST_DIR__/test.arrows'); 19 | ---- 20 | 15487 21 | 22 | query I 23 | SELECT sum((test.time = written.time)::INTEGER) FROM test INNER JOIN written ON test.commit = written.commit; 24 | ---- 25 | 15487 26 | -------------------------------------------------------------------------------- /test_local.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Cheap version of make test that works with cmake 4 | SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 5 | SOURCE_DIR_NAME="$(basename "${SOURCE_DIR}")" 6 | 7 | "$SOURCE_DIR/build/test/unittest" "*/${SOURCE_DIR_NAME}/*" 8 | -------------------------------------------------------------------------------- /vcpkg.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": [] 3 | } 4 | --------------------------------------------------------------------------------