├── .clang-format
├── .cmake-format
├── .editorconfig
├── .github
    └── workflows
    │   ├── MainDistributionPipeline.yml
    │   ├── NodeJS.yml
    │   ├── Python.yml
    │   ├── _extension_deploy.yml
    │   └── dev.yaml
├── .gitignore
├── .gitmodules
├── .pre-commit-config.yaml
├── CMakeLists.txt
├── CMakeUserPresets.json
├── LICENSE
├── Makefile
├── README.md
├── benchmark
    └── lineitem.py
├── data
    ├── fruit.arrow
    ├── multifile
    │   ├── different_order.arrows
    │   ├── different_type.arrows
    │   ├── different_type_int.arrows
    │   ├── different_type_order.arrows
    │   ├── fruit_extra.arrows
    │   ├── glob
    │   │   ├── f1.arrow
    │   │   ├── f2.arrow
    │   │   └── f3.arrow
    │   └── hive
    │   │   ├── part=a
    │   │       ├── f1.arrow
    │   │       └── f2.arrow
    │   │   └── part=b
    │   │       ├── f1.arrow
    │   │       └── f3.arrow
    ├── parquet-testing
    │   └── lineitem_sf0_01.parquet
    ├── test.arrow
    └── test.arrows
├── docs
    └── UPDATING.md
├── extension_config.cmake
├── scripts
    ├── extension-upload.sh
    └── setup-custom-toolchain.sh
├── src
    ├── file_scanner
    │   ├── arrow_file_scan.cpp
    │   └── arrow_multi_file_info.cpp
    ├── include
    │   ├── file_scanner
    │   │   ├── arrow_file_scan.hpp
    │   │   └── arrow_multi_file_info.hpp
    │   ├── ipc
    │   │   ├── array_stream.hpp
    │   │   ├── stream_factory.hpp
    │   │   └── stream_reader
    │   │   │   ├── base_stream_reader.hpp
    │   │   │   ├── ipc_buffer_stream_reader.hpp
    │   │   │   └── ipc_file_stream_reader.hpp
    │   ├── nanoarrow_errors.hpp
    │   ├── nanoarrow_extension.hpp
    │   ├── table_function
    │   │   ├── arrow_ipc_function_data.hpp
    │   │   ├── read_arrow.hpp
    │   │   └── scan_arrow_ipc.hpp
    │   ├── write_arrow_stream.hpp
    │   └── writer
    │   │   ├── arrow_stream_writer.hpp
    │   │   ├── column_data_collection_serializer.hpp
    │   │   └── to_arrow_ipc.hpp
    ├── ipc
    │   ├── array_stream.cpp
    │   ├── stream_factory.cpp
    │   └── stream_reader
    │   │   ├── base_stream_reader.cpp
    │   │   ├── ipc_buffer_stream_reader.cpp
    │   │   └── ipc_file_stream_reader.cpp
    ├── nanoarrow_extension.cpp
    ├── scanner
    │   ├── read_arrow.cpp
    │   └── scan_arrow_ipc.cpp
    └── writer
    │   ├── arrow_stream_writer.cpp
    │   ├── column_data_collection_serializer.cpp
    │   ├── to_arrow_ipc.cpp
    │   └── write_arrow_stream.cpp
├── test
    ├── README.md
    ├── nodejs
    │   └── arrow_test.js
    ├── python
    │   ├── conftest.py
    │   ├── requirements-dev.txt
    │   ├── test_arrow_ipc_scan.py
    │   ├── test_arrow_ipc_writer.py
    │   └── test_integration.py
    └── sql
    │   ├── arrow_testing.test
    │   ├── multifile_reading.test
    │   ├── nanoarrow.test
    │   ├── read_arrow.test
    │   ├── read_arrow_file.test
    │   ├── test_copy_to.test
    │   ├── to_arrow_ipc.test
    │   └── write_arrow_stream.test
├── test_local.sh
└── vcpkg.json


/.clang-format:
--------------------------------------------------------------------------------
1 | ---
2 | BasedOnStyle: Google
3 | ColumnLimit: 90
4 | DerivePointerAlignment: false
5 | IncludeBlocks: Preserve
6 | 


--------------------------------------------------------------------------------
/.cmake-format:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one
 2 | # or more contributor license agreements.  See the NOTICE file
 3 | # distributed with this work for additional information
 4 | # regarding copyright ownership.  The ASF licenses this file
 5 | # to you under the Apache License, Version 2.0 (the
 6 | # "License"); you may not use this file except in compliance
 7 | # with the License.  You may obtain a copy of the License at
 8 | #
 9 | #   http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied.  See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | 
18 | # cmake-format configuration file
19 | # Use `archery lint --cmake-format --fix` to reformat all cmake files in the
20 | # source tree
21 | 
22 | # -----------------------------
23 | # Options affecting formatting.
24 | # -----------------------------
25 | with section("format"):
26 |     # How wide to allow formatted cmake files
27 |     line_width = 90
28 | 
29 |     # How many spaces to tab for indent
30 |     tab_size = 2
31 | 
32 |     # If a positional argument group contains more than this many arguments,
33 |     # then force it to a vertical layout.
34 |     max_pargs_hwrap = 4
35 | 
36 |     # If the statement spelling length (including space and parenthesis) is
37 |     # smaller than this amount, then force reject nested layouts.
38 |     # This value only comes into play when considering whether or not to nest
39 |     # arguments below their parent. If the number of characters in the parent
40 |     # is less than this value, we will not nest.
41 |     min_prefix_chars = 32
42 | 
43 |     # If true, separate flow control names from their parentheses with a space
44 |     separate_ctrl_name_with_space = False
45 | 
46 |     # If true, separate function names from parentheses with a space
47 |     separate_fn_name_with_space = False
48 | 
49 |     # If a statement is wrapped to more than one line, than dangle the closing
50 |     # parenthesis on it's own line
51 |     dangle_parens = False
52 | 
53 |     # What style line endings to use in the output.
54 |     line_ending = 'unix'
55 | 
56 |     # Format command names consistently as 'lower' or 'upper' case
57 |     command_case = 'lower'
58 | 
59 |     # Format keywords consistently as 'lower' or 'upper' case
60 |     keyword_case = 'unchanged'
61 | 
62 | # ------------------------------------------------
63 | # Options affecting comment reflow and formatting.
64 | # ------------------------------------------------
65 | with section("markup"):
66 |     # enable comment markup parsing and reflow
67 |     enable_markup = False
68 | 
69 |     # If comment markup is enabled, don't reflow the first comment block in
70 |     # eachlistfile. Use this to preserve formatting of your
71 |     # copyright/licensestatements.
72 |     first_comment_is_literal = True
73 | 
74 |     # If comment markup is enabled, don't reflow any comment block which
75 |     # matchesthis (regex) pattern. Default is `None` (disabled).
76 |     literal_comment_pattern = None
77 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
1 | duckdb/.editorconfig


--------------------------------------------------------------------------------
/.github/workflows/MainDistributionPipeline.yml:
--------------------------------------------------------------------------------
 1 | #
 2 | # This workflow calls the main distribution pipeline from DuckDB to build, test and (optionally) release the extension
 3 | #
 4 | name: Main Extension Distribution Pipeline
 5 | on:
 6 |   push:
 7 |   pull_request:
 8 |   workflow_dispatch:
 9 | 
10 | concurrency:
11 |   group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/main' || github.sha }}
12 |   cancel-in-progress: true
13 | 
14 | jobs:
15 |   duckdb-next-build:
16 |     name: Build extension binaries
17 |     uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main
18 |     with:
19 |       duckdb_version: main
20 |       ci_tools_version: main
21 |       extension_name: nanoarrow
22 | 
23 |   duckdb-stable-build:
24 |     name: Build extension binaries
25 |     if: github.ref == 'refs/heads/stable'
26 |     uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@v1.2.1
27 |     with:
28 |       duckdb_version: v1.2.1
29 |       ci_tools_version: v1.2.1
30 |       extension_name: nanoarrow
31 | 


--------------------------------------------------------------------------------
/.github/workflows/NodeJS.yml:
--------------------------------------------------------------------------------
 1 | on: [push, pull_request,repository_dispatch,workflow_dispatch]
 2 | 
 3 | defaults:
 4 |   run:
 5 |     shell: bash
 6 | 
 7 | jobs:
 8 |   nodejs:
 9 |     if: github.ref == 'refs/heads/stable'
10 |     name: NodeJS
11 |     runs-on: macos-latest
12 |     env:
13 |       GEN: ninja
14 | 
15 |     steps:
16 |       - name: Install Ninja
17 |         run: brew install ninja
18 | 
19 |       - uses: actions/checkout@v2
20 |         with:
21 |           fetch-depth: 0
22 |           submodules: 'true'
23 | 
24 |       - uses: actions/setup-python@v2
25 |         with:
26 |           python-version: '3.9'
27 | 
28 |       - uses: actions/setup-node@v4
29 |         with:
30 |           node-version: '20'
31 | 
32 |       - name: Install required node packages
33 |         run: |
34 |           sudo npm i duckdb@1.2.1
35 |           sudo npm install -g  apache-arrow mocha
36 |           sudo npm install  apache-arrow mocha
37 |           npm -v
38 |           node -v
39 | 
40 |       - name: Build duckdb
41 |         run: |
42 |           cd duckdb
43 |           git checkout 8e52ec43959ab363643d63cb78ee214577111da4 #v1.2.1
44 |           cd ..
45 |           make
46 | 
47 |       - name: Run JS tests
48 |         run: |
49 |           make test_release_js
50 | 


--------------------------------------------------------------------------------
/.github/workflows/Python.yml:
--------------------------------------------------------------------------------
 1 | on: [push, pull_request,repository_dispatch]
 2 | 
 3 | defaults:
 4 |   run:
 5 |     shell: bash
 6 | 
 7 | jobs:
 8 |   python:
 9 |     name: Python
10 |     runs-on: macos-latest
11 |     env:
12 |       GEN: ninja
13 | 
14 |     steps:
15 |     - name: Install Ninja
16 |       run: brew install ninja
17 | 
18 |     - uses: actions/checkout@v2
19 |       with:
20 |         fetch-depth: 0
21 |         submodules: true
22 | 
23 |     - uses: actions/setup-python@v2
24 |       with:
25 |         python-version: '3.11'
26 | 
27 |     - name: Build DuckDB (Python)
28 |       run: |
29 |         cd duckdb
30 |         git checkout main
31 |         cd tools/pythonpkg
32 |         python3 -m pip install .
33 | 
34 |     - name: Build Arrow Extension
35 |       run: make release
36 | 
37 |     - name: Install Python Dependencies
38 |       shell: bash
39 |       run: |
40 |         pip install -r test/python/requirements-dev.txt
41 | 
42 |     - name: Test Python
43 |       run: |
44 |         (cd test/python && python -m pytest)
45 | 


--------------------------------------------------------------------------------
/.github/workflows/_extension_deploy.yml:
--------------------------------------------------------------------------------
  1 | #
  2 | # Reusable workflow that deploys the artifacts produced by github.com/duckdb/duckdb/.github/workflows/_extension_distribution.yml
  3 | #
  4 | # note: this workflow needs to be located in the extension repository, as it requires secrets to be passed to the
  5 | # deploy script. However, it should generally not be necessary to modify this workflow in your extension repository, as
  6 | # this workflow can be configured to use a custom deploy script.
  7 | 
  8 | 
  9 | name: Extension Deployment
 10 | on:
 11 |   workflow_call:
 12 |     inputs:
 13 |       # The name of the extension
 14 |       extension_name:
 15 |         required: true
 16 |         type: string
 17 |       # DuckDB version to build against
 18 |       duckdb_version:
 19 |         required: true
 20 |         type: string
 21 |       # ';' separated list of architectures to exclude, for example: 'linux_amd64;osx_arm64'
 22 |       exclude_archs:
 23 |         required: false
 24 |         type: string
 25 |         default: ""
 26 |       # Whether to upload this deployment as the latest. This may overwrite a previous deployment.
 27 |       deploy_latest:
 28 |         required: false
 29 |         type: boolean
 30 |         default: false
 31 |       # Whether to upload this deployment under a versioned path. These will not be deleted automatically
 32 |       deploy_versioned:
 33 |         required: false
 34 |         type: boolean
 35 |         default: false
 36 |       # Postfix added to artifact names. Can be used to guarantee unique names when this workflow is called multiple times
 37 |       artifact_postfix:
 38 |         required: false
 39 |         type: string
 40 |         default: ""
 41 |       # Override the default deploy script with a custom script
 42 |       deploy_script:
 43 |         required: false
 44 |         type: string
 45 |         default: "./scripts/extension-upload.sh"
 46 |       # Override the default matrix parse script with a custom script
 47 |       matrix_parse_script:
 48 |         required: false
 49 |         type: string
 50 |         default: "./duckdb/scripts/modify_distribution_matrix.py"
 51 | 
 52 | jobs:
 53 |   generate_matrix:
 54 |     name: Generate matrix
 55 |     runs-on: ubuntu-latest
 56 |     outputs:
 57 |       deploy_matrix: ${{ steps.parse-matrices.outputs.deploy_matrix }}
 58 |     steps:
 59 |       - uses: actions/checkout@v3
 60 |         with:
 61 |           fetch-depth: 0
 62 |           submodules: 'true'
 63 | 
 64 |       - name: Checkout DuckDB to version
 65 |         run: |
 66 |           cd duckdb
 67 |           git checkout ${{ inputs.duckdb_version }}
 68 | 
 69 |       - id: parse-matrices
 70 |         run: |
 71 |           python3 ${{ inputs.matrix_parse_script }} --input ./duckdb/.github/config/distribution_matrix.json --deploy_matrix --output deploy_matrix.json --exclude "${{ inputs.exclude_archs }}" --pretty
 72 |           deploy_matrix="`cat deploy_matrix.json`"
 73 |           echo deploy_matrix=$deploy_matrix >> $GITHUB_OUTPUT
 74 |           echo `cat $GITHUB_OUTPUT`
 75 | 
 76 |   deploy:
 77 |     name: Deploy
 78 |     runs-on: ubuntu-latest
 79 |     needs: generate_matrix
 80 |     if: ${{ needs.generate_matrix.outputs.deploy_matrix != '{}' && needs.generate_matrix.outputs.deploy_matrix != '' }}
 81 |     strategy:
 82 |       matrix: ${{fromJson(needs.generate_matrix.outputs.deploy_matrix)}}
 83 | 
 84 |     steps:
 85 |       - uses: actions/checkout@v3
 86 |         with:
 87 |           fetch-depth: 0
 88 |           submodules: 'true'
 89 | 
 90 |       - name: Checkout DuckDB to version
 91 |         run: |
 92 |           cd duckdb
 93 |           git checkout ${{ inputs.duckdb_version }}
 94 | 
 95 |       - uses: actions/download-artifact@v3
 96 |         with:
 97 |           name: ${{ inputs.extension_name }}-${{ inputs.duckdb_version }}-extension-${{matrix.duckdb_arch}}${{inputs.artifact_postfix}}${{startsWith(matrix.duckdb, 'wasm') && '.wasm' || ''}}
 98 |           path: |
 99 |             /tmp/extension
100 | 
101 |       - name: Deploy
102 |         shell: bash
103 |         env:
104 |           AWS_ACCESS_KEY_ID: ${{ secrets.S3_DEPLOY_ID }}
105 |           AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_DEPLOY_KEY }}
106 |           AWS_DEFAULT_REGION: ${{ secrets.S3_REGION }}
107 |           BUCKET_NAME: ${{ secrets.S3_BUCKET }}
108 |           DUCKDB_EXTENSION_SIGNING_PK: ${{ secrets.S3_DUCKDB_ORG_EXTENSION_SIGNING_PK }}
109 |         run: |
110 |           pwd
111 |           python3 -m pip install pip awscli
112 |           git config --global --add safe.directory '*'
113 |           cd duckdb
114 |           git fetch --tags
115 |           export DUCKDB_VERSION=`git tag --points-at HEAD`
116 |           export DUCKDB_VERSION=${DUCKDB_VERSION:=`git log -1 --format=%h`}
117 |           cd ..
118 |           git fetch --tags
119 |           export EXT_VERSION=`git tag --points-at HEAD`
120 |           export EXT_VERSION=${EXT_VERSION:=`git log -1 --format=%h`}
121 |           ${{ inputs.deploy_script }} ${{ inputs.extension_name }} $EXT_VERSION $DUCKDB_VERSION ${{ matrix.duckdb_arch }} $BUCKET_NAME ${{inputs.deploy_latest || 'true' && 'false'}} ${{inputs.deploy_versioned || 'true' && 'false'}}
122 | 


--------------------------------------------------------------------------------
/.github/workflows/dev.yaml:
--------------------------------------------------------------------------------
 1 | name: dev
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 |     branches:
 9 |       - main
10 | 
11 | permissions:
12 |   contents: read
13 | 
14 | jobs:
15 |   pre-commit:
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - uses: actions/checkout@v4
19 |         with:
20 |           fetch-depth: 0
21 |           persist-credentials: false
22 |       - uses: actions/setup-python@v5
23 |         with:
24 |           python-version: '3.x'
25 |       - name: pre-commit (cache)
26 |         uses: actions/cache@v4
27 |         with:
28 |           path: ~/.cache/pre-commit
29 |           key: pre-commit-${{ hashFiles('.pre-commit-config.yaml') }}
30 |       - name: pre-commit (--all-files)
31 |         run: |
32 |           python -m pip install pre-commit
33 |           pre-commit run --show-diff-on-failure --color=always --all-files
34 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | build
 2 | .idea
 3 | cmake-build-debug
 4 | duckdb_unittest_tempdir/
 5 | .DS_Store
 6 | testext
 7 | test/python/__pycache__/
 8 | .Rhistory
 9 | .vscode/settings.json
10 | .cache
11 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "duckdb"]
 2 | 	path = duckdb
 3 | 	url = https://github.com/duckdb/duckdb
 4 | 	branch = main
 5 | [submodule "extension-ci-tools"]
 6 | 	path = extension-ci-tools
 7 | 	url = https://github.com/duckdb/extension-ci-tools
 8 | 	branch = main
 9 | [submodule "arrow-testing"]
10 | 	path = arrow-testing
11 | 	url = https://github.com/apache/arrow-testing.git
12 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v2.3.0
 4 |     hooks:
 5 |     - id: check-yaml
 6 |     - id: end-of-file-fixer
 7 |     - id: trailing-whitespace
 8 |   - repo: https://github.com/pre-commit/mirrors-clang-format
 9 |     rev: v19.1.4
10 |     hooks:
11 |     - id: clang-format
12 |       types_or: [c, c++]
13 |   - repo: https://github.com/cheshirekow/cmake-format-precommit
14 |     rev: v0.6.13
15 |     hooks:
16 |     - id: cmake-format
17 |       args: [--in-place]
18 |   - repo: https://github.com/codespell-project/codespell
19 |     rev: v2.2.5
20 |     hooks:
21 |     -   id: codespell
22 |         types_or: [rst, markdown, c, c++]
23 |         additional_dependencies: [tomli]
24 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.5)
 2 | include(FetchContent)
 3 | 
 4 | # Set extension name here
 5 | set(TARGET_NAME nanoarrow)
 6 | 
 7 | set(NANOARROW_IPC ON)
 8 | set(NANOARROW_NAMESPACE "DuckDBExt${TARGET_NAME}")
 9 | fetchcontent_declare(nanoarrow
10 |                      URL "https://github.com/apache/arrow-nanoarrow/archive/4bf5a9322626e95e3717e43de7616c0a256179eb.zip"
11 |                      URL_HASH SHA256=49d588ee758a2a1d099ed4525c583a04adf71ce40405011e0190aa1e75e61b59
12 | )
13 | fetchcontent_makeavailable(nanoarrow)
14 | 
15 | set(EXTENSION_NAME ${TARGET_NAME}_extension)
16 | set(LOADABLE_EXTENSION_NAME ${TARGET_NAME}_loadable_extension)
17 | 
18 | project(${TARGET_NAME})
19 | include_directories(src/include)
20 | 
21 | set(EXTENSION_SOURCES
22 |     src/file_scanner/arrow_file_scan.cpp
23 |     src/file_scanner/arrow_multi_file_info.cpp
24 |     src/ipc/array_stream.cpp
25 |     src/ipc/stream_factory.cpp
26 |     src/ipc/stream_reader/base_stream_reader.cpp
27 |     src/ipc/stream_reader/ipc_file_stream_reader.cpp
28 |     src/ipc/stream_reader/ipc_buffer_stream_reader.cpp
29 |     src/scanner/read_arrow.cpp
30 |     src/scanner/scan_arrow_ipc.cpp
31 |     src/nanoarrow_extension.cpp
32 |     src/writer/arrow_stream_writer.cpp
33 |     src/writer/column_data_collection_serializer.cpp
34 |     src/writer/write_arrow_stream.cpp
35 |     src/writer/to_arrow_ipc.cpp)
36 | 
37 | build_static_extension(${TARGET_NAME} ${EXTENSION_SOURCES})
38 | build_loadable_extension(${TARGET_NAME} " " ${EXTENSION_SOURCES})
39 | 
40 | # Link nanoarrow in both the static library as the loadable extension
41 | target_link_libraries(${EXTENSION_NAME} nanoarrow::nanoarrow nanoarrow::nanoarrow_ipc)
42 | target_link_libraries(${LOADABLE_EXTENSION_NAME} nanoarrow::nanoarrow
43 |                       nanoarrow::nanoarrow_ipc)
44 | 
45 | install(TARGETS ${EXTENSION_NAME}
46 |         EXPORT "${DUCKDB_EXPORT_SET}"
47 |         LIBRARY DESTINATION "${INSTALL_LIB_DIR}"
48 |         ARCHIVE DESTINATION "${INSTALL_LIB_DIR}")
49 | 


--------------------------------------------------------------------------------
/CMakeUserPresets.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": 3,
 3 |     "cmakeMinimumRequired": {
 4 |         "major": 3,
 5 |         "minor": 21,
 6 |         "patch": 0
 7 |     },
 8 |     "configurePresets": [
 9 |         {
10 |             "name": "extension",
11 |             "displayName": "Extension",
12 |             "generator": "Ninja",
13 |             "binaryDir": "${sourceDir}/../build",
14 |             "cacheVariables": {
15 |                 "EXTENSION_STATIC_BUILD": "1",
16 |                 "DUCKDB_EXTENSION_CONFIGS": "${sourceDir}/../extension_config.cmake"
17 |             }
18 |         },
19 |         {
20 |             "name": "extension_debug",
21 |             "displayName": "Extension (Debug build)",
22 |             "inherits": ["extension"],
23 |             "cacheVariables": {
24 |                 "CMAKE_BUILD_TYPE": "Debug"
25 |             }
26 |         },
27 |         {
28 |             "name": "extension_vcpkg_config",
29 |             "hidden": true,
30 |             "cacheVariables": {
31 |                 "CMAKE_TOOLCHAIN_FILE": "/$env{VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake",
32 |                 "VCPKG_MANIFEST_DIR": "${sourceDir}/..",
33 |                 "VCPKG_BUILD": "1"
34 |             }
35 |         },
36 |         {
37 |             "name": "extension_vcpkg",
38 |             "displayName": "Extension (using vcpkg)",
39 |             "inherits": ["extension", "extension_vcpkg_config"]
40 |         },
41 |         {
42 |             "name": "extension_vcpkg_debug",
43 |             "displayName": "Extension (Debug build using vcpkg)",
44 |             "inherits": ["extension_debug", "extension_vcpkg_config"]
45 |         }
46 |     ]
47 | }
48 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2018-2024 Stichting DuckDB Foundation
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | PROJ_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 2 | 
 3 | # Configuration of extension
 4 | EXT_NAME=nanoarrow
 5 | EXT_CONFIG=${PROJ_DIR}extension_config.cmake
 6 | 
 7 | # Include the Makefile from extension-ci-tools
 8 | include extension-ci-tools/makefiles/duckdb_extension.Makefile
 9 | 
10 | # Client tests
11 | DEBUG_EXT_PATH='$(PROJ_DIR)build/debug/extension/nanoarrow/nanoarrow.duckdb_extension'
12 | RELDEBUG_EXT_PATH='$(PROJ_DIR)build/release/extension/nanoarrow/nanoarrow.duckdb_extension'
13 | RELEASE_EXT_PATH='$(PROJ_DIR)build/release/extension/nanoarrow/nanoarrow.duckdb_extension'
14 | 
15 | test_js:
16 | test_debug_js:
17 | 	ARROW_EXTENSION_BINARY_PATH=$(DEBUG_EXT_PATH) mocha -R spec --timeout 480000 -n expose-gc --exclude 'test/*.ts' -- "test/nodejs/**/*.js"
18 | test_release_js:
19 | 	ARROW_EXTENSION_BINARY_PATH=$(RELEASE_EXT_PATH) mocha -R spec --timeout 480000 -n expose-gc --exclude 'test/*.ts' -- "test/nodejs/**/*.js"
20 | 
21 | run_benchmark:
22 | 	python3 benchmark/lineitem.py $(RELEASE_EXT_PATH)
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # nanoarrow for DuckDB
  2 | 
  3 | This extension, nanoarrow, allows you to read Arrow IPC streams and files. It serves a similar purpose as the now-deprecated [Arrow DuckDB core extension](https://github.com/duckdb/arrow).
  4 | However, it comes with the added functionality to query Arrow IPC files and is much better tested. This extension is released as a DuckDB Community Extension.
  5 | For compatibility reasons with the previous Arrow core extension, this extension is also aliased as `arrow`.
  6 | 
  7 | You can install and load it as:
  8 | 
  9 | ```sql
 10 | -- arrow would also be a suitable name
 11 | INSTALL nanoarrow FROM community;
 12 | LOAD nanoarrow;
 13 | ```
 14 | 
 15 | ## Usage
 16 | Below is a complete example of how to use our extension to read an Arrow IPC file.
 17 | In addition to our extension, you will also need the `httpfs` extension installed and loaded to fetch the data directly from GitHub.
 18 | 
 19 | ```sql
 20 | LOAD httpfs;
 21 | LOAD nanoarrow;
 22 | SELECT
 23 |     commit, message
 24 |   FROM
 25 |     'https://github.com/apache/arrow-experiments/raw/refs/heads/main/data/arrow-commits/arrow-commits.arrows'
 26 |   LIMIT 10;
 27 | ```
 28 | 
 29 | ```
 30 | ┌───────────────────────────┬───────────────────────────────────────────────────────────────────────────────────────────┐
 31 | │          commit           │                                          message                                          │
 32 | │          varchar          │                                          varchar                                          │
 33 | ├───────────────────────────┼───────────────────────────────────────────────────────────────────────────────────────────┤
 34 | │ 49cdb0fe4e98fda19031c86…  │ GH-40370: [C++] Define ARROW_FORCE_INLINE for non-MSVC builds (#40372)                    │
 35 | │ 1d966e98e41ce817d1f8c51…  │ GH-40386: [Python] Fix except clauses (#40387)                                            │
 36 | │ 96f26a89bd73997f7532643…  │ GH-40227: [R] ensure executable files in `create_package_with_all_dependencies` (#40232)  │
 37 | │ ee1a8c39a55f3543a82fed9…  │ GH-40366: [C++] Remove const qualifier from Buffer::mutable_span_as (#40367)              │
 38 | │ 3d467ac7bfae03cf2db0980…  │ GH-20127: [Python][CI] Remove legacy hdfs tests from hdfs and hypothesis setup (#40363)   │
 39 | │ ef6ea6beed071ed070daf03…  │ GH-40345: [FlightRPC][C++][Java][Go] Add URI scheme to reuse connection (#40084)          │
 40 | │ 53e0c745ad491af98a5bf18…  │ GH-40153: [C++][Python] Fix test_gdb failures on 32-bit (#40293)                          │
 41 | │ 3ba6d286caad328b8572a3b…  │ GH-40059: [C++][Python] Basic conversion of RecordBatch to Arrow Tensor (#40064)          │
 42 | │ 4ce9a5edd2710fb8bf0c642…  │ GH-40153: [Python] Make `Tensor.__getbuffer__` work on 32-bit platforms (#40294)          │
 43 | │ 2445975162905bd8d9a42ff…  │ GH-40334: [C++][Gandiva] Add missing OpenSSL dependency to encrypt_utils_test.cc (#40338) │
 44 | ├───────────────────────────┴───────────────────────────────────────────────────────────────────────────────────────────┤
 45 | │ 10 rows                                                                                                     2 columns │
 46 | └───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
 47 | ```
 48 | 
 49 | In the remainder of this section, we cover the supported parameters and usages for our IPC readers/writers.
 50 | 
 51 | ### IPC Files
 52 | 
 53 | #### Write
 54 | Writing an Arrow IPC file is done using the COPY statement Below is a simple example of how you can use DuckDB to create such a file.
 55 | 
 56 | ```sql
 57 | COPY (SELECT 42 as foofy, 'string' as stringy) TO "test.arrows";
 58 | ```
 59 | 
 60 | Both `.arrows` and `.arrow` will be automatically recognized by DuckDB as Arrow IPC streams.
 61 | However, if you wish to use a different extension, you can manually specify the format using:
 62 | 
 63 | ```sql
 64 | COPY (SELECT 42 as foofy, 'string' as stringy) TO "test.ipc" (FORMAT ARROWS);
 65 | ```
 66 | 
 67 | The Copy function of the Copy To Arrow File operation accepts the following parameters:
 68 | * `row_group_size`: The size of a row group. By default, the value is 122,880. A lower value may reduce performance but can be beneficial for streaming. It is important to note that this value is not exact, a slightly higher value divisible by 2,048 (DuckDB's standard vector size) may be used as the actual row group size.
 69 | * `chunk_size`: An alias for the `row_group_size` parameter.
 70 | * `row_group_size_bytes`: The size of row groups in bytes.
 71 | * `row_groups_per_file`: The maximum number of row groups per file. If this option is set, multiple files can be generated in a single `COPY` call. This means the specified path will create a directory, and the `row_group_size` parameter will also be used to determine the partition sizes.
 72 | * `kv_metadata`: Key-value metadata to be added to the file schema.
 73 | 
 74 | If `row_group_size_bytes` and either `chunk_size` or `row_group_size` are used, the row groups will be defined by the smallest of these parameters.
 75 | 
 76 | #### Read
 77 | You can consume the file using the `read_arrow` scanner. For example, to read the file we just created, you could run:
 78 | ```sql
 79 | FROM read_arrow('test.arrows');
 80 | ```
 81 | 
 82 | Similar to the copy function, the extension also registers `.arrows` and `.arrow` as valid extensions for the Arrow IPC format. This means that a replacement scan can be applied if that is the file extension, so the following would also be a valid query:
 83 | ```sql
 84 | FROM 'test.arrows';
 85 | ```
 86 | 
 87 | Besides single-file reading, our extension also fully supports multi-file reading, including all valid multi-file options.
 88 | 
 89 | If we were to create a second test file using:
 90 | ```sql
 91 | COPY (SELECT 42 as foofy, 'string' as stringy) TO "test_2.arrows" (FORMAT ARROWS);
 92 | ```
 93 | 
 94 | We can then run a query that reads both files using a glob pattern or a list of file paths:
 95 | 
 96 | ```sql
 97 | -- Glob
 98 | FROM read_arrow('*.arrows')
 99 | 
100 | -- List
101 | FROM read_arrow(['test.arrows','test_2.arrows'])
102 | ```
103 | 
104 | When reading multiple files, the following parameters are also supported:
105 | * `union_by_name`: If the schemas of the files differ, setting `union_by_name` allows DuckDB to construct the schema by aligning columns with the same name.
106 | * `filename`: If set to `True`, this will add a column with the name of the file that generated each row.
107 | * `hive_partitioning`: Enables reading data from a Hive-partitioned dataset and applies partition filtering.
108 | > [!NOTE]
109 | > [Arrow IPC files (.arrow)](https://arrow.apache.org/docs/format/Columnar.html#ipc-file-format) and [Arrow IPC streams (.arrows)](https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format) are distinct but related formats. This extension can read both but only writes Arrow IPC Streams.
110 | ### IPC Stream Buffers
111 | Similar to the old core Arrow extension, this extension also allows direct production and consumption of the Arrow IPC streaming format from in-memory buffers in both Python and Node.js.
112 | In this section, we will demonstrate how to use the Python API, but you can find many tests that serve as examples for both [Node.js](https://github.com/paleolimbot/duckdb-nanoarrow/tree/main/test/nodejs) and [Python](https://github.com/paleolimbot/duckdb-nanoarrow/tree/main/test/python).
113 | 
114 | Our extension can create Arrow IPC buffers using the `to_arrow_ipc` function. This function returns two columns: one containing the serialized data as a `BLOB`, and another `BOOL` column indicating which tuples contain the header information of the messages. For example, consider the following table in our DuckDB database:
115 | ```python
116 | import pyarrow as pa
117 | import duckdb
118 | import pyarrow.ipc as ipc
119 | 
120 | connection = duckdb.connect()
121 | connection.execute("CREATE TABLE T (f0 integer, f1 varchar, f2 bool )")
122 | connection.execute("INSERT INTO T values (1, 'foo', true),(2, 'bar', NULL), (3, 'baz', false), (4, NULL, true) ")
123 | ```
124 | 
125 | We can then obtain our buffers by simply issuing a `to_arrow_ipc` call, like this:
126 | 
127 | ```python
128 | buffers = connection.execute("FROM to_arrow_ipc((FROM T))").fetchall()
129 | ```
130 | In this case, our buffers will contain two tuples: the first is the header of our message, and the second is the data. To convert this into an Arrow table, we simply concatenate the tuples and use the `ipc.RecordBatchStreamReader`. For example, you can read them as follows:
131 | 
132 | 
133 | ```python
134 | batches = []
135 | with pa.BufferReader(pa.py_buffer(buffers[0][0] + buffers[1][0])) as reader:
136 |      stream_reader = ipc.RecordBatchStreamReader(reader)
137 |      schema = stream_reader.schema
138 |      batches.extend(stream_reader)
139 | arrow_table = pa.Table.from_batches(batches, schema=schema)
140 | ```
141 | 
142 | To read buffers with DuckDB, you must use the Python function `from_arrow`. Continuing from our example, we would first need to convert our Arrow table into the Arrow IPC format.
143 | ```python
144 | batch = arrow_table.to_batches()[0]
145 | sink = pa.BufferOutputStream()
146 | with pa.ipc.new_stream(sink, batch.schema) as writer:
147 |      writer.write_batch(batch)
148 | buffer = sink.getvalue()
149 | buf_reader = pa.BufferReader(buffer)
150 | msg_reader = ipc.MessageReader.open_stream(buf_reader)
151 | ```
152 | 
153 | After this, the following query will return a DuckDB relation with the deserialized Arrow IPC:
154 | 
155 | ```python
156 | connection.from_arrow(msg_reader)
157 | ```
158 | 
159 | ## Building
160 | 
161 | To build the extension, clone the repository with submodules:
162 | 
163 | ``` shell
164 | git clone --recurse-submodules https://github.com/paleolimbot/duckdb-nanoarrow.git
165 | ```
166 | 
167 | ...or if you forget to clone the submodules/you're using VSCode to do your checkout, you can run:
168 | 
169 | ``` shell
170 | git submodule init
171 | git submodule update --checkout
172 | ```
173 | 
174 | A quick-and-dirty way to get your build up and running is to run `make`:
175 | 
176 | ```sh
177 | make
178 | 
179 | ```
180 | The main binaries that will be built are:
181 | 
182 | ```sh
183 | ./build/release/duckdb
184 | ./build/release/test/unittest
185 | ./build/release/extension/nanoarrow/nanoarrow.duckdb_extension
186 | ```
187 | 
188 | - `duckdb` is the binary for the duckdb shell with the extension code automatically loaded.
189 | - `unittest` is the test runner of duckdb. Again, the extension is already linked into the binary.
190 | - `nanoarrow.duckdb_extension` is the loadable binary as it would be distributed.
191 | 
192 | If you'd like to use VSCode with the integration provided by the CMake/clangd extension, you
193 | can run:
194 | 
195 | ``` shell
196 | cp CMakeUserPresets.json duckdb/
197 | ```
198 | 
199 | ...and ensure that `.vscode/settings.json` contains:
200 | 
201 | ``` json
202 | {
203 |     "cmake.sourceDirectory": "${workspaceFolder}/duckdb"
204 | }
205 | ```
206 | 
207 | Then choose *Developer: Reload window* from the command palette and choose the
208 | *Extension (Debug build)* preset.
209 | 
210 | ## Running the extension
211 | 
212 | To run the extension code, simply start the shell with `./build/release/duckdb`
213 | (if you're using `make` to build) or `./build/duckdb` (if you're using CMake
214 | via VSCode).
215 | 
216 | Now we can use the features from the extension directly in DuckDB.
217 | 
218 | ## Running the tests
219 | 
220 | Different tests can be created for DuckDB extensions. Tests are written in
221 | SQL  `./test/sql`. These SQL tests can be run using `make test` (if using
222 | make) or `./test_local.sh` (if using CMake via VSCode).
223 | 
224 | ## Debugging
225 | 
226 | You can debug an interactive SQL session by launching it with `gdb` or `lldb`:
227 | 
228 | ``` shell
229 | lldb build/duckdb
230 | ```
231 | 
232 | ...or you can use the CodeLLDB extension (Command Palette: *LLDB: Attach to process*)
233 | to launch a VSCode interactive debugger launched in a terminal.
234 | 


--------------------------------------------------------------------------------
/benchmark/lineitem.py:
--------------------------------------------------------------------------------
  1 | import pyarrow as pa
  2 | import duckdb
  3 | import time
  4 | import statistics
  5 | import sys
  6 | from decimal import Decimal
  7 | import concurrent.futures
  8 | 
  9 | 
 10 | def measure_execution_time(con, query, result = None, lineitem_arrow = None):
 11 |     times = []
 12 |     for _ in range(5):
 13 |         start = time.perf_counter()
 14 |         res = con.execute(query)
 15 |         end = time.perf_counter()
 16 |         times.append(end - start)
 17 |         if result:
 18 |             assert res.fetchall() == result
 19 | 
 20 |     return statistics.median(times)
 21 | 
 22 | def get_queries(table_name):
 23 |     return [
 24 |         f"""SELECT
 25 |             sum(l_extendedprice * l_discount) AS revenue
 26 |         FROM
 27 |             {table_name}
 28 |         WHERE
 29 |             l_shipdate >= CAST('1994-01-01' AS date)
 30 |             AND l_shipdate < CAST('1995-01-01' AS date)
 31 |             AND l_discount BETWEEN 0.05
 32 |             AND 0.07
 33 |             AND l_quantity < 24;"""
 34 |             ]
 35 | 
 36 | def run_duckdb_native(con):
 37 |     # Lets see how long it takes to run the queries in DuckDB
 38 |     print ("Read DuckDB - Native")
 39 |     queries = get_queries("lineitem")
 40 |     for query in queries:
 41 |         print(measure_execution_time(con,query,[(Decimal('123141078.2283'),)]))
 42 | 
 43 | 
 44 | def run_duckdb_arrow_array_stream(con):
 45 |     # Lets see how long it takes to run the queries in DuckDB
 46 |     print ("Generate PyArrow - Arrow Stream")
 47 |     times = []
 48 |     for _ in range(5):
 49 |         start = time.perf_counter()
 50 |         results = con.execute("FROM lineitem").fetch_record_batch()
 51 |         while True:
 52 |             try:
 53 |                 # Process chunks
 54 |                 results.read_next_batch()
 55 |             except StopIteration:
 56 |                 break
 57 |         end = time.perf_counter()
 58 |         times.append(end - start)
 59 |     print(statistics.median(times))
 60 | 
 61 |     print ("Read PyArrow - Arrow Stream")
 62 |     queries = get_queries("record_batch_reader")
 63 |     arrow_table = con.execute("FROM lineitem").arrow()
 64 |     batches = arrow_table.to_batches(2048*120)
 65 |     for query in queries:
 66 |         times = []
 67 |         for _ in range(5):
 68 |             record_batch_reader = pa.RecordBatchReader.from_batches(arrow_table.schema, batches)
 69 |             start = time.perf_counter()
 70 |             res = con.execute(query)
 71 |             end = time.perf_counter()
 72 |             times.append(end - start)
 73 |             assert res.fetchall() == [(Decimal('123141078.2283'),)]
 74 |         print(statistics.median(times))
 75 | 
 76 | def run_arrow_ipc(con):
 77 |     print ("Read IPC Buffers")
 78 | 
 79 |     queries = get_queries("lineitem_arrow")
 80 |     times = []
 81 |     for _ in range(5):
 82 |         batches = con.execute("FROM lineitem").arrow().to_batches(2048*120)
 83 |         sink = pa.BufferOutputStream()
 84 | 
 85 |         with pa.ipc.new_stream(sink, batches[0].schema) as writer:
 86 |             for batch in batches:
 87 |                 writer.write_batch(batch)
 88 | 
 89 |         buffer = sink.getvalue()
 90 |         with pa.BufferReader(buffer) as buf_reader:
 91 |             for query in queries:
 92 |                     msg_reader = pa.ipc.MessageReader.open_stream(buf_reader)
 93 |                     start = time.perf_counter()
 94 |                     lineitem_arrow = con.from_arrow(msg_reader)
 95 |                     res = con.execute(query)
 96 |                     end = time.perf_counter()
 97 |                     times.append(end - start)
 98 |                     assert res.fetchall() == [(Decimal('123141078.2283'),)]
 99 |     print(statistics.median(times))
100 | 
101 |     print("Generate IPC Buffers")
102 |     print(measure_execution_time(con,"FROM to_arrow_ipc((FROM lineitem))"))
103 | 
104 | def run_pyarrow(con):
105 |     # Lets see how long it takes to run the queries in DuckDB
106 |     print ("DuckDB - Native")
107 |     queries = get_queries("lineitem")
108 |     for query in queries:
109 |         print(query)
110 |         print(measure_execution_time(con,query))
111 | 
112 | def run_parquet(con):
113 |     print("Generate Parquet")
114 |     print(measure_execution_time(con, "COPY lineitem TO 'lineitem.parquet'"))
115 | 
116 |     queries = get_queries("lineitem.parquet")
117 |     print("Read From Parquet")
118 |     print(measure_execution_time(con, queries[0], [(Decimal('123141078.2283'),)]))
119 | 
120 | def run_arrow_file(con):
121 |     print("Generate ArrowIPC File")
122 |     print(measure_execution_time(con, "COPY lineitem TO 'lineitem.arrows' (FORMAT arrows)"))
123 | 
124 |     queries = get_queries("lineitem.arrows")
125 |     print("Read From ArrowIPC File")
126 |     print(measure_execution_time(con, queries[0], [(Decimal('123141078.2283'),)]))
127 | 
128 |     # Use Arrow IPC to generate an ipc file
129 |     table = con.execute("FROM lineitem").arrow()
130 | 
131 |     # Write the table to an IPC file (Arrow file format)
132 |     print("Generate ArrowIPC File Pure")
133 |     times = []
134 |     for _ in range(5):
135 |         options = pa.ipc.IpcWriteOptions(compression = 'zstd')
136 |         with open("lineitem_ipc.arrow", "wb") as f:
137 |             start = time.perf_counter()
138 |             writer = pa.ipc.RecordBatchFileWriter(f, table.schema, options=options)
139 |             writer.write_table(table)
140 |             writer.close()
141 |             end = time.perf_counter()
142 |             times.append(end - start)
143 |     print(statistics.median(times))
144 |     queries = get_queries("lineitem_ipc.arrow")
145 |     print("Read From ArrowIPC - Pure File")
146 |     print(measure_execution_time(con, queries[0], [(Decimal('123141078.2283'),)]))
147 | 
148 | 
149 | def create_con(path,sf):
150 |     con = duckdb.connect(config={"allow_unsigned_extensions":"true"})
151 |     con.execute(f"load '{path}'")
152 |     con.execute(f"CALL dbgen(sf={sf});")
153 |     return con
154 | 
155 | if len(sys.argv) < 2:
156 |     print("Usage: lineitem.py <extension_lib_path>")
157 |     sys.exit(1)
158 | 
159 | path = sys.argv[1]
160 | 
161 | def run_buffer_benchmark():
162 |     con = create_con(path,1)
163 |     run_duckdb_native(con)
164 |     run_duckdb_arrow_array_stream(con)
165 |     run_arrow_ipc(con)
166 | 
167 | def run_file_benchmark():
168 |     con = create_con(path,1)
169 |     run_parquet(con)
170 |     run_arrow_file(con)
171 | 
172 | if __name__ == "__main__":
173 |     run_file_benchmark()
174 | 


--------------------------------------------------------------------------------
/data/fruit.arrow:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paleolimbot/duckdb-nanoarrow/8d76db91a3d375262e0d425a8a85fd8645b31ae9/data/fruit.arrow


--------------------------------------------------------------------------------
/data/multifile/different_order.arrows:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paleolimbot/duckdb-nanoarrow/8d76db91a3d375262e0d425a8a85fd8645b31ae9/data/multifile/different_order.arrows


--------------------------------------------------------------------------------
/data/multifile/different_type.arrows:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paleolimbot/duckdb-nanoarrow/8d76db91a3d375262e0d425a8a85fd8645b31ae9/data/multifile/different_type.arrows


--------------------------------------------------------------------------------
/data/multifile/different_type_int.arrows:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paleolimbot/duckdb-nanoarrow/8d76db91a3d375262e0d425a8a85fd8645b31ae9/data/multifile/different_type_int.arrows


--------------------------------------------------------------------------------
/data/multifile/different_type_order.arrows:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paleolimbot/duckdb-nanoarrow/8d76db91a3d375262e0d425a8a85fd8645b31ae9/data/multifile/different_type_order.arrows


--------------------------------------------------------------------------------
/data/multifile/fruit_extra.arrows:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paleolimbot/duckdb-nanoarrow/8d76db91a3d375262e0d425a8a85fd8645b31ae9/data/multifile/fruit_extra.arrows


--------------------------------------------------------------------------------
/data/multifile/glob/f1.arrow:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paleolimbot/duckdb-nanoarrow/8d76db91a3d375262e0d425a8a85fd8645b31ae9/data/multifile/glob/f1.arrow


--------------------------------------------------------------------------------
/data/multifile/glob/f2.arrow:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paleolimbot/duckdb-nanoarrow/8d76db91a3d375262e0d425a8a85fd8645b31ae9/data/multifile/glob/f2.arrow


--------------------------------------------------------------------------------
/data/multifile/glob/f3.arrow:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paleolimbot/duckdb-nanoarrow/8d76db91a3d375262e0d425a8a85fd8645b31ae9/data/multifile/glob/f3.arrow


--------------------------------------------------------------------------------
/data/multifile/hive/part=a/f1.arrow:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paleolimbot/duckdb-nanoarrow/8d76db91a3d375262e0d425a8a85fd8645b31ae9/data/multifile/hive/part=a/f1.arrow


--------------------------------------------------------------------------------
/data/multifile/hive/part=a/f2.arrow:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paleolimbot/duckdb-nanoarrow/8d76db91a3d375262e0d425a8a85fd8645b31ae9/data/multifile/hive/part=a/f2.arrow


--------------------------------------------------------------------------------
/data/multifile/hive/part=b/f1.arrow:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paleolimbot/duckdb-nanoarrow/8d76db91a3d375262e0d425a8a85fd8645b31ae9/data/multifile/hive/part=b/f1.arrow


--------------------------------------------------------------------------------
/data/multifile/hive/part=b/f3.arrow:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paleolimbot/duckdb-nanoarrow/8d76db91a3d375262e0d425a8a85fd8645b31ae9/data/multifile/hive/part=b/f3.arrow


--------------------------------------------------------------------------------
/data/parquet-testing/lineitem_sf0_01.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paleolimbot/duckdb-nanoarrow/8d76db91a3d375262e0d425a8a85fd8645b31ae9/data/parquet-testing/lineitem_sf0_01.parquet


--------------------------------------------------------------------------------
/data/test.arrow:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paleolimbot/duckdb-nanoarrow/8d76db91a3d375262e0d425a8a85fd8645b31ae9/data/test.arrow


--------------------------------------------------------------------------------
/data/test.arrows:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paleolimbot/duckdb-nanoarrow/8d76db91a3d375262e0d425a8a85fd8645b31ae9/data/test.arrows


--------------------------------------------------------------------------------
/docs/UPDATING.md:
--------------------------------------------------------------------------------
 1 | # Extension updating
 2 | When cloning this template, the target version of DuckDB should be the latest stable release of DuckDB. However, there
 3 | will inevitably come a time when a new DuckDB is released and the extension repository needs updating. This process goes
 4 | as follows:
 5 | 
 6 | - Bump submodules
 7 |   - `./duckdb` should be set to latest tagged release
 8 |   - `./extension-ci-tools` should be set to updated branch corresponding to latest DuckDB release. So if you're building for DuckDB `v1.1.0` there will be a branch in `extension-ci-tools` named `v1.1.0` to which you should check out.
 9 | - Bump versions in `./github/workflows`
10 |   - `duckdb_version` input in `duckdb-stable-build` job in `MainDistributionPipeline.yml` should be set to latest tagged release
11 |   - `duckdb_version` input in `duckdb-stable-deploy` job in `MainDistributionPipeline.yml` should be set to latest tagged release
12 |   - the reusable workflow `duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml` for the `duckdb-stable-build` job should be set to latest tagged release
13 | 
14 | # API changes
15 | DuckDB extensions built with this extension template are built against the internal C++ API of DuckDB. This API is not guaranteed to be stable.
16 | What this means for extension development is that when updating your extensions DuckDB target version using the above steps, you may run into the fact that your extension no longer builds properly.
17 | 
18 | Currently, DuckDB does not (yet) provide a specific change log for these API changes, but it is generally not too hard to figure out what has changed.
19 | 
20 | For figuring out how and why the C++ API changed, we recommend using the following resources:
21 | - DuckDB's [Release Notes](https://github.com/duckdb/duckdb/releases)
22 | - DuckDB's history of [Core extension patches](https://github.com/duckdb/duckdb/commits/main/.github/patches/extensions)
23 | - The git history of the relevant C++ Header file of the API that has changed
24 | 


--------------------------------------------------------------------------------
/extension_config.cmake:
--------------------------------------------------------------------------------
 1 | # This file is included by DuckDB's build system. It specifies which extension to load
 2 | 
 3 | # Extension from this repo
 4 | duckdb_extension_load(nanoarrow
 5 |                       SOURCE_DIR
 6 |                       ${CMAKE_CURRENT_LIST_DIR}
 7 |                       LOAD_TESTS
 8 |                       LINKED_LIBS
 9 |                       "../../_deps/nanoarrow-build/lib*.a")
10 | 
11 | # Any extra extensions that should be built
12 | # e.g.: duckdb_extension_load(json)
13 | 


--------------------------------------------------------------------------------
/scripts/extension-upload.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Extension upload script
 4 | 
 5 | # Usage: ./extension-upload.sh <name> <extension_version> <duckdb_version> <architecture> <s3_bucket> <copy_to_latest> <copy_to_versioned>
 6 | # <name>                : Name of the extension
 7 | # <extension_version>   : Version (commit / version tag) of the extension
 8 | # <duckdb_version>      : Version (commit / version tag) of DuckDB
 9 | # <architecture>        : Architecture target of the extension binary
10 | # <s3_bucket>           : S3 bucket to upload to
11 | # <copy_to_latest>      : Set this as the latest version ("true" / "false", default: "false")
12 | # <copy_to_versioned>   : Set this as a versioned version that will prevent its deletion
13 | 
14 | set -e
15 | 
16 | if [[ $4 == wasm* ]]; then
17 |   ext="/tmp/extension/$1.duckdb_extension.wasm"
18 | else
19 |   ext="/tmp/extension/$1.duckdb_extension"
20 | fi
21 | 
22 | echo $ext
23 | 
24 | script_dir="$(dirname "$(readlink -f "$0")")"
25 | 
26 | # calculate SHA256 hash of extension binary
27 | cat $ext > $ext.append
28 | 
29 | if [[ $4 == wasm* ]]; then
30 |   # 0 for custom section
31 |   # 113 in hex = 275 in decimal, total lenght of what follows (1 + 16 + 2 + 256)
32 |   # [1(continuation) + 0010011(payload) = \x93, 0(continuation) + 10(payload) = \x02]
33 |   echo -n -e '\x00' >> $ext.append
34 |   echo -n -e '\x93\x02' >> $ext.append
35 |   # 10 in hex = 16 in decimal, lenght of name, 1 byte
36 |   echo -n -e '\x10' >> $ext.append
37 |   echo -n -e 'duckdb_signature' >> $ext.append
38 |   # the name of the WebAssembly custom section, 16 bytes
39 |   # 100 in hex, 256 in decimal
40 |   # [1(continuation) + 0000000(payload) = ff, 0(continuation) + 10(payload)],
41 |   # for a grand total of 2 bytes
42 |   echo -n -e '\x80\x02' >> $ext.append
43 | fi
44 | 
45 | # (Optionally) Sign binary
46 | if [ "$DUCKDB_EXTENSION_SIGNING_PK" != "" ]; then
47 |   echo "$DUCKDB_EXTENSION_SIGNING_PK" > private.pem
48 |   $script_dir/../duckdb/scripts/compute-extension-hash.sh $ext.append > $ext.hash
49 |   openssl pkeyutl -sign -in $ext.hash -inkey private.pem -pkeyopt digest:sha256 -out $ext.sign
50 |   rm -f private.pem
51 | fi
52 | 
53 | # Signature is always there, potentially defaulting to 256 zeros
54 | truncate -s 256 $ext.sign
55 | 
56 | # append signature to extension binary
57 | cat $ext.sign >> $ext.append
58 | 
59 | # compress extension binary
60 | if [[ $4 == wasm_* ]]; then
61 |   brotli < $ext.append > "$ext.compressed"
62 | else
63 |   gzip < $ext.append > "$ext.compressed"
64 | fi
65 | 
66 | set -e
67 | 
68 | # Abort if AWS key is not set
69 | if [ -z "$AWS_ACCESS_KEY_ID" ]; then
70 |     echo "No AWS key found, skipping.."
71 |     exit 0
72 | fi
73 | 
74 | # upload versioned version
75 | if [[ $7 = 'true' ]]; then
76 |   if [[ $4 == wasm* ]]; then
77 |     aws s3 cp $ext.compressed s3://$5/$1/$2/$3/$4/$1.duckdb_extension.wasm --acl public-read --content-encoding br --content-type="application/wasm"
78 |   else
79 |     aws s3 cp $ext.compressed s3://$5/$1/$2/$3/$4/$1.duckdb_extension.gz --acl public-read
80 |   fi
81 | fi
82 | 
83 | # upload to latest version
84 | if [[ $6 = 'true' ]]; then
85 |   if [[ $4 == wasm* ]]; then
86 |     aws s3 cp $ext.compressed s3://$5/$3/$4/$1.duckdb_extension.wasm --acl public-read --content-encoding br --content-type="application/wasm"
87 |   else
88 |     aws s3 cp $ext.compressed s3://$5/$3/$4/$1.duckdb_extension.gz --acl public-read
89 |   fi
90 | fi
91 | 


--------------------------------------------------------------------------------
/scripts/setup-custom-toolchain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This is an example script that can be used to install additional toolchain dependencies. Feel free to remove this script
 4 | # if no additional toolchains are required
 5 | 
 6 | # To enable this script, set the `custom_toolchain_script` option to true when calling the reusable workflow
 7 | # `.github/workflows/_extension_distribution.yml` from `https://github.com/duckdb/extension-ci-tools`
 8 | 
 9 | # note that the $DUCKDB_PLATFORM environment variable can be used to discern between the platforms
10 | echo "This is the sample custom toolchain script running for architecture '$DUCKDB_PLATFORM' for the nanoarrow extension."
11 | 


--------------------------------------------------------------------------------
/src/file_scanner/arrow_file_scan.cpp:
--------------------------------------------------------------------------------
 1 | #include "file_scanner/arrow_file_scan.hpp"
 2 | 
 3 | #include "file_scanner/arrow_multi_file_info.hpp"
 4 | 
 5 | namespace duckdb {
 6 | namespace ext_nanoarrow {
 7 | struct ArrowFileLocalState;
 8 | 
 9 | ArrowFileScan::ArrowFileScan(ClientContext& context, const string& file_name)
10 |     : BaseFileReader(file_name) {
11 |   factory = make_uniq<FileIPCStreamFactory>(context, file_name);
12 | 
13 |   factory->InitReader();
14 |   factory->GetFileSchema(schema_root);
15 |   DBConfig& config = DatabaseInstance::GetDatabase(context).config;
16 |   ArrowTableFunction::PopulateArrowTableType(config, arrow_table_type, schema_root, names,
17 |                                              types);
18 |   QueryResult::DeduplicateColumns(names);
19 |   if (types.empty()) {
20 |     throw InvalidInputException("Provided table/dataframe must have at least one column");
21 |   }
22 |   columns = MultiFileColumnDefinition::ColumnsFromNamesAndTypes(names, types);
23 | }
24 | 
25 | string ArrowFileScan::GetReaderType() const { return "ARROW"; }
26 | 
27 | const vector<string>& ArrowFileScan::GetNames() { return names; }
28 | const vector<LogicalType>& ArrowFileScan::GetTypes() { return types; }
29 | 
30 | bool ArrowFileScan::TryInitializeScan(ClientContext& context,
31 |                                       GlobalTableFunctionState& gstate_p,
32 |                                       LocalTableFunctionState& lstate_p) {
33 |   auto& gstate = gstate_p.Cast<ArrowFileGlobalState>();
34 |   auto& lstate = lstate_p.Cast<ArrowFileLocalState>();
35 |   if (gstate.files.find(file_list_idx.GetIndex()) != gstate.files.end()) {
36 |     // Return false because we don't currently support more than one thread
37 |     // scanning a file. In the future we may be able to support this by (e.g.)
38 |     // reading the Arrow file footer or sending a thread to read ahead to scan
39 |     // for RecordBatch messages.
40 |     return false;
41 |   }
42 |   gstate.files.insert(file_list_idx.GetIndex());
43 | 
44 |   // lstate.file_scan = shared_ptr_cast<BaseFileReader, ArrowFileScan>(this);
45 |   lstate.local_arrow_function_data = make_uniq<ArrowScanFunctionData>(
46 |       &FileIPCStreamFactory::Produce, reinterpret_cast<uintptr_t>(factory.get()));
47 |   lstate.local_arrow_function_data->schema_root = schema_root;
48 |   lstate.local_arrow_function_data->arrow_table = arrow_table_type;
49 |   if (!column_indexes.empty()) {
50 |     lstate.init_input = make_uniq<TableFunctionInitInput>(
51 |         *lstate.local_arrow_function_data, column_indexes,
52 |         gstate.global_state.projection_ids, filters);
53 |   } else {
54 |     lstate.init_input = make_uniq<TableFunctionInitInput>(
55 |         *lstate.local_arrow_function_data, gstate.global_state.column_indexes,
56 |         gstate.global_state.projection_ids, filters);
57 |   }
58 |   lstate.local_arrow_global_state =
59 |       ArrowTableFunction::ArrowScanInitGlobal(context, *lstate.init_input);
60 |   lstate.local_arrow_local_state =
61 |       ArrowTableFunction::ArrowScanInitLocal(lstate.execution_context, *lstate.init_input,
62 |                                              lstate.local_arrow_global_state.get());
63 |   lstate.table_function_input = make_uniq<TableFunctionInput>(
64 |       lstate.local_arrow_function_data.get(), lstate.local_arrow_local_state.get(),
65 |       lstate.local_arrow_global_state.get());
66 |   return true;
67 | }
68 | void ArrowFileScan::Scan(ClientContext& context, GlobalTableFunctionState& global_state,
69 |                          LocalTableFunctionState& local_state, DataChunk& chunk) {
70 |   auto& lstate = local_state.Cast<ArrowFileLocalState>();
71 |   ArrowTableFunction::ArrowScanFunction(context, *lstate.table_function_input, chunk);
72 | }
73 | 
74 | shared_ptr<BaseUnionData> ArrowFileScan::GetUnionData(idx_t file_idx) {
75 |   auto data = make_shared_ptr<BaseUnionData>(GetFileName());
76 |   data->names = GetNames();
77 |   data->types = GetTypes();
78 |   return data;
79 | }
80 | 
81 | }  // namespace ext_nanoarrow
82 | }  // namespace duckdb
83 | 


--------------------------------------------------------------------------------
/src/file_scanner/arrow_multi_file_info.cpp:
--------------------------------------------------------------------------------
  1 | #include "file_scanner/arrow_multi_file_info.hpp"
  2 | 
  3 | #include "ipc/stream_reader/ipc_file_stream_reader.hpp"
  4 | 
  5 | #include "duckdb/common/bind_helpers.hpp"
  6 | #include "file_scanner/arrow_file_scan.hpp"
  7 | #include "ipc/stream_factory.hpp"
  8 | 
  9 | namespace duckdb {
 10 | namespace ext_nanoarrow {
 11 | 
 12 | unique_ptr<BaseFileReaderOptions> ArrowMultiFileInfo::InitializeOptions(
 13 |     ClientContext& context, optional_ptr<TableFunctionInfo> info) {
 14 |   return make_uniq<ArrowFileReaderOptions>();
 15 | }
 16 | 
 17 | bool ArrowMultiFileInfo::ParseCopyOption(ClientContext& context, const string& key,
 18 |                                          const vector<Value>& values,
 19 |                                          BaseFileReaderOptions& options_p,
 20 |                                          vector<string>& expected_names,
 21 |                                          vector<LogicalType>& expected_types) {
 22 |   // We currently do not have any options for the scanner, so we always return false
 23 |   return false;
 24 | }
 25 | 
 26 | unique_ptr<MultiFileReaderInterface> ArrowMultiFileInfo::InitializeInterface(
 27 |     ClientContext& context, MultiFileReader& reader, MultiFileList& file_list) {
 28 |   return make_uniq<ArrowMultiFileInfo>();
 29 | }
 30 | 
 31 | bool ArrowMultiFileInfo::ParseOption(ClientContext& context, const string& key,
 32 |                                      const Value& val, MultiFileOptions& file_options,
 33 |                                      BaseFileReaderOptions& options) {
 34 |   // We currently do not have any options for the scanner, so we always return false
 35 |   return false;
 36 | }
 37 | 
 38 | void ArrowMultiFileInfo::FinalizeCopyBind(ClientContext& context,
 39 |                                           BaseFileReaderOptions& options_p,
 40 |                                           const vector<string>& expected_names,
 41 |                                           const vector<LogicalType>& expected_types) {}
 42 | 
 43 | struct ArrowMultiFileData final : public TableFunctionData {
 44 |   ArrowMultiFileData() = default;
 45 | 
 46 |   unique_ptr<ArrowFileScan> file_scan;
 47 | };
 48 | 
 49 | unique_ptr<TableFunctionData> ArrowMultiFileInfo::InitializeBindData(
 50 |     MultiFileBindData& multi_file_data, unique_ptr<BaseFileReaderOptions> options_p) {
 51 |   return make_uniq<ArrowMultiFileData>();
 52 | }
 53 | 
 54 | void ArrowMultiFileInfo::BindReader(ClientContext& context,
 55 |                                     vector<LogicalType>& return_types,
 56 |                                     vector<string>& names, MultiFileBindData& bind_data) {
 57 |   ArrowFileReaderOptions options;
 58 |   auto& multi_file_list = *bind_data.file_list;
 59 |   if (!bind_data.file_options.union_by_name) {
 60 |     bind_data.reader_bind = bind_data.multi_file_reader->BindReader(
 61 |         context, return_types, names, *bind_data.file_list, bind_data, options,
 62 |         bind_data.file_options);
 63 | 
 64 |   } else {
 65 |     bind_data.reader_bind = bind_data.multi_file_reader->BindUnionReader(
 66 |         context, return_types, names, multi_file_list, bind_data, options,
 67 |         bind_data.file_options);
 68 |   }
 69 |   D_ASSERT(names.size() == return_types.size());
 70 | }
 71 | 
 72 | void ArrowMultiFileInfo::FinalizeBindData(MultiFileBindData& multi_file_data) {}
 73 | 
 74 | void ArrowMultiFileInfo::GetBindInfo(const TableFunctionData& bind_data, BindInfo& info) {
 75 | }
 76 | 
 77 | optional_idx ArrowMultiFileInfo::MaxThreads(const MultiFileBindData& bind_data_p,
 78 |                                             const MultiFileGlobalState& global_state,
 79 |                                             FileExpandResult expand_result) {
 80 |   if (expand_result == FileExpandResult::MULTIPLE_FILES) {
 81 |     // always launch max threads if we are reading multiple files
 82 |     return {};
 83 |   }
 84 |   // Otherwise, only one thread
 85 |   return 1;
 86 | }
 87 | 
 88 | unique_ptr<GlobalTableFunctionState> ArrowMultiFileInfo::InitializeGlobalState(
 89 |     ClientContext& context, MultiFileBindData& bind_data,
 90 |     MultiFileGlobalState& global_state) {
 91 |   return make_uniq<ArrowFileGlobalState>(
 92 |       context, bind_data.file_list->GetTotalFileCount(), bind_data, global_state);
 93 | }
 94 | 
 95 | unique_ptr<LocalTableFunctionState> ArrowMultiFileInfo::InitializeLocalState(
 96 |     ExecutionContext& context, GlobalTableFunctionState& function_state) {
 97 |   return make_uniq<ArrowFileLocalState>(context);
 98 | }
 99 | 
100 | shared_ptr<BaseFileReader> ArrowMultiFileInfo::CreateReader(
101 |     ClientContext& context, GlobalTableFunctionState& gstate_p, BaseUnionData& union_data,
102 |     const MultiFileBindData& bind_data) {
103 |   return make_shared_ptr<ArrowFileScan>(context, union_data.GetFileName());
104 | }
105 | 
106 | shared_ptr<BaseFileReader> ArrowMultiFileInfo::CreateReader(
107 |     ClientContext& context, GlobalTableFunctionState& gstate_p,
108 |     const OpenFileInfo& file_info, idx_t file_idx, const MultiFileBindData& bind_data) {
109 |   return make_shared_ptr<ArrowFileScan>(context, file_info.path);
110 | }
111 | 
112 | shared_ptr<BaseFileReader> ArrowMultiFileInfo::CreateReader(
113 |     ClientContext& context, const OpenFileInfo& file, BaseFileReaderOptions& options,
114 |     const MultiFileOptions& file_options) {
115 |   return make_shared_ptr<ArrowFileScan>(context, file.path);
116 | }
117 | 
118 | void ArrowMultiFileInfo::FinalizeReader(ClientContext& context, BaseFileReader& reader,
119 |                                         GlobalTableFunctionState&) {}
120 | 
121 | void ArrowMultiFileInfo::FinishFile(ClientContext& context,
122 |                                     GlobalTableFunctionState& global_state,
123 |                                     BaseFileReader& reader) {}
124 | 
125 | void ArrowMultiFileInfo::FinishReading(ClientContext& context,
126 |                                        GlobalTableFunctionState& global_state,
127 |                                        LocalTableFunctionState& local_state) {}
128 | 
129 | unique_ptr<NodeStatistics> ArrowMultiFileInfo::GetCardinality(
130 |     const MultiFileBindData& bind_data, idx_t file_count) {
131 |   // TODO: Here is where we might set statistics, for optimizations if we have them
132 |   // e.g., cardinality from the file footer
133 |   return make_uniq<NodeStatistics>();
134 | }
135 | 
136 | unique_ptr<BaseStatistics> ArrowMultiFileInfo::GetStatistics(ClientContext& context,
137 |                                                              BaseFileReader& reader,
138 |                                                              const string& name) {
139 |   return nullptr;
140 | }
141 | 
142 | double ArrowMultiFileInfo::GetProgressInFile(ClientContext& context,
143 |                                              const BaseFileReader& reader) {
144 |   auto& file_scan = reader.Cast<ArrowFileScan>();
145 |   if (!file_scan.factory->reader) {
146 |     // We are done with this file
147 |     return 100;
148 |   }
149 |   auto file_reader =
150 |       reinterpret_cast<IPCFileStreamReader*>(file_scan.factory->reader.get());
151 |   return file_reader->GetProgress();
152 | }
153 | 
154 | void ArrowMultiFileInfo::GetVirtualColumns(ClientContext&, MultiFileBindData&,
155 |                                            virtual_column_map_t& result) {
156 |   if (result.find(COLUMN_IDENTIFIER_EMPTY) != result.end()) {
157 |     result.erase(COLUMN_IDENTIFIER_EMPTY);
158 |   }
159 | }
160 | 
161 | }  // namespace ext_nanoarrow
162 | }  // namespace duckdb
163 | 


--------------------------------------------------------------------------------
/src/include/file_scanner/arrow_file_scan.hpp:
--------------------------------------------------------------------------------
 1 | //===----------------------------------------------------------------------===//
 2 | //                         DuckDB - nanoarrow
 3 | //
 4 | // file_scanner/arrow_file_scan.hpp
 5 | //
 6 | //
 7 | //===----------------------------------------------------------------------===//
 8 | 
 9 | #pragma once
10 | 
11 | #include "ipc/stream_factory.hpp"
12 | 
13 | #include "duckdb/common/multi_file/base_file_reader.hpp"
14 | 
15 | namespace duckdb {
16 | namespace ext_nanoarrow {
17 | 
18 | //! This class refers to an Arrow File Scan
19 | class ArrowFileScan : public BaseFileReader {
20 |  public:
21 |   explicit ArrowFileScan(ClientContext& context, const string& file_name);
22 |   ~ArrowFileScan() override {
23 |     // Release is done by the arrow scanner
24 |     schema_root.arrow_schema.release = nullptr;
25 |   };
26 | 
27 |   //! Factory of this stream
28 |   unique_ptr<FileIPCStreamFactory> factory;
29 | 
30 |   string GetReaderType() const override;
31 | 
32 |   const vector<string>& GetNames();
33 |   const vector<LogicalType>& GetTypes();
34 |   ArrowSchemaWrapper schema_root;
35 |   ArrowTableType arrow_table_type;
36 | 
37 |   bool TryInitializeScan(ClientContext& context, GlobalTableFunctionState& gstate,
38 |                          LocalTableFunctionState& lstate) override;
39 |   void Scan(ClientContext& context, GlobalTableFunctionState& global_state,
40 |             LocalTableFunctionState& local_state, DataChunk& chunk) override;
41 | 
42 |   shared_ptr<BaseUnionData> GetUnionData(idx_t file_idx) override;
43 | 
44 |  private:
45 |   vector<string> names;
46 |   vector<LogicalType> types;
47 | };
48 | }  // namespace ext_nanoarrow
49 | }  // namespace duckdb
50 | 


--------------------------------------------------------------------------------
/src/include/file_scanner/arrow_multi_file_info.hpp:
--------------------------------------------------------------------------------
  1 | //===----------------------------------------------------------------------===//
  2 | //                         DuckDB - nanoarrow
  3 | //
  4 | // file_scanner/arrow_multi_file_info.hpp
  5 | //
  6 | //
  7 | //===----------------------------------------------------------------------===//
  8 | 
  9 | #pragma once
 10 | 
 11 | #include "duckdb/common/multi_file/multi_file_function.hpp"
 12 | #include "duckdb/function/table/arrow.hpp"
 13 | 
 14 | namespace duckdb {
 15 | namespace ext_nanoarrow {
 16 | 
 17 | //! We might have arrow specific options one day
 18 | class ArrowFileReaderOptions : public BaseFileReaderOptions {};
 19 | 
 20 | class ArrowFileScan;
 21 | 
 22 | //! The Arrow Local File State, basically refers to the Scan of one Arrow File
 23 | //! This is done by calling the Arrow Scan directly on one file.
 24 | struct ArrowFileLocalState : public LocalTableFunctionState {
 25 |  public:
 26 |   explicit ArrowFileLocalState(ExecutionContext& execution_context)
 27 |       : execution_context(execution_context) {};
 28 |   //! Factory Pointer
 29 |   shared_ptr<ArrowFileScan> file_scan;
 30 | 
 31 |   ExecutionContext& execution_context;
 32 | 
 33 |   //! Each local state refers to an Arrow Scan on a local file
 34 |   unique_ptr<ArrowScanFunctionData> local_arrow_function_data;
 35 |   unique_ptr<TableFunctionInitInput> init_input;
 36 |   unique_ptr<GlobalTableFunctionState> local_arrow_global_state;
 37 |   unique_ptr<LocalTableFunctionState> local_arrow_local_state;
 38 |   unique_ptr<TableFunctionInput> table_function_input;
 39 | };
 40 | 
 41 | struct ArrowFileGlobalState : public GlobalTableFunctionState {
 42 |  public:
 43 |   ArrowFileGlobalState(ClientContext& context_p, idx_t total_file_count,
 44 |                        const MultiFileBindData& bind_data,
 45 |                        MultiFileGlobalState& global_state)
 46 |       : global_state(global_state), context(context_p) {};
 47 | 
 48 |   ~ArrowFileGlobalState() override = default;
 49 | 
 50 |   const MultiFileGlobalState& global_state;
 51 |   ClientContext& context;
 52 |   set<idx_t> files;
 53 | };
 54 | 
 55 | struct ArrowMultiFileInfo : MultiFileReaderInterface {
 56 |   unique_ptr<BaseFileReaderOptions> InitializeOptions(
 57 |       ClientContext& context, optional_ptr<TableFunctionInfo> info) override;
 58 | 
 59 |   static unique_ptr<MultiFileReaderInterface> InitializeInterface(
 60 |       ClientContext& context, MultiFileReader& reader, MultiFileList& file_list);
 61 | 
 62 |   bool ParseCopyOption(ClientContext& context, const string& key,
 63 |                        const vector<Value>& values, BaseFileReaderOptions& options,
 64 |                        vector<string>& expected_names,
 65 |                        vector<LogicalType>& expected_types) override;
 66 | 
 67 |   bool ParseOption(ClientContext& context, const string& key, const Value& val,
 68 |                    MultiFileOptions& file_options,
 69 |                    BaseFileReaderOptions& options) override;
 70 | 
 71 |   void FinalizeCopyBind(ClientContext& context, BaseFileReaderOptions& options,
 72 |                         const vector<string>& expected_names,
 73 |                         const vector<LogicalType>& expected_types) override;
 74 | 
 75 |   unique_ptr<TableFunctionData> InitializeBindData(
 76 |       MultiFileBindData& multi_file_data,
 77 |       unique_ptr<BaseFileReaderOptions> options) override;
 78 | 
 79 |   //! This is where the actual binding must happen, so in this function we either:
 80 |   //! 1. union_by_name = False. We set the schema/name depending on the first file
 81 |   //! 2. union_by_name = True.
 82 |   void BindReader(ClientContext& context, vector<LogicalType>& return_types,
 83 |                   vector<string>& names, MultiFileBindData& bind_data) override;
 84 | 
 85 |   void FinalizeBindData(MultiFileBindData& multi_file_data) override;
 86 | 
 87 |   void GetBindInfo(const TableFunctionData& bind_data, BindInfo& info) override;
 88 | 
 89 |   optional_idx MaxThreads(const MultiFileBindData& bind_data_p,
 90 |                           const MultiFileGlobalState& global_state,
 91 |                           FileExpandResult expand_result) override;
 92 | 
 93 |   unique_ptr<GlobalTableFunctionState> InitializeGlobalState(
 94 |       ClientContext& context, MultiFileBindData& bind_data,
 95 |       MultiFileGlobalState& global_state) override;
 96 | 
 97 |   unique_ptr<LocalTableFunctionState> InitializeLocalState(
 98 |       ExecutionContext& context, GlobalTableFunctionState& function_state) override;
 99 | 
100 |   shared_ptr<BaseFileReader> CreateReader(ClientContext& context,
101 |                                           GlobalTableFunctionState& gstate,
102 |                                           BaseUnionData& union_data,
103 |                                           const MultiFileBindData& bind_data_p) override;
104 | 
105 |   shared_ptr<BaseFileReader> CreateReader(ClientContext& context,
106 |                                           GlobalTableFunctionState& gstate,
107 |                                           const OpenFileInfo& file_info, idx_t file_idx,
108 |                                           const MultiFileBindData& bind_data) override;
109 | 
110 |   shared_ptr<BaseFileReader> CreateReader(ClientContext& context,
111 |                                           const OpenFileInfo& file,
112 |                                           BaseFileReaderOptions& options,
113 |                                           const MultiFileOptions& file_options) override;
114 | 
115 |   static void FinalizeReader(ClientContext& context, BaseFileReader& reader,
116 |                              GlobalTableFunctionState&);
117 | 
118 |   static void FinishFile(ClientContext& context, GlobalTableFunctionState& global_state,
119 |                          BaseFileReader& reader);
120 | 
121 |   void FinishReading(ClientContext& context, GlobalTableFunctionState& global_state,
122 |                      LocalTableFunctionState& local_state) override;
123 | 
124 |   unique_ptr<NodeStatistics> GetCardinality(const MultiFileBindData& bind_data,
125 |                                             idx_t file_count) override;
126 | 
127 |   static unique_ptr<BaseStatistics> GetStatistics(ClientContext& context,
128 |                                                   BaseFileReader& reader,
129 |                                                   const string& name);
130 | 
131 |   static double GetProgressInFile(ClientContext& context, const BaseFileReader& reader);
132 | 
133 |   void GetVirtualColumns(ClientContext& context, MultiFileBindData& bind_data,
134 |                          virtual_column_map_t& result) override;
135 | };
136 | 
137 | }  // namespace ext_nanoarrow
138 | }  // namespace duckdb
139 | 


--------------------------------------------------------------------------------
/src/include/ipc/array_stream.hpp:
--------------------------------------------------------------------------------
 1 | //===----------------------------------------------------------------------===//
 2 | //                         DuckDB - nanoarrow
 3 | //
 4 | // ipc/array_stream.hpp
 5 | //
 6 | //
 7 | //===----------------------------------------------------------------------===//
 8 | 
 9 | #pragma once
10 | 
11 | #include "ipc/stream_reader/base_stream_reader.hpp"
12 | 
13 | namespace duckdb {
14 | namespace ext_nanoarrow {
15 | class IpcArrayStream {
16 |  public:
17 |   explicit IpcArrayStream(unique_ptr<IPCStreamReader> reader);
18 | 
19 |   IPCStreamReader& Reader() const;
20 | 
21 |   void ToArrayStream(ArrowArrayStream* stream);
22 | 
23 |   int GetSchema(ArrowSchema* schema);
24 | 
25 |   int GetNext(ArrowArray* array);
26 | 
27 |   const char* GetLastError() const;
28 | 
29 |   template <typename Func>
30 |   int Wrap(Func&& func) {
31 |     try {
32 |       func();
33 |       return NANOARROW_OK;
34 |     } catch (IOException& e) {
35 |       last_msg = std::string("IOException: ") + e.what();
36 |       return EIO;
37 |     } catch (InternalException& e) {
38 |       last_msg = std::string("InternalException: ") + e.what();
39 |       return EINVAL;
40 |     } catch (nanoarrow::Exception& e) {
41 |       last_msg = std::string("nanoarrow::Exception: ") + e.what();
42 |       // Could probably find a way to pass on this code, usually ENOMEM
43 |       return ENOMEM;
44 |     } catch (std::exception& e) {
45 |       last_msg = e.what();
46 |       return EINVAL;
47 |     }
48 |   }
49 | 
50 |  private:
51 |   unique_ptr<IPCStreamReader> reader;
52 |   string last_msg;
53 | };
54 | }  // namespace ext_nanoarrow
55 | }  // namespace duckdb
56 | 


--------------------------------------------------------------------------------
/src/include/ipc/stream_factory.hpp:
--------------------------------------------------------------------------------
 1 | //===----------------------------------------------------------------------===//
 2 | //                         DuckDB - nanoarrow
 3 | //
 4 | // ipc/ipc_stream_factory.hpp
 5 | //
 6 | //
 7 | //===----------------------------------------------------------------------===//
 8 | 
 9 | #pragma once
10 | 
11 | #include "ipc/array_stream.hpp"
12 | 
13 | #include "duckdb/common/arrow/arrow_wrapper.hpp"
14 | #include "duckdb/function/table/arrow.hpp"
15 | #include "table_function/scan_arrow_ipc.hpp"
16 | 
17 | namespace duckdb {
18 | namespace ext_nanoarrow {
19 | 
20 | class ArrowStreamFactory {
21 |   ArrowStreamFactory() {};
22 | };
23 | //! This Factory is a type invented by DuckDB. Notably, the Produce()
24 | //! function pointer is passed to the constructor of the ArrowScanFunctionData
25 | //! constructor (which we wrap).
26 | class ArrowIPCStreamFactory {
27 |  public:
28 |   virtual ~ArrowIPCStreamFactory() = default;
29 |   explicit ArrowIPCStreamFactory(Allocator& allocator);
30 | 
31 |   //! Called once when initializing Scan States
32 |   static unique_ptr<ArrowArrayStreamWrapper> Produce(uintptr_t factory_ptr,
33 |                                                      ArrowStreamParameters& parameters);
34 | 
35 |   //! Get the schema of the arrow object
36 |   void GetFileSchema(ArrowSchemaWrapper& schema) const;
37 | 
38 |   //! Opens the file, wraps it in the ArrowIpcInputStream, and wraps it in
39 |   //! the ArrowArrayStream reader.
40 |   virtual void InitReader() {
41 |     throw NotImplementedException("ArrowIPCStreamFactory::InitReader not implemented");
42 |   }
43 | 
44 |   Allocator& allocator;
45 |   unique_ptr<IPCStreamReader> reader;
46 |   ArrowError error{};
47 | };
48 | 
49 | class BufferIPCStreamFactory final : public ArrowIPCStreamFactory {
50 |  public:
51 |   explicit BufferIPCStreamFactory(ClientContext& context,
52 |                                   const vector<ArrowIPCBuffer>& buffers);
53 |   void InitReader() override;
54 | 
55 |   vector<ArrowIPCBuffer> buffers;
56 | };
57 | 
58 | class FileIPCStreamFactory final : public ArrowIPCStreamFactory {
59 |  public:
60 |   explicit FileIPCStreamFactory(ClientContext& context, string src_string);
61 |   void InitReader() override;
62 | 
63 |   FileSystem& fs;
64 |   string src_string;
65 | };
66 | }  // namespace ext_nanoarrow
67 | }  // namespace duckdb
68 | 


--------------------------------------------------------------------------------
/src/include/ipc/stream_reader/base_stream_reader.hpp:
--------------------------------------------------------------------------------
  1 | //===----------------------------------------------------------------------===//
  2 | //                         DuckDB - nanoarrow
  3 | //
  4 | // ipc/stream_reader/base_stream_reader.hpp
  5 | //
  6 | //
  7 | //===----------------------------------------------------------------------===//
  8 | 
  9 | #pragma once
 10 | 
 11 | #include "nanoarrow/nanoarrow.hpp"
 12 | #include "nanoarrow/nanoarrow_ipc.hpp"
 13 | 
 14 | #include "duckdb/common/allocator.hpp"
 15 | #include "duckdb/common/file_system.hpp"
 16 | #include "duckdb/common/radix.hpp"
 17 | #include "duckdb/common/serializer/buffered_file_reader.hpp"
 18 | #include "nanoarrow_errors.hpp"
 19 | 
 20 | #include "table_function/scan_arrow_ipc.hpp"
 21 | 
 22 | namespace duckdb {
 23 | namespace ext_nanoarrow {
 24 | 
 25 | //! Missing in nanoarrow_ipc.hpp
 26 | struct UniqueSharedBuffer {
 27 |   struct ArrowIpcSharedBuffer data{};
 28 | 
 29 |   ~UniqueSharedBuffer() {
 30 |     if (data.private_src.allocator.free != nullptr) {
 31 |       ArrowIpcSharedBufferReset(&data);
 32 |     }
 33 |   }
 34 | };
 35 | 
 36 | struct ArrowIpcMessagePrefix {
 37 |   uint32_t continuation_token;
 38 |   int32_t metadata_size;
 39 | };
 40 | 
 41 | //! Base IPC Reader
 42 | class IPCStreamReader {
 43 |  public:
 44 |   virtual ~IPCStreamReader() = default;
 45 |   explicit IPCStreamReader(Allocator& allocator)
 46 |       : decoder(NewDuckDBArrowDecoder()), allocator(allocator) {};
 47 |   //! Gets the output schema, which is the file schema with projection pushdown being
 48 |   //! considered
 49 |   const ArrowSchema* GetOutputSchema();
 50 |   //! Gets the next batch
 51 |   bool GetNextBatch(ArrowArray* out);
 52 |   //! Gets the unique buffer to get the next batch
 53 |   virtual nanoarrow::UniqueBuffer GetUniqueBuffer() {
 54 |     throw InternalException("IPCStreamReader::GetUniqueBuffer not implemented");
 55 |   };
 56 | 
 57 |   //! Sets the projection pushdown for this reader
 58 |   void SetColumnProjection(const vector<string>& column_names);
 59 |   //! Gets the base schema with no projection pushdown
 60 |   const ArrowSchema* GetBaseSchema();
 61 | 
 62 |   ArrowIpcMessageType ReadNextMessage(vector<ArrowIpcMessageType> expected_types,
 63 |                                       bool end_of_stream_ok = true);
 64 |   virtual ArrowIpcMessageType ReadNextMessage() {
 65 |     throw InternalException("IPCStreamReader::ReadNextMessage not implemented");
 66 |   }
 67 | 
 68 |  protected:
 69 |   virtual data_ptr_t ReadData(data_ptr_t ptr, idx_t size) {
 70 |     throw InternalException("IPCStreamReader::ReadData not implemented");
 71 |   }
 72 |   //! Decode Message is composed of 3 steps
 73 |   ArrowIpcMessageType DecodeMessage();
 74 |   //! 1. We decode the message metadata, and return the message_header_size
 75 |   idx_t DecodeMetadata() const;
 76 |   //! 2. We decode the message head, if message is finished we return true
 77 |   virtual bool DecodeHeader(idx_t message_header_size) {
 78 |     throw InternalException("IPCStreamReader::DecodeHead not implemented");
 79 |   }
 80 |   //! 3. We decode the message body
 81 |   virtual void DecodeBody() {
 82 |     throw InternalException("IPCStreamReader::DecodeBody not implemented");
 83 |   }
 84 | 
 85 |   bool HasProjection() const;
 86 |   static nanoarrow::ipc::UniqueDecoder NewDuckDBArrowDecoder();
 87 | 
 88 |   static ArrowBufferView AllocatedDataView(const_data_ptr_t data, int64_t size);
 89 |   static nanoarrow::UniqueBuffer AllocatedDataToOwningBuffer(
 90 |       const shared_ptr<AllocatedData>& data);
 91 | 
 92 |   static const char* MessageTypeString(ArrowIpcMessageType message_type);
 93 | 
 94 |   static int64_t CountFields(const ArrowSchema* schema);
 95 | 
 96 |   ArrowError error{};
 97 |   nanoarrow::ipc::UniqueDecoder decoder{};
 98 |   vector<int64_t> projected_fields;
 99 |   nanoarrow::UniqueSchema projected_schema;
100 |   //! Schema without projection applied to it
101 |   nanoarrow::UniqueSchema base_schema;
102 | 
103 |   //! Information on current buffer
104 |   data_ptr_t cur_ptr{};
105 |   int64_t cur_size{};
106 | 
107 |   //! Allocator used to allocate buffers with decoded arrow information
108 |   Allocator& allocator;
109 | 
110 |   bool finished{false};
111 | 
112 |   ArrowIpcMessagePrefix message_prefix{};
113 |   static constexpr uint32_t kContinuationToken = 0xFFFFFFFF;
114 | };
115 | 
116 | }  // namespace ext_nanoarrow
117 | }  // namespace duckdb
118 | 


--------------------------------------------------------------------------------
/src/include/ipc/stream_reader/ipc_buffer_stream_reader.hpp:
--------------------------------------------------------------------------------
 1 | //===----------------------------------------------------------------------===//
 2 | //                         DuckDB - nanoarrow
 3 | //
 4 | // ipc/stream_reader/ipc_buffer_stream_reader.hpp
 5 | //
 6 | //
 7 | //===----------------------------------------------------------------------===//
 8 | 
 9 | #pragma once
10 | 
11 | #include "ipc/stream_reader/base_stream_reader.hpp"
12 | 
13 | namespace duckdb {
14 | namespace ext_nanoarrow {
15 | 
16 | struct IPCBuffer {
17 |   idx_t pos = 0;
18 |   data_ptr_t ptr = nullptr;
19 |   int64_t size = 0;
20 | };
21 | 
22 | //! Buffer Stream
23 | class IPCBufferStreamReader final : public IPCStreamReader {
24 |  public:
25 |   IPCBufferStreamReader(vector<ArrowIPCBuffer> buffers, Allocator& allocator);
26 | 
27 |   ArrowIpcMessageType ReadNextMessage() override;
28 | 
29 |  private:
30 |   data_ptr_t ReadData(data_ptr_t ptr, idx_t size) override;
31 |   bool DecodeHeader(idx_t message_header_size) override;
32 |   void DecodeBody() override;
33 |   nanoarrow::UniqueBuffer GetUniqueBuffer() override;
34 |   vector<ArrowIPCBuffer> buffers;
35 |   idx_t cur_idx = 0;
36 |   IPCBuffer header;
37 |   IPCBuffer body;
38 |   IPCBuffer cur_buffer;
39 |   bool initialized = false;
40 | };
41 | 
42 | }  // namespace ext_nanoarrow
43 | }  // namespace duckdb
44 | 


--------------------------------------------------------------------------------
/src/include/ipc/stream_reader/ipc_file_stream_reader.hpp:
--------------------------------------------------------------------------------
 1 | //===----------------------------------------------------------------------===//
 2 | //                         DuckDB - nanoarrow
 3 | //
 4 | // ipc/stream_reader/ipc_file_stream_reader.hpp
 5 | //
 6 | //
 7 | //===----------------------------------------------------------------------===//
 8 | 
 9 | #pragma once
10 | 
11 | #include "ipc/stream_reader/base_stream_reader.hpp"
12 | 
13 | namespace duckdb {
14 | namespace ext_nanoarrow {
15 | 
16 | //! IPC File
17 | class IPCFileStreamReader final : public IPCStreamReader {
18 |  public:
19 |   IPCFileStreamReader(FileSystem& fs, unique_ptr<FileHandle> handle,
20 |                       Allocator& allocator);
21 | 
22 |   ArrowIpcMessageType ReadNextMessage() override;
23 | 
24 |   double GetProgress();
25 | 
26 |  private:
27 |   BufferedFileReader file_reader;
28 |   AllocatedData message_header;
29 |   shared_ptr<AllocatedData> message_body;
30 | 
31 |   void EnsureInputStreamAligned();
32 | 
33 |   data_ptr_t ReadData(data_ptr_t ptr, idx_t size) override;
34 |   static void DecodeArray(nanoarrow::ipc::UniqueDecoder& decoder, ArrowArray* out,
35 |                           ArrowBufferView& body_view, ArrowError* error);
36 |   bool DecodeHeader(idx_t message_header_size) override;
37 |   void DecodeBody() override;
38 |   nanoarrow::UniqueBuffer GetUniqueBuffer() override;
39 |   void PopulateNames(vector<string>& names);
40 | };
41 | 
42 | }  // namespace ext_nanoarrow
43 | }  // namespace duckdb
44 | 


--------------------------------------------------------------------------------
/src/include/nanoarrow_errors.hpp:
--------------------------------------------------------------------------------
 1 | //===----------------------------------------------------------------------===//
 2 | //                         DuckDB - nanoarrow
 3 | //
 4 | // nanoarrow_errors.hpp
 5 | //
 6 | //
 7 | //===----------------------------------------------------------------------===//
 8 | 
 9 | #pragma once
10 | #define _DUCKDB_NANOARROW_THROW_NOT_OK_IMPL(NAME, ExceptionCls, ERROR_PTR, EXPR,      \
11 |                                             EXPR_STR)                                 \
12 |   do {                                                                                \
13 |     const int NAME = (EXPR);                                                          \
14 |     if (NAME) {                                                                       \
15 |       throw ExceptionCls(std::string(EXPR_STR) + std::string(" failed with errno ") + \
16 |                          std::to_string(NAME) + std::string(": ") +                   \
17 |                          std::string((ERROR_PTR)->message));                          \
18 |     }                                                                                 \
19 |   } while (0)
20 | 
21 | #define THROW_NOT_OK(ExceptionCls, ERROR_PTR, EXPR)                                     \
22 |   _DUCKDB_NANOARROW_THROW_NOT_OK_IMPL(_NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), \
23 |                                       ExceptionCls, ERROR_PTR, EXPR, #EXPR)
24 | 


--------------------------------------------------------------------------------
/src/include/nanoarrow_extension.hpp:
--------------------------------------------------------------------------------
 1 | //===----------------------------------------------------------------------===//
 2 | //                         DuckDB - nanoarrow
 3 | //
 4 | // nanoarrow_extension.hpp
 5 | //
 6 | //
 7 | //===----------------------------------------------------------------------===//
 8 | 
 9 | #pragma once
10 | 
11 | #include "duckdb/main/database.hpp"
12 | 
13 | namespace duckdb {
14 | 
15 | class NanoarrowExtension : public Extension {
16 |  public:
17 |   void Load(DuckDB& db) override;
18 |   std::string Name() override;
19 |   std::string Version() const override;
20 | };
21 | 
22 | }  // namespace duckdb
23 | 


--------------------------------------------------------------------------------
/src/include/table_function/arrow_ipc_function_data.hpp:
--------------------------------------------------------------------------------
 1 | //===----------------------------------------------------------------------===//
 2 | //                         DuckDB - nanoarrow
 3 | //
 4 | // table_function/arrow_ipc_function_data.hpp
 5 | //
 6 | //
 7 | //===----------------------------------------------------------------------===//
 8 | 
 9 | #pragma once
10 | 
11 | #include "duckdb/function/table/arrow.hpp"
12 | #include "ipc/stream_factory.hpp"
13 | 
14 | namespace duckdb {
15 | namespace ext_nanoarrow {
16 | //! Our FunctionData is the same as the ArrowScanFunctionData except we extend it
17 | //! to keep the ArrowIpcArrowArrayStreamFactory alive.
18 | struct ArrowIPCFunctionData : public ArrowScanFunctionData {
19 |   explicit ArrowIPCFunctionData(std::unique_ptr<ArrowIPCStreamFactory> factory)
20 |       : ArrowScanFunctionData(ArrowIPCStreamFactory::Produce,
21 |                               reinterpret_cast<uintptr_t>(factory.get())),
22 |         factory(std::move(factory)) {}
23 |   std::unique_ptr<ArrowIPCStreamFactory> factory;
24 | };
25 | }  // namespace ext_nanoarrow
26 | }  // namespace duckdb
27 | 


--------------------------------------------------------------------------------
/src/include/table_function/read_arrow.hpp:
--------------------------------------------------------------------------------
 1 | //===----------------------------------------------------------------------===//
 2 | //                         DuckDB - nanoarrow
 3 | //
 4 | // table_function/read_arrow.hpp
 5 | //
 6 | //
 7 | //===----------------------------------------------------------------------===//
 8 | 
 9 | #pragma once
10 | 
11 | #include "duckdb/function/table_function.hpp"
12 | #include "duckdb/parser/parsed_data/copy_info.hpp"
13 | 
14 | namespace duckdb {
15 | namespace ext_nanoarrow {
16 | 
17 | TableFunction ReadArrowStreamFunction();
18 | 
19 | void RegisterReadArrowStream(DatabaseInstance& db);
20 | 
21 | }  // namespace ext_nanoarrow
22 | }  // namespace duckdb
23 | 


--------------------------------------------------------------------------------
/src/include/table_function/scan_arrow_ipc.hpp:
--------------------------------------------------------------------------------
 1 | //===----------------------------------------------------------------------===//
 2 | //                         DuckDB - nanoarrow
 3 | //
 4 | // table_function/scan_arrow_ipc.hpp
 5 | //
 6 | //
 7 | //===----------------------------------------------------------------------===//
 8 | 
 9 | #pragma once
10 | 
11 | #include "duckdb/function/table/arrow.hpp"
12 | 
13 | #include "duckdb.hpp"
14 | 
15 | namespace duckdb {
16 | namespace ext_nanoarrow {
17 | 
18 | //! Arrow IPC Buffer, basically a pointer to the buffer and its size
19 | struct ArrowIPCBuffer {
20 |   ArrowIPCBuffer(const uint64_t ptr, const uint64_t size) : ptr(ptr), size(size) {};
21 |   uint64_t ptr;
22 |   uint64_t size;
23 | };
24 | 
25 | //! IPC Table scan is identical to ArrowTableFunction arrow scan except instead
26 | //! of CDataInterface header pointers, it takes a bunch of pointers pointing to
27 | //! buffers containing data in Arrow IPC format
28 | struct ScanArrowIPC {
29 |   static void RegisterReadArrowStream(DatabaseInstance& db);
30 | };
31 | }  // namespace ext_nanoarrow
32 | }  // namespace duckdb
33 | 


--------------------------------------------------------------------------------
/src/include/write_arrow_stream.hpp:
--------------------------------------------------------------------------------
 1 | //===----------------------------------------------------------------------===//
 2 | //                         DuckDB - nanoarrow
 3 | //
 4 | // write_arrow_stream.hpp
 5 | //
 6 | //
 7 | //===----------------------------------------------------------------------===//
 8 | 
 9 | #pragma once
10 | #include "duckdb/function/copy_function.hpp"
11 | 
12 | namespace duckdb {
13 | namespace ext_nanoarrow {
14 | 
15 | void RegisterArrowStreamCopyFunction(DatabaseInstance& db);
16 | 
17 | }  // namespace ext_nanoarrow
18 | }  // namespace duckdb
19 | 


--------------------------------------------------------------------------------
/src/include/writer/arrow_stream_writer.hpp:
--------------------------------------------------------------------------------
 1 | //===----------------------------------------------------------------------===//
 2 | //                         DuckDB - nanoarrow
 3 | //
 4 | // writer/arrow_stream_writer.hpp
 5 | //
 6 | //
 7 | //===----------------------------------------------------------------------===//
 8 | 
 9 | #pragma once
10 | #include "duckdb/main/client_context.hpp"
11 | #include "writer/column_data_collection_serializer.hpp"
12 | 
13 | namespace duckdb {
14 | namespace ext_nanoarrow {
15 | 
16 | struct ArrowStreamWriter {
17 |   ArrowStreamWriter(ClientContext& context, FileSystem& fs, const string& file_path,
18 |                     const vector<LogicalType>& logical_types,
19 |                     const vector<string>& column_names,
20 |                     const vector<pair<string, string>>& metadata);
21 | 
22 |   void InitSchema(const vector<LogicalType>& logical_types,
23 |                   const vector<string>& column_names,
24 |                   const vector<pair<string, string>>& metadata);
25 | 
26 |   void InitOutputFile(FileSystem& fs, const string& file_path);
27 | 
28 |   void WriteSchema();
29 | 
30 |   unique_ptr<ColumnDataCollectionSerializer> NewSerializer();
31 | 
32 |   void Flush(ColumnDataCollection& buffer);
33 | 
34 |   void Flush(ColumnDataCollectionSerializer& serializer);
35 | 
36 |   void Finalize() const;
37 | 
38 |   idx_t NumberOfRowGroups() const;
39 | 
40 |   idx_t FileSize() const;
41 | 
42 |  private:
43 |   ClientProperties options;
44 |   Allocator& allocator;
45 |   ColumnDataCollectionSerializer serializer;
46 |   string file_name;
47 |   vector<LogicalType> logical_types;
48 |   unique_ptr<BufferedFileWriter> writer;
49 |   idx_t row_group_count{0};
50 |   nanoarrow::UniqueSchema schema;
51 | };
52 | 
53 | }  // namespace ext_nanoarrow
54 | }  // namespace duckdb
55 | 


--------------------------------------------------------------------------------
/src/include/writer/column_data_collection_serializer.hpp:
--------------------------------------------------------------------------------
 1 | //===----------------------------------------------------------------------===//
 2 | //                         DuckDB - nanoarrow
 3 | //
 4 | // writer/column_data_collection_serializer.hpp
 5 | //
 6 | //
 7 | //===----------------------------------------------------------------------===//
 8 | #pragma once
 9 | 
10 | #include "duckdb/common/arrow/arrow_converter.hpp"
11 | #include "duckdb/common/exception.hpp"
12 | #include "duckdb/common/serializer/buffered_file_writer.hpp"
13 | #include "duckdb/common/types/column/column_data_collection.hpp"
14 | #include "duckdb/function/table/arrow/arrow_duck_schema.hpp"
15 | #include "duckdb/main/client_properties.hpp"
16 | #include "nanoarrow/nanoarrow_ipc.hpp"
17 | #include "nanoarrow_errors.hpp"
18 | 
19 | namespace duckdb {
20 | namespace ext_nanoarrow {
21 | 
22 | class ColumnDataCollectionSerializer {
23 |  public:
24 |   ColumnDataCollectionSerializer(ClientProperties options, Allocator& allocator);
25 | 
26 |   void Init(const ArrowSchema* schema_p, const vector<LogicalType>& logical_types);
27 | 
28 |   void SerializeSchema();
29 | 
30 |   idx_t Serialize(ArrowArray& array);
31 |   idx_t Serialize(DataChunk& chunk);
32 | 
33 |   idx_t Serialize(const ColumnDataCollection& buffer);
34 | 
35 |   void Flush(BufferedFileWriter& writer);
36 | 
37 |   nanoarrow::UniqueBuffer GetHeader();
38 | 
39 |   nanoarrow::UniqueBuffer GetBody();
40 | 
41 |  private:
42 |   ClientProperties options;
43 |   Allocator& allocator;
44 |   const ArrowSchema* schema{};
45 |   unordered_map<idx_t, const shared_ptr<ArrowTypeExtensionData>> extension_types;
46 |   nanoarrow::ipc::UniqueEncoder encoder;
47 |   nanoarrow::UniqueArrayView chunk_view;
48 |   nanoarrow::UniqueArray chunk_arrow;
49 |   nanoarrow::UniqueBuffer header;
50 |   nanoarrow::UniqueBuffer body;
51 |   ArrowError error{};
52 | };
53 | 
54 | }  // namespace ext_nanoarrow
55 | }  // namespace duckdb
56 | 


--------------------------------------------------------------------------------
/src/include/writer/to_arrow_ipc.hpp:
--------------------------------------------------------------------------------
 1 | //===----------------------------------------------------------------------===//
 2 | //                         DuckDB - nanoarrow
 3 | //
 4 | // writer/to_arrow_ipc.hpp
 5 | //
 6 | //
 7 | //===----------------------------------------------------------------------===//
 8 | #pragma once
 9 | #include "duckdb/function/table_function.hpp"
10 | 
11 | #include "nanoarrow/hpp/unique.hpp"
12 | 
13 | namespace duckdb {
14 | namespace ext_nanoarrow {
15 | 
16 | class ArrowStringVectorBuffer : public VectorBuffer {
17 |  public:
18 |   explicit ArrowStringVectorBuffer(nanoarrow::UniqueBuffer buffer_p)
19 |       : VectorBuffer(VectorBufferType::OPAQUE_BUFFER), buffer(std::move(buffer_p)) {}
20 | 
21 |  private:
22 |   nanoarrow::UniqueBuffer buffer;
23 | };
24 | 
25 | class ToArrowIPCFunction {
26 |  public:
27 |   //! note: this is the number of vectors per chunk
28 |   static constexpr idx_t DEFAULT_CHUNK_SIZE = 120;
29 | 
30 |   static TableFunction GetFunction();
31 |   static void RegisterToIPCFunction(DatabaseInstance& db);
32 | 
33 |  private:
34 |   static unique_ptr<LocalTableFunctionState> InitLocal(
35 |       ExecutionContext& context, TableFunctionInitInput& input,
36 |       GlobalTableFunctionState* global_state);
37 |   static unique_ptr<GlobalTableFunctionState> InitGlobal(ClientContext& context,
38 |                                                          TableFunctionInitInput& input);
39 |   static unique_ptr<FunctionData> Bind(ClientContext& context,
40 |                                        TableFunctionBindInput& input,
41 |                                        vector<LogicalType>& return_types,
42 |                                        vector<string>& names);
43 |   static OperatorResultType Function(ExecutionContext& context,
44 |                                      TableFunctionInput& data_p, DataChunk& input,
45 |                                      DataChunk& output);
46 |   static OperatorFinalizeResultType FunctionFinal(ExecutionContext& context,
47 |                                                   TableFunctionInput& data_p,
48 |                                                   DataChunk& output);
49 | };
50 | 
51 | }  // namespace ext_nanoarrow
52 | }  // namespace duckdb
53 | 


--------------------------------------------------------------------------------
/src/ipc/array_stream.cpp:
--------------------------------------------------------------------------------
 1 | #include "ipc/array_stream.hpp"
 2 | 
 3 | namespace duckdb {
 4 | namespace ext_nanoarrow {
 5 | 
 6 | IpcArrayStream::IpcArrayStream(unique_ptr<IPCStreamReader> reader)
 7 |     : reader(std::move(reader)) {}
 8 | 
 9 | IPCStreamReader& IpcArrayStream::Reader() const { return *reader; }
10 | 
11 | void IpcArrayStream::ToArrayStream(ArrowArrayStream* stream) {
12 |   nanoarrow::ArrayStreamFactory<IpcArrayStream>::InitArrayStream(
13 |       new IpcArrayStream(std::move(reader)), stream);
14 | }
15 | 
16 | int IpcArrayStream::GetSchema(ArrowSchema* schema) {
17 |   return Wrap([&]() {
18 |     NANOARROW_THROW_NOT_OK(ArrowSchemaDeepCopy(reader->GetOutputSchema(), schema));
19 |   });
20 | }
21 | 
22 | int IpcArrayStream::GetNext(ArrowArray* array) {
23 |   return Wrap([&]() { reader->GetNextBatch(array); });
24 | }
25 | 
26 | const char* IpcArrayStream::GetLastError() const { return last_msg.c_str(); }
27 | 
28 | }  // namespace ext_nanoarrow
29 | }  // namespace duckdb
30 | 


--------------------------------------------------------------------------------
/src/ipc/stream_factory.cpp:
--------------------------------------------------------------------------------
 1 | #include "ipc/stream_factory.hpp"
 2 | 
 3 | #include <iostream>
 4 | #include <utility>
 5 | 
 6 | #include "ipc/stream_reader/ipc_buffer_stream_reader.hpp"
 7 | #include "ipc/stream_reader/ipc_file_stream_reader.hpp"
 8 | 
 9 | namespace duckdb {
10 | namespace ext_nanoarrow {
11 | ArrowIPCStreamFactory::ArrowIPCStreamFactory(Allocator& allocator_p)
12 |     : allocator(allocator_p) {}
13 | 
14 | unique_ptr<ArrowArrayStreamWrapper> ArrowIPCStreamFactory::Produce(
15 |     uintptr_t factory_ptr, ArrowStreamParameters& parameters) {
16 |   auto factory =
17 |       static_cast<ArrowIPCStreamFactory*>(reinterpret_cast<void*>(factory_ptr));
18 | 
19 |   if (!factory->reader) {
20 |     throw InternalException("IpcStreamReader was not initialized or was already moved");
21 |   }
22 | 
23 |   if (!parameters.projected_columns.columns.empty()) {
24 |     factory->reader->SetColumnProjection(parameters.projected_columns.columns);
25 |   }
26 | 
27 |   auto out = make_uniq<ArrowArrayStreamWrapper>();
28 |   IpcArrayStream(std::move(factory->reader)).ToArrayStream(&out->arrow_array_stream);
29 |   return out;
30 | }
31 | 
32 | void ArrowIPCStreamFactory::GetFileSchema(ArrowSchemaWrapper& schema) const {
33 |   if (!reader) {
34 |     throw InternalException("IpcStreamReader is no longer valid");
35 |   }
36 | 
37 |   NANOARROW_THROW_NOT_OK(
38 |       ArrowSchemaDeepCopy(reader->GetBaseSchema(), &schema.arrow_schema));
39 | }
40 | 
41 | BufferIPCStreamFactory::BufferIPCStreamFactory(ClientContext& context,
42 |                                                const vector<ArrowIPCBuffer>& buffers_p)
43 |     : ArrowIPCStreamFactory(BufferAllocator::Get(context)), buffers(buffers_p) {}
44 | 
45 | void BufferIPCStreamFactory::InitReader() {
46 |   if (reader) {
47 |     throw InternalException("ArrowArrayStream or IpcStreamReader already initialized");
48 |   }
49 |   reader = make_uniq<IPCBufferStreamReader>(buffers, allocator);
50 | }
51 | 
52 | FileIPCStreamFactory::FileIPCStreamFactory(ClientContext& context, string src_string)
53 |     : ArrowIPCStreamFactory(BufferAllocator::Get(context)),
54 |       fs(FileSystem::GetFileSystem(context)),
55 |       src_string(std::move(src_string)) {}
56 | 
57 | void FileIPCStreamFactory::InitReader() {
58 |   if (reader) {
59 |     throw InternalException("ArrowArrayStream or IpcStreamReader already initialized");
60 |   }
61 |   unique_ptr<FileHandle> handle = fs.OpenFile(src_string, FileOpenFlags::FILE_FLAGS_READ);
62 |   reader = make_uniq<IPCFileStreamReader>(fs, std::move(handle), allocator);
63 | }
64 | 
65 | }  // namespace ext_nanoarrow
66 | }  // namespace duckdb
67 | 


--------------------------------------------------------------------------------
/src/ipc/stream_reader/base_stream_reader.cpp:
--------------------------------------------------------------------------------
  1 | #include "ipc/stream_reader/base_stream_reader.hpp"
  2 | #include <cinttypes>
  3 | #include <iostream>
  4 | #include "zstd.h"
  5 | 
  6 | namespace duckdb {
  7 | namespace ext_nanoarrow {
  8 | 
  9 | // A version of ArrowDecompressZstd that uses DuckDB's C++ name-specified
 10 | // zstd.h header that doesn't work with a C compiler
 11 | static ArrowErrorCode DuckDBDecompressZstd(struct ArrowBufferView src, uint8_t* dst,
 12 |                                            int64_t dst_size, struct ArrowError* error) {
 13 |   size_t code = duckdb_zstd::ZSTD_decompress((void*)dst, (size_t)dst_size, src.data.data,
 14 |                                              src.size_bytes);
 15 |   if (duckdb_zstd::ZSTD_isError(code)) {
 16 |     ArrowErrorSet(error,
 17 |                   "ZSTD_decompress([buffer with %" PRId64
 18 |                   " bytes] -> [buffer with %" PRId64 " bytes]) failed with error '%s'",
 19 |                   src.size_bytes, dst_size, duckdb_zstd::ZSTD_getErrorName(code));
 20 |     return EIO;
 21 |   }
 22 | 
 23 |   if (dst_size != static_cast<int64_t>(code)) {
 24 |     ArrowErrorSet(error,
 25 |                   "Expected decompressed size of %" PRId64 " bytes but got %" PRId64
 26 |                   " bytes",
 27 |                   dst_size, static_cast<int64_t>(code));
 28 |     return EIO;
 29 |   }
 30 | 
 31 |   return NANOARROW_OK;
 32 | }
 33 | 
 34 | // Create an ArrowIpcDecoder() with the appropriate decompressor set.
 35 | // We could also define a decompressor that uses threads to parellelize
 36 | // decompression for batches with many columns.
 37 | nanoarrow::ipc::UniqueDecoder IPCStreamReader::NewDuckDBArrowDecoder() {
 38 |   nanoarrow::ipc::UniqueDecompressor decompressor;
 39 |   NANOARROW_THROW_NOT_OK(ArrowIpcSerialDecompressor(decompressor.get()));
 40 |   NANOARROW_THROW_NOT_OK(ArrowIpcSerialDecompressorSetFunction(
 41 |       decompressor.get(), NANOARROW_IPC_COMPRESSION_TYPE_ZSTD, DuckDBDecompressZstd));
 42 | 
 43 |   nanoarrow::ipc::UniqueDecoder decoder;
 44 |   NANOARROW_THROW_NOT_OK(ArrowIpcDecoderInit(decoder.get()));
 45 |   NANOARROW_THROW_NOT_OK(
 46 |       ArrowIpcDecoderSetDecompressor(decoder.get(), decompressor.get()));
 47 |   // Bug in nanoarrow!
 48 |   decompressor->release = nullptr;
 49 |   return decoder;
 50 | }
 51 | 
 52 | const ArrowSchema* IPCStreamReader::GetBaseSchema() {
 53 |   if (base_schema->release) {
 54 |     return base_schema.get();
 55 |   }
 56 | 
 57 |   ReadNextMessage({NANOARROW_IPC_MESSAGE_TYPE_SCHEMA}, /*end_of_stream_ok*/ false);
 58 | 
 59 |   if (decoder->feature_flags & NANOARROW_IPC_FEATURE_DICTIONARY_REPLACEMENT) {
 60 |     throw IOException("This stream uses unsupported feature DICTIONARY_REPLACEMENT");
 61 |   }
 62 | 
 63 |   // Decode the schema
 64 |   THROW_NOT_OK(IOException, &error,
 65 |                ArrowIpcDecoderDecodeSchema(decoder.get(), base_schema.get(), &error));
 66 | 
 67 |   // Set up the decoder to decode batches
 68 |   THROW_NOT_OK(InternalException, &error,
 69 |                ArrowIpcDecoderSetEndianness(decoder.get(), decoder->endianness));
 70 |   THROW_NOT_OK(InternalException, &error,
 71 |                ArrowIpcDecoderSetSchema(decoder.get(), base_schema.get(), &error));
 72 | 
 73 |   return base_schema.get();
 74 | }
 75 | 
 76 | bool IPCStreamReader::HasProjection() const { return !projected_fields.empty(); }
 77 | 
 78 | const ArrowSchema* IPCStreamReader::GetOutputSchema() {
 79 |   if (HasProjection()) {
 80 |     return projected_schema.get();
 81 |   } else {
 82 |     return GetBaseSchema();
 83 |   }
 84 | }
 85 | 
 86 | bool IPCStreamReader::GetNextBatch(ArrowArray* out) {
 87 |   // When nanoarrow supports dictionary batches, we'd accept either a
 88 |   // RecordBatch or DictionaryBatch message, recording the dictionary batch
 89 |   // (or possibly ignoring it if it is for a field that we don't care about),
 90 |   // but looping until we end up with a RecordBatch in the decoder.
 91 |   ArrowIpcMessageType message_type =
 92 |       ReadNextMessage({NANOARROW_IPC_MESSAGE_TYPE_RECORD_BATCH});
 93 |   if (message_type == NANOARROW_IPC_MESSAGE_TYPE_UNINITIALIZED) {
 94 |     out->release = nullptr;
 95 |     return false;
 96 |   }
 97 | 
 98 |   // Use the ArrowIpcSharedBuffer if we have thread safety (i.e., if this was
 99 |   // compiled with a compiler that supports C11 atomics, i.e., not gcc 4.8 or
100 |   // MSVC)
101 |   bool thread_safe_shared = ArrowIpcSharedBufferIsThreadSafe();
102 |   struct ArrowBufferView body_view = AllocatedDataView(cur_ptr, cur_size);
103 |   nanoarrow::UniqueBuffer body_shared = GetUniqueBuffer();
104 |   UniqueSharedBuffer shared;
105 |   NANOARROW_THROW_NOT_OK(ArrowIpcSharedBufferInit(&shared.data, body_shared.get()));
106 |   nanoarrow::UniqueArray array;
107 |   if (HasProjection()) {
108 |     NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(array.get(), NANOARROW_TYPE_STRUCT));
109 |     NANOARROW_THROW_NOT_OK(
110 |         ArrowArrayAllocateChildren(array.get(), GetOutputSchema()->n_children));
111 | 
112 |     if (thread_safe_shared) {
113 |       for (int64_t i = 0; i < array->n_children; i++) {
114 |         THROW_NOT_OK(InternalException, &error,
115 |                      ArrowIpcDecoderDecodeArrayFromShared(
116 |                          decoder.get(), &shared.data, projected_fields[i],
117 |                          array->children[i], NANOARROW_VALIDATION_LEVEL_FULL, &error));
118 |       }
119 |     } else {
120 |       for (int64_t i = 0; i < array->n_children; i++) {
121 |         THROW_NOT_OK(InternalException, &error,
122 |                      ArrowIpcDecoderDecodeArray(decoder.get(), body_view,
123 |                                                 projected_fields[i], array->children[i],
124 |                                                 NANOARROW_VALIDATION_LEVEL_FULL, &error));
125 |       }
126 |     }
127 | 
128 |     D_ASSERT(array->n_children > 0);
129 |     array->length = array->children[0]->length;
130 |     array->null_count = 0;
131 |   } else if (thread_safe_shared) {
132 |     THROW_NOT_OK(
133 |         InternalException, &error,
134 |         ArrowIpcDecoderDecodeArrayFromShared(decoder.get(), &shared.data, -1, array.get(),
135 |                                              NANOARROW_VALIDATION_LEVEL_FULL, &error));
136 |   } else {
137 |     THROW_NOT_OK(InternalException, &error,
138 |                  ArrowIpcDecoderDecodeArray(decoder.get(), body_view, -1, array.get(),
139 |                                             NANOARROW_VALIDATION_LEVEL_FULL, &error));
140 |   }
141 | 
142 |   ArrowArrayMove(array.get(), out);
143 |   return true;
144 | }
145 | 
146 | void IPCStreamReader::SetColumnProjection(const vector<string>& column_names) {
147 |   if (column_names.empty()) {
148 |     throw InternalException("Can't request zero fields projected from IpcStreamReader");
149 |   }
150 | 
151 |   // Ensure we have a file schema to work with
152 |   GetBaseSchema();
153 | 
154 |   nanoarrow::UniqueSchema schema;
155 |   ArrowSchemaInit(schema.get());
156 |   NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(
157 |       schema.get(), UnsafeNumericCast<int64_t>(column_names.size())));
158 | 
159 |   // The ArrowArray builder needs the flattened field index, which we need to
160 |   // keep track of.
161 |   unordered_map<string, pair<int64_t, const ArrowSchema*>> name_to_flat_field_map;
162 | 
163 |   // Duplicate column names are in theory fine as long as they are not queried,
164 |   // so we need to make a list of them to check.
165 |   unordered_set<string> duplicate_column_names;
166 | 
167 |   vector<string> names;
168 |   // Let's check if we need to deduplicate projection column names
169 |   for (idx_t col_idx = 0; col_idx < static_cast<idx_t>(base_schema->n_children);
170 |        col_idx++) {
171 |     if (base_schema->children[col_idx]->name) {
172 |       names.push_back(base_schema->children[col_idx]->name);
173 |     } else {
174 |       names.push_back("");
175 |     }
176 |   }
177 |   QueryResult::DeduplicateColumns(names);
178 |   // Loop over columns to build the field map
179 |   int64_t field_count = 0;
180 |   for (int64_t i = 0; i < base_schema->n_children; i++) {
181 |     if (name_to_flat_field_map.find(names[i]) != name_to_flat_field_map.end()) {
182 |       duplicate_column_names.insert(names[i]);
183 |     }
184 |     name_to_flat_field_map.insert({names[i], {field_count, base_schema->children[i]}});
185 |     field_count += CountFields(base_schema->children[i]);
186 |   }
187 | 
188 |   // Loop over projected column names to build the projection information
189 |   int64_t output_column_index = 0;
190 |   for (const auto& column_name : column_names) {
191 |     if (duplicate_column_names.find(column_name) != duplicate_column_names.end()) {
192 |       throw InternalException(string("Field '") + column_name +
193 |                               "' refers to a duplicate column name in IPC file schema");
194 |     }
195 | 
196 |     auto field_id_item = name_to_flat_field_map.find(column_name);
197 |     if (field_id_item == name_to_flat_field_map.end()) {
198 |       throw InternalException(string("Field '") + column_name +
199 |                               "' does not exist in IPC file schema");
200 |     }
201 | 
202 |     // Record the flat field index for this column
203 |     projected_fields.push_back(field_id_item->second.first);
204 | 
205 |     // Record the Schema for this column
206 |     NANOARROW_THROW_NOT_OK(ArrowSchemaDeepCopy(field_id_item->second.second,
207 |                                                schema->children[output_column_index]));
208 | 
209 |     ++output_column_index;
210 |   }
211 |   projected_schema = std::move(schema);
212 | }
213 | 
214 | idx_t IPCStreamReader::DecodeMetadata() const {
215 |   idx_t metadata_size;
216 |   if (!Radix::IsLittleEndian()) {
217 |     metadata_size = static_cast<int32_t>(BSWAP32(message_prefix.metadata_size));
218 |   } else {
219 |     metadata_size = message_prefix.metadata_size;
220 |   }
221 | 
222 |   if (metadata_size < 0) {
223 |     throw IOException(std::string("Expected metadata size >= 0 but got " +
224 |                                   std::to_string(metadata_size)));
225 |   }
226 |   return metadata_size + sizeof(message_prefix);
227 | }
228 | 
229 | ArrowIpcMessageType IPCStreamReader::DecodeMessage() {
230 |   auto message_header_size = DecodeMetadata();
231 |   if (DecodeHeader(message_header_size)) {
232 |     return NANOARROW_IPC_MESSAGE_TYPE_UNINITIALIZED;
233 |   }
234 |   DecodeBody();
235 |   return decoder->message_type;
236 | }
237 | 
238 | ArrowIpcMessageType IPCStreamReader::ReadNextMessage(
239 |     vector<ArrowIpcMessageType> expected_types, bool end_of_stream_ok) {
240 |   ArrowIpcMessageType actual_type = ReadNextMessage();
241 |   if (end_of_stream_ok && actual_type == NANOARROW_IPC_MESSAGE_TYPE_UNINITIALIZED) {
242 |     return actual_type;
243 |   }
244 | 
245 |   for (const auto expected_type : expected_types) {
246 |     if (expected_type == actual_type) {
247 |       return actual_type;
248 |     }
249 |   }
250 | 
251 |   std::stringstream expected_types_label;
252 |   for (size_t i = 0; i < expected_types.size(); i++) {
253 |     if (i > 0) {
254 |       expected_types_label << " or ";
255 |     }
256 | 
257 |     expected_types_label << MessageTypeString(expected_types[i]);
258 |   }
259 | 
260 |   string actual_type_label;
261 |   if (actual_type == NANOARROW_IPC_MESSAGE_TYPE_UNINITIALIZED) {
262 |     actual_type_label = "end of stream";
263 |   } else {
264 |     actual_type_label = MessageTypeString(actual_type);
265 |   }
266 | 
267 |   throw IOException(string("Expected ") + expected_types_label.str() +
268 |                     " Arrow IPC message but got " + actual_type_label);
269 | }
270 | 
271 | int64_t IPCStreamReader::CountFields(const ArrowSchema* schema) {
272 |   int64_t n_fields = 1;
273 |   for (int64_t i = 0; i < schema->n_children; i++) {
274 |     n_fields += CountFields(schema->children[i]);
275 |   }
276 |   return n_fields;
277 | }
278 | 
279 | ArrowBufferView IPCStreamReader::AllocatedDataView(const_data_ptr_t data, int64_t size) {
280 |   ArrowBufferView view{};
281 |   view.data.data = data;
282 |   view.size_bytes = size;
283 |   return view;
284 | }
285 | 
286 | nanoarrow::UniqueBuffer IPCStreamReader::AllocatedDataToOwningBuffer(
287 |     const shared_ptr<AllocatedData>& data) {
288 |   nanoarrow::UniqueBuffer out;
289 |   if (data) {
290 |     nanoarrow::BufferInitWrapped(out.get(), data, data->get(),
291 |                                  UnsafeNumericCast<int64_t>(data->GetSize()));
292 |   }
293 |   return out;
294 | }
295 | 
296 | const char* IPCStreamReader::MessageTypeString(ArrowIpcMessageType message_type) {
297 |   switch (message_type) {
298 |     case NANOARROW_IPC_MESSAGE_TYPE_SCHEMA:
299 |       return "Schema";
300 |     case NANOARROW_IPC_MESSAGE_TYPE_RECORD_BATCH:
301 |       return "RecordBatch";
302 |     case NANOARROW_IPC_MESSAGE_TYPE_DICTIONARY_BATCH:
303 |       return "DictionaryBatch";
304 |     case NANOARROW_IPC_MESSAGE_TYPE_TENSOR:
305 |       return "Tensor";
306 |     case NANOARROW_IPC_MESSAGE_TYPE_SPARSE_TENSOR:
307 |       return "SparseTensor";
308 |     case NANOARROW_IPC_MESSAGE_TYPE_UNINITIALIZED:
309 |       return "Uninitialized";
310 |     default:
311 |       return "";
312 |   }
313 | }
314 | 
315 | }  // namespace ext_nanoarrow
316 | }  // namespace duckdb
317 | 


--------------------------------------------------------------------------------
/src/ipc/stream_reader/ipc_buffer_stream_reader.cpp:
--------------------------------------------------------------------------------
 1 | #include "ipc/stream_reader/ipc_buffer_stream_reader.hpp"
 2 | 
 3 | #include <iostream>
 4 | 
 5 | namespace duckdb {
 6 | namespace ext_nanoarrow {
 7 | 
 8 | IPCBufferStreamReader::IPCBufferStreamReader(vector<ArrowIPCBuffer> buffers,
 9 |                                              Allocator& allocator)
10 |     : IPCStreamReader(allocator), buffers(std::move(buffers)) {}
11 | 
12 | ArrowIpcMessageType IPCBufferStreamReader::ReadNextMessage() {
13 |   if ((!initialized && cur_idx == buffers.size()) || finished) {
14 |     finished = true;
15 |     return NANOARROW_IPC_MESSAGE_TYPE_UNINITIALIZED;
16 |   }
17 |   if (!initialized || cur_buffer.pos >= buffers[cur_idx].size) {
18 |     if (initialized) {
19 |       cur_idx++;
20 |     }
21 |     if (cur_idx >= buffers.size()) {
22 |       finished = true;
23 |       return NANOARROW_IPC_MESSAGE_TYPE_UNINITIALIZED;
24 |     }
25 |     cur_buffer.ptr = reinterpret_cast<data_ptr_t>(buffers[cur_idx].ptr);
26 |     cur_buffer.size = static_cast<int64_t>(buffers[cur_idx].size);
27 |     cur_buffer.pos = 0;
28 |     initialized = true;
29 |   }
30 |   auto* message_prefix_ptr = reinterpret_cast<const ArrowIpcMessagePrefix*>(
31 |       ReadData(reinterpret_cast<data_ptr_t>(&message_prefix), sizeof(message_prefix)));
32 |   message_prefix = *message_prefix_ptr;
33 |   return DecodeMessage();
34 | }
35 | 
36 | data_ptr_t IPCBufferStreamReader::ReadData(data_ptr_t ptr, idx_t size) {
37 |   D_ASSERT(size + cur_buffer.pos < cur_buffer.size);
38 |   data_ptr_t cur_ptr = cur_buffer.ptr + cur_buffer.pos;
39 |   cur_buffer.pos += size;
40 |   return cur_ptr;
41 | }
42 | 
43 | bool IPCBufferStreamReader::DecodeHeader(idx_t message_header_size) {
44 |   // Our Header must contain the message prefix
45 |   header.ptr =
46 |       ReadData(header.ptr, message_prefix.metadata_size) - sizeof(message_prefix);
47 |   header.size = message_header_size;
48 |   const ArrowErrorCode decode_header_status = ArrowIpcDecoderDecodeHeader(
49 |       decoder.get(), AllocatedDataView(header.ptr, header.size), &error);
50 |   if (decode_header_status == ENODATA) {
51 |     finished = true;
52 |     return true;
53 |   }
54 |   THROW_NOT_OK(IOException, &error, decode_header_status);
55 |   return false;
56 | }
57 | 
58 | void IPCBufferStreamReader::DecodeBody() {
59 |   if (decoder->body_size_bytes > 0) {
60 |     body.ptr = ReadData(body.ptr, decoder->body_size_bytes);
61 |   }
62 |   if (body.ptr) {
63 |     cur_ptr = body.ptr;
64 |     cur_size = body.size;
65 |   } else {
66 |     cur_ptr = nullptr;
67 |     cur_size = 0;
68 |   }
69 | }
70 | 
71 | nanoarrow::UniqueBuffer IPCBufferStreamReader::GetUniqueBuffer() {
72 |   nanoarrow::UniqueBuffer out;
73 |   nanoarrow::BufferInitWrapped(out.get(), body, body.ptr, body.size);
74 |   return out;
75 | }
76 | 
77 | }  // namespace ext_nanoarrow
78 | }  // namespace duckdb
79 | 


--------------------------------------------------------------------------------
/src/ipc/stream_reader/ipc_file_stream_reader.cpp:
--------------------------------------------------------------------------------
  1 | #include "ipc/stream_reader/ipc_file_stream_reader.hpp"
  2 | #include "duckdb/common/file_system.hpp"
  3 | 
  4 | namespace duckdb {
  5 | namespace ext_nanoarrow {
  6 | IPCFileStreamReader::IPCFileStreamReader(FileSystem& fs, unique_ptr<FileHandle> handle,
  7 |                                          Allocator& allocator)
  8 |     : IPCStreamReader(allocator), file_reader(fs, std::move(handle)) {}
  9 | 
 10 | void IPCFileStreamReader::PopulateNames(vector<string>& names) {
 11 |   GetBaseSchema();
 12 |   for (int64_t i = 0; i < base_schema->n_children; i++) {
 13 |     const ArrowSchema* column = base_schema->children[i];
 14 |     if (!column->name) {
 15 |       names.push_back("");
 16 |     } else {
 17 |       names.push_back(column->name);
 18 |     }
 19 |   }
 20 | }
 21 | 
 22 | double IPCFileStreamReader::GetProgress() {
 23 |   idx_t file_size = file_reader.FileSize();
 24 |   if (file_size == 0) {
 25 |     return 100;
 26 |   }
 27 |   auto current_offset = static_cast<double>(file_reader.CurrentOffset());
 28 |   return (current_offset / static_cast<double>(file_size)) * 100;
 29 | }
 30 | 
 31 | void IPCFileStreamReader::DecodeArray(nanoarrow::ipc::UniqueDecoder& decoder,
 32 |                                       ArrowArray* out, ArrowBufferView& body_view,
 33 |                                       ArrowError* error) {
 34 |   // Use the ArrowIpcSharedBuffer if we have thread safety (i.e., if this was
 35 |   // compiled with a compiler that supports C11 atomics, i.e., not gcc 4.8 or
 36 |   // MSVC)
 37 |   nanoarrow::UniqueArray array;
 38 |   THROW_NOT_OK(InternalException, error,
 39 |                ArrowIpcDecoderDecodeArray(decoder.get(), body_view, -1, array.get(),
 40 |                                           NANOARROW_VALIDATION_LEVEL_FULL, error));
 41 |   ArrowArrayMove(array.get(), out);
 42 | }
 43 | 
 44 | nanoarrow::UniqueBuffer IPCFileStreamReader::GetUniqueBuffer() {
 45 |   return AllocatedDataToOwningBuffer(message_body);
 46 | }
 47 | bool IPCFileStreamReader::DecodeHeader(const idx_t message_header_size) {
 48 |   if (message_header.GetSize() < message_header_size) {
 49 |     message_header = allocator.Allocate(message_header_size);
 50 |   }
 51 |   // Read the message header. I believe the fact that this loops and calls
 52 |   // the file handle's Read() method with relatively small chunks will ensure that
 53 |   // an attempt to read a very large message_header_size can be cancelled. If this
 54 |   // is not the case, we might want to implement our own buffering.
 55 |   std::memcpy(message_header.get(), &message_prefix, sizeof(message_prefix));
 56 |   ReadData(message_header.get() + sizeof(message_prefix), message_prefix.metadata_size);
 57 | 
 58 |   ArrowErrorCode decode_header_status = ArrowIpcDecoderDecodeHeader(
 59 |       decoder.get(),
 60 |       AllocatedDataView(message_header.get(),
 61 |                         static_cast<int64_t>(message_header.GetSize())),
 62 |       &error);
 63 |   if (decode_header_status == ENODATA) {
 64 |     finished = true;
 65 |     return true;
 66 |   }
 67 |   THROW_NOT_OK(IOException, &error, decode_header_status);
 68 |   return false;
 69 | }
 70 | 
 71 | void IPCFileStreamReader::DecodeBody() {
 72 |   if (decoder->body_size_bytes > 0) {
 73 |     EnsureInputStreamAligned();
 74 |     message_body =
 75 |         make_shared_ptr<AllocatedData>(allocator.Allocate(decoder->body_size_bytes));
 76 | 
 77 |     // Again, this is possibly a long running Read() call for a large body.
 78 |     // We could possibly be smarter about how we do this, particularly if we
 79 |     // are reading a small portion of the input from a seekable file.
 80 |     ReadData(message_body->get(), decoder->body_size_bytes);
 81 |   }
 82 |   if (message_body) {
 83 |     cur_ptr = message_body->get();
 84 |     cur_size = static_cast<int64_t>(message_body->GetSize());
 85 |   } else {
 86 |     cur_ptr = nullptr;
 87 |     cur_size = 0;
 88 |   }
 89 | }
 90 | 
 91 | data_ptr_t IPCFileStreamReader::ReadData(data_ptr_t ptr, idx_t size) {
 92 |   file_reader.ReadData(ptr, size);
 93 |   return ptr;
 94 | }
 95 | 
 96 | ArrowIpcMessageType IPCFileStreamReader::ReadNextMessage() {
 97 |   if (finished) {
 98 |     return NANOARROW_IPC_MESSAGE_TYPE_UNINITIALIZED;
 99 |   }
100 | 
101 |   // If there is no more data to be read, we're done!
102 |   try {
103 |     EnsureInputStreamAligned();
104 |     file_reader.ReadData(reinterpret_cast<data_ptr_t>(&message_prefix),
105 |                          sizeof(message_prefix));
106 | 
107 |     // If we're at the beginning of the read, and we see the Arrow file format
108 |     // header bytes, skip them and try to read the stream anyway. This works because
109 |     // there's a full stream within an Arrow file (including the EOS indicator, which
110 |     // is key to success. This EOS indicator is unfortunately missing in Rust releases
111 |     // prior to ~September 2024).
112 |     //
113 |     // When we support dictionary encoding we will possibly need to seek to the footer
114 |     // here, parse that, and maybe lazily seek and read dictionaries for if/when they are
115 |     // required.
116 |     if (file_reader.CurrentOffset() == 8 &&
117 |         std::memcmp("ARROW1\0\0", &message_prefix, 8) == 0) {
118 |       return ReadNextMessage();
119 |     }
120 | 
121 |     if (message_prefix.continuation_token != kContinuationToken) {
122 |       throw IOException(std::string("Expected continuation token (0xFFFFFFFF) but got " +
123 |                                     std::to_string(message_prefix.continuation_token)));
124 |     }
125 | 
126 |   } catch (SerializationException& e) {
127 |     finished = true;
128 |     return NANOARROW_IPC_MESSAGE_TYPE_UNINITIALIZED;
129 |   }
130 | 
131 |   return DecodeMessage();
132 | }
133 | 
134 | void IPCFileStreamReader::EnsureInputStreamAligned() {
135 |   uint8_t padding[8];
136 |   int padding_bytes = 8 - (file_reader.CurrentOffset() % 8);
137 |   if (padding_bytes != 8) {
138 |     file_reader.ReadData(padding, padding_bytes);
139 |   }
140 |   D_ASSERT((file_reader.CurrentOffset() % 8) == 0);
141 | }
142 | 
143 | }  // namespace ext_nanoarrow
144 | }  // namespace duckdb
145 | 


--------------------------------------------------------------------------------
/src/nanoarrow_extension.cpp:
--------------------------------------------------------------------------------
 1 | #define DUCKDB_EXTENSION_MAIN
 2 | 
 3 | #include "nanoarrow_extension.hpp"
 4 | 
 5 | #include <string>
 6 | #include "writer/to_arrow_ipc.hpp"
 7 | 
 8 | #include "duckdb/function/scalar_function.hpp"
 9 | #include "duckdb/main/extension_util.hpp"
10 | 
11 | #include "nanoarrow/nanoarrow.hpp"
12 | 
13 | #include "table_function/read_arrow.hpp"
14 | #include "table_function/scan_arrow_ipc.hpp"
15 | #include "write_arrow_stream.hpp"
16 | 
17 | namespace duckdb {
18 | 
19 | namespace {
20 | 
21 | struct NanoarrowVersion {
22 |   static void Register(DatabaseInstance& db) {
23 |     auto fn = ScalarFunction("nanoarrow_version", {}, LogicalType::VARCHAR, ExecuteFn);
24 |     ExtensionUtil::RegisterFunction(db, fn);
25 |   }
26 | 
27 |   static void ExecuteFn(DataChunk& args, ExpressionState& state, Vector& result) {
28 |     result.SetValue(0, StringVector::AddString(result, ArrowNanoarrowVersion()));
29 |     result.SetVectorType(VectorType::CONSTANT_VECTOR);
30 |   }
31 | };
32 | 
33 | void LoadInternal(DatabaseInstance& db) {
34 |   NanoarrowVersion::Register(db);
35 |   ext_nanoarrow::RegisterReadArrowStream(db);
36 |   ext_nanoarrow::RegisterArrowStreamCopyFunction(db);
37 | 
38 |   ext_nanoarrow::ScanArrowIPC::RegisterReadArrowStream(db);
39 |   ext_nanoarrow::ToArrowIPCFunction::RegisterToIPCFunction(db);
40 | }
41 | 
42 | }  // namespace
43 | 
44 | void NanoarrowExtension::Load(DuckDB& db) { LoadInternal(*db.instance); }
45 | std::string NanoarrowExtension::Name() { return "nanoarrow"; }
46 | 
47 | std::string NanoarrowExtension::Version() const {
48 | #ifdef EXT_VERSION_NANOARROW
49 |   return EXT_VERSION_NANOARROW;
50 | #else
51 |   return "";
52 | #endif
53 | }
54 | 
55 | }  // namespace duckdb
56 | 
57 | extern "C" {
58 | 
59 | DUCKDB_EXTENSION_API void nanoarrow_init(duckdb::DatabaseInstance& db) {
60 |   duckdb::DuckDB db_wrapper(db);
61 |   db_wrapper.LoadExtension<duckdb::NanoarrowExtension>();
62 | }
63 | 
64 | DUCKDB_EXTENSION_API const char* nanoarrow_version() {
65 |   return duckdb::DuckDB::LibraryVersion();
66 | }
67 | }
68 | 
69 | #ifndef DUCKDB_EXTENSION_MAIN
70 | #error DUCKDB_EXTENSION_MAIN not defined
71 | #endif
72 | 


--------------------------------------------------------------------------------
/src/scanner/read_arrow.cpp:
--------------------------------------------------------------------------------
 1 | #include "table_function/read_arrow.hpp"
 2 | 
 3 | #include <inttypes.h>
 4 | 
 5 | #include "file_scanner/arrow_multi_file_info.hpp"
 6 | #include "zstd.h"
 7 | 
 8 | #include "duckdb/common/radix.hpp"
 9 | #include "duckdb/common/serializer/buffered_file_reader.hpp"
10 | #include "duckdb/function/table/arrow.hpp"
11 | #include "duckdb/function/table_function.hpp"
12 | #include "duckdb/main/database.hpp"
13 | #include "duckdb/main/extension_util.hpp"
14 | #include "duckdb/parser/expression/constant_expression.hpp"
15 | #include "duckdb/parser/expression/function_expression.hpp"
16 | #include "duckdb/parser/tableref/table_function_ref.hpp"
17 | 
18 | #include "nanoarrow/nanoarrow.hpp"
19 | #include "nanoarrow/nanoarrow_ipc.hpp"
20 | 
21 | #include "ipc/stream_factory.hpp"
22 | #include "ipc/stream_reader/base_stream_reader.hpp"
23 | #include "nanoarrow_errors.hpp"
24 | #include "table_function/arrow_ipc_function_data.hpp"
25 | 
26 | // read_arrow() implementation
27 | //
28 | // This version uses the ArrowIpcDecoder directly. instead of nanoarrow's
29 | // ArrowArrayStream wrapper. This lets it use DuckDB's allocator at the
30 | // expense of a bit more verbosity. Because we can apply the projection
31 | // it reduces some of the verbosity of the actual DuckDB part (although the
32 | // ArrayStreamReader from nanoarrow could support a projection, which
33 | // would handle that too).
34 | //
35 | // I like this version better than the simpler one, and there are more parts
36 | // that could get optimized here (whereas with the array stream version you
37 | // don't have much control).
38 | 
39 | namespace duckdb {
40 | 
41 | namespace ext_nanoarrow {
42 | 
43 | struct ReadArrowStream : ArrowTableFunction {
44 |   static TableFunction Function() {
45 |     MultiFileFunction<ArrowMultiFileInfo> read_arrow("read_arrow");
46 |     read_arrow.projection_pushdown = true;
47 |     read_arrow.filter_pushdown = false;
48 |     read_arrow.filter_prune = false;
49 |     return static_cast<TableFunction>(read_arrow);
50 |   }
51 | 
52 |   static unique_ptr<TableRef> ScanReplacement(ClientContext& context,
53 |                                               ReplacementScanInput& input,
54 |                                               optional_ptr<ReplacementScanData> data) {
55 |     auto table_name = ReplacementScan::GetFullPath(input);
56 |     if (!ReplacementScan::CanReplace(table_name, {"arrows", "arrow"})) {
57 |       return nullptr;
58 |     }
59 | 
60 |     auto table_function = make_uniq<TableFunctionRef>();
61 |     vector<unique_ptr<ParsedExpression>> children;
62 |     auto table_name_expr = make_uniq<ConstantExpression>(Value(table_name));
63 |     children.push_back(std::move(table_name_expr));
64 |     auto function_expr = make_uniq<FunctionExpression>("read_arrow", std::move(children));
65 |     table_function->function = std::move(function_expr);
66 | 
67 |     if (!FileSystem::HasGlob(table_name)) {
68 |       auto& fs = FileSystem::GetFileSystem(context);
69 |       table_function->alias = fs.ExtractBaseName(table_name);
70 |     }
71 | 
72 |     return std::move(table_function);
73 |   }
74 | };
75 | 
76 | TableFunction ReadArrowStreamFunction() { return ReadArrowStream::Function(); }
77 | 
78 | void RegisterReadArrowStream(DatabaseInstance& db) {
79 |   auto function = ReadArrowStream::Function();
80 |   ExtensionUtil::RegisterFunction(db, function);
81 |   // So we can accept a list of paths as well e.g., ['file_1.arrow','file_2.arrow']
82 |   function.arguments = {LogicalType::LIST(LogicalType::VARCHAR)};
83 |   ExtensionUtil::RegisterFunction(db, function);
84 |   auto& config = DBConfig::GetConfig(db);
85 |   config.replacement_scans.emplace_back(ReadArrowStream::ScanReplacement);
86 | }
87 | 
88 | }  // namespace ext_nanoarrow
89 | }  // namespace duckdb
90 | 


--------------------------------------------------------------------------------
/src/scanner/scan_arrow_ipc.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "table_function/scan_arrow_ipc.hpp"
 3 | #include "duckdb/main/extension_util.hpp"
 4 | #include "ipc/stream_factory.hpp"
 5 | #include "table_function/arrow_ipc_function_data.hpp"
 6 | 
 7 | #include "duckdb/function/table/arrow.hpp"
 8 | 
 9 | #include "ipc/stream_reader/base_stream_reader.hpp"
10 | 
11 | #include "duckdb/function/function.hpp"
12 | #include "duckdb/function/table/arrow/arrow_duck_schema.hpp"
13 | #include "duckdb/function/table_function.hpp"
14 | #include "duckdb/main/config.hpp"
15 | namespace duckdb {
16 | 
17 | namespace ext_nanoarrow {
18 | 
19 | struct ScanArrowIPCFunction : ArrowTableFunction {
20 |   static unique_ptr<FunctionData> ScanArrowIPCBind(ClientContext& context,
21 |                                                    TableFunctionBindInput& input,
22 |                                                    vector<LogicalType>& return_types,
23 |                                                    vector<string>& names) {
24 |     // Create a vector with all the buffers and their sizes
25 |     vector<ArrowIPCBuffer> buffers;
26 |     const auto buffer_ptr_list = ListValue::GetChildren(input.inputs[0]);
27 |     for (auto& buffer_ptr_struct : buffer_ptr_list) {
28 |       auto unpacked = StructValue::GetChildren(buffer_ptr_struct);
29 |       buffers.emplace_back(unpacked[0].GetPointer(), unpacked[1].GetValue<uint64_t>());
30 |     }
31 | 
32 |     auto stream_factory = make_uniq<BufferIPCStreamFactory>(context, buffers);
33 |     auto res = make_uniq<ArrowIPCFunctionData>(std::move(stream_factory));
34 |     res->factory->InitReader();
35 |     res->factory->GetFileSchema(res->schema_root);
36 | 
37 |     DBConfig& config = DatabaseInstance::GetDatabase(context).config;
38 |     PopulateArrowTableType(config, res->arrow_table, res->schema_root, names,
39 |                            return_types);
40 |     QueryResult::DeduplicateColumns(names);
41 |     res->all_types = return_types;
42 |     if (return_types.empty()) {
43 |       throw InvalidInputException(
44 |           "Provided table/dataframe must have at least one column");
45 |     }
46 | 
47 |     return std::move(res);
48 |   }
49 | 
50 |   static TableFunction Function() {
51 |     child_list_t<LogicalType> make_buffer_struct_children{{"ptr", LogicalType::POINTER},
52 |                                                           {"size", LogicalType::UBIGINT}};
53 |     TableFunction scan_arrow_ipc_func(
54 |         "scan_arrow_ipc",
55 |         {LogicalType::LIST(LogicalType::STRUCT(make_buffer_struct_children))},
56 |         ArrowScanFunction, ScanArrowIPCBind, ArrowScanInitGlobal, ArrowScanInitLocal);
57 | 
58 |     scan_arrow_ipc_func.cardinality = ArrowScanCardinality;
59 |     scan_arrow_ipc_func.projection_pushdown = true;
60 |     scan_arrow_ipc_func.filter_pushdown = false;
61 |     scan_arrow_ipc_func.filter_prune = false;
62 | 
63 |     return scan_arrow_ipc_func;
64 |   }
65 | };
66 | 
67 | void ScanArrowIPC::RegisterReadArrowStream(DatabaseInstance& db) {
68 |   auto function = ScanArrowIPCFunction::Function();
69 |   ExtensionUtil::RegisterFunction(db, function);
70 | }
71 | 
72 | }  // namespace ext_nanoarrow
73 | }  // namespace duckdb
74 | 


--------------------------------------------------------------------------------
/src/writer/arrow_stream_writer.cpp:
--------------------------------------------------------------------------------
 1 | #include "writer/arrow_stream_writer.hpp"
 2 | namespace duckdb {
 3 | 
 4 | namespace ext_nanoarrow {
 5 | 
 6 | ArrowStreamWriter::ArrowStreamWriter(ClientContext& context, FileSystem& fs,
 7 |                                      const string& file_path,
 8 |                                      const vector<LogicalType>& logical_types,
 9 |                                      const vector<string>& column_names,
10 |                                      const vector<pair<string, string>>& metadata)
11 |     : options(context.GetClientProperties()),
12 |       allocator(BufferAllocator::Get(context)),
13 |       serializer(options, allocator),
14 |       file_name(file_path),
15 |       logical_types(logical_types) {
16 |   InitSchema(logical_types, column_names, metadata);
17 |   InitOutputFile(fs, file_path);
18 | }
19 | 
20 | void ArrowStreamWriter::InitSchema(const vector<LogicalType>& logical_types,
21 |                                    const vector<string>& column_names,
22 |                                    const vector<pair<string, string>>& metadata) {
23 |   nanoarrow::UniqueSchema tmp_schema;
24 |   ArrowConverter::ToArrowSchema(tmp_schema.get(), logical_types, column_names, options);
25 | 
26 |   if (metadata.empty()) {
27 |     ArrowSchemaMove(tmp_schema.get(), schema.get());
28 |   } else {
29 |     nanoarrow::UniqueBuffer metadata_packed;
30 |     NANOARROW_THROW_NOT_OK(
31 |         ArrowMetadataBuilderInit(metadata_packed.get(), tmp_schema->metadata));
32 |     ArrowStringView key{};
33 |     ArrowStringView value{};
34 |     for (const auto& item : metadata) {
35 |       key = {item.first.data(), static_cast<int64_t>(item.first.size())};
36 |       value = {item.second.data(), static_cast<int64_t>(item.second.size())};
37 |       NANOARROW_THROW_NOT_OK(
38 |           ArrowMetadataBuilderAppend(metadata_packed.get(), key, value));
39 |     }
40 | 
41 |     NANOARROW_THROW_NOT_OK(ArrowSchemaDeepCopy(tmp_schema.get(), schema.get()));
42 |     NANOARROW_THROW_NOT_OK(ArrowSchemaSetMetadata(
43 |         schema.get(), reinterpret_cast<char*>(metadata_packed->data)));
44 |   }
45 | 
46 |   serializer.Init(schema.get(), logical_types);
47 | }
48 | 
49 | void ArrowStreamWriter::InitOutputFile(FileSystem& fs, const string& file_path) {
50 |   writer = make_uniq<BufferedFileWriter>(
51 |       fs, file_path.c_str(),
52 |       FileFlags::FILE_FLAGS_WRITE | FileFlags::FILE_FLAGS_FILE_CREATE_NEW);
53 | }
54 | 
55 | void ArrowStreamWriter::WriteSchema() {
56 |   serializer.SerializeSchema();
57 |   serializer.Flush(*writer);
58 | }
59 | 
60 | unique_ptr<ColumnDataCollectionSerializer> ArrowStreamWriter::NewSerializer() {
61 |   auto serializer = make_uniq<ColumnDataCollectionSerializer>(options, allocator);
62 |   serializer->Init(schema.get(), logical_types);
63 |   return serializer;
64 | }
65 | 
66 | void ArrowStreamWriter::Flush(ColumnDataCollection& buffer) {
67 |   serializer.Serialize(buffer);
68 |   buffer.Reset();
69 |   serializer.Flush(*writer);
70 |   ++row_group_count;
71 | }
72 | 
73 | void ArrowStreamWriter::Flush(ColumnDataCollectionSerializer& serializer) {
74 |   serializer.Flush(*writer);
75 |   ++row_group_count;
76 | }
77 | 
78 | void ArrowStreamWriter::Finalize() const {
79 |   uint8_t end_of_stream[] = {0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00};
80 |   writer->WriteData(end_of_stream, sizeof(end_of_stream));
81 |   writer->Close();
82 | }
83 | 
84 | idx_t ArrowStreamWriter::NumberOfRowGroups() const { return row_group_count; }
85 | 
86 | idx_t ArrowStreamWriter::FileSize() const { return writer->GetTotalWritten(); }
87 | 
88 | }  // namespace ext_nanoarrow
89 | }  // namespace duckdb
90 | 


--------------------------------------------------------------------------------
/src/writer/column_data_collection_serializer.cpp:
--------------------------------------------------------------------------------
  1 | #include "writer/column_data_collection_serializer.hpp"
  2 | 
  3 | #include <utility>
  4 | namespace duckdb {
  5 | 
  6 | namespace ext_nanoarrow {
  7 | 
  8 | // Initialize buffer whose realloc operations go through DuckDB's memory
  9 | // accounting. Note that the Allocator must outlive the buffer (true for
 10 | // the case of this writer, but maybe not true for generic production of
 11 | // ArrowArrays whose lifetime might outlive the connection/database).
 12 | inline void InitArrowDuckBuffer(ArrowBuffer* buffer, Allocator& duck_allocator) {
 13 |   ArrowBufferInit(buffer);
 14 | 
 15 |   buffer->allocator.reallocate = [](ArrowBufferAllocator* allocator, uint8_t* ptr,
 16 |                                     int64_t old_size, int64_t new_size) -> uint8_t* {
 17 |     NANOARROW_DCHECK(allocator->private_data != nullptr);
 18 |     auto duck_allocator = static_cast<Allocator*>(allocator->private_data);
 19 |     if (ptr == nullptr && new_size > 0) {
 20 |       return duck_allocator->AllocateData(new_size);
 21 |     } else if (new_size == 0) {
 22 |       duck_allocator->FreeData(ptr, old_size);
 23 |       return nullptr;
 24 |     } else {
 25 |       return duck_allocator->ReallocateData(ptr, old_size, new_size);
 26 |     }
 27 |   };
 28 | 
 29 |   buffer->allocator.free = [](ArrowBufferAllocator* allocator, uint8_t* ptr,
 30 |                               int64_t old_size) {
 31 |     NANOARROW_DCHECK(allocator->private_data != nullptr);
 32 |     auto duck_allocator = static_cast<Allocator*>(allocator->private_data);
 33 |     duck_allocator->FreeData(ptr, old_size);
 34 |   };
 35 | 
 36 |   buffer->allocator.private_data = &duck_allocator;
 37 | }
 38 | 
 39 | ColumnDataCollectionSerializer::ColumnDataCollectionSerializer(ClientProperties options,
 40 |                                                                Allocator& allocator)
 41 |     : options(std::move(options)), allocator(allocator) {}
 42 | 
 43 | void ColumnDataCollectionSerializer::Init(const ArrowSchema* schema_p,
 44 |                                           const vector<LogicalType>& logical_types) {
 45 |   InitArrowDuckBuffer(header.get(), allocator);
 46 |   InitArrowDuckBuffer(body.get(), allocator);
 47 |   NANOARROW_THROW_NOT_OK(ArrowIpcEncoderInit(encoder.get()));
 48 |   THROW_NOT_OK(InternalException, &error,
 49 |                ArrowArrayViewInitFromSchema(chunk_view.get(), schema_p, &error));
 50 | 
 51 |   schema = schema_p;
 52 | 
 53 |   extension_types =
 54 |       ArrowTypeExtensionData::GetExtensionTypes(*options.client_context, logical_types);
 55 | }
 56 | 
 57 | void ColumnDataCollectionSerializer::SerializeSchema() {
 58 |   header->size_bytes = 0;
 59 |   body->size_bytes = 0;
 60 |   THROW_NOT_OK(InternalException, &error,
 61 |                ArrowIpcEncoderEncodeSchema(encoder.get(), schema, &error));
 62 |   NANOARROW_THROW_NOT_OK(
 63 |       ArrowIpcEncoderFinalizeBuffer(encoder.get(), true, header.get()));
 64 | }
 65 | 
 66 | idx_t ColumnDataCollectionSerializer::Serialize(ArrowArray& array) {
 67 |   header->size_bytes = 0;
 68 |   body->size_bytes = 0;
 69 | 
 70 |   THROW_NOT_OK(duckdb::InternalException, &error,
 71 |                ArrowArrayViewSetArray(chunk_view.get(), &array, &error));
 72 |   THROW_NOT_OK(InternalException, &error,
 73 |                ArrowIpcEncoderEncodeSimpleRecordBatch(encoder.get(), chunk_view.get(),
 74 |                                                       body.get(), &error));
 75 |   NANOARROW_THROW_NOT_OK(
 76 |       ArrowIpcEncoderFinalizeBuffer(encoder.get(), true, header.get()));
 77 | 
 78 |   return 1;
 79 | }
 80 | idx_t ColumnDataCollectionSerializer::Serialize(DataChunk& chunk) {
 81 |   header->size_bytes = 0;
 82 |   body->size_bytes = 0;
 83 |   chunk_arrow.reset();
 84 | 
 85 |   ArrowConverter::ToArrowArray(chunk, chunk_arrow.get(), options, extension_types);
 86 |   THROW_NOT_OK(duckdb::InternalException, &error,
 87 |                ArrowArrayViewSetArray(chunk_view.get(), chunk_arrow.get(), &error));
 88 |   THROW_NOT_OK(InternalException, &error,
 89 |                ArrowIpcEncoderEncodeSimpleRecordBatch(encoder.get(), chunk_view.get(),
 90 |                                                       body.get(), &error));
 91 |   NANOARROW_THROW_NOT_OK(
 92 |       ArrowIpcEncoderFinalizeBuffer(encoder.get(), true, header.get()));
 93 | 
 94 |   return 1;
 95 | }
 96 | 
 97 | idx_t ColumnDataCollectionSerializer::Serialize(const ColumnDataCollection& buffer) {
 98 |   header->size_bytes = 0;
 99 |   body->size_bytes = 0;
100 |   if (buffer.Count() == 0) {
101 |     return 0;
102 |   }
103 |   // The ArrowConverter requires all of this to be in one big DataChunk.
104 |   // It would be better to append these one at a time using other DuckDB
105 |   // internals like the ArrowAppender. (Possibly better would be to skip the
106 |   // owning ArrowArray entirely and just expose an ArrowArrayView of the
107 |   // chunk. keeping track of any owning elements that had to be allocated,
108 |   // since that's all that is strictly required to write).
109 |   DataChunk chunk;
110 |   chunk.Initialize(allocator, buffer.Types(), buffer.Count());
111 |   for (const auto& item : buffer.Chunks()) {
112 |     chunk.Append(item, true);
113 |   }
114 |   return Serialize(chunk);
115 | }
116 | 
117 | void ColumnDataCollectionSerializer::Flush(BufferedFileWriter& writer) {
118 |   writer.WriteData(header->data, header->size_bytes);
119 |   writer.WriteData(body->data, body->size_bytes);
120 | }
121 | nanoarrow::UniqueBuffer ColumnDataCollectionSerializer::GetHeader() {
122 |   auto result_header = std::move(header);
123 |   InitArrowDuckBuffer(header.get(), allocator);
124 |   return result_header;
125 | }
126 | nanoarrow::UniqueBuffer ColumnDataCollectionSerializer::GetBody() {
127 |   auto result_body = std::move(body);
128 |   InitArrowDuckBuffer(body.get(), allocator);
129 |   return result_body;
130 | }
131 | }  // namespace ext_nanoarrow
132 | }  // namespace duckdb
133 | 


--------------------------------------------------------------------------------
/src/writer/to_arrow_ipc.cpp:
--------------------------------------------------------------------------------
  1 | #include "writer/to_arrow_ipc.hpp"
  2 | 
  3 | #include "duckdb/main/extension_util.hpp"
  4 | 
  5 | #include "writer/column_data_collection_serializer.hpp"
  6 | 
  7 | #include "duckdb/common/arrow/arrow_appender.hpp"
  8 | #include "duckdb/function/function.hpp"
  9 | #include "duckdb/function/table_function.hpp"
 10 | 
 11 | #include "duckdb/main/client_context.hpp"
 12 | 
 13 | namespace duckdb {
 14 | 
 15 | namespace ext_nanoarrow {
 16 | 
 17 | struct ToArrowIpcFunctionData : public TableFunctionData {
 18 |   ToArrowIpcFunctionData() = default;
 19 |   ArrowSchema schema{};
 20 |   vector<LogicalType> logical_types;
 21 |   const idx_t chunk_size = ToArrowIPCFunction::DEFAULT_CHUNK_SIZE * STANDARD_VECTOR_SIZE;
 22 | };
 23 | 
 24 | struct ToArrowIpcGlobalState : public GlobalTableFunctionState {
 25 |   ToArrowIpcGlobalState() : sent_schema(false) {}
 26 |   atomic<bool> sent_schema;
 27 |   mutex lock;
 28 | };
 29 | 
 30 | struct ToArrowIpcLocalState : public LocalTableFunctionState {
 31 |   unique_ptr<ArrowAppender> appender;
 32 |   unique_ptr<ColumnDataCollectionSerializer> serializer;
 33 |   idx_t current_count = 0;
 34 |   bool checked_schema = false;
 35 | };
 36 | 
 37 | unique_ptr<LocalTableFunctionState> ToArrowIPCFunction::InitLocal(
 38 |     ExecutionContext& context, TableFunctionInitInput& input,
 39 |     GlobalTableFunctionState* global_state) {
 40 |   auto local_state = make_uniq<ToArrowIpcLocalState>();
 41 |   auto properties = context.client.GetClientProperties();
 42 |   local_state->serializer = make_uniq<ColumnDataCollectionSerializer>(
 43 |       properties, BufferAllocator::Get(context.client));
 44 |   return local_state;
 45 | }
 46 | 
 47 | unique_ptr<GlobalTableFunctionState> ToArrowIPCFunction::InitGlobal(
 48 |     ClientContext& context, TableFunctionInitInput& input) {
 49 |   return make_uniq<ToArrowIpcGlobalState>();
 50 | }
 51 | 
 52 | unique_ptr<FunctionData> ToArrowIPCFunction::Bind(ClientContext& context,
 53 |                                                   TableFunctionBindInput& input,
 54 |                                                   vector<LogicalType>& return_types,
 55 |                                                   vector<string>& names) {
 56 |   auto result = make_uniq<ToArrowIpcFunctionData>();
 57 | 
 58 |   // Set return schema
 59 |   return_types.emplace_back(LogicalType::BLOB);
 60 |   names.emplace_back("ipc");
 61 |   return_types.emplace_back(LogicalType::BOOLEAN);
 62 |   names.emplace_back("header");
 63 | 
 64 |   // Create the Arrow schema
 65 |   auto properties = context.GetClientProperties();
 66 |   result->logical_types = input.input_table_types;
 67 |   ArrowConverter::ToArrowSchema(&result->schema, input.input_table_types,
 68 |                                 input.input_table_names, properties);
 69 |   return std::move(result);
 70 | }
 71 | 
 72 | void SerializeArray(const ToArrowIpcLocalState& local_state,
 73 |                     nanoarrow::UniqueBuffer& arrow_serialized_ipc_buffer) {
 74 |   ArrowArray arr = local_state.appender->Finalize();
 75 |   local_state.serializer->Serialize(arr);
 76 |   arrow_serialized_ipc_buffer = local_state.serializer->GetHeader();
 77 |   auto body = local_state.serializer->GetBody();
 78 |   idx_t ipc_buffer_size = arrow_serialized_ipc_buffer->size_bytes;
 79 |   arrow_serialized_ipc_buffer->data = arrow_serialized_ipc_buffer->allocator.reallocate(
 80 |       &arrow_serialized_ipc_buffer->allocator, arrow_serialized_ipc_buffer->data,
 81 |       static_cast<int64_t>(ipc_buffer_size),
 82 |       static_cast<int64_t>(ipc_buffer_size + body->size_bytes));
 83 |   arrow_serialized_ipc_buffer->size_bytes += body->size_bytes;
 84 |   arrow_serialized_ipc_buffer->capacity_bytes += body->size_bytes;
 85 |   memcpy(arrow_serialized_ipc_buffer->data + ipc_buffer_size, body->data,
 86 |          body->size_bytes);
 87 | }
 88 | 
 89 | void InsertMessageToChunk(nanoarrow::UniqueBuffer& arrow_serialized_ipc_buffer,
 90 |                           DataChunk& output) {
 91 |   const auto ptr = reinterpret_cast<const char*>(arrow_serialized_ipc_buffer->data);
 92 |   const auto len = arrow_serialized_ipc_buffer->size_bytes;
 93 |   const auto wrapped_buffer =
 94 |       make_buffer<ArrowStringVectorBuffer>(std::move(arrow_serialized_ipc_buffer));
 95 |   auto& vector = output.data[0];
 96 |   StringVector::AddBuffer(vector, wrapped_buffer);
 97 |   const auto data_ptr = reinterpret_cast<string_t*>(vector.GetData());
 98 |   *data_ptr = string_t(ptr, len);
 99 |   output.SetCardinality(1);
100 |   output.Verify();
101 | }
102 | 
103 | OperatorResultType ToArrowIPCFunction::Function(ExecutionContext& context,
104 |                                                 TableFunctionInput& data_p,
105 |                                                 DataChunk& input, DataChunk& output) {
106 |   nanoarrow::UniqueBuffer arrow_serialized_ipc_buffer;
107 |   auto& data = data_p.bind_data->Cast<ToArrowIpcFunctionData>();
108 |   auto& local_state = data_p.local_state->Cast<ToArrowIpcLocalState>();
109 |   auto& global_state = data_p.global_state->Cast<ToArrowIpcGlobalState>();
110 | 
111 |   bool sending_schema = false;
112 | 
113 |   bool caching_disabled = !PhysicalOperator::OperatorCachingAllowed(context);
114 |   local_state.serializer->Init(&data.schema, data.logical_types);
115 | 
116 |   if (!local_state.checked_schema) {
117 |     if (!global_state.sent_schema) {
118 |       lock_guard<mutex> init_lock(global_state.lock);
119 |       if (!global_state.sent_schema) {
120 |         // This run will send the schema, other threads can just send the
121 |         // buffers
122 |         global_state.sent_schema = true;
123 |         sending_schema = true;
124 |       }
125 |     }
126 |     local_state.checked_schema = true;
127 |   }
128 | 
129 |   if (sending_schema) {
130 |     local_state.serializer->SerializeSchema();
131 |     arrow_serialized_ipc_buffer = local_state.serializer->GetHeader();
132 |     output.data[1].SetValue(0, Value::BOOLEAN(true));
133 |   } else {
134 |     if (!local_state.appender) {
135 |       local_state.appender = make_uniq<ArrowAppender>(
136 |           input.GetTypes(), data.chunk_size, context.client.GetClientProperties(),
137 |           ArrowTypeExtensionData::GetExtensionTypes(context.client, input.GetTypes()));
138 |     }
139 | 
140 |     // Append input chunk
141 |     local_state.appender->Append(input, 0, input.size(), input.size());
142 |     local_state.current_count += input.size();
143 | 
144 |     // If chunk size is reached, we can flush to IPC blob
145 |     if (caching_disabled || local_state.current_count >= data.chunk_size) {
146 |       SerializeArray(local_state, arrow_serialized_ipc_buffer);
147 |       // Reset appender
148 |       local_state.appender.reset();
149 |       local_state.current_count = 0;
150 | 
151 |       // This is a data message, hence we set the second column to false
152 |       output.data[1].SetValue(0, Value::BOOLEAN(false));
153 |     } else {
154 |       return OperatorResultType::NEED_MORE_INPUT;
155 |     }
156 |   }
157 |   InsertMessageToChunk(arrow_serialized_ipc_buffer, output);
158 |   if (sending_schema) {
159 |     return OperatorResultType::HAVE_MORE_OUTPUT;
160 |   } else {
161 |     return OperatorResultType::NEED_MORE_INPUT;
162 |   }
163 | }
164 | 
165 | OperatorFinalizeResultType ToArrowIPCFunction::FunctionFinal(ExecutionContext& context,
166 |                                                              TableFunctionInput& data_p,
167 |                                                              DataChunk& output) {
168 |   auto& local_state = data_p.local_state->Cast<ToArrowIpcLocalState>();
169 | 
170 |   if (local_state.appender) {
171 |     // If we have an appender, we serialize the array into a message and insert it to the
172 |     // chunk
173 |     nanoarrow::UniqueBuffer arrow_serialized_ipc_buffer;
174 |     SerializeArray(local_state, arrow_serialized_ipc_buffer);
175 |     InsertMessageToChunk(arrow_serialized_ipc_buffer, output);
176 | 
177 |     // This is always a data message, so we set the second column to false.
178 |     output.data[1].SetValue(0, Value::BOOLEAN(false));
179 |   }
180 | 
181 |   return OperatorFinalizeResultType::FINISHED;
182 | }
183 | 
184 | TableFunction ToArrowIPCFunction::GetFunction() {
185 |   TableFunction fun("to_arrow_ipc", {LogicalType::TABLE}, nullptr, Bind, InitGlobal,
186 |                     InitLocal);
187 |   fun.in_out_function = Function;
188 |   fun.in_out_function_final = FunctionFinal;
189 |   return fun;
190 | }
191 | 
192 | void ToArrowIPCFunction::RegisterToIPCFunction(DatabaseInstance& db) {
193 |   const auto function = GetFunction();
194 |   ExtensionUtil::RegisterFunction(db, function);
195 | }
196 | }  // namespace ext_nanoarrow
197 | }  // namespace duckdb
198 | 


--------------------------------------------------------------------------------
/src/writer/write_arrow_stream.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "write_arrow_stream.hpp"
  3 | 
  4 | #include "duckdb/common/multi_file/multi_file_function.hpp"
  5 | #include "file_scanner/arrow_multi_file_info.hpp"
  6 | 
  7 | #include "duckdb/common/arrow/arrow_converter.hpp"
  8 | #include "duckdb/common/serializer/buffered_file_writer.hpp"
  9 | #include "duckdb/function/copy_function.hpp"
 10 | #include "duckdb/main/extension_util.hpp"
 11 | 
 12 | #include "nanoarrow/nanoarrow_ipc.hpp"
 13 | 
 14 | #include "nanoarrow_errors.hpp"
 15 | #include "table_function/read_arrow.hpp"
 16 | #include "writer/arrow_stream_writer.hpp"
 17 | 
 18 | namespace duckdb {
 19 | 
 20 | namespace ext_nanoarrow {
 21 | 
 22 | namespace {
 23 | 
 24 | struct ArrowWriteBindData : public TableFunctionData {
 25 |   vector<LogicalType> sql_types;
 26 |   vector<string> column_names;
 27 |   vector<pair<string, string>> kv_metadata;
 28 |   // Storage::ROW_GROUP_SIZE (122880), which seems to be the default
 29 |   // for Parquet, is higher than the usual number used in IPC writers (65536).
 30 |   // Using a value of 65536 results in fairly bad performance for the use
 31 |   // case of "write it all then read it all" (at the expense of not being as
 32 |   // useful for streaming).
 33 |   idx_t row_group_size = 122880;
 34 |   bool row_group_size_set = false;
 35 |   optional_idx row_groups_per_file;
 36 |   static constexpr const idx_t BYTES_PER_ROW = 1024;
 37 |   idx_t row_group_size_bytes{};
 38 | };
 39 | 
 40 | struct ArrowWriteGlobalState : public GlobalFunctionData {
 41 |   unique_ptr<ArrowStreamWriter> writer;
 42 | };
 43 | 
 44 | struct ArrowWriteLocalState : public LocalFunctionData {
 45 |   explicit ArrowWriteLocalState(ClientContext& context, const vector<LogicalType>& types)
 46 |       : buffer(context, types, ColumnDataAllocatorType::HYBRID) {
 47 |     buffer.InitializeAppend(append_state);
 48 |   }
 49 | 
 50 |   ColumnDataCollection buffer;
 51 |   ColumnDataAppendState append_state;
 52 | };
 53 | 
 54 | unique_ptr<FunctionData> ArrowWriteBind(ClientContext& context,
 55 |                                         CopyFunctionBindInput& input,
 56 |                                         const vector<string>& names,
 57 |                                         const vector<LogicalType>& sql_types) {
 58 |   D_ASSERT(names.size() == sql_types.size());
 59 |   auto bind_data = make_uniq<ArrowWriteBindData>();
 60 |   bool row_group_size_bytes_set = false;
 61 | 
 62 |   for (auto& option : input.info.options) {
 63 |     const auto loption = StringUtil::Lower(option.first);
 64 |     if (option.second.size() != 1) {
 65 |       // All Arrow write options require exactly one argument
 66 |       throw BinderException("%s requires exactly one argument",
 67 |                             StringUtil::Upper(loption));
 68 |     }
 69 | 
 70 |     if (loption == "row_group_size" || loption == "chunk_size") {
 71 |       if (bind_data->row_group_size_set) {
 72 |         throw BinderException(
 73 |             "ROW_GROUP_SIZE and ROW_GROUP_SIZE_BYTES are mutually exclusive");
 74 |       }
 75 |       bind_data->row_group_size = option.second[0].GetValue<uint64_t>();
 76 |       bind_data->row_group_size_set = true;
 77 |     } else if (loption == "row_group_size_bytes") {
 78 |       auto roption = option.second[0];
 79 |       if (roption.GetTypeMutable().id() == LogicalTypeId::VARCHAR) {
 80 |         bind_data->row_group_size_bytes = DBConfig::ParseMemoryLimit(roption.ToString());
 81 |       } else {
 82 |         bind_data->row_group_size_bytes = option.second[0].GetValue<uint64_t>();
 83 |       }
 84 |       row_group_size_bytes_set = true;
 85 |     } else if (loption == "row_groups_per_file") {
 86 |       bind_data->row_groups_per_file = option.second[0].GetValue<uint64_t>();
 87 |     } else if (loption == "kv_metadata") {
 88 |       auto& kv_struct = option.second[0];
 89 |       auto& kv_struct_type = kv_struct.type();
 90 |       if (kv_struct_type.id() != LogicalTypeId::STRUCT) {
 91 |         throw BinderException("Expected kv_metadata argument to be a STRUCT");
 92 |       }
 93 |       auto values = StructValue::GetChildren(kv_struct);
 94 |       for (idx_t i = 0; i < values.size(); i++) {
 95 |         const auto& value = values[i];
 96 |         auto key = StructType::GetChildName(kv_struct_type, i);
 97 |         // If the value is a blob, write the raw blob bytes
 98 |         // otherwise, cast to string
 99 |         if (value.type().id() == LogicalTypeId::BLOB) {
100 |           bind_data->kv_metadata.emplace_back(key, StringValue::Get(value));
101 |         } else {
102 |           bind_data->kv_metadata.emplace_back(key, value.ToString());
103 |         }
104 |       }
105 |     }
106 |   }
107 | 
108 |   if (row_group_size_bytes_set) {
109 |     if (DBConfig::GetConfig(context).options.preserve_insertion_order) {
110 |       throw BinderException(
111 |           "ROW_GROUP_SIZE_BYTES does not work while preserving insertion order. Use "
112 |           "\"SET preserve_insertion_order=false;\" to disable preserving insertion "
113 |           "order.");
114 |     }
115 |   } else {
116 |     // We always set a max row group size bytes so we don't use too much memory
117 |     bind_data->row_group_size_bytes =
118 |         bind_data->row_group_size * ArrowWriteBindData::BYTES_PER_ROW;
119 |   }
120 | 
121 |   bind_data->sql_types = sql_types;
122 |   bind_data->column_names = names;
123 | 
124 |   return std::move(bind_data);
125 | }
126 | 
127 | unique_ptr<GlobalFunctionData> ArrowWriteInitializeGlobal(ClientContext& context,
128 |                                                           FunctionData& bind_data,
129 |                                                           const string& file_path) {
130 |   auto global_state = make_uniq<ArrowWriteGlobalState>();
131 |   auto& arrow_bind = bind_data.Cast<ArrowWriteBindData>();
132 | 
133 |   auto& fs = FileSystem::GetFileSystem(context);
134 |   global_state->writer =
135 |       make_uniq<ArrowStreamWriter>(context, fs, file_path, arrow_bind.sql_types,
136 |                                    arrow_bind.column_names, arrow_bind.kv_metadata);
137 |   global_state->writer->WriteSchema();
138 |   return std::move(global_state);
139 | }
140 | 
141 | void ArrowWriteSink(ExecutionContext& context, FunctionData& bind_data_p,
142 |                     GlobalFunctionData& gstate, LocalFunctionData& lstate,
143 |                     DataChunk& input) {
144 |   auto& bind_data = bind_data_p.Cast<ArrowWriteBindData>();
145 |   auto& global_state = gstate.Cast<ArrowWriteGlobalState>();
146 |   auto& local_state = lstate.Cast<ArrowWriteLocalState>();
147 | 
148 |   // append data to the local (buffered) chunk collection
149 |   local_state.buffer.Append(local_state.append_state, input);
150 | 
151 |   if (local_state.buffer.Count() >= bind_data.row_group_size ||
152 |       local_state.buffer.SizeInBytes() >= bind_data.row_group_size_bytes) {
153 |     // if the chunk collection exceeds a certain size (rows/bytes) we flush it to the
154 |     // Arrow file
155 |     local_state.append_state.current_chunk_state.handles.clear();
156 |     global_state.writer->Flush(local_state.buffer);
157 |     local_state.buffer.InitializeAppend(local_state.append_state);
158 |   }
159 | }
160 | 
161 | void ArrowWriteCombine(ExecutionContext& context, FunctionData& bind_data,
162 |                        GlobalFunctionData& gstate, LocalFunctionData& lstate) {
163 |   auto& global_state = gstate.Cast<ArrowWriteGlobalState>();
164 |   auto& local_state = lstate.Cast<ArrowWriteLocalState>();
165 |   // flush any data left in the local state to the file
166 |   global_state.writer->Flush(local_state.buffer);
167 | }
168 | 
169 | void ArrowWriteFinalize(ClientContext& context, FunctionData& bind_data,
170 |                         GlobalFunctionData& gstate) {
171 |   auto& global_state = gstate.Cast<ArrowWriteGlobalState>();
172 |   // finalize: write any additional metadata to the file here
173 |   global_state.writer->Finalize();
174 | }
175 | 
176 | unique_ptr<LocalFunctionData> ArrowWriteInitializeLocal(ExecutionContext& context,
177 |                                                         FunctionData& bind_data_p) {
178 |   auto& bind_data = bind_data_p.Cast<ArrowWriteBindData>();
179 |   return make_uniq<ArrowWriteLocalState>(context.client, bind_data.sql_types);
180 | }
181 | 
182 | CopyFunctionExecutionMode ArrowWriteExecutionMode(bool preserve_insertion_order,
183 |                                                   bool supports_batch_index) {
184 |   if (!preserve_insertion_order) {
185 |     return CopyFunctionExecutionMode::PARALLEL_COPY_TO_FILE;
186 |   }
187 |   if (supports_batch_index) {
188 |     return CopyFunctionExecutionMode::BATCH_COPY_TO_FILE;
189 |   }
190 |   return CopyFunctionExecutionMode::REGULAR_COPY_TO_FILE;
191 | }
192 | 
193 | idx_t ArrowWriteDesiredBatchSize(ClientContext& context, FunctionData& bind_data_p) {
194 |   auto& bind_data = bind_data_p.Cast<ArrowWriteBindData>();
195 |   return bind_data.row_group_size;
196 | }
197 | 
198 | bool ArrowWriteRotateFiles(FunctionData& bind_data_p,
199 |                            const optional_idx& file_size_bytes) {
200 |   auto& bind_data = bind_data_p.Cast<ArrowWriteBindData>();
201 |   return file_size_bytes.IsValid() || bind_data.row_groups_per_file.IsValid();
202 | }
203 | 
204 | bool ArrowWriteRotateNextFile(GlobalFunctionData& gstate, FunctionData& bind_data_p,
205 |                               const optional_idx& file_size_bytes) {
206 |   auto& global_state = gstate.Cast<ArrowWriteGlobalState>();
207 |   auto& bind_data = bind_data_p.Cast<ArrowWriteBindData>();
208 |   if (file_size_bytes.IsValid() &&
209 |       global_state.writer->FileSize() > file_size_bytes.GetIndex()) {
210 |     return true;
211 |   }
212 | 
213 |   if (bind_data.row_groups_per_file.IsValid() &&
214 |       global_state.writer->NumberOfRowGroups() >=
215 |           bind_data.row_groups_per_file.GetIndex()) {
216 |     return true;
217 |   }
218 |   return false;
219 | }
220 | 
221 | struct ArrowWriteBatchData : public PreparedBatchData {
222 |   unique_ptr<ColumnDataCollectionSerializer> serializer;
223 | };
224 | 
225 | // This is called concurrently for large writes so it can't interact with the
226 | // writer except to read information needed to initialize.
227 | unique_ptr<PreparedBatchData> ArrowWritePrepareBatch(
228 |     ClientContext& context, FunctionData& bind_data, GlobalFunctionData& gstate,
229 |     unique_ptr<ColumnDataCollection> collection) {
230 |   auto& global_state = gstate.Cast<ArrowWriteGlobalState>();
231 | 
232 |   auto batch = make_uniq<ArrowWriteBatchData>();
233 |   batch->serializer = global_state.writer->NewSerializer();
234 |   batch->serializer->Serialize(*collection);
235 |   collection->Reset();
236 | 
237 |   return std::move(batch);
238 | }
239 | 
240 | void ArrowWriteFlushBatch(ClientContext& context, FunctionData& bind_data,
241 |                           GlobalFunctionData& gstate, PreparedBatchData& batch_p) {
242 |   auto& global_state = gstate.Cast<ArrowWriteGlobalState>();
243 |   auto& batch = batch_p.Cast<ArrowWriteBatchData>();
244 |   global_state.writer->Flush(*batch.serializer);
245 | }
246 | 
247 | }  // namespace
248 | 
249 | void RegisterArrowStreamCopyFunction(DatabaseInstance& db) {
250 |   CopyFunction function("arrows");
251 |   function.copy_to_bind = ArrowWriteBind;
252 |   function.copy_to_initialize_global = ArrowWriteInitializeGlobal;
253 |   function.copy_to_initialize_local = ArrowWriteInitializeLocal;
254 |   function.copy_to_sink = ArrowWriteSink;
255 |   function.copy_to_combine = ArrowWriteCombine;
256 |   function.copy_to_finalize = ArrowWriteFinalize;
257 |   function.execution_mode = ArrowWriteExecutionMode;
258 |   function.copy_from_bind = MultiFileFunction<ArrowMultiFileInfo>::MultiFileBindCopy;
259 |   function.copy_from_function = ReadArrowStreamFunction();
260 |   function.prepare_batch = ArrowWritePrepareBatch;
261 |   function.flush_batch = ArrowWriteFlushBatch;
262 |   function.desired_batch_size = ArrowWriteDesiredBatchSize;
263 |   function.rotate_files = ArrowWriteRotateFiles;
264 |   function.rotate_next_file = ArrowWriteRotateNextFile;
265 | 
266 |   function.extension = "arrows";
267 |   ExtensionUtil::RegisterFunction(db, function);
268 | 
269 |   function.name = "arrow";
270 |   function.extension = "arrow";
271 |   ExtensionUtil::RegisterFunction(db, function);
272 | }
273 | 
274 | }  // namespace ext_nanoarrow
275 | }  // namespace duckdb
276 | 


--------------------------------------------------------------------------------
/test/README.md:
--------------------------------------------------------------------------------
 1 | # Testing this extension
 2 | 
 3 | This directory contains all the tests for this extension. The `sql` directory holds tests that are written as [SQLLogicTests](https://duckdb.org/dev/sqllogictest/intro.html).
 4 | 
 5 | The root makefile contains targets to build and run all of these tests. To run the SQLLogicTests:
 6 | 
 7 | ```bash
 8 | make test
 9 | # or make test_debug
10 | ```
11 | 
12 | If you're using CMake + VSCode, you can run
13 | 
14 | ``` shell
15 | ./test_local.sh
16 | ```
17 | 
18 | The test data is generated with:
19 | 
20 | ```python
21 | import nanoarrow as na
22 | from nanoarrow import ipc
23 | 
24 | url = "https://github.com/apache/arrow-experiments/raw/refs/heads/main/data/arrow-commits/arrow-commits.arrows"
25 | with ipc.StreamWriter.from_path("data/test.arrows") as writer:
26 |     writer.write_stream(na.ArrayStream.from_url(url))
27 | ```
28 | 


--------------------------------------------------------------------------------
/test/nodejs/arrow_test.js:
--------------------------------------------------------------------------------
  1 | var arrow = require('apache-arrow')
  2 | var duckdb = require('duckdb');
  3 | var assert = require('assert');
  4 | // import { RecordBatchReader } from "apache-arrow";
  5 | 
  6 | 
  7 | const parquet_file_path = "data/parquet-testing/lineitem_sf0_01.parquet";
  8 | 
  9 | // Wrapper for tests, materializes whole stream
 10 | const arrow_ipc_stream = async (conn, sql) => {
 11 |     const result_stream = await conn.arrowIPCStream(sql);
 12 |     return await result_stream.toArray();
 13 | }
 14 | 
 15 | // Wrapper for tests
 16 | const arrow_ipc_materialized = async (conn, sql) => {
 17 |     return await new Promise((resolve, reject) => {
 18 |         conn.arrowIPCAll(sql, function (err, result) {
 19 |             if (err) {
 20 |                 reject(err)
 21 |             }
 22 | 
 23 |             resolve(result);
 24 |         })
 25 |     });
 26 | }
 27 | 
 28 | const to_ipc_functions = {
 29 |     'streaming': arrow_ipc_stream,
 30 |     'materialized': arrow_ipc_materialized,
 31 | }
 32 | 
 33 | function getDatabase() {
 34 |     return new duckdb.Database(':memory:', {"allow_unsigned_extensions":"true"});
 35 | }
 36 | 
 37 | 
 38 | // Stream results by getting an arrowIPCStream, then iterating with an arrow RecordBatchReader
 39 | const streamResults = async (con, sql) => {
 40 |   const results = [];
 41 |   for await (const batch of await arrow.RecordBatchReader.from(
 42 |     await con.arrowIPCStream(sql)
 43 |   )) {
 44 |     for (const row of batch) {
 45 |       const result = {};
 46 |       for (const [field, val] of row) {
 47 |         result[field] = val;
 48 |       }
 49 |       results.push(result);
 50 |     }
 51 |   }
 52 |   return results;
 53 | };
 54 | 
 55 | function getConnection(db, done) {
 56 |     let conn = new duckdb.Connection(db);
 57 |     // Makes CI life a bit easier
 58 |      conn.exec(`SET allow_extensions_metadata_mismatch=true;`, function (err) {
 59 |         if (err) throw err;
 60 |     });
 61 |     conn.exec(`LOAD '${process.env.ARROW_EXTENSION_BINARY_PATH}';`, function (err) {
 62 |         if (err) throw err;
 63 |         done();
 64 |     });
 65 |     return conn
 66 | }
 67 | 
 68 | describe(`Arrow IPC`, () => {
 69 |     let db;
 70 |     let conn;
 71 |     before((done) => {
 72 |         db = getDatabase();
 73 |         conn = getConnection(db, () => done())
 74 |     });
 75 | 
 76 |     it(`Basic examples`, async () => {
 77 |         const range_size = 130000;
 78 |         const query = `SELECT * FROM range(0,${range_size}) tbl(i)`;
 79 |         const arrow_table_expected = new arrow.Table({
 80 |             i: new arrow.Vector([arrow.makeData({ type: new arrow.Int32, data: Array.from(new Array(range_size), (x, i) => i) })]),
 81 |         });
 82 | 
 83 |         // Can use Arrow to read from stream directly
 84 |         const result_stream = await db.arrowIPCStream(query);
 85 |         const reader = await arrow.RecordBatchReader.from(result_stream);
 86 |         const table = await arrow.tableFromIPC(reader);
 87 |         const array_from_arrow = table.toArray();
 88 |         assert.deepEqual(array_from_arrow, arrow_table_expected.toArray());
 89 | 
 90 |         // Can also fully materialize stream first, then pass to Arrow
 91 |         const result_stream2 = await db.arrowIPCStream(query);
 92 |         const reader2 = await arrow.RecordBatchReader.from(result_stream2.toArray());
 93 |         const table2 = await arrow.tableFromIPC(reader2);
 94 |         const array_from_arrow2 = table2.toArray();
 95 |         assert.deepEqual(array_from_arrow2, arrow_table_expected.toArray());
 96 | 
 97 |         // Can also fully materialize in DuckDB first (allowing parallel execution)
 98 |         const result_materialized = await new Promise((resolve, reject) => {
 99 |             db.arrowIPCAll(query, function (err, result) {
100 |                 if (err) {
101 |                     reject(err)
102 |                 }
103 | 
104 |                 resolve(result);
105 |             })
106 |         });
107 | 
108 |         const reader3 = await arrow.RecordBatchReader.from(result_materialized);
109 |         const table3 = await arrow.tableFromIPC(reader3);
110 |         const array_from_arrow3 = table3.toArray();
111 |         assert.deepEqual(array_from_arrow3, arrow_table_expected.toArray());
112 | 
113 |         // Scanning materialized IPC buffers from DuckDB
114 |         db.register_buffer("ipc_table", result_materialized, true);
115 |         await new Promise((resolve, reject) => {
116 |             db.arrowIPCAll(`SELECT * FROM ipc_table`, function (err, result) {
117 |                 if (err) {
118 |                     reject(err);
119 |                 }
120 | 
121 |                 assert.deepEqual(result, result_materialized);
122 |                 resolve()
123 |             });
124 |         });
125 |     });
126 | 
127 |     // Ensure we handle empty result properly
128 |     for (const [name, fun] of Object.entries(to_ipc_functions)) {
129 |         it(`Empty results (${name})`, async () => {
130 |             const range_size = 130000;
131 |             const query = `SELECT * FROM range(0,${range_size}) tbl(i) where i > ${range_size}`;
132 | 
133 |             let ipc_buffers = await fun(conn, query);
134 |             const reader = await arrow.RecordBatchReader.from(ipc_buffers);
135 |             const table = await arrow.tableFromIPC(reader);
136 |             const arr = table.toArray();
137 |             assert.deepEqual(arr, []);
138 |         });
139 |     }
140 | })
141 | 
142 | for (const [name, fun] of Object.entries(to_ipc_functions)) {
143 |     describe(`DuckDB <-> Arrow IPC (${name})`, () => {
144 |         const total = 1000;
145 | 
146 |         let db;
147 |         let conn;
148 |         before((done) => {
149 |             db = getDatabase();
150 |             conn = getConnection(db, () => done())
151 |         });
152 | 
153 |         it(`Buffers are not garbage collected`, async () => {
154 |             let ipc_buffers = await fun(conn, 'SELECT * FROM range(1001, 2001) tbl(i)');
155 | 
156 |             // Now to scan the buffer, we first need to register it
157 |             conn.register_buffer(`ipc_table_${name}`, ipc_buffers, true);
158 | 
159 |             // Delete JS reference to arrays
160 |             ipc_buffers = 0;
161 | 
162 |             // Run GC to ensure file is deleted
163 |             if (global.gc) {
164 |                 global.gc();
165 |             } else {
166 |                 throw "should run with --expose-gc";
167 |             }
168 | 
169 |             // Spray memory overwriting hopefully old buffer
170 |             let spray_results = [];
171 |             for (let i = 0; i < 3000; i++) {
172 |                 spray_results.push(await fun(db, 'SELECT * FROM range(2001, 3001) tbl(i)'));
173 |             }
174 | 
175 |             // Now we can query the ipc buffer using DuckDB by providing an object with an alias and the materialized ipc buffers
176 |             await new Promise((resolve, reject) => {
177 |                 conn.all(`SELECT avg(i) as average, count(1) as total
178 |                         FROM ipc_table_${name};`, function (err, result) {
179 |                     if (err) {
180 |                         reject(err);
181 |                     }
182 |                     assert.deepEqual(result, [{average: 1500.5, total: 1000}]);
183 |                     resolve();
184 |                 });
185 |             });
186 |         });
187 | 
188 |         it(`Round-trip int column`, async () => {
189 |             // Now we fetch the ipc stream object and construct the RecordBatchReader
190 |             const ipc_buffers = await fun(db, 'SELECT * FROM range(1001, 2001) tbl(i)');
191 | 
192 |             // Now to scan the buffer, we first need to register it
193 |             conn.register_buffer("ipc_table", ipc_buffers, true, (err) => {
194 |                 assert(!err);
195 |             });
196 | 
197 |             // Now we can query the ipc buffer using DuckDB by providing an object with an alias and the materialized ipc buffers
198 |             await new Promise((resolve, reject) => {
199 |                 conn.all(`SELECT avg(i) as average, count(1) as total
200 |                         FROM ipc_table;`, function (err, result) {
201 |                     if (err) {
202 |                         reject(err)
203 |                     }
204 |                     assert.deepEqual(result, [{average: 1500.5, total: 1000}]);
205 |                     resolve();
206 |                 });
207 |             });
208 |         });
209 | 
210 | 
211 |         it(`Joining 2 IPC buffers in DuckDB`, async () => {
212 |             // Insert first table
213 |             const ipc_buffers1 = await fun(db, 'SELECT * FROM range(1, 3) tbl(i)');
214 | 
215 |             // Insert second table
216 |             const ipc_buffers2 = await fun(db, 'SELECT * FROM range(2, 4) tbl(i)');
217 | 
218 |             // Register buffers for scanning from DuckDB
219 |             conn.register_buffer("table1", ipc_buffers1, true, (err) => {
220 |                 assert(!err);
221 |             });
222 |             conn.register_buffer("table2", ipc_buffers2, true, (err) => {
223 |                 assert(!err);
224 |             });
225 | 
226 |             await new Promise((resolve, reject) => {
227 |                 conn.all(`SELECT *
228 |                         FROM table1
229 |                                  JOIN table2 ON table1.i = table2.i;`, function (err, result) {
230 |                     if (err) {
231 |                         reject(err);
232 |                     }
233 |                     assert.deepEqual(result, [{i: 2}]);
234 |                     resolve()
235 |                 });
236 |             });
237 |         });
238 |     })
239 | }
240 | 
241 | describe('[Benchmark] Arrow IPC Single Int Column (50M tuples)',() => {
242 |     // Config
243 |     const column_size = 50*1000*1000;
244 | 
245 |     let db;
246 |     let conn;
247 | 
248 |     before((done) => {
249 |         db = getDatabase();
250 |         conn = getConnection(db, () => {
251 |             conn.run("CREATE OR REPLACE TABLE test AS select * FROM range(0,?) tbl(i);", column_size, (err) => {
252 |                 if (err) throw err;
253 |                 done()
254 |             });
255 |         })
256 |     });
257 | 
258 |     it('DuckDB table -> DuckDB table', (done) => {
259 |         conn.run('CREATE OR REPLACE  TABLE copy_table AS SELECT * FROM test', (err) => {
260 |             assert(!err);
261 |             done();
262 |         });
263 |     });
264 | 
265 |     it('DuckDB table -> Stream IPC buffer', async () => {
266 |         const result = await conn.arrowIPCStream('SELECT * FROM test');
267 |         const ipc_buffers = await result.toArray();
268 |         const reader = await arrow.RecordBatchReader.from(ipc_buffers);
269 |         const table = arrow.tableFromIPC(reader);
270 |         assert.equal(table.numRows, column_size);
271 |     });
272 | 
273 |     it('DuckDB table -> Materialized IPC buffer',  (done) => {
274 |         conn.arrowIPCAll('SELECT * FROM test', (err,res) => {
275 |             done();
276 |         });
277 |     });
278 | });
279 | 
280 | describe('Buffer registration',() => {
281 |     let db;
282 |     let conn1;
283 |     let conn2;
284 | 
285 |     before((done) => {
286 |         db = new duckdb.Database(':memory:',  {"allow_unsigned_extensions":"true"});
287 |         conn1 = new duckdb.Connection(db);
288 |         conn2 = new duckdb.Connection(db);
289 |         done();
290 |     });
291 | 
292 |     before((done) => {
293 |         db = getDatabase();
294 |         conn1 = getConnection(db, () => {
295 |             conn2 = getConnection(db, () => done());
296 |         })
297 |     });
298 | 
299 |     it('Buffers can only be overwritten with force flag',  async () => {
300 |         const arrow_buffer = await arrow_ipc_materialized(conn1, "SELECT 1337 as a");
301 | 
302 |         conn1.register_buffer('arrow_buffer', arrow_buffer, true, (err) => {
303 |             assert(!err);
304 |         })
305 | 
306 |         await new Promise((resolve, reject) => {
307 |             try {
308 |                 conn1.register_buffer('arrow_buffer', arrow_buffer, false);
309 |                 reject("Expected query to fail");
310 |             } catch (err) {
311 |                 assert(err.message.includes('Buffer with this name already exists and force_register is not enabled'));
312 |                 resolve();
313 |             }
314 |         });
315 |     });
316 | 
317 |     it('Existing tables are silently shadowed by registered buffers',  async () => {
318 |         // Unregister, in case other test has registered this
319 |         conn1.unregister_buffer('arrow_buffer', (err) => {
320 |             assert(!err);
321 |         });
322 | 
323 |         conn1.run('CREATE OR REPLACE  TABLE arrow_buffer AS SELECT 7 as a;', (err) => {
324 |             assert(!err);
325 |         });
326 | 
327 |         conn1.all('SELECT * FROM arrow_buffer;', (err, result) => {
328 |             assert(!err);
329 |             assert.deepEqual(result, [{'a': 7}]);
330 |         });
331 | 
332 |         const arrow_buffer = await arrow_ipc_materialized(conn1, "SELECT 1337 as b");
333 | 
334 |         conn1.register_buffer('arrow_buffer', arrow_buffer, true, (err) => {
335 |             assert(!err);
336 |         })
337 | 
338 |         conn1.all('SELECT * FROM arrow_buffer;', (err, result) => {
339 |             assert(!err);
340 |             assert.deepEqual(result, [{'b': 1337}]);
341 |         });
342 | 
343 |         conn1.unregister_buffer('arrow_buffer', (err) => {
344 |             assert(!err);
345 |         });
346 | 
347 |         conn1.all('SELECT * FROM arrow_buffer;', (err, result) => {
348 |             assert(!err);
349 |             assert.deepEqual(result, [{'a': 7}]);
350 |         });
351 | 
352 |         await new Promise((resolve, reject) => {
353 |             // Cleanup
354 |             conn1.run('DROP TABLE arrow_buffer;', (err) => {
355 |                 if (err) reject(err);
356 |                 resolve();
357 |             });
358 | 
359 |         });
360 |     });
361 | 
362 |     it('Registering buffers should only be visible within current connection', async () => {
363 |         const arrow_buffer1 = await arrow_ipc_materialized(conn1, "SELECT 1337 as a");
364 |         const arrow_buffer2 = await arrow_ipc_materialized(conn2, "SELECT 42 as b");
365 | 
366 |         conn1.register_buffer('arrow_buffer', arrow_buffer1, true, (err) => {
367 |             assert(!err);
368 |         })
369 |         conn2.register_buffer('arrow_buffer', arrow_buffer2, true, (err) => {
370 |             assert(!err);
371 |         })
372 | 
373 |         conn1.all('SELECT * FROM arrow_buffer;', (err, result) => {
374 |             assert(!err);
375 |             assert.deepEqual(result, [{'a': 1337}]);
376 |         });
377 | 
378 |         conn2.all('SELECT * FROM arrow_buffer;', (err, result) => {
379 |             assert(!err);
380 |             assert.deepEqual(result, [{'b': 42}]);
381 |         });
382 | 
383 |         conn1 = 0;
384 | 
385 |         conn2.all('SELECT * FROM arrow_buffer;', (err, result) => {
386 |             assert(!err);
387 |             assert.deepEqual(result, [{'b': 42}]);
388 |         });
389 | 
390 |         conn2.unregister_buffer('arrow_buffer', (err) => {
391 |             assert(!err);
392 |         })
393 | 
394 |         await new Promise((resolve, reject) => {
395 |             conn2.all('SELECT * FROM arrow_buffer;', (err, result) => {
396 |                 if (!err) {
397 |                     reject("Expected error");
398 |                 }
399 |                 assert(err.message.includes('Catalog Error: Table with name arrow_buffer does not exist!'));
400 |                 resolve();
401 |             });
402 |         });
403 |     });
404 | });
405 | 
406 | describe(`Single Value IPC`, () => {
407 |     let db;
408 |     let conn;
409 |     before((done) => {
410 |         db = getDatabase();
411 |         conn = getConnection(db, () => done())
412 |     });
413 | 
414 |     it('Try to read from query returtning one value',  async () => {
415 |         const sql = "select now() as t";
416 |         const result = await streamResults(conn, sql)
417 |         assert.strictEqual(result.length, 1, "Expected exactly one row");
418 |         assert.strictEqual(Object.keys(result[0]).length, 1, "Expected exactly one field");
419 |     });
420 | });
421 | 
422 | describe('[Benchmark] Arrow IPC TPC-H lineitem.parquet', () => {
423 |     const sql = "SELECT sum(l_extendedprice * l_discount) AS revenue FROM lineitem WHERE l_shipdate >= CAST('1994-01-01' AS date) AND l_shipdate < CAST('1995-01-01' AS date) AND l_discount BETWEEN 0.05 AND 0.07 AND l_quantity < 24"
424 |     const answer = [{revenue: 1193053.2253}];
425 | 
426 |     let db;
427 |     let conn;
428 | 
429 |     before((done) => {
430 |         db = getDatabase();
431 |         conn = getConnection(db, () => done())
432 |     });
433 | 
434 |     it('Parquet -> DuckDB Streaming-> Arrow IPC -> DuckDB Query', async () => {
435 |         const ipc_buffers = await arrow_ipc_stream(conn, 'SELECT * FROM "' + parquet_file_path + '"');
436 | 
437 |         const query = sql.replace("lineitem", "my_arrow_ipc_stream");
438 |         conn.register_buffer("my_arrow_ipc_stream", ipc_buffers, true, (err) => {
439 |             assert(!err);
440 |         });
441 | 
442 |         await new Promise((resolve, reject) => {
443 |             conn.all(query, function (err, result) {
444 |                 if (err) {
445 |                     reject(err)
446 |                 }
447 | 
448 |                 assert.deepEqual(result, answer);
449 |                 resolve();
450 |             })
451 |         });
452 |     });
453 | 
454 |     it('Parquet -> DuckDB Materialized -> Arrow IPC -> DuckDB' , async () => {
455 |         const ipc_buffers = await arrow_ipc_materialized(conn, 'SELECT * FROM "' + parquet_file_path + '"');
456 | 
457 |         const query = sql.replace("lineitem", "my_arrow_ipc_stream_2");
458 |         conn.register_buffer("my_arrow_ipc_stream_2", ipc_buffers, true, (err) => {
459 |             assert(!err);
460 |         });
461 | 
462 |         await new Promise((resolve, reject) => {
463 |             conn.all(query, function (err, result) {
464 |                 if (err) {
465 |                     reject(err)
466 |                 } else {
467 |                     assert.deepEqual(result, answer);
468 |                     resolve();
469 |                 }
470 |             })
471 |         });
472 |     });
473 | 
474 |     it('Parquet -> DuckDB', async () => {
475 |         await new Promise((resolve, reject) => {
476 |             conn.run('CREATE OR REPLACE  TABLE load_parquet_directly AS SELECT * FROM "' + parquet_file_path + '";', (err) => {
477 |                 if (err) {
478 |                     reject(err)
479 |                 }
480 |                 resolve()
481 |             });
482 |         });
483 | 
484 |         const query = sql.replace("lineitem", "load_parquet_directly");
485 | 
486 |         const result = await new Promise((resolve, reject) => {
487 |             conn.all(query, function (err, result) {
488 |                 if (err) {
489 |                     reject(err);
490 |                 }
491 |                 resolve(result)
492 |             });
493 |         });
494 | 
495 |         assert.deepEqual(result, answer);
496 |     });
497 | });
498 | 
499 | for (const [name, fun] of Object.entries(to_ipc_functions)) {
500 |     describe(`Arrow IPC TPC-H lineitem SF0.01 (${name})`, () => {
501 |         // `table_name` in these queries will be replaced by either the parquet file directly, or the ipc buffer
502 |         const queries = [
503 |             "select count(*) from table_name LIMIT 10",
504 |             "select sum(l_orderkey) as sum_orderkey FROM table_name",
505 |             "select * from table_name",
506 |             "select l_orderkey from table_name WHERE l_orderkey=2 LIMIT 2",
507 |             "select l_extendedprice from table_name",
508 |             "select l_extendedprice from table_name WHERE l_extendedprice > 53468 and l_extendedprice < 53469  LIMIT 2",
509 |             "select count(l_orderkey) from table_name where l_commitdate > '1996-10-28'",
510 |             "SELECT sum(l_extendedprice * l_discount) AS revenue FROM table_name WHERE l_shipdate >= CAST('1994-01-01' AS date) AND l_shipdate < CAST('1995-01-01' AS date) AND l_discount BETWEEN 0.05 AND 0.07 AND l_quantity < 24"
511 |         ];
512 | 
513 |         let db;
514 |         let conn;
515 |         before((done) => {
516 |             db = getDatabase();
517 |             conn = getConnection(db, () => done())
518 |         });
519 | 
520 |         for (const query of queries) {
521 |             it(` ${query}`, async () => {
522 |                 // First do query directly on parquet file
523 |                 const expected_value = await new Promise((resolve, reject) => {
524 |                     conn.all(query.replace("table_name", `'${parquet_file_path}'`), function (err, result) {
525 |                         if (err) {
526 |                             reject(err);
527 |                         }
528 | 
529 |                         resolve(result);
530 |                     });
531 |                 });
532 | 
533 |                 // Copy parquet file completely into Arrow IPC format
534 |                 const ipc_buffers = await fun(conn, 'SELECT * FROM "' + parquet_file_path + '"');
535 | 
536 |                 // Register the ipc buffers as table in duckdb, using force to override the previously registered buffers
537 |                 conn.register_buffer("table_name", ipc_buffers, true, (err) => {
538 |                     assert(!err);
539 |                 });
540 | 
541 |                 await new Promise((resolve, reject) => {
542 |                     conn.all(query, function (err, result) {
543 |                         if (err) {
544 |                             reject(err)
545 |                         }
546 | 
547 |                         assert.deepEqual(result, expected_value, `Query failed: ${query}`);
548 |                         resolve();
549 |                     })
550 |                 });
551 |             });
552 |         }
553 |     })
554 | }
555 | 


--------------------------------------------------------------------------------
/test/python/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | import duckdb
 4 | from typing import Union, Optional
 5 | from duckdb import DuckDBPyConnection
 6 | 
 7 | dir = os.path.dirname(os.path.abspath(__file__))
 8 | build_type = "release"
 9 | 
10 | @pytest.fixture(scope="function")
11 | def duckdb_empty_cursor(request):
12 |     connection = duckdb.connect('')
13 |     cursor = connection.cursor()
14 |     return cursor
15 | 
16 | def add_extension(extension_name, conn: Union[str, DuckDBPyConnection] = '') -> DuckDBPyConnection:
17 |     if (isinstance(conn, str)):
18 |         config = {
19 |             'allow_unsigned_extensions' : 'true'
20 |         }
21 |         conn = duckdb.connect(conn or '', config=config)
22 |     file_path = f"'{dir}/../../build/{build_type}/extension/{extension_name}/{extension_name}.duckdb_extension'"
23 |     conn.execute(f"LOAD {file_path}")
24 |     return conn
25 | 
26 | @pytest.fixture(scope="function")
27 | def require():
28 |     def _require(extension_name, db_name=''):
29 |         conn = add_extension(extension_name, db_name)
30 |         conn.execute("SET allow_extensions_metadata_mismatch=true;")
31 |         return conn
32 | 
33 |     return _require
34 | 
35 | @pytest.fixture(scope='function')
36 | def connection():
37 | 	return add_extension('nanoarrow')
38 | 


--------------------------------------------------------------------------------
/test/python/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | pyarrow
3 | 


--------------------------------------------------------------------------------
/test/python/test_arrow_ipc_scan.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import pyarrow as pa
 3 | import duckdb
 4 | import pyarrow.ipc as ipc
 5 | 
 6 | 
 7 | def get_record_batch():
 8 |    data = [
 9 |           pa.array([1, 2, 3, 4]),
10 |           pa.array(['foo', 'bar', 'baz', None]),
11 |           pa.array([True, None, False, True])
12 |       ]
13 | 
14 |    return pa.record_batch(data, names=['f0', 'f1', 'f2'])
15 | 
16 | def tables_match(result):
17 |    assert result == [(1, 'foo', True), (2, 'bar', None), (3, 'baz', False), (4, None, True), (1, 'foo', True), (2, 'bar', None), (3, 'baz', False), (4, None, True), (1, 'foo', True), (2, 'bar', None), (3, 'baz', False), (4, None, True), (1, 'foo', True), (2, 'bar', None), (3, 'baz', False), (4, None, True), (1, 'foo', True), (2, 'bar', None), (3, 'baz', False), (4, None, True)]
18 | 
19 | class TestArrowIPCBufferRead(object):
20 |    def test_single_buffer(self, connection):
21 |       batch = get_record_batch()
22 |       sink = pa.BufferOutputStream()
23 |       with pa.ipc.new_stream(sink, batch.schema) as writer:
24 |          for i in range(5):
25 |             writer.write_batch(batch)
26 |       buffer = sink.getvalue()
27 |       with pa.BufferReader(buffer) as buf_reader:
28 |          msg_reader = ipc.MessageReader.open_stream(buf_reader)
29 |          tables_match(connection.from_arrow(msg_reader).fetchall())
30 | 
31 |    def test_multi_buffers(self, connection):
32 |       batch = get_record_batch()
33 |       sink = pa.BufferOutputStream()
34 | 
35 |       with pa.ipc.new_stream(sink, batch.schema) as writer:
36 |           for _ in range(5):  # Write 5 batches into one stream
37 |               writer.write_batch(batch)
38 | 
39 |       buffer = sink.getvalue()
40 | 
41 |       with pa.BufferReader(buffer) as buf_reader:
42 |          msg_reader = ipc.MessageReader.open_stream(buf_reader)
43 |          tables_match(connection.from_arrow(msg_reader).fetchall())
44 | 
45 |    def test_replacement_scan(self, connection):
46 | 
47 |       batch = get_record_batch()
48 |       sink = pa.BufferOutputStream()
49 | 
50 |       with pa.ipc.new_stream(sink, batch.schema) as writer:
51 |          writer.write_batch(batch)
52 | 
53 |       buffer = sink.getvalue()
54 | 
55 |       with pa.BufferReader(buffer) as buf_reader:
56 |          msg_reader = ipc.MessageReader.open_stream(buf_reader)
57 |          with pytest.raises(duckdb.InvalidInputException,
58 |                  match="not suitable for replacement scans",):
59 |             connection.execute("FROM msg_reader")
60 | 


--------------------------------------------------------------------------------
/test/python/test_arrow_ipc_writer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import pyarrow as pa
 3 | import duckdb
 4 | import pyarrow.ipc as ipc
 5 | 
 6 | def create_table(connection):
 7 | 	connection.execute("CREATE TABLE T (f0 integer, f1 varchar, f2 bool )")
 8 | 	connection.execute("INSERT INTO T values (1, 'foo', true),(2, 'bar', NULL), (3, 'baz', false), (4, NULL, true) ")
 9 | 
10 | def tables_match(result):
11 | 	print(result)
12 | 	assert result == [(1, 'foo', True), (2, 'bar', None), (3, 'baz', False), (4, None, True)]
13 | 
14 | class TestArrowIPCBufferWriter(object):
15 | 	def test_round_trip(self, connection):
16 | 		create_table(connection)
17 | 		buffers = connection.execute("FROM to_arrow_ipc((FROM T))").fetchall()
18 | 		buffer = pa.py_buffer(buffers[0][0] + buffers[1][0])
19 | 		with pa.BufferReader(buffer) as buf_reader:
20 | 			msg_reader = ipc.MessageReader.open_stream(buf_reader)
21 | 			tables_match(connection.from_arrow(msg_reader).fetchall())
22 | 
23 | 	def test_arrow_read_duck_buffers(self, connection):
24 | 		create_table(connection)
25 | 		buffers = connection.execute("FROM to_arrow_ipc((FROM T))").fetchall()
26 | 		arrow_buffers = []
27 | 		# We have to concatenate the schema to the data
28 | 		arrow_buffers.append(pa.py_buffer(buffers[0][0] + buffers[1][0]))
29 | 		assert buffers[0][1] == True
30 | 		assert buffers[1][1] == False
31 | 		batches = []
32 | 		with pa.BufferReader(arrow_buffers[0]) as reader:
33 | 			stream_reader = ipc.RecordBatchStreamReader(reader)
34 | 			schema = stream_reader.schema
35 | 			batches.extend(stream_reader)
36 | 		arrow_table = pa.Table.from_batches(batches, schema=schema)
37 | 		tables_match(connection.execute("FROM arrow_table").fetchall())
38 | 


--------------------------------------------------------------------------------
/test/python/test_integration.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import pyarrow as pa
  3 | import duckdb
  4 | import pyarrow.ipc as ipc
  5 | from pyarrow.ipc import MessageReader as mr
  6 | import os
  7 | import sys
  8 | import tempfile
  9 | 
 10 | # duckdb.duckdb.NotImplementedException: Not implemented Error: Unsupported Internal Arrow Type for Decimal d:37,5,256
 11 | # "generated_decimal256.stream",
 12 | 
 13 | # duckdb.duckdb.ConversionException: Conversion Error: Could not convert Interval to Microsecond
 14 | # "generated_interval.stream"
 15 | 
 16 | # Not implemented Error: Unsupported Internal Arrow Type: "d" Union
 17 | # "generated_union.stream"
 18 | 
 19 | little_big_integration_files = ["generated_null_trivial.stream", "generated_primitive_large_offsets.stream","generated_custom_metadata.stream","generated_datetime.stream","generated_decimal.stream","generated_map_non_canonical.stream","generated_map.stream","generated_nested_large_offsets.stream","generated_nested.stream","generated_null.stream","generated_primitive_no_batches.stream","generated_primitive_zerolength.stream","generated_primitive.stream","generated_recursive_nested.stream"]
 20 | 
 21 | compression_2_0_0 = ["generated_uncompressible_zstd.stream", "generated_zstd.stream"]
 22 | 
 23 | script_path = os.path.dirname(os.path.abspath(__file__))
 24 | 
 25 | test_folder = os.path.join(script_path,'..','..','arrow-testing','data','arrow-ipc-stream','integration')
 26 | 
 27 | # All Test Folders:
 28 | big_endian_folder = os.path.join(test_folder,'1.0.0-bigendian')
 29 | little_endian_folder = os.path.join(test_folder,'1.0.0-littleendian')
 30 | compression_folder = os.path.join(test_folder,'2.0.0-compression')
 31 | 
 32 | def compare_result(arrow_result,duckdb_result, con):
 33 |     return con.execute("""
 34 |     SELECT COUNT(*) = 0
 35 |     FROM (
 36 |         (SELECT * FROM arrow_result EXCEPT SELECT * FROM duckdb_result)
 37 |         UNION
 38 |         (SELECT * FROM duckdb_result EXCEPT SELECT * FROM arrow_result)
 39 |     ) """).fetchone()[0]
 40 | 
 41 | # 1. Compare result from reading the IPC file in Arrow, and in Duckdb
 42 | def compare_ipc_file_reader(con, file):
 43 |     arrow_result = ipc.open_stream(file).read_all()
 44 |     duckdb_file_result = con.sql(f"FROM read_arrow('{file}')").arrow()
 45 |     assert compare_result(arrow_result, duckdb_file_result, con)
 46 | 
 47 | # 2. Now test the writer, write it to a file from DuckDB, read it with arrow and compare
 48 | def compare_ipc_file_writer(con, file):
 49 |     arrow_result = ipc.open_stream(file).read_all()
 50 |     with tempfile.TemporaryDirectory() as temp_dir:
 51 |         file_path = os.path.join(temp_dir, "arrow_duck.arrows")
 52 |         con.execute(f"COPY (FROM read_arrow('{file}')) TO '{file_path}'")
 53 |         duckdb_file_result = con.sql(f"FROM read_arrow('{file}')").arrow()
 54 |         assert compare_result(arrow_result, duckdb_file_result, con)
 55 | 
 56 | # 3. Compare result from reading the IPC file in Arrow, and in Duckdb
 57 | def compare_ipc_buffer_reader(con, file):
 58 |     arrow_result = ipc.open_stream(file).read_all()
 59 |     reader = mr.open_stream(file)
 60 |     duckdb_struct_result = con.from_arrow(reader).arrow()
 61 |     assert compare_result(arrow_result, duckdb_struct_result, con)
 62 | 
 63 | # 4. Now test the DuckDB buffer writer, by reading it back with arrow and comparing
 64 | def compare_ipc_buffer_writer(con, file):
 65 |     arrow_result = ipc.open_stream(file).read_all()
 66 |     buffers = con.execute(f"FROM to_arrow_ipc((FROM read_arrow('{file}')))").fetchall()
 67 |     if not buffers:
 68 |         return
 69 |     arrow_buffers = []
 70 |     for i in range (1, len(buffers)):
 71 |         # We have to concatenate the schema to the data
 72 |         arrow_buffers.append(pa.py_buffer(buffers[0][0] + buffers[i][0]))
 73 | 
 74 |     batches = []
 75 |     for buffer in arrow_buffers:
 76 |         with pa.BufferReader(buffer) as reader:
 77 |             stream_reader = ipc.RecordBatchStreamReader(reader)
 78 |             schema = stream_reader.schema
 79 |             batches.extend(stream_reader)
 80 | 
 81 |     duckdb_struct_result = pa.Table.from_batches(batches, schema=schema)
 82 |     assert compare_result(arrow_result, duckdb_struct_result, con)
 83 | 
 84 | 
 85 | class TestArrowIntegrationTests(object):
 86 |     def test_read_ipc_file(self, connection):
 87 |         for file in little_big_integration_files:
 88 |             compare_ipc_file_reader(connection,os.path.join(big_endian_folder,file))
 89 |             compare_ipc_file_reader(connection,os.path.join(little_endian_folder,file))
 90 |         for file in compression_2_0_0:
 91 |             compare_ipc_file_reader(connection,os.path.join(compression_folder,file))
 92 | 
 93 |     def test_write_ipc_file(self, connection):
 94 |         for file in little_big_integration_files:
 95 |             compare_ipc_file_writer(connection,os.path.join(big_endian_folder,file))
 96 |             compare_ipc_file_writer(connection,os.path.join(little_endian_folder,file))
 97 |         for file in compression_2_0_0:
 98 |             compare_ipc_file_reader(connection,os.path.join(compression_folder,file))
 99 | 
100 |     def test_read_ipc_buffer(self, connection):
101 |         for file in little_big_integration_files:
102 |             compare_ipc_buffer_reader(connection,os.path.join(big_endian_folder,file))
103 |             compare_ipc_buffer_reader(connection,os.path.join(little_endian_folder,file))
104 |         for file in compression_2_0_0:
105 |             compare_ipc_file_reader(connection,os.path.join(compression_folder,file))
106 | 
107 |     def test_write_ipc_buffer(self, connection):
108 |         for file in little_big_integration_files:
109 |             compare_ipc_buffer_writer(connection,os.path.join(big_endian_folder,file))
110 |             compare_ipc_buffer_writer(connection,os.path.join(little_endian_folder,file))
111 |         for file in compression_2_0_0:
112 |             compare_ipc_buffer_writer(connection,os.path.join(compression_folder,file))
113 | 


--------------------------------------------------------------------------------
/test/sql/arrow_testing.test:
--------------------------------------------------------------------------------
 1 | # name:
 2 | # description: test nanoarrow extension
 3 | # group: [nanoarrow]
 4 | 
 5 | # Require statement will ensure this test is run with this extension loaded
 6 | require nanoarrow
 7 | 
 8 | statement ok
 9 | SET VARIABLE test_files = '__WORKING_DIRECTORY__/arrow-testing/data/arrow-ipc-stream/integration/';
10 | 
11 | # We can do more sophisticated things here (read the arrow_file or .json.gz
12 | # versions of the files, etc.)
13 | statement ok
14 | CREATE MACRO check_arrow_testing_file(test_file) AS TABLE
15 |     FROM read_arrow(getvariable('test_files') || test_file || '.stream');
16 | 
17 | statement ok
18 | FROM check_arrow_testing_file('1.0.0-littleendian/generated_primitive')
19 | 
20 | statement ok
21 | FROM check_arrow_testing_file('1.0.0-littleendian/generated_datetime')
22 | 
23 | statement ok
24 | FROM check_arrow_testing_file('1.0.0-littleendian/generated_decimal')
25 | 
26 | # This test will fail,because the struct created in this arrow file does not have names on their children
27 | # This is not supported by DuckDB.
28 | statement error
29 | FROM check_arrow_testing_file('1.0.0-littleendian/generated_duplicate_fieldnames')
30 | ----
31 | Struct remap can only remap named structs
32 | 
33 | statement ok
34 | FROM check_arrow_testing_file('1.0.0-littleendian/generated_map_non_canonical')
35 | 
36 | statement ok
37 | FROM check_arrow_testing_file('1.0.0-littleendian/generated_map')
38 | 
39 | statement ok
40 | FROM check_arrow_testing_file('1.0.0-littleendian/generated_nested_large_offsets')
41 | 
42 | statement ok
43 | FROM check_arrow_testing_file('1.0.0-littleendian/generated_nested')
44 | 
45 | statement ok
46 | FROM check_arrow_testing_file('1.0.0-littleendian/generated_null_trivial')
47 | 
48 | statement ok
49 | FROM check_arrow_testing_file('1.0.0-littleendian/generated_primitive_large_offsets')
50 | 
51 | statement ok
52 | FROM check_arrow_testing_file('1.0.0-littleendian/generated_primitive_no_batches')
53 | 
54 | statement ok
55 | FROM check_arrow_testing_file('1.0.0-littleendian/generated_primitive_zerolength')
56 | 
57 | statement ok
58 | FROM check_arrow_testing_file('1.0.0-littleendian/generated_recursive_nested')
59 | 
60 | statement ok
61 | FROM check_arrow_testing_file('2.0.0-compression/generated_uncompressible_zstd')
62 | 
63 | statement ok
64 | FROM check_arrow_testing_file('2.0.0-compression/generated_zstd')
65 | 
66 | 
67 | # Following tests are failing but are unrelated to the extension:
68 | # Could not convert interval to microsecond?
69 | # statement ok
70 | # SELECT * FROM check_arrow_testing_file('1.0.0-littleendian/generated_interval')
71 | 
72 | # Dense unions not supported?
73 | # statement ok
74 | # SELECT * FROM check_arrow_testing_file('1.0.0-littleendian/generated_union')
75 | 
76 | # Fails because of missing extension registration
77 | # statement ok
78 | # SELECT * FROM check_arrow_testing_file('1.0.0-littleendian/generated_custom_metadata')
79 | 
80 | # Decimal256 apparently not supported
81 | # statement ok
82 | # SELECT * FROM check_arrow_testing_file('1.0.0-littleendian/generated_decimal256')
83 | 


--------------------------------------------------------------------------------
/test/sql/multifile_reading.test:
--------------------------------------------------------------------------------
  1 | # name: test/sql/multifile_reading.test
  2 | # description: Test read_arrow over multiple files.
  3 | # group: [nanoarrow]
  4 | 
  5 | require nanoarrow
  6 | 
  7 | # Test File List Works
  8 | statement ok
  9 | CREATE TABLE T as FROM read_arrow(['data/test.arrows', 'data/test.arrows'])
 10 | 
 11 | query I
 12 | SELECT count(*) from T;
 13 | ----
 14 | 30974
 15 | 
 16 | # Test GLOB
 17 | query III
 18 | FROM read_arrow('data/multifile/glob/*.arrow')
 19 | ----
 20 | apple	gala	134.2
 21 | orange	navel	142.1
 22 | apple	honeycrisp	158.6
 23 | orange	valencia	96.7
 24 | apple	fuji	NULL
 25 | orange	cara cara	NULL
 26 | 
 27 | # Test projections
 28 | query II
 29 | SELECT weight, variety FROM read_arrow('data/multifile/glob/*.arrow')
 30 | ----
 31 | 134.2	gala
 32 | 142.1	navel
 33 | 158.6	honeycrisp
 34 | 96.7	valencia
 35 | NULL	fuji
 36 | NULL	cara cara
 37 | 
 38 | query II
 39 | SELECT count(*), fruit FROM read_arrow('data/multifile/glob/*.arrow') group by fruit order by all
 40 | ----
 41 | 3	apple
 42 | 3	orange
 43 | 
 44 | # Test mismatching schemas
 45 | statement error
 46 | FROM read_arrow(['data/test.arrows', 'data/multifile/glob/f1.arrow'])
 47 | ----
 48 | If you are trying to read files with different schemas, try setting union_by_name=True
 49 | 
 50 | statement error
 51 | FROM read_arrow(['data/multifile/fruit_extra.arrows', 'data/multifile/glob/f1.arrow'])
 52 | ----
 53 | If you are trying to read files with different schemas, try setting union_by_name=True
 54 | 
 55 | 
 56 | # Test UNION BY NAME
 57 | query IIII
 58 | FROM read_arrow(['data/multifile/fruit_extra.arrows', 'data/multifile/glob/f1.arrow'], union_by_name=True)
 59 | ----
 60 | apple	pink lady	2.2	10.0
 61 | orange	jiha	NULL	NULL
 62 | apple	gala	134.2	NULL
 63 | orange	navel	142.1	NULL
 64 | 
 65 | # Test different column order
 66 | query III
 67 | FROM read_arrow(['data/multifile/different_order.arrows', 'data/multifile/glob/f1.arrow']) order by all
 68 | ----
 69 | apple	2.2	pink lady
 70 | apple	134.2	gala
 71 | orange	142.1	navel
 72 | orange	NULL	jiha
 73 | 
 74 | # Test different types
 75 | query III
 76 | FROM read_arrow(['data/multifile/different_type.arrows', 'data/multifile/glob/f1.arrow']) order by all
 77 | ----
 78 | apple	gala	134.2
 79 | apple	pink lady	2.2
 80 | orange	jiha	NULL
 81 | orange	navel	142.1
 82 | 
 83 | query III
 84 | FROM read_arrow(['data/multifile/glob/f1.arrow', 'data/multifile/different_type.arrows']) order by all
 85 | ----
 86 | apple	gala	134.2
 87 | apple	pink lady	2.2
 88 | orange	jiha	NULL
 89 | orange	navel	142.1
 90 | 
 91 | query I
 92 | select typeof(#3) FROM read_arrow(['data/multifile/different_type.arrows', 'data/multifile/glob/f1.arrow']) limit 1
 93 | ----
 94 | VARCHAR
 95 | 
 96 | query I
 97 | select typeof(#3) FROM read_arrow(['data/multifile/glob/f1.arrow', 'data/multifile/different_type.arrows']) limit 1
 98 | ----
 99 | DOUBLE
100 | 
101 | query I
102 | select typeof(weight) FROM read_arrow(['data/multifile/different_type_int.arrows','data/multifile/glob/f1.arrow', 'data/multifile/different_type.arrows'], union_by_name = true) limit 1;
103 | ----
104 | VARCHAR
105 | 
106 | query I
107 | select typeof(weight) FROM read_arrow(['data/multifile/different_type_int.arrows','data/multifile/glob/f1.arrow', 'data/multifile/different_type.arrows']) limit 1;
108 | ----
109 | BIGINT
110 | 
111 | query III
112 | FROM read_arrow(['data/multifile/glob/f1.arrow', 'data/multifile/different_type_order.arrows']) order by all
113 | ----
114 | apple	gala	134.2
115 | apple	pink lady	2.2
116 | orange	jiha	NULL
117 | orange	navel	142.1
118 | 
119 | # Test filename option
120 | query IIII
121 | SELECT fruit, variety, weight, replace(filename, '\', '/') FROM read_arrow('data/multifile/glob/*.arrow', filename = true)
122 | ----
123 | apple	gala	134.2	data/multifile/glob/f1.arrow
124 | orange	navel	142.1	data/multifile/glob/f1.arrow
125 | apple	honeycrisp	158.6	data/multifile/glob/f2.arrow
126 | orange	valencia	96.7	data/multifile/glob/f2.arrow
127 | apple	fuji	NULL	data/multifile/glob/f3.arrow
128 | orange	cara cara	NULL	data/multifile/glob/f3.arrow
129 | 
130 | # test hive_partitioning option
131 | query IIII
132 | FROM read_arrow('data/multifile/hive/*/*.arrow', hive_partitioning = true)
133 | ----
134 | apple	gala	134.2	a
135 | orange	navel	142.1	a
136 | apple	honeycrisp	158.6	a
137 | orange	valencia	96.7	a
138 | apple	gala	134.2	b
139 | orange	navel	142.1	b
140 | apple	fuji	NULL	b
141 | orange	cara cara	NULL	b
142 | 
143 | # Multifile reader works with replacement scans
144 | query III
145 | FROM 'data/multifile/glob/*.arrow' ORDER BY ALL
146 | ----
147 | apple	fuji	NULL
148 | apple	gala	134.2
149 | apple	honeycrisp	158.6
150 | orange	cara cara	NULL
151 | orange	navel	142.1
152 | orange	valencia	96.7
153 | 
154 | statement ok
155 | CREATE TABLE T_2 (fruit varchar, variety varchar, weight double);
156 | 
157 | statement error
158 | COPY T_2 FROM 'data/multifile/glob/*.arrow' (FORMAT arrows, Made_up_option FALSE)
159 | ----
160 | Unsupported option for COPY
161 | 
162 | statement ok
163 | COPY T_2 FROM 'data/multifile/glob/*.arrow' (FORMAT arrows)
164 | 
165 | query III
166 | FROM T_2
167 | ----
168 | apple	gala	134.2
169 | orange	navel	142.1
170 | apple	honeycrisp	158.6
171 | orange	valencia	96.7
172 | apple	fuji	NULL
173 | orange	cara cara	NULL
174 | 


--------------------------------------------------------------------------------
/test/sql/nanoarrow.test:
--------------------------------------------------------------------------------
 1 | # name: test/sql/nanoarrow.test
 2 | # description: test nanoarrow extension
 3 | # group: [nanoarrow]
 4 | 
 5 | # Before we load the extension, this will fail
 6 | statement error
 7 | SELECT nanoarrow_version();
 8 | ----
 9 | Catalog Error: Scalar Function with name nanoarrow_version does not exist!
10 | 
11 | # Require statement will ensure this test is run with this extension loaded
12 | require nanoarrow
13 | 
14 | # Confirm the extension works
15 | query I
16 | SELECT nanoarrow_version();
17 | ----
18 | 0.7.0-SNAPSHOT
19 | 


--------------------------------------------------------------------------------
/test/sql/read_arrow.test:
--------------------------------------------------------------------------------
 1 | # name: test/sql/read_arrow.test
 2 | # description: test nanoarrow extension
 3 | # group: [nanoarrow]
 4 | 
 5 | # Require statement will ensure this test is run with this extension loaded
 6 | require nanoarrow
 7 | 
 8 | # Check a basic roundtrip
 9 | statement ok
10 | COPY (SELECT 42 as foofy, 'string' as stringy) TO "__TEST_DIR__/test.arrows" (FORMAT ARROWS);
11 | 
12 | query II
13 | SELECT * FROM read_arrow('__TEST_DIR__/test.arrows');
14 | ----
15 | 42	string
16 | 
17 | # Check that the replacement scan works
18 | query II
19 | SELECT * FROM "__TEST_DIR__/test.arrows";
20 | ----
21 | 42	string
22 | 
23 | # Make sure these project correctly
24 | query I
25 | SELECT foofy FROM read_arrow('__TEST_DIR__/test.arrows')
26 | ----
27 | 42
28 | 
29 | query I
30 | SELECT stringy FROM read_arrow('__TEST_DIR__/test.arrows')
31 | ----
32 | string
33 | 
34 | # Check our more realistic test table
35 | query I
36 | SELECT count(*) FROM read_arrow('__WORKING_DIRECTORY__/data/test.arrows');
37 | ----
38 | 15487
39 | 
40 | statement error
41 | SELECT count(*) FROM read_arrow('__WORKING_DIRECTORY__/data/test.arrows', made_up_option = false);
42 | ----
43 | Invalid named parameter "made_up_option" for function read_arrow
44 | 
45 | # Check with a filter and projection
46 | query I
47 | SELECT message FROM read_arrow('__WORKING_DIRECTORY__/data/test.arrows') WHERE "commit" = 'fa5f0299f046c46e1b2f671e5e3b4f1956522711';
48 | ----
49 | ARROW-1: Initial Arrow Code Commit
50 | 
51 | # Check a filter that has to select from multiple batches
52 | query I
53 | SELECT count(*) from "data/test.arrows" WHERE dayname(time::TIMESTAMP) = 'Wednesday';
54 | ----
55 | 2927
56 | 


--------------------------------------------------------------------------------
/test/sql/read_arrow_file.test:
--------------------------------------------------------------------------------
 1 | # name: test/sql/read_arrow_file.test
 2 | # description: test nanoarrow extension when reading arrow file (with footer)
 3 | # group: [nanoarrow]
 4 | 
 5 | # The files here should be generated with "with pa.ipc.new_file(file_path, table.schema) as writer:"
 6 | # Require statement will ensure this test is run with this extension loaded
 7 | require nanoarrow
 8 | 
 9 | query III
10 | FROM 'data/fruit.arrow'
11 | ----
12 | apple	gala	134.2
13 | apple	honeycrisp	158.6
14 | apple	fuji	NULL
15 | orange	navel	142.1
16 | orange	valencia	96.7
17 | orange	cara cara	NULL
18 | 


--------------------------------------------------------------------------------
/test/sql/test_copy_to.test:
--------------------------------------------------------------------------------
 1 | # name: test/sql/test_copy_to.test
 2 | # description: test copy to functionality and options
 3 | # group: [nanoarrow]
 4 | 
 5 | # Require statement will ensure this test is run with this extension loaded
 6 | require nanoarrow
 7 | 
 8 | statement ok
 9 | CREATE TABLE test AS SELECT * FROM read_arrow('__WORKING_DIRECTORY__/data/test.arrows');
10 | 
11 | # Try extensions without the format options
12 | statement ok
13 | COPY test TO '__TEST_DIR__/test.arrows'
14 | 
15 | statement ok
16 | COPY test TO '__TEST_DIR__/test.arrow'
17 | 
18 | query I
19 | SELECT count(*) FROM read_arrow('__TEST_DIR__/test.arrows');
20 | ----
21 | 15487
22 | 
23 | query I
24 | SELECT count(*) FROM read_arrow('__TEST_DIR__/test.arrows');
25 | ----
26 | 15487
27 | 
28 | # Lets test the writing options
29 | # row_group_size: The size of a row group. By default, the value is 122,880. A lower value may reduce performance but can be beneficial for streaming.
30 | 
31 | statement ok
32 | COPY test TO '__TEST_DIR__/test_row_group.arrow' (row_group_size 10)
33 | 
34 | query I
35 | SELECT count(*) FROM read_arrow('__TEST_DIR__/test_row_group.arrow');
36 | ----
37 | 15487
38 | 
39 | statement ok
40 | COPY test TO '__TEST_DIR__/test_row_group.arrow' (chunk_size 10)
41 | 
42 | query I
43 | SELECT count(*) FROM read_arrow('__TEST_DIR__/test_row_group.arrow');
44 | ----
45 | 15487
46 | 
47 | statement error
48 | COPY test TO '__TEST_DIR__/test_row_group.arrow' (row_group_size 100, chunk_size 10)
49 | ----
50 | ROW_GROUP_SIZE and ROW_GROUP_SIZE_BYTES are mutually exclusive
51 | 
52 | statement error
53 | COPY test TO '__TEST_DIR__/test_row_group.arrow' (row_group_size_bytes 100)
54 | ----
55 | ROW_GROUP_SIZE_BYTES does not work while preserving insertion order. Use "SET preserve_insertion_order=false;" to disable preserving insertion order.
56 | 
57 | statement ok
58 | SET preserve_insertion_order=false;
59 | 
60 | statement ok
61 | COPY test TO '__TEST_DIR__/test_row_group.arrow' (row_group_size_bytes 100)
62 | 
63 | query I
64 | SELECT count(*) FROM read_arrow('__TEST_DIR__/test_row_group.arrow');
65 | ----
66 | 15487
67 | 
68 | # This actually has a "minimum" of 2048
69 | statement ok
70 | COPY test TO '__TEST_DIR__/test_row_group_folder' (chunk_size 10, row_groups_per_file 1, FORMAT ARROW)
71 | 
72 | query I
73 | select count(file) from glob('__TEST_DIR__/test_row_group_folder/*');
74 | ----
75 | 9
76 | 
77 | query I
78 | SELECT count(*) FROM read_arrow('__TEST_DIR__/test_row_group_folder/*');
79 | ----
80 | 15487
81 | 
82 | statement ok
83 | COPY test TO '__TEST_DIR__/data_kv.arrow' (kv_metadata {'test':'works'})
84 | 
85 | query I
86 | SELECT count(*) FROM read_arrow('__TEST_DIR__/data_kv.arrow');
87 | ----
88 | 15487
89 | 


--------------------------------------------------------------------------------
/test/sql/to_arrow_ipc.test:
--------------------------------------------------------------------------------
 1 | # name: test/sql/to_arrow_ipc.test
 2 | # description: round trip arrow serialization
 3 | # group: [nanoarrow]
 4 | 
 5 | # NOTE: for now there's not much we can test here, since we cannot really pass pointers from the
 6 | #       serialized blobs to the scan_arrow_ipc function in SQL. Therefore tests of these features
 7 | #       currently live in the NodeJS client tests.
 8 | 
 9 | require nanoarrow
10 | 
11 | statement ok
12 | SET disabled_optimizers='column_lifetime'
13 | 
14 | statement ok
15 | SELECT * FROM to_arrow_ipc((SELECT 'Its working!'));
16 | 
17 | # Test operator caching behaviour is sane
18 | statement ok
19 | create table data as select * from range(0,2000) tbl(col)
20 | 
21 | statement ok
22 | WITH data_union AS (
23 |     SELECT * FROM data
24 |     UNION ALL
25 |     SELECT * FROM data
26 | )
27 | FROM to_arrow_ipc((SELECT * FROM data_union ORDER BY col))
28 | 


--------------------------------------------------------------------------------
/test/sql/write_arrow_stream.test:
--------------------------------------------------------------------------------
 1 | # name: test/sql/write_arrow_stream.test
 2 | # description: test nanoarrow extension
 3 | # group: [nanoarrow]
 4 | 
 5 | # Require statement will ensure this test is run with this extension loaded
 6 | require nanoarrow
 7 | 
 8 | statement ok
 9 | CREATE TABLE test AS SELECT * FROM read_arrow('__WORKING_DIRECTORY__/data/test.arrows');
10 | 
11 | statement ok
12 | COPY test TO '__TEST_DIR__/test.arrows' (FORMAT ARROWS, BATCH_SIZE 100)
13 | 
14 | statement ok
15 | CREATE OR REPLACE TABLE written AS SELECT * FROM read_arrow('__TEST_DIR__/test.arrows');
16 | 
17 | query I
18 | SELECT count(*) FROM read_arrow('__TEST_DIR__/test.arrows');
19 | ----
20 | 15487
21 | 
22 | query I
23 | SELECT sum((test.time = written.time)::INTEGER) FROM test INNER JOIN written ON test.commit = written.commit;
24 | ----
25 | 15487
26 | 


--------------------------------------------------------------------------------
/test_local.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | # Cheap version of make test that works with cmake
4 | SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
5 | SOURCE_DIR_NAME="$(basename "${SOURCE_DIR}")"
6 | 
7 | "$SOURCE_DIR/build/test/unittest" "*/${SOURCE_DIR_NAME}/*"
8 | 


--------------------------------------------------------------------------------
/vcpkg.json:
--------------------------------------------------------------------------------
1 | {
2 |     "dependencies": []
3 | }
4 | 


--------------------------------------------------------------------------------