├── .editorconfig
├── vcpkg.json
├── docker-demo
    ├── test_verification.sql
    ├── docker-compose.yml
    ├── Dockerfile
    ├── demo.sql
    └── README.md
├── .clangd
├── assets
    ├── benchmark_boxplot.png
    └── benchmark_customer_table.png
├── benchmarks
    ├── benchmark_sf_6_with_spatial.png
    ├── benchmark_sf_10_with_spatial.png
    ├── joined_barplot_benchmark_sf_1.png
    ├── barplot_benchmark_sf_1_with_spatial.png
    ├── barplot_benchmark_sf_6_with_spatial.png
    ├── boxplot_benchmark_sf_1_with_spatial.png
    ├── boxplot_benchmark_sf_6_with_spatial.png
    ├── joined_barplot_benchmark_1000000_rows.png
    ├── joined_barplot_benchmark_sf_6_with_spatial.png
    ├── tpch_customer_sf_1_results.csv
    ├── tpch_customer_sf_6_results.csv
    ├── tpch_orders_sf_1_results.csv
    └── tpch_customer_sf_10_results.csv
├── .gitignore
├── Makefile
├── extension_config.cmake
├── .gitmodules
├── .github
    ├── dependabot.yml
    └── workflows
    │   ├── _check_secrets.yml
    │   ├── MainDistributionPipeline.yml
    │   └── _extension_deploy.yml
├── test
    └── README.md
├── .vscode
    ├── tasks.json
    └── launch.json
├── docs
    └── UPDATING.md
├── .devcontainer
    ├── Dockerfile
    ├── devcontainer.json
    └── reinstall-cmake.sh
├── .clang-format
├── LICENSE
├── CMakeLists.txt
├── .clang-tidy
├── scripts
    └── extension-upload.sh
├── src
    ├── include
    │   └── sheetreader_extension.hpp
    └── sheetreader_extension.cpp
└── README.md


/.editorconfig:
--------------------------------------------------------------------------------
1 | duckdb/.editorconfig


--------------------------------------------------------------------------------
/vcpkg.json:
--------------------------------------------------------------------------------
1 | {
2 |   "dependencies": [
3 |   ]
4 | }


--------------------------------------------------------------------------------
/docker-demo/test_verification.sql:
--------------------------------------------------------------------------------
1 | SELECT * FROM sheetreader('docker-demo/test.xlsx');
2 | 


--------------------------------------------------------------------------------
/.clangd:
--------------------------------------------------------------------------------
1 | CompileFlags:
2 |   CompilationDatabase: build/debug
3 |   Add: -Wno-unqualified-std-cast-call
4 | 


--------------------------------------------------------------------------------
/assets/benchmark_boxplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/polydbms/sheetreader-duckdb/HEAD/assets/benchmark_boxplot.png


--------------------------------------------------------------------------------
/assets/benchmark_customer_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/polydbms/sheetreader-duckdb/HEAD/assets/benchmark_customer_table.png


--------------------------------------------------------------------------------
/benchmarks/benchmark_sf_6_with_spatial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/polydbms/sheetreader-duckdb/HEAD/benchmarks/benchmark_sf_6_with_spatial.png


--------------------------------------------------------------------------------
/benchmarks/benchmark_sf_10_with_spatial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/polydbms/sheetreader-duckdb/HEAD/benchmarks/benchmark_sf_10_with_spatial.png


--------------------------------------------------------------------------------
/benchmarks/joined_barplot_benchmark_sf_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/polydbms/sheetreader-duckdb/HEAD/benchmarks/joined_barplot_benchmark_sf_1.png


--------------------------------------------------------------------------------
/benchmarks/barplot_benchmark_sf_1_with_spatial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/polydbms/sheetreader-duckdb/HEAD/benchmarks/barplot_benchmark_sf_1_with_spatial.png


--------------------------------------------------------------------------------
/benchmarks/barplot_benchmark_sf_6_with_spatial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/polydbms/sheetreader-duckdb/HEAD/benchmarks/barplot_benchmark_sf_6_with_spatial.png


--------------------------------------------------------------------------------
/benchmarks/boxplot_benchmark_sf_1_with_spatial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/polydbms/sheetreader-duckdb/HEAD/benchmarks/boxplot_benchmark_sf_1_with_spatial.png


--------------------------------------------------------------------------------
/benchmarks/boxplot_benchmark_sf_6_with_spatial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/polydbms/sheetreader-duckdb/HEAD/benchmarks/boxplot_benchmark_sf_6_with_spatial.png


--------------------------------------------------------------------------------
/benchmarks/joined_barplot_benchmark_1000000_rows.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/polydbms/sheetreader-duckdb/HEAD/benchmarks/joined_barplot_benchmark_1000000_rows.png


--------------------------------------------------------------------------------
/benchmarks/joined_barplot_benchmark_sf_6_with_spatial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/polydbms/sheetreader-duckdb/HEAD/benchmarks/joined_barplot_benchmark_sf_6_with_spatial.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | build
 2 | .idea
 3 | cmake-build-debug
 4 | duckdb_unittest_tempdir/
 5 | .DS_Store
 6 | testext
 7 | test/python/__pycache__/
 8 | .Rhistory
 9 | *.xlsx
10 | *.csv
11 | *.txt
12 | *.db
13 | *.db.wal
14 | benchmark/
15 | .trunk/


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | PROJ_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
2 | 
3 | # Configuration of extension
4 | EXT_NAME=sheetreader
5 | EXT_CONFIG=${PROJ_DIR}extension_config.cmake
6 | 
7 | # Include the Makefile from extension-ci-tools
8 | include extension-ci-tools/makefiles/duckdb_extension.Makefile


--------------------------------------------------------------------------------
/extension_config.cmake:
--------------------------------------------------------------------------------
 1 | # This file is included by DuckDB's build system. It specifies which extension to load
 2 | 
 3 | # Extension from this repo
 4 | duckdb_extension_load(sheetreader
 5 |     SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}
 6 |     LOAD_TESTS
 7 | )
 8 | 
 9 | # Any extra extensions that should be built
10 | # e.g.: duckdb_extension_load(json)


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "duckdb"]
 2 | 	path = duckdb
 3 | 	url = https://github.com/duckdb/duckdb
 4 | 	branch = main
 5 | [submodule "extension-ci-tools"]
 6 | 	path = extension-ci-tools
 7 | 	url = https://github.com/duckdb/extension-ci-tools
 8 | 	branch = main
 9 | [submodule "src/include/sheetreader-core"]
10 | 	path = src/include/sheetreader-core
11 | 	url = https://github.com/polydbms/sheetreader-core.git
12 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for more information:
 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 5 | # https://containers.dev/guide/dependabot
 6 | 
 7 | version: 2
 8 | updates:
 9 |  - package-ecosystem: "devcontainers"
10 |    directory: "/"
11 |    schedule:
12 |      interval: weekly
13 | 


--------------------------------------------------------------------------------
/test/README.md:
--------------------------------------------------------------------------------
 1 | # Testing this extension
 2 | This directory contains all the tests for this extension. The `sql` directory holds tests that are written as [SQLLogicTests](https://duckdb.org/dev/sqllogictest/intro.html). DuckDB aims to have most its tests in this format as SQL statements, so for the quack extension, this should probably be the goal too.
 3 | 
 4 | The root makefile contains targets to build and run all of these tests. To run the SQLLogicTests:
 5 | ```bash
 6 | make test
 7 | ```
 8 | or 
 9 | ```bash
10 | make test_debug
11 | ```


--------------------------------------------------------------------------------
/docker-demo/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | 
 3 | services:
 4 |   sheetreader-dev:
 5 |     build:
 6 |       context: .
 7 |       dockerfile: Dockerfile
 8 |     volumes:
 9 |       - ../:/workspace/sheetreader-duckdb
10 |       - ccache_vol:/root/.ccache
11 |     environment:
12 |       - PATH=/usr/lib/ccache:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
13 |     working_dir: /workspace/sheetreader-duckdb
14 |     command: /bin/bash
15 |     deploy:
16 |       resources:
17 |         limits:
18 |           memory: 12G # Limit to 12GB, leaving 4GB for host system
19 | 
20 | volumes:
21 |   ccache_vol:
22 | 


--------------------------------------------------------------------------------
/.vscode/tasks.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   // See https://go.microsoft.com/fwlink/?LinkId=733558
 3 |   // for the documentation about the tasks.json format
 4 |   "version": "2.0.0",
 5 |   "tasks": [
 6 |     {
 7 |       "label": "Build release",
 8 |       "type": "shell",
 9 |       "command": "GEN=ninja make",
10 |       "group": {
11 |         "kind": "build",
12 |         "isDefault": true
13 |       }
14 |     },
15 |     {
16 |       "label": "Build debug",
17 |       "type": "shell",
18 |       "command": "GEN=ninja make debug",
19 |       "group": {
20 |         "kind": "build",
21 |         "isDefault": false
22 |       }
23 |     }
24 |   ]
25 | }
26 | 


--------------------------------------------------------------------------------
/benchmarks/tpch_customer_sf_1_results.csv:
--------------------------------------------------------------------------------
 1 | ,function,time
 2 | 0,Spatial,8.727685526013374
 3 | 1,SheetReader 1 Thread,2.9178942814469337
 4 | 2,SheetReader 4 Threads,1.6063461303710938
 5 | 3,Spatial,8.46089394390583
 6 | 4,SheetReader 1 Thread,3.051068462431431
 7 | 5,SheetReader 4 Threads,1.420861043035984
 8 | 6,Spatial,8.677419312298298
 9 | 7,SheetReader 1 Thread,2.913003034889698
10 | 8,SheetReader 4 Threads,1.572079375386238
11 | 9,Spatial,8.883316360414028
12 | 10,SheetReader 1 Thread,2.935627445578575
13 | 11,SheetReader 4 Threads,1.431564912199974
14 | 12,Spatial,8.629714667797089
15 | 13,SheetReader 1 Thread,3.045584127306938
16 | 14,SheetReader 4 Threads,1.4394349306821823
17 | 


--------------------------------------------------------------------------------
/benchmarks/tpch_customer_sf_6_results.csv:
--------------------------------------------------------------------------------
 1 | ,function,time
 2 | 0,Spatial,49.53681559860706
 3 | 1,SheetReader 1 Thread,17.911359935998917
 4 | 2,SheetReader 4 Threads,9.131846889853477
 5 | 3,Spatial,51.09752745181322
 6 | 4,SheetReader 1 Thread,17.520940147340298
 7 | 5,SheetReader 4 Threads,8.596089884638786
 8 | 6,Spatial,50.83717566728592
 9 | 7,SheetReader 1 Thread,17.522817224264145
10 | 8,SheetReader 4 Threads,8.461024843156338
11 | 9,Spatial,49.879797391593456
12 | 10,SheetReader 1 Thread,17.9565489590168
13 | 11,SheetReader 4 Threads,8.618413478136063
14 | 12,Spatial,49.76275556534529
15 | 13,SheetReader 1 Thread,17.571647956967354
16 | 14,SheetReader 4 Threads,8.666186735033989
17 | 


--------------------------------------------------------------------------------
/benchmarks/tpch_orders_sf_1_results.csv:
--------------------------------------------------------------------------------
 1 | ,function,time
 2 | 0,Spatial,150.51047530025244
 3 | 1,SheetReader 1 Thread,42.91278725862503
 4 | 2,SheetReader 4 Threads,17.066086061298847
 5 | 3,Spatial,148.84033582359552
 6 | 4,SheetReader 1 Thread,43.02319976687431
 7 | 5,SheetReader 4 Threads,17.86900133639574
 8 | 6,Spatial,151.8528371155262
 9 | 7,SheetReader 1 Thread,44.46525827050209
10 | 8,SheetReader 4 Threads,17.78221821784973
11 | 9,Spatial,147.54299394786358
12 | 10,SheetReader 1 Thread,43.010885283350945
13 | 11,SheetReader 4 Threads,17.91719976812601
14 | 12,Spatial,159.33181025087833
15 | 13,SheetReader 1 Thread,43.35006716102362
16 | 14,SheetReader 4 Threads,16.704988904297352
17 | 


--------------------------------------------------------------------------------
/benchmarks/tpch_customer_sf_10_results.csv:
--------------------------------------------------------------------------------
 1 | ,function,time
 2 | 0,Spatial,85.97506861388683
 3 | 1,SheetReader 1 Thread,30.615816429257393
 4 | 2,SheetReader 4 Threads,14.14170940220356
 5 | 3,Spatial,86.2689319550991
 6 | 4,SheetReader 1 Thread,28.927293568849564
 7 | 5,SheetReader 4 Threads,14.446914449334145
 8 | 6,Spatial,84.92323327809572
 9 | 7,SheetReader 1 Thread,29.267103753983974
10 | 8,SheetReader 4 Threads,14.090325027704239
11 | 9,Spatial,85.49243979901075
12 | 10,SheetReader 1 Thread,29.322267293930054
13 | 11,SheetReader 4 Threads,14.052365981042385
14 | 12,Spatial,86.0353828445077
15 | 13,SheetReader 1 Thread,29.225902386009693
16 | 14,SheetReader 4 Threads,14.108775869011879
17 | 


--------------------------------------------------------------------------------
/docs/UPDATING.md:
--------------------------------------------------------------------------------
 1 | # Extension updating 
 2 | When cloning this template, the target version of DuckDB should be the latest stable release of DuckDB. However, there 
 3 | will inevitably come a time when a new DuckDB is released and the extension repository needs updating. This process goes
 4 | as follows:
 5 | 
 6 | - Bump submodules
 7 |   - `./duckdb` should be set to latest tagged release
 8 |   - `./extension-ci-tools` should be set to updated branch corresponding to latest DuckDB release
 9 | - Bump versions in `./github/workflows`
10 |   - `duckdb_version` input in `MainDistributionPipeline.yml` should be set to latest tagged release
11 |   - reusable workflow `_extension_distribution.yml` should be set to updated branch corresponding to latest DuckDB release
12 | 
13 | 


--------------------------------------------------------------------------------
/.devcontainer/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM mcr.microsoft.com/devcontainers/cpp:1-debian-11
 2 | 
 3 | ARG REINSTALL_CMAKE_VERSION_FROM_SOURCE="none"
 4 | 
 5 | # Optionally install the cmake for vcpkg
 6 | COPY ./reinstall-cmake.sh /tmp/
 7 | 
 8 | RUN if [ "${REINSTALL_CMAKE_VERSION_FROM_SOURCE}" != "none" ]; then \
 9 |         chmod +x /tmp/reinstall-cmake.sh && /tmp/reinstall-cmake.sh ${REINSTALL_CMAKE_VERSION_FROM_SOURCE}; \
10 |     fi \
11 |     && rm -f /tmp/reinstall-cmake.sh
12 | 
13 | # [Optional] Uncomment this section to install additional vcpkg ports.
14 | # RUN su vscode -c "${VCPKG_ROOT}/vcpkg install <your-port-name-here>"
15 | 
16 | # [Optional] Uncomment this section to install additional packages.
17 | RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
18 |     && apt-get -y install --no-install-recommends clangd


--------------------------------------------------------------------------------
/docker-demo/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use Ubuntu as base image
 2 | FROM ubuntu:22.04
 3 | 
 4 | # Install dependencies
 5 | RUN apt-get update && apt-get install -y \
 6 |     wget \
 7 |     unzip \
 8 |     git \
 9 |     cmake \
10 |     build-essential \
11 |     ninja-build \
12 |     libssl-dev \
13 |     python3-dev \
14 |     ccache \
15 |     && rm -rf /var/lib/apt/lists/*
16 | 
17 | # Download and install DuckDB
18 | RUN wget https://github.com/duckdb/duckdb/releases/download/v1.4.2/duckdb_cli-linux-amd64.zip \
19 |     && unzip duckdb_cli-linux-amd64.zip \
20 |     && mv duckdb /usr/local/bin/ \
21 |     && chmod +x /usr/local/bin/duckdb \
22 |     && rm duckdb_cli-linux-amd64.zip
23 | 
24 | # Create working directory
25 | WORKDIR /workspace
26 | 
27 | # Copy the Excel file and demo script
28 | COPY test.xlsx /workspace/
29 | COPY demo.sql /workspace/
30 | 
31 | # Set the entrypoint to bash for interactive use
32 | CMD ["/bin/bash"]
33 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
 1 | ---
 2 | BasedOnStyle: LLVM
 3 | TabWidth: 4
 4 | IndentWidth: 4
 5 | ColumnLimit: 120
 6 | AllowShortFunctionsOnASingleLine: false
 7 | ---
 8 | UseTab: ForIndentation
 9 | DerivePointerAlignment: false
10 | PointerAlignment: Right
11 | AlignConsecutiveMacros: true
12 | AlignTrailingComments: true
13 | AllowAllArgumentsOnNextLine: true
14 | AllowAllConstructorInitializersOnNextLine: true
15 | AllowAllParametersOfDeclarationOnNextLine: true
16 | AlignAfterOpenBracket: Align
17 | SpaceBeforeCpp11BracedList: true
18 | SpaceBeforeCtorInitializerColon: true
19 | SpaceBeforeInheritanceColon: true
20 | SpacesInAngles: false
21 | SpacesInCStyleCastParentheses: false
22 | SpacesInConditionalStatement: false
23 | AllowShortLambdasOnASingleLine: Inline
24 | AllowShortLoopsOnASingleLine: false
25 | AlwaysBreakTemplateDeclarations: Yes
26 | IncludeBlocks: Regroup
27 | Language: Cpp
28 | AccessModifierOffset: -4
29 | ---
30 | Language: Java
31 | SpaceAfterCStyleCast: true
32 | ---
33 | 


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   // Use IntelliSense to learn about possible attributes.
 3 |   // Hover to view descriptions of existing attributes.
 4 |   // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
 5 |   "version": "0.2.0",
 6 |   "configurations": [
 7 | 
 8 |     {
 9 |       "name": "(gdb) Launch DuckDB repl",
10 |       "type": "cppdbg",
11 |       "request": "launch",
12 |       "program": "${workspaceFolder}/build/debug/duckdb",
13 |       "args": [],
14 |       "stopAtEntry": false,
15 |       "cwd": "${workspaceFolder}",
16 |       "environment": [],
17 |       "externalConsole": false,
18 |       "MIMode": "gdb",
19 |       "setupCommands": [
20 |           {
21 |               "description": "Enable pretty-printing for gdb",
22 |               "text": "-enable-pretty-printing",
23 |               "ignoreFailures": true
24 |           },
25 |           {
26 |               "description": "Set Disassembly Flavor to Intel",
27 |               "text": "-gdb-set disassembly-flavor intel",
28 |               "ignoreFailures": true
29 |           }
30 |       ]
31 |     }
32 |   ],
33 |   "inputs": []
34 | }


--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the
 2 | // README at: https://github.com/devcontainers/templates/tree/main/src/cpp
 3 | {
 4 | 	"name": "C++",
 5 | 	"build": {
 6 | 		"dockerfile": "Dockerfile"
 7 | 	},
 8 | 
 9 | 	// Features to add to the dev container. More info: https://containers.dev/features.
10 | 	"features": {
11 | 		"ghcr.io/devcontainers/features/python:1": {
12 | 			"installJupyterlab": "true"
13 | 		}
14 | 	},
15 | 
16 | 	// Use 'forwardPorts' to make a list of ports inside the container available locally.
17 | 	// "forwardPorts": [],
18 | 
19 | 	// Use 'postCreateCommand' to run commands after the container is created.
20 | 	// "postCreateCommand": "gcc -v",
21 | 
22 | 	// Configure tool-specific properties.
23 | 	// "customizations": {},
24 | 	"customizations": {
25 | 		"vscode": {
26 | 			"extensions": [
27 | 				"llvm-vs-code-extensions.vscode-clangd",
28 | 				"ms-toolsai.jupyter"
29 | 			]
30 | 		}
31 | 	}
32 | 	// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
33 | 	// "remoteUser": "root"
34 | }
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 polydbms
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.5)
 2 | 
 3 | # Set extension name here
 4 | set(TARGET_NAME sheetreader)
 5 | 
 6 | # DuckDB's extension distribution supports vcpkg. As such, dependencies can be added in ./vcpkg.json and then
 7 | # used in cmake with find_package. Feel free to remove or replace with other dependencies.
 8 | # Note that it should also be removed from vcpkg.json to prevent needlessly installing it..
 9 | # find_package(Example-Package REQUIRED)
10 | 
11 | set(EXTENSION_NAME ${TARGET_NAME}_extension)
12 | set(LOADABLE_EXTENSION_NAME ${TARGET_NAME}_loadable_extension)
13 | 
14 | project(${TARGET_NAME})
15 | include_directories(src/include)
16 | include_directories(src/include/sheetreader-core/src/)
17 | include_directories(src/include/sheetreader-core/src/fast_double_parser)
18 | include_directories(src/include/sheetreader-core/src/miniz)
19 | 
20 | set(EXTENSION_SOURCES src/sheetreader_extension.cpp src/include/sheetreader-core/src/XlsxFile.cpp src/include/sheetreader-core/src/XlsxSheet.cpp src/include/sheetreader-core/src/miniz/miniz.cpp)
21 | 
22 | build_static_extension(${TARGET_NAME} ${EXTENSION_SOURCES})
23 | # TODO: We might need this at some point -- this is probably faster to build
24 | build_loadable_extension(${TARGET_NAME} " " ${EXTENSION_SOURCES})
25 | 
26 | install(
27 |   TARGETS ${EXTENSION_NAME}
28 |   EXPORT "${DUCKDB_EXPORT_SET}"
29 |   LIBRARY DESTINATION "${INSTALL_LIB_DIR}"
30 |   ARCHIVE DESTINATION "${INSTALL_LIB_DIR}")
31 | 


--------------------------------------------------------------------------------
/docker-demo/demo.sql:
--------------------------------------------------------------------------------
 1 | -- SheetReader DuckDB Extension Demo
 2 | -- This script demonstrates how to use the sheetreader extension to query Excel files
 3 | 
 4 | -- Step 1: Install the sheetreader extension from community extensions
 5 | INSTALL sheetreader FROM community;
 6 | 
 7 | -- Step 2: Load the extension
 8 | LOAD sheetreader;
 9 | 
10 | -- Step 3: Query the Excel file directly
11 | .print '=== Reading test.xlsx with sheetreader ==='
12 | SELECT * FROM sheetreader('test.xlsx');
13 | 
14 | -- Step 4: Get row count
15 | .print ''
16 | .print '=== Row count ==='
17 | SELECT COUNT(*) as total_rows FROM sheetreader('test.xlsx');
18 | 
19 | -- Step 5: Calculate statistics on the data
20 | .print ''
21 | .print '=== Statistics ==='
22 | SELECT 
23 |     MIN(Numeric0) as min_value,
24 |     MAX(Numeric0) as max_value,
25 |     AVG(Numeric0) as avg_value,
26 |     SUM(Numeric0) as sum_value
27 | FROM sheetreader('test.xlsx');
28 | 
29 | -- Step 6: Create a table from the Excel data
30 | .print ''
31 | .print '=== Creating table from Excel data ==='
32 | CREATE TABLE excel_data AS 
33 | FROM sheetreader('test.xlsx');
34 | 
35 | -- Step 7: Query the created table
36 | .print ''
37 | .print '=== Querying the created table ==='
38 | SELECT * FROM excel_data;
39 | 
40 | -- Step 8: Filter data (example: values greater than 50)
41 | .print ''
42 | .print '=== Filtering values > 50 ==='
43 | SELECT * FROM excel_data WHERE Numeric0 > 50;
44 | 
45 | .print ''
46 | .print '=== Demo completed successfully! ==='
47 | 


--------------------------------------------------------------------------------
/.github/workflows/_check_secrets.yml:
--------------------------------------------------------------------------------
 1 | name: Check Secrets
 2 | on:
 3 |   workflow_call:
 4 |     inputs:
 5 |       stub:
 6 |         required: false
 7 |         type: string
 8 |         default: "stub"
 9 |     secrets:
10 |       S3_BUCKET:
11 |         required: true
12 |       S3_DEPLOY_ID:
13 |         required: true
14 |       S3_DEPLOY_KEY:
15 |         required: true
16 |       S3_REGION:
17 |         required: true
18 |       
19 | jobs:
20 |   test-secrets-accessable:
21 |     name: Test secrets accessable
22 |     runs-on: ubuntu-latest
23 |     environment: Actions
24 |     env:
25 |       BUCKET_NAME: ${{ secrets.S3_BUCKET }}
26 |       AWS_ACCESS_KEY_ID: ${{ secrets.S3_DEPLOY_ID }}
27 |       AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_DEPLOY_KEY }}
28 |       AWS_DEFAULT_REGION: ${{ secrets.S3_REGION }}
29 |     steps:
30 |       # Check whether 
31 |       - name: Test secrets accessable
32 |         run: echo ${BUCKET_NAME} | sed -e 's/\(.\)/\1 /g'
33 |       - name: Check deploy secrets
34 |         run: if [ -z ${AWS_ACCESS_KEY_ID+x} ]; then echo "access key is unset"; else echo "access key is set with length ${#AWS_ACCESS_KEY_ID}"; fi
35 |       - name: Check deploy secrets
36 |         run: if [ -z ${AWS_SECRET_ACCESS_KEY+x} ]; then echo "secret key is unset"; else echo "secret key is set with length ${#AWS_DEFAULT_REGION}"; fi
37 |       - name: Check deploy secrets
38 |         run: if [ -z ${AWS_DEFAULT_REGION+x} ]; then echo "region is unset"; else echo "region is set with length ${#AWS_DEFAULT_REGION}"; fi
39 |   


--------------------------------------------------------------------------------
/.github/workflows/MainDistributionPipeline.yml:
--------------------------------------------------------------------------------
 1 | #
 2 | # This workflow calls the main distribution pipeline from DuckDB to build, test and (optionally) release the extension
 3 | #
 4 | name: Main Extension Distribution Pipeline
 5 | on:
 6 |   push:
 7 |   pull_request:
 8 |   workflow_dispatch:
 9 | 
10 | 
11 | 
12 | concurrency:
13 |   group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/main' || github.sha }}
14 |   cancel-in-progress: true
15 | 
16 | jobs:
17 |   # This is useful for debugging issues regarding the secret management:
18 |   #
19 |   # test-secrets-accessable:
20 |   #   name: Test secrets accessable
21 |   #   uses: ./.github/workflows/_check_secrets.yml
22 |   #   secrets: inherit
23 | 
24 | 
25 | 
26 |   duckdb-stable-build:
27 |     name: Build extension binaries (DuckDB v1.4.2)
28 |     # needs: test-secrets-accessable
29 |     uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@v1.4.2
30 |     with:
31 |       duckdb_version: v1.4.2
32 |       ci_tools_version: v1.4.2
33 |       extension_name: sheetreader
34 |       exclude_archs: "windows_amd64_rtools"
35 | 
36 |   # We disable deployment for now
37 |   #
38 |   # duckdb-stable-deploy:
39 |   #   name: Deploy extension binaries (DuckDB v1.0.0)
40 |   #   needs: duckdb-stable-build
41 |   #   uses: ./.github/workflows/_extension_deploy.yml
42 |   #   secrets: inherit
43 |   #   with:
44 |   #     duckdb_version: v1.0.0
45 |   #     extension_name: sheetreader
46 |   #     deploy_latest: ${{ startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main' }}
47 |   #     exclude_archs: "windows_amd64_rtools"
48 | 


--------------------------------------------------------------------------------
/.devcontainer/reinstall-cmake.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #-------------------------------------------------------------------------------------------------------------
 3 | # Copyright (c) Microsoft Corporation. All rights reserved.
 4 | # Licensed under the MIT License. See https://go.microsoft.com/fwlink/?linkid=2090316 for license information.
 5 | #-------------------------------------------------------------------------------------------------------------
 6 | #
 7 | set -e
 8 | 
 9 | CMAKE_VERSION=${1:-"none"}
10 | 
11 | if [ "${CMAKE_VERSION}" = "none" ]; then
12 |     echo "No CMake version specified, skipping CMake reinstallation"
13 |     exit 0
14 | fi
15 | 
16 | # Cleanup temporary directory and associated files when exiting the script.
17 | cleanup() {
18 |     EXIT_CODE=$?
19 |     set +e
20 |     if [[ -n "${TMP_DIR}" ]]; then
21 |         echo "Executing cleanup of tmp files"
22 |         rm -Rf "${TMP_DIR}"
23 |     fi
24 |     exit $EXIT_CODE
25 | }
26 | trap cleanup EXIT
27 | 
28 | 
29 | echo "Installing CMake..."
30 | apt-get -y purge --auto-remove cmake
31 | mkdir -p /opt/cmake
32 | 
33 | architecture=$(dpkg --print-architecture)
34 | case "${architecture}" in
35 |     arm64)
36 |         ARCH=aarch64 ;;
37 |     amd64)
38 |         ARCH=x86_64 ;;
39 |     *)
40 |         echo "Unsupported architecture ${architecture}."
41 |         exit 1
42 |         ;;
43 | esac
44 | 
45 | CMAKE_BINARY_NAME="cmake-${CMAKE_VERSION}-linux-${ARCH}.sh"
46 | CMAKE_CHECKSUM_NAME="cmake-${CMAKE_VERSION}-SHA-256.txt"
47 | TMP_DIR=$(mktemp -d -t cmake-XXXXXXXXXX)
48 | 
49 | echo "${TMP_DIR}"
50 | cd "${TMP_DIR}"
51 | 
52 | curl -sSL "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/${CMAKE_BINARY_NAME}" -O
53 | curl -sSL "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/${CMAKE_CHECKSUM_NAME}" -O
54 | 
55 | sha256sum -c --ignore-missing "${CMAKE_CHECKSUM_NAME}"
56 | sh "${TMP_DIR}/${CMAKE_BINARY_NAME}" --prefix=/opt/cmake --skip-license
57 | 
58 | ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake
59 | ln -s /opt/cmake/bin/ctest /usr/local/bin/ctest
60 | 


--------------------------------------------------------------------------------
/docker-demo/README.md:
--------------------------------------------------------------------------------
 1 | # SheetReader DuckDB Docker Demo
 2 | 
 3 | Demo of the **sheetreader-duckdb** extension with DuckDB v1.4.0+ compatibility.
 4 | 
 5 | ## Prerequisites
 6 | 
 7 | - Docker and Docker Compose installed and running
 8 | 
 9 | ## DuckDB v1.4.0 Extension Verification
10 | 
11 | This setup allows you to verify that the sheetreader extension works correctly with DuckDB v1.4.0+.
12 | 
13 | ### Build and Test the Extension
14 | 
15 | **Step 1: Navigate to the demo directory**
16 | ```bash
17 | cd docker-demo
18 | ```
19 | 
20 | **Step 2: Build the Docker image**
21 | ```bash
22 | docker compose build
23 | ```
24 | 
25 | **Step 3: Build the extension from source**
26 | ```bash
27 | docker compose run --rm sheetreader-dev bash -c "GEN=ninja NINJA_BUILD_FLAGS='-j2' make"
28 | ```
29 | 
30 | This will:
31 | - Build DuckDB v1.4.0+ from source
32 | - Compile the sheetreader extension with the new API
33 | - Create a DuckDB binary with the extension pre-loaded
34 | 
35 | **Step 4: Run the verification test**
36 | ```bash
37 | docker compose run --rm sheetreader-dev bash -c "./build/release/duckdb < docker-demo/test_verification.sql"
38 | ```
39 | 
40 | **Expected output:**
41 | ```
42 | ┌──────────┐
43 | │ Numeric0 │
44 | │  double  │
45 | ├──────────┤
46 | │     92.0 │
47 | │     48.0 │
48 | │     99.0 │
49 | │     35.0 │
50 | │     97.0 │
51 | └──────────┘
52 | ```
53 | 
54 | If you see this output, the extension is working correctly with DuckDB v1.4.0+! ✅
55 | 
56 | ---
57 | 
58 | ## Interactive Development
59 | 
60 | For interactive development and testing:
61 | 
62 | **Start an interactive shell:**
63 | ```bash
64 | docker compose run --rm sheetreader-dev bash
65 | ```
66 | 
67 | **Inside the container, you can:**
68 | ```bash
69 | # Build the extension
70 | GEN=ninja make
71 | 
72 | 
73 | # start DuckDB interactively
74 | ./build/release/duckdb
75 | ```
76 | 
77 | **Inside DuckDB, try queries:**
78 | ```sql
79 | -- Query the Excel file
80 | SELECT * FROM sheetreader('docker-demo/test.xlsx');
81 | 
82 | ```
83 | 
84 | **Exit:**
85 | ```
86 | .exit  # Exit DuckDB
87 | exit   # Exit container
88 | ```
89 | 
90 | ---
91 | 
92 | ## Files
93 | 
94 | - **Dockerfile** - Ubuntu 22.04 with build dependencies (git, cmake, ninja, etc.)
95 | - **docker-compose.yml** - Docker Compose setup with volume mounts and ccache
96 | - **test.xlsx** - Sample Excel file with test data
97 | - **test_verification.sql** - Verification query for testing
98 | 


--------------------------------------------------------------------------------
/.clang-tidy:
--------------------------------------------------------------------------------
 1 | Checks:          '-*,clang-diagnostic-*,bugprone-*,performance-*,google-explicit-constructor,google-build-using-namespace,google-runtime-int,misc-definitions-in-headers,modernize-use-nullptr,modernize-use-override,-bugprone-macro-parentheses,readability-braces-around-statements,-bugprone-branch-clone,readability-identifier-naming,hicpp-exception-baseclass,misc-throw-by-value-catch-by-reference,-bugprone-signed-char-misuse,-bugprone-misplaced-widening-cast,-bugprone-sizeof-expression,-bugprone-narrowing-conversions,-bugprone-easily-swappable-parameters,google-global-names-in-headers,llvm-header-guard,misc-definitions-in-headers,modernize-use-emplace,modernize-use-bool-literals,-performance-inefficient-string-concatenation,-performance-no-int-to-ptr,readability-container-size-empty,cppcoreguidelines-pro-type-cstyle-cast'
 2 | WarningsAsErrors: '*'
 3 | HeaderFilterRegex: '.*^(re2.h)'
 4 | AnalyzeTemporaryDtors: false
 5 | FormatStyle:     none
 6 | CheckOptions:
 7 |   - key:             readability-identifier-naming.ClassCase
 8 |     value:           CamelCase
 9 |   - key:             readability-identifier-naming.EnumCase
10 |     value:           CamelCase
11 |   - key:             readability-identifier-naming.TypedefCase
12 |     value:           lower_case
13 |   - key:             readability-identifier-naming.TypedefSuffix
14 |     value:           _t
15 |   - key:             readability-identifier-naming.FunctionCase
16 |     value:           CamelCase
17 |   - key:             readability-identifier-naming.MemberCase
18 |     value:           lower_case
19 |   - key:             readability-identifier-naming.ParameterCase
20 |     value:           lower_case
21 |   - key:             readability-identifier-naming.ConstantCase
22 |     value:           aNy_CasE
23 |   - key:             readability-identifier-naming.ConstantParameterCase
24 |     value:           lower_case
25 |   - key:             readability-identifier-naming.NamespaceCase
26 |     value:           lower_case
27 |   - key:             readability-identifier-naming.MacroDefinitionCase
28 |     value:           UPPER_CASE
29 |   - key:             readability-identifier-naming.StaticConstantCase
30 |     value:           UPPER_CASE
31 |   - key:             readability-identifier-naming.ConstantMemberCase
32 |     value:           aNy_CasE
33 |   - key:             readability-identifier-naming.StaticVariableCase
34 |     value:           UPPER_CASE
35 |   - key:             readability-identifier-naming.ClassConstantCase
36 |     value:           UPPER_CASE
37 |   - key:             readability-identifier-naming.EnumConstantCase
38 |     value:           UPPER_CASE
39 |   - key:             readability-identifier-naming.ConstexprVariableCase
40 |     value:           UPPER_CASE
41 |   - key:             readability-identifier-naming.StaticConstantCase
42 |     value:           UPPER_CASE
43 |   - key:             readability-identifier-naming.TemplateTemplateParameterCase
44 |     value:           UPPER_CASE
45 |   - key:             readability-identifier-naming.TypeTemplateParameterCase
46 |     value:           UPPER_CASE
47 |   - key:             readability-identifier-naming.VariableCase
48 |     value:           lower_case
49 |   - key:             modernize-use-emplace.SmartPointers
50 |     value:           '::std::shared_ptr;::duckdb::unique_ptr;::std::auto_ptr;::std::weak_ptr'
51 | 
52 | 


--------------------------------------------------------------------------------
/scripts/extension-upload.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Extension upload script
 4 | 
 5 | # Usage: ./extension-upload.sh <name> <extension_version> <duckdb_version> <architecture> <s3_bucket> <copy_to_latest> <copy_to_versioned>
 6 | # <name>                : Name of the extension
 7 | # <extension_version>   : Version (commit / version tag) of the extension
 8 | # <duckdb_version>      : Version (commit / version tag) of DuckDB
 9 | # <architecture>        : Architecture target of the extension binary
10 | # <s3_bucket>           : S3 bucket to upload to
11 | # <copy_to_latest>      : Set this as the latest version ("true" / "false", default: "false")
12 | # <copy_to_versioned>   : Set this as a versioned version that will prevent its deletion
13 | 
14 | set -e
15 | 
16 | if [[ $4 == wasm* ]]; then
17 |   ext="/tmp/extension/$1.duckdb_extension.wasm"
18 | else
19 |   ext="/tmp/extension/$1.duckdb_extension"
20 | fi
21 | 
22 | echo $ext
23 | 
24 | script_dir="$(dirname "$(readlink -f "$0")")"
25 | 
26 | # calculate SHA256 hash of extension binary
27 | cat $ext > $ext.append
28 | 
29 | if [[ $4 == wasm* ]]; then
30 |   # 0 for custom section
31 |   # 113 in hex = 275 in decimal, total lenght of what follows (1 + 16 + 2 + 256)
32 |   # [1(continuation) + 0010011(payload) = \x93, 0(continuation) + 10(payload) = \x02]
33 |   echo -n -e '\x00' >> $ext.append
34 |   echo -n -e '\x93\x02' >> $ext.append
35 |   # 10 in hex = 16 in decimal, lenght of name, 1 byte
36 |   echo -n -e '\x10' >> $ext.append
37 |   echo -n -e 'duckdb_signature' >> $ext.append
38 |   # the name of the WebAssembly custom section, 16 bytes
39 |   # 100 in hex, 256 in decimal
40 |   # [1(continuation) + 0000000(payload) = ff, 0(continuation) + 10(payload)],
41 |   # for a grand total of 2 bytes
42 |   echo -n -e '\x80\x02' >> $ext.append
43 | fi
44 | 
45 | # (Optionally) Sign binary
46 | if [ "$DUCKDB_EXTENSION_SIGNING_PK" != "" ]; then
47 |   echo "$DUCKDB_EXTENSION_SIGNING_PK" > private.pem
48 |   $script_dir/../duckdb/scripts/compute-extension-hash.sh $ext.append > $ext.hash
49 |   openssl pkeyutl -sign -in $ext.hash -inkey private.pem -pkeyopt digest:sha256 -out $ext.sign
50 |   rm -f private.pem
51 | fi
52 | 
53 | # Signature is always there, potentially defaulting to 256 zeros
54 | truncate -s 256 $ext.sign
55 | 
56 | # append signature to extension binary
57 | cat $ext.sign >> $ext.append
58 | 
59 | # compress extension binary
60 | if [[ $4 == wasm_* ]]; then
61 |   brotli < $ext.append > "$ext.compressed"
62 | else
63 |   gzip < $ext.append > "$ext.compressed"
64 | fi
65 | 
66 | set -e
67 | 
68 | # Abort if AWS key is not set
69 | if [ -z "$AWS_ACCESS_KEY_ID" ]; then
70 |     echo "No AWS key found, skipping.."
71 |     exit 0
72 | fi
73 | 
74 | # upload versioned version
75 | if [[ $7 = 'true' ]]; then
76 |   if [[ $4 == wasm* ]]; then
77 |     aws s3 cp $ext.compressed s3://$5/$1/$2/$3/$4/$1.duckdb_extension.wasm --acl public-read --content-encoding br --content-type="application/wasm"
78 |   else
79 |     aws s3 cp $ext.compressed s3://$5/$1/$2/$3/$4/$1.duckdb_extension.gz --acl public-read
80 |   fi
81 | fi
82 | 
83 | # upload to latest version
84 | if [[ $6 = 'true' ]]; then
85 |   if [[ $4 == wasm* ]]; then
86 |     aws s3 cp $ext.compressed s3://$5/$3/$4/$1.duckdb_extension.wasm --acl public-read --content-encoding br --content-type="application/wasm"
87 |   else
88 |     aws s3 cp $ext.compressed s3://$5/$3/$4/$1.duckdb_extension.gz --acl public-read
89 |   fi
90 | fi
91 | 


--------------------------------------------------------------------------------
/src/include/sheetreader_extension.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "duckdb.h"
  4 | #include "duckdb.hpp"
  5 | #include "duckdb/common/typedefs.hpp"
  6 | #include "duckdb/common/types.hpp"
  7 | #include "duckdb/common/unique_ptr.hpp"
  8 | #include "duckdb/common/vector.hpp"
  9 | #include "duckdb/function/function.hpp"
 10 | #include "sheetreader-core/src/XlsxFile.h"
 11 | #include "sheetreader-core/src/XlsxSheet.h"
 12 | 
 13 | namespace duckdb {
 14 | 
 15 | class SheetreaderExtension : public Extension {
 16 | public:
 17 | 	void Load(ExtensionLoader &loader) override;
 18 | 	std::string Name() override;
 19 | 	std::string Version() const override;
 20 | };
 21 | 
 22 | //! Contains all data that is determined during the bind function
 23 | struct SRBindData : public TableFunctionData {
 24 | public:
 25 | 	//! File name with path to file
 26 | 	//! Sheet ID default is 1
 27 | 	SRBindData(string file_name);
 28 | 	//! File name with path to file and name of sheet
 29 | 	//! Throws exception if sheet name is not found
 30 | 	SRBindData(string file_name, string sheet_name);
 31 | 	//! File name with path to file and index of sheet (starts with 1)
 32 | 	//! Throws exception if sheet at index is not found
 33 | 	SRBindData(string file_name, int sheet_index);
 34 | 
 35 | public:
 36 | 	//! The paths of the files we're reading
 37 | 	vector<string> file_names;
 38 | 
 39 | 	//! All column names (in order)
 40 | 	vector<string> names;
 41 | 
 42 | 	//! All column DuckDB types (in order)
 43 | 	vector<LogicalType> types;
 44 | 
 45 | 	//! The .XLSX-file -- created by sheetreader-core
 46 | 	XlsxFile xlsx_file;
 47 | 	//! A sheet of xlsx_file -- created by sheetreader-core
 48 | 	unique_ptr<XlsxSheet> xlsx_sheet;
 49 | 
 50 | 	//! Number of threads used while parsing
 51 | 	idx_t number_threads = 1;
 52 | 
 53 | 	//! Number of rows to skip while parsing
 54 | 	idx_t skip_rows = 0;
 55 | 
 56 | 	//! Coerce all cells to string in user defined column types
 57 | 	bool coerce_to_string = false;
 58 | 
 59 | 	//! User defined types
 60 | 	vector<LogicalType> user_types = {};
 61 | 
 62 | 	//! Use user_types even if they are not compatible with types determined by first/second row
 63 | 	bool force_types = false;
 64 | 
 65 | private:
 66 | 	SRBindData(ClientContext &context, vector<string> file_names, string sheet_name);
 67 | };
 68 | //! Keeps state in between calls to the table (copy) function
 69 | struct SRGlobalState {
 70 | public:
 71 | 	SRGlobalState(ClientContext &context, const SRBindData &bind_data);
 72 | 
 73 | public:
 74 | 	//! Bound data
 75 | 	const SRBindData &bind_data;
 76 | 
 77 | 	//! Number of chunk read so far
 78 | 	idx_t chunk_count = 0;
 79 | 
 80 | 	//! State of copying from mCells
 81 | 	size_t max_buffers;
 82 | 	//! Current index of thread
 83 | 	size_t current_thread;
 84 | 	//! Current index of buffer in thread
 85 | 	size_t current_buffer;
 86 | 	//! Current index of cell in buffer
 87 | 	size_t current_cell;
 88 | 	//! Current index of column in row
 89 | 	unsigned long current_column;
 90 | 	//! Current index of row in sheet
 91 | 	long long current_row;
 92 | 	//! Current index of row per thread
 93 | 	std::vector<size_t> current_locs;
 94 | };
 95 | 
 96 | struct SRLocalState {
 97 | public:
 98 | 	SRLocalState(ClientContext &context, SRGlobalState &gstate);
 99 | 
100 | private:
101 | 	const SRBindData &bind_data;
102 | };
103 | 
104 | //! Contains SRGlobalState
105 | struct SRGlobalTableFunctionState : public GlobalTableFunctionState {
106 | public:
107 | 	SRGlobalTableFunctionState(ClientContext &context, TableFunctionInitInput &input);
108 | 	static unique_ptr<GlobalTableFunctionState> Init(ClientContext &context, TableFunctionInitInput &input);
109 | 
110 | public:
111 | 	SRGlobalState state;
112 | };
113 | 
114 | struct SRLocalTableFunctionState : public LocalTableFunctionState {
115 | public:
116 | 	SRLocalTableFunctionState(ClientContext &context, SRGlobalState &gstate);
117 | 	static unique_ptr<LocalTableFunctionState> Init(ExecutionContext &context, TableFunctionInitInput &input,
118 | 	                                                GlobalTableFunctionState *global_state);
119 | 
120 | public:
121 | 	SRLocalState state;
122 | };
123 | } // namespace duckdb
124 | 


--------------------------------------------------------------------------------
/.github/workflows/_extension_deploy.yml:
--------------------------------------------------------------------------------
  1 | #
  2 | # Reusable workflow that deploys the artifacts produced by github.com/duckdb/duckdb/.github/workflows/_extension_distribution.yml
  3 | #
  4 | # note: this workflow needs to be located in the extension repository, as it requires secrets to be passed to the
  5 | # deploy script. However, it should generally not be necessary to modify this workflow in your extension repository, as
  6 | # this workflow can be configured to use a custom deploy script.
  7 | 
  8 | 
  9 | name: Extension Deployment
 10 | on:
 11 |   workflow_call:
 12 |     inputs:
 13 |       # The name of the extension
 14 |       extension_name:
 15 |         required: true
 16 |         type: string
 17 |       # DuckDB version to build against
 18 |       duckdb_version:
 19 |         required: true
 20 |         type: string
 21 |       # ';' separated list of architectures to exclude, for example: 'linux_amd64;osx_arm64'
 22 |       exclude_archs:
 23 |         required: false
 24 |         type: string
 25 |         default: ""
 26 |       # Whether to upload this deployment as the latest. This may overwrite a previous deployment.
 27 |       deploy_latest:
 28 |         required: false
 29 |         type: boolean
 30 |         default: false
 31 |       # Whether to upload this deployment under a versioned path. These will not be deleted automatically
 32 |       deploy_versioned:
 33 |         required: false
 34 |         type: boolean
 35 |         default: false
 36 |       # Postfix added to artifact names. Can be used to guarantee unique names when this workflow is called multiple times
 37 |       artifact_postfix:
 38 |         required: false
 39 |         type: string
 40 |         default: ""
 41 |       # Override the default deploy script with a custom script
 42 |       deploy_script:
 43 |         required: false
 44 |         type: string
 45 |         default: "./scripts/extension-upload.sh"
 46 |       # Override the default matrix parse script with a custom script
 47 |       matrix_parse_script:
 48 |         required: false
 49 |         type: string
 50 |         default: "./duckdb/scripts/modify_distribution_matrix.py"
 51 |     secrets:
 52 |       S3_BUCKET:
 53 |         required: true
 54 |       S3_DEPLOY_ID:
 55 |         required: true
 56 |       S3_DEPLOY_KEY:
 57 |         required: true
 58 |       S3_REGION:
 59 |         required: true
 60 | 
 61 | jobs:
 62 |   generate_matrix:
 63 |     name: Generate matrix
 64 |     environment: Actions
 65 |     runs-on: ubuntu-latest
 66 |     outputs:
 67 |       deploy_matrix: ${{ steps.parse-matrices.outputs.deploy_matrix }}
 68 |     steps:
 69 |       - uses: actions/checkout@v3
 70 |         with:
 71 |           fetch-depth: 0
 72 |           submodules: 'true'
 73 | 
 74 |       - name: Checkout DuckDB to version
 75 |         run: |
 76 |           cd duckdb
 77 |           git checkout ${{ inputs.duckdb_version }}
 78 | 
 79 |       - id: parse-matrices
 80 |         run: |
 81 |           python3 ${{ inputs.matrix_parse_script }} --input ./duckdb/.github/config/distribution_matrix.json --deploy_matrix --output deploy_matrix.json --exclude "${{ inputs.exclude_archs }}" --pretty
 82 |           deploy_matrix="`cat deploy_matrix.json`"
 83 |           echo deploy_matrix=$deploy_matrix >> $GITHUB_OUTPUT
 84 |           echo `cat $GITHUB_OUTPUT`
 85 | 
 86 |   deploy:
 87 |     name: Deploy
 88 |     environment: Actions
 89 |     runs-on: ubuntu-latest
 90 |     needs: generate_matrix
 91 |     if: ${{ needs.generate_matrix.outputs.deploy_matrix != '{}' && needs.generate_matrix.outputs.deploy_matrix != '' }}
 92 |     strategy:
 93 |       matrix: ${{fromJson(needs.generate_matrix.outputs.deploy_matrix)}}
 94 | 
 95 |     steps:
 96 |       - uses: actions/checkout@v3
 97 |         with:
 98 |           fetch-depth: 0
 99 |           submodules: 'true'
100 | 
101 |       - name: Checkout DuckDB to version
102 |         run: |
103 |           cd duckdb
104 |           git checkout ${{ inputs.duckdb_version }}
105 | 
106 |       - uses: actions/download-artifact@v2
107 |         with:
108 |           name: ${{ inputs.extension_name }}-${{ inputs.duckdb_version }}-extension-${{matrix.duckdb_arch}}${{inputs.artifact_postfix}}${{startsWith(matrix.duckdb, 'wasm') && '.wasm' || ''}}
109 |           path: |
110 |             /tmp/extension
111 | 
112 |       - name: Deploy
113 |         shell: bash
114 |         env:
115 |           AWS_ACCESS_KEY_ID: ${{ secrets.S3_DEPLOY_ID }}
116 |           AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_DEPLOY_KEY }}
117 |           AWS_DEFAULT_REGION: ${{ secrets.S3_REGION }}
118 |           BUCKET_NAME: ${{ secrets.S3_BUCKET }}
119 |           DUCKDB_EXTENSION_SIGNING_PK: ${{ secrets.S3_DUCKDB_ORG_EXTENSION_SIGNING_PK }}
120 |         run: |
121 |           pwd
122 |           python3 -m pip install pip awscli
123 |           git config --global --add safe.directory '*'
124 |           cd duckdb
125 |           git fetch --tags
126 |           export DUCKDB_VERSION=`git tag --points-at HEAD`
127 |           export DUCKDB_VERSION=${DUCKDB_VERSION:=`git log -1 --format=%h`}
128 |           cd ..
129 |           git fetch --tags
130 |           export EXT_VERSION=`git tag --points-at HEAD`
131 |           export EXT_VERSION=${EXT_VERSION:=`git log -1 --format=%h`}
132 |           ${{ inputs.deploy_script }} ${{ inputs.extension_name }} $EXT_VERSION $DUCKDB_VERSION ${{ matrix.duckdb_arch }} $BUCKET_NAME ${{inputs.deploy_latest || 'true' && 'false'}} ${{inputs.deploy_versioned || 'true' && 'false'}}
133 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SheetReader DuckDB extension
  2 | 
  3 | `sheetreader` is a DuckDB extension that allows reading XLSX files into DuckDB tables with SheetReader, our blazingly fast XLSX parser (https://github.com/polydbms/sheetreader-core).
  4 | 
  5 | ---
  6 | 
  7 | This repository is based on https://github.com/duckdb/extension-template.
  8 | 
  9 | ## Table of Contents
 10 | 
 11 | - [SheetReader DuckDB extension](#sheetreader-duckdb-extension)
 12 |   - [Table of Contents](#table-of-contents)
 13 |   - [Usage](#usage)
 14 |     - [Parameters](#parameters)
 15 |   - [More information on SheetReader](#more-information-on-sheetreader)
 16 |   - [Benchmarks](#benchmarks)
 17 |   - [Building yourself](#building-yourself)
 18 |     - [Running the extension](#running-the-extension)
 19 | 
 20 | ## Usage
 21 | 
 22 | Before using SheetReader, you need to install it from the [community extensions](https://community-extensions.duckdb.org/extensions/sheetreader.html) and load it into your DuckDB-environment:
 23 | 
 24 | ```sql
 25 | INSTALL sheetreader FROM community;
 26 | LOAD sheetreader;
 27 | ```
 28 | 
 29 | Now, you can run your first query:
 30 | 
 31 | ```sql
 32 | D SELECT *
 33 |   FROM sheetreader('test.xlsx');
 34 | ```
 35 | 
 36 | The `sheetreader()` function offers further parameters to load the XLSX-file as required:
 37 | 
 38 | ```sql
 39 | D CREATE TABLE test AS FROM sheetreader(
 40 |     'test.xlsx',
 41 |     sheet_index=1,
 42 |     threads=16,
 43 |     skip_rows=0,
 44 |     has_header=TRUE,
 45 |     types=[BOOLEAN,VARCHAR],
 46 |     coerce_to_string=TRUE,
 47 |     force_types=TRUE
 48 |   );
 49 | ```
 50 | 
 51 | ### Parameters
 52 | 
 53 | | Name               | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |      Type       | Default                                                          |
 54 | | :----------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :-------------: | :--------------------------------------------------------------- |
 55 | | `sheet_index`      | Index of the sheet to read. Starts at 1.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |    `INTEGER`    | `1`                                                              |
 56 | | `sheet_name`       | Name of the sheet to read. <br /> Only either `sheet_index` or `sheet_name` can be set.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |    `VARCHAR`    | `""`                                                             |
 57 | | `threads`          | Number of threads to use, while parsing                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |    `INTEGER`    | Half of available cores; minimum 1                               |
 58 | | `skip_rows`        | Number of rows to skip                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |    `INTEGER`    | `0`                                                              |
 59 | | `types`            | List of types for all columns <ul> <li> Types currently available:<br /> `VARCHAR`,`BOOLEAN`,`DOUBLE`, `DATE`.</li> <li> Useful in combination with `coerce_to_string` and `force_types`. </li> </ul>                                                                                                                                                                                                                                                                                                                                                                                                                                                                      | `LIST(VARCHAR)` | Uses types determined by first & second row (after skipped rows) |
 60 | | `coerce_to_string` | Coerce all cells in column of type `VARCHAR` to string (i.e. `VARCHAR`).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |    `BOOLEAN`    | `false`                                                          |
 61 | | `force_types`      | Use `types` even if they are not compatible with types determined by first/second row. <br /> Cells, that are not of the column type, are set to `NULL` or coerced to string, if option is set.                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |    `BOOLEAN`    | `false`                                                          |
 62 | | `has_header`       | If set to `true`: <ul> <li> Force to treat first row as header row (only works if all cells are of type `VARCHAR`). </li> <li> If successful, the cell contents are used for column names. </li> <li> Will overwrite the default behavior, which doesn't use the first row as headers, if all columns have type `VARCHAR`. </li> </ul> <br /> If set to `false`: <ul>  <li> The extension will still try to treat the first row as header row. </li> <li> The difference is that it will not fail, if the first row is not usable (i.e. not all cells are of type `VARCHAR`). </li> <li> The first row won't be used as headers, if all columns have type `VARCHAR`. </ul> |    `BOOLEAN`    | `false`                                                          |
 63 | 
 64 | 
 65 | ## More information on SheetReader
 66 | 
 67 | SheetReader was published in the [Information Systems Journal](https://www.sciencedirect.com/science/article/abs/pii/S0306437923000194).
 68 | ```
 69 | @article{DBLP:journals/is/GavriilidisHZM23,
 70 |   author       = {Haralampos Gavriilidis and
 71 |                   Felix Henze and
 72 |                   Eleni Tzirita Zacharatou and
 73 |                   Volker Markl},
 74 |   title        = {SheetReader: Efficient Specialized Spreadsheet Parsing},
 75 |   journal      = {Inf. Syst.},
 76 |   volume       = {115},
 77 |   pages        = {102183},
 78 |   year         = {2023},
 79 |   url          = {https://doi.org/10.1016/j.is.2023.102183},
 80 |   doi          = {10.1016/J.IS.2023.102183},
 81 |   timestamp    = {Mon, 26 Jun 2023 20:54:32 +0200},
 82 |   biburl       = {https://dblp.org/rec/journals/is/GavriilidisHZM23.bib},
 83 |   bibsource    = {dblp computer science bibliography, https://dblp.org}
 84 | }
 85 | ```
 86 | 
 87 | ## Benchmarks
 88 | 
 89 | You can find benchmarks in the above-mentioned paper, comparing SheetReader to other XLSX parsers.
 90 | 
 91 | Here is a plot of preliminary benchmarks comparing the `sheetreader` DuckDB extension to the `spatial` extension's `st_read` function:
 92 | 
 93 | 
 94 | ![Benchmark](./benchmarks/joined_barplot_benchmark_sf_1.png)
 95 | 
 96 | (*System info: 2x Intel(R) Xeon(R) E5530 @ 2.40GHz, 47GiB RAM*)
 97 | 
 98 | ## Building yourself
 99 | 
100 | First, clone this repository with the `--recurse-submodules` flag --- so you get all the needed source files.
101 | 
102 | To build the extension, run:
103 | ```sh
104 | GEN=ninja make
105 | ```
106 | The main binaries that will be built are:
107 | ```sh
108 | ./build/release/duckdb
109 | ./build/release/extension/sheetreader/sheetreader.duckdb_extension
110 | ```
111 | - `duckdb` is the binary for the DuckDB shell with the extension code automatically loaded.
112 | - `sheetreader.duckdb_extension` is the loadable binary as it would be distributed.
113 | 
114 | ### Running the extension
115 | 
116 | To run the self-built extension code, simply start the shell with `./build/release/duckdb`.
117 | 


--------------------------------------------------------------------------------
/src/sheetreader_extension.cpp:
--------------------------------------------------------------------------------
   1 | #include "duckdb.h"
   2 | #include "duckdb/common/assert.hpp"
   3 | #include "duckdb/common/helper.hpp"
   4 | #include "duckdb/common/multi_file/multi_file_reader.hpp"
   5 | #include "duckdb/common/typedefs.hpp"
   6 | #include "duckdb/common/types.hpp"
   7 | #include "duckdb/common/types/data_chunk.hpp"
   8 | #include "duckdb/common/types/date.hpp"
   9 | #include "duckdb/common/types/string_type.hpp"
  10 | #include "duckdb/common/types/value.hpp"
  11 | #include "duckdb/common/types/vector.hpp"
  12 | #include "duckdb/common/unique_ptr.hpp"
  13 | #include "duckdb/common/vector.hpp"
  14 | #include "duckdb/common/vector_size.hpp"
  15 | #include "duckdb/function/function.hpp"
  16 | #include "sheetreader-core/src/XlsxFile.h"
  17 | #include "sheetreader-core/src/XlsxSheet.h"
  18 | 
  19 | #include <cmath>
  20 | #include <cstddef>
  21 | #include <string>
  22 | #include <utility>
  23 | #define DUCKDB_EXTENSION_MAIN
  24 | 
  25 | #include "duckdb/common/exception.hpp"
  26 | #include "duckdb/common/string_util.hpp"
  27 | #include "duckdb/function/table_function.hpp"
  28 | #include "sheetreader_extension.hpp"
  29 | 
  30 | #include <duckdb/parser/parsed_data/create_scalar_function_info.hpp>
  31 | #include "duckdb/main/database.hpp"
  32 | 
  33 | namespace duckdb {
  34 | 
  35 | //! Determine default number of threads
  36 | inline idx_t DefaultThreads() {
  37 | #ifdef __EMSCRIPTEN__
  38 | 	// WebAssembly doesn't support threading in MVP builds
  39 | 	return 1;
  40 | #else
  41 | 	// Returns 0 if not able to detect
  42 | 	idx_t sys_number_threads = std::thread::hardware_concurrency();
  43 | 
  44 | 	// Don't be to greedy
  45 | 	idx_t appropriate_number_threads = sys_number_threads / 2;
  46 | 
  47 | 	if (appropriate_number_threads <= 0) {
  48 | 		appropriate_number_threads = 1;
  49 | 	}
  50 | 
  51 | 	return appropriate_number_threads;
  52 | #endif
  53 | }
  54 | 
  55 | // =====================================
  56 | // Following are a bunch of constructors for classes that hold the state of the sheetreader extension
  57 | // Find the definitions & documentation of these classes in the sheetreader_extension.hpp file
  58 | // =====================================
  59 | 
  60 | SRBindData::SRBindData(string file_name) : SRBindData(file_name, 1) {
  61 | }
  62 | 
  63 | SRBindData::SRBindData(string file_name, string sheet_name)
  64 |     : xlsx_file(file_name), xlsx_sheet(make_uniq<XlsxSheet>(xlsx_file.getSheet(sheet_name))),
  65 |       number_threads(DefaultThreads()) {
  66 | }
  67 | 
  68 | SRBindData::SRBindData(string file_name, int sheet_index)
  69 |     : xlsx_file(file_name), xlsx_sheet(make_uniq<XlsxSheet>(xlsx_file.getSheet(sheet_index))),
  70 |       number_threads(DefaultThreads()) {
  71 | }
  72 | 
  73 | SRGlobalState::SRGlobalState(ClientContext &context, const SRBindData &bind_data)
  74 |     : bind_data(bind_data), chunk_count(0) {
  75 | }
  76 | 
  77 | SRLocalState::SRLocalState(ClientContext &context, SRGlobalState &gstate) : bind_data(gstate.bind_data) {
  78 | }
  79 | 
  80 | SRGlobalTableFunctionState::SRGlobalTableFunctionState(ClientContext &context, TableFunctionInitInput &input)
  81 |     : state(context, input.bind_data->Cast<SRBindData>()) {
  82 | }
  83 | 
  84 | unique_ptr<GlobalTableFunctionState> SRGlobalTableFunctionState::Init(ClientContext &context,
  85 |                                                                       TableFunctionInitInput &input) {
  86 | 
  87 | 	auto result = make_uniq<SRGlobalTableFunctionState>(context, input);
  88 | 
  89 | 	return std::move(result);
  90 | }
  91 | 
  92 | SRLocalTableFunctionState::SRLocalTableFunctionState(ClientContext &context, SRGlobalState &gstate)
  93 |     : state(context, gstate) {
  94 | }
  95 | 
  96 | unique_ptr<LocalTableFunctionState> SRLocalTableFunctionState::Init(ExecutionContext &context,
  97 |                                                                     TableFunctionInitInput &input,
  98 |                                                                     GlobalTableFunctionState *global_state) {
  99 | 	auto &gstate = global_state->Cast<SRGlobalTableFunctionState>();
 100 | 	auto result = make_uniq<SRLocalTableFunctionState>(context.client, gstate.state);
 101 | 
 102 | 	return std::move(result);
 103 | }
 104 | 
 105 | // =====================================
 106 | // Following are definitions that are used to copy data from the sheetreader-core to the DuckDB data chunk
 107 | // =====================================
 108 | 
 109 | //! DataPtr is a union that holds a pointer to the different data that are stored in the vectors of the data chunk
 110 | union DataPtr {
 111 | 	string_t *string_data;
 112 | 	double *double_data;
 113 | 	bool *bool_data;
 114 | 	date_t *date_data;
 115 | };
 116 | 
 117 | //! Set cell to NULL
 118 | inline void SetNull(const SRBindData &bind_data, DataChunk &output, vector<DataPtr> &flat_vectors, const XlsxCell &cell,
 119 |                     idx_t row_id, idx_t column_id) {
 120 | 	LogicalType expected_type = bind_data.types[column_id];
 121 | 
 122 | 	// Value constructor with LogicalType sets the value to NULL
 123 | 	output.data[column_id].SetValue(row_id, Value(expected_type));
 124 | }
 125 | 
 126 | //! Set all values in the data chunk to NULL
 127 | inline void SetAllInvalid(DataChunk &output, idx_t cardinality) {
 128 | 	// Iterate over all columns
 129 | 	for (idx_t col = 0; col < output.ColumnCount(); col++) {
 130 | 		Vector &vec = output.data[col];
 131 | 		// Validity mask saves the information about NULL values
 132 | 		auto &validity = FlatVector::Validity(vec);
 133 | 		validity.SetAllInvalid(cardinality);
 134 | 	}
 135 | }
 136 | 
 137 | //! Set cell to the value of XlsxCell
 138 | //! Expects XlsxCell to have the same type as the column
 139 | inline void SetCell(const SRBindData &bind_data, DataChunk &output, vector<DataPtr> &flat_vectors, const XlsxCell &cell,
 140 |                     idx_t row_id, idx_t column_id) {
 141 | 
 142 | 	auto &xlsx_file = bind_data.xlsx_file;
 143 | 
 144 | 	// Get validity mask of the column and set it to valid (i.e. not NULL)
 145 | 	Vector &vec = output.data[column_id];
 146 | 	auto &validity = FlatVector::Validity(vec);
 147 | 	validity.SetValid(row_id);
 148 | 
 149 | 	// Set the value of the cell to cell in the data chunk
 150 | 	// Note: bind_data.types[column_id] is the expected type of the column,
 151 | 	// so the type XlsxCell should be checked before calling this function
 152 | 	switch (bind_data.types[column_id].id()) {
 153 | 	case LogicalTypeId::VARCHAR: {
 154 | 		auto value = xlsx_file.getString(cell.data.integer);
 155 | 		// string_t creates values that fail the UTF-8 check, so we use the slow technique
 156 | 		// flat_vectors[j].string_data[i] = string_t(value);
 157 | 		output.data[column_id].SetValue(row_id, Value(value));
 158 | 		break;
 159 | 	}
 160 | 	case LogicalTypeId::DOUBLE: {
 161 | 		auto value = cell.data.real;
 162 | 		flat_vectors[column_id].double_data[row_id] = value;
 163 | 		break;
 164 | 	}
 165 | 	case LogicalTypeId::BOOLEAN: {
 166 | 		auto value = cell.data.boolean;
 167 | 		flat_vectors[column_id].bool_data[row_id] = value;
 168 | 		break;
 169 | 	}
 170 | 	case LogicalTypeId::DATE: {
 171 | 		// Convert seconds to days
 172 | 		date_t value = date_t((int)(cell.data.real / 86400.0));
 173 | 		flat_vectors[column_id].date_data[row_id] = value;
 174 | 		break;
 175 | 	}
 176 | 	default:
 177 | 		throw InternalException("This shouldn't happen. Unsupported Logical type");
 178 | 	}
 179 | }
 180 | 
 181 | //! Coerce cell to string and save it in the data chunk
 182 | inline void SetCellString(const SRBindData &bind_data, DataChunk &output, vector<DataPtr> &flat_vectors,
 183 |                           const XlsxCell &cell, idx_t row_id, idx_t column_id) {
 184 | 
 185 | 	auto &xlsx_file = bind_data.xlsx_file;
 186 | 
 187 | 	// Get validity mask of the column and set it to valid (i.e. not NULL)
 188 | 	Vector &vec = output.data[column_id];
 189 | 	auto &validity = FlatVector::Validity(vec);
 190 | 	validity.SetValid(row_id);
 191 | 
 192 | 	// Similar to SetCell() only difference:
 193 | 	// Use coercion method depending on the type of the XlsxCell
 194 | 	switch (cell.type) {
 195 | 	case CellType::T_STRING_REF: {
 196 | 		auto value = xlsx_file.getString(cell.data.integer);
 197 | 		output.data[column_id].SetValue(row_id, Value(value));
 198 | 		break;
 199 | 	}
 200 | 	case CellType::T_NUMERIC: {
 201 | 		auto value = cell.data.real;
 202 | 		string str = std::to_string(value);
 203 | 		output.data[column_id].SetValue(row_id, Value(str));
 204 | 		break;
 205 | 	}
 206 | 	case CellType::T_BOOLEAN: {
 207 | 		auto value = cell.data.boolean;
 208 | 		string str = value ? "TRUE" : "FALSE";
 209 | 		output.data[column_id].SetValue(row_id, Value(str));
 210 | 		break;
 211 | 	}
 212 | 	case CellType::T_DATE: {
 213 | 		date_t value = date_t((int)(cell.data.real / 86400.0));
 214 | 		string str = Date::ToString(value);
 215 | 		output.data[column_id].SetValue(row_id, Value(str));
 216 | 		break;
 217 | 	}
 218 | 	default:
 219 | 		throw InternalException("This shouldn't happen. Unsupported Cell type");
 220 | 	}
 221 | }
 222 | 
 223 | //! Check if the types of the XlsxCell and the column are compatible
 224 | //! Types are compatible with VARCHAR if coercing to string is enabled
 225 | bool TypesCompatible(const LogicalType &expected_type, const CellType &cell_type, bool coerce_to_string) {
 226 | 	switch (expected_type.id()) {
 227 | 	case LogicalTypeId::VARCHAR:
 228 | 		if (coerce_to_string) {
 229 | 			switch (cell_type) {
 230 | 			case CellType::T_STRING_REF:
 231 | 			case CellType::T_NUMERIC:
 232 | 			case CellType::T_BOOLEAN:
 233 | 			case CellType::T_DATE:
 234 | 				return true;
 235 | 			default:
 236 | 				return false;
 237 | 			}
 238 | 		}
 239 | 		return cell_type == CellType::T_STRING_REF;
 240 | 	case LogicalTypeId::DOUBLE:
 241 | 		return cell_type == CellType::T_NUMERIC;
 242 | 	case LogicalTypeId::BOOLEAN:
 243 | 		return cell_type == CellType::T_BOOLEAN;
 244 | 	case LogicalTypeId::DATE:
 245 | 		return cell_type == CellType::T_DATE;
 246 | 	default:
 247 | 		// TODO: Add support for T_STRING and T_STRING_INLINE
 248 | 		throw InternalException("This shouldn't happen. Unsupported Logical type");
 249 | 	}
 250 | }
 251 | 
 252 | //! Check if current_row is within the limit of the current chunk
 253 | bool CheckRowLimitReached(SRGlobalState &gstate) {
 254 | 	// Need offset, since current_row is index of the current row in whole table.
 255 | 	// So we subtract the number of rows (determined by chunk) already copied
 256 | 	long long row_offset = gstate.chunk_count * STANDARD_VECTOR_SIZE;
 257 | 	// Limit is the last row of the current chunk (should be 2048 == STANDARD_VECTOR_SIZE)
 258 | 	long long limit = row_offset + STANDARD_VECTOR_SIZE;
 259 | 	long long skip_rows = gstate.bind_data.xlsx_sheet->mSkipRows;
 260 | 	bool limit_reached = gstate.current_row - skip_rows >= limit;
 261 | 	return limit_reached;
 262 | }
 263 | 
 264 | //! Get the number of rows copied so far
 265 | idx_t GetCardinality(SRGlobalState &gstate) {
 266 | 	// Same reason as in CheckRowLimitReached
 267 | 	long long row_offset = gstate.chunk_count * STANDARD_VECTOR_SIZE;
 268 | 	long long skip_rows = gstate.bind_data.xlsx_sheet->mSkipRows;
 269 | 	// This is the case when no new rows are copied and last chunk was not full (last iteration)
 270 | 	if (gstate.current_row + 1 < skip_rows + row_offset) {
 271 | 		return 0;
 272 | 	}
 273 | 	return gstate.current_row - skip_rows - row_offset + 1;
 274 | }
 275 | 
 276 | /*!
 277 |   Summary of data is stored in mCells:
 278 |     ====================================
 279 | 
 280 |     General layout:
 281 |     ---------------
 282 | 
 283 |   mCells = [thread[0], thread[1], thread[2], ...]
 284 |   thread[current_thread] = [buffer[0], buffer[1], buffer[2], ...]
 285 |   buffer[current_buffer] = [cell[0], cell[1], cell[2], ...]
 286 | */
 287 | /*!
 288 |    - Copy data from sheetreader-core's mCells to DuckDB data chunk
 289 |    - Copies STANDARD_VECTOR_SIZE rows at a time
 290 |    - This function is stateful, so it keeps track of the current location in the mCells
 291 |      to maintain state between calls
 292 |    Returns Cardinality (number of rows copied)
 293 | */
 294 | size_t StatefulCopy(SRGlobalState &gstate, const SRBindData &bind_data, DataChunk &output,
 295 |                     vector<DataPtr> &flat_vectors) {
 296 | 
 297 | 	auto &sheet = bind_data.xlsx_sheet;
 298 | 
 299 | 	// Every thread has a list of buffers
 300 | 	D_ASSERT(bind_data.number_threads == sheet->mCells.size());
 301 | 
 302 | 	idx_t number_threads = bind_data.number_threads;
 303 | 
 304 | 	if (number_threads == 0) {
 305 | 		return 0;
 306 | 	}
 307 | 
 308 | 	size_t row_offset = gstate.chunk_count * STANDARD_VECTOR_SIZE;
 309 | 
 310 | 	// Helper function to calculate the adjusted row
 311 | 	auto calc_adjusted_row = [row_offset](long long current_row, unsigned long skip_rows) {
 312 | 		return current_row - skip_rows - row_offset;
 313 | 	};
 314 | 
 315 | 	// Initialize state for first call
 316 | 	if (gstate.current_locs.empty()) {
 317 | 		// Get number of buffers from first thread (is always the maximum)
 318 | 		gstate.max_buffers = sheet->mCells[0].size();
 319 | 		gstate.current_thread = 0;
 320 | 		gstate.current_buffer = 0;
 321 | 		gstate.current_cell = 0;
 322 | 		gstate.current_column = 0;
 323 | 		gstate.current_row = -1;
 324 | 		// Initialize current_locs for all threads
 325 | 		gstate.current_locs = std::vector<size_t>(number_threads, 0);
 326 | 	}
 327 | 
 328 | 	// Set all values to NULL per default, since sheetreader-core stores information about empty cells only by skipping
 329 | 	// them in mCells. Since we iterate over mCells, empty cells are implicitly skipped. So we wouldn't know if a cell
 330 | 	// in the chunk is empty if we don't set it to NULL here and set it to valid when we find it in mCells (see
 331 | 	// SetValue)
 332 | 	SetAllInvalid(output, STANDARD_VECTOR_SIZE);
 333 | 
 334 | 	//! To get the correct order of rows we iterate for(buffer_index) { for(thread_index) { for(cell_index) } }
 335 | 	//! This is due to how sheetreader-core writes the data to the buffers (stored in mCells)
 336 | 	for (; gstate.current_buffer < gstate.max_buffers; ++gstate.current_buffer) {
 337 | 		for (; gstate.current_thread < sheet->mCells.size(); ++gstate.current_thread) {
 338 | 
 339 | 			// If there are no more buffers to read, prepare for finishing copying
 340 | 			if (sheet->mCells[gstate.current_thread].empty()) {
 341 | 				// Set to maxBuffers, so this is the last iteration
 342 | 				gstate.current_buffer = gstate.max_buffers;
 343 | 
 344 | 				// Return number of copied rows in this chunk
 345 | 				return GetCardinality(gstate);
 346 | 			}
 347 | 
 348 | 			//! Current cell buffer
 349 | 			const std::vector<XlsxCell> cells = sheet->mCells[gstate.current_thread].front();
 350 | 			//! Location info for current thread
 351 | 			const std::vector<LocationInfo> &locs_infos = sheet->mLocationInfos[gstate.current_thread];
 352 | 			//! Current location index in current thread
 353 | 			size_t &current_loc = gstate.current_locs[gstate.current_thread];
 354 | 
 355 | 			// This is a weird implementation detail of sheetreader-core:
 356 | 			// currentCell <= cells.size() because there might be location info after last cell
 357 | 			for (; gstate.current_cell <= cells.size(); ++gstate.current_cell) {
 358 | 
 359 | 				// Description of the following loop:
 360 | 				// Update currentRow & currentColumn when location info is available for current cell at currentLoc.
 361 | 				// After setting those values: Advance to next location info.
 362 | 				//
 363 | 				// This means that the values won't be updated if there is no location info for the current cell
 364 | 				// (e.g. not first cell in row)
 365 | 				//
 366 | 				// Edge case 0:
 367 | 				// Loop is executed n+1 times for first location info, where n is the number of skip_rows (specified as
 368 | 				// parameter for interleaved) This is because, SheetReader creates location infos for the skipped lines
 369 | 				// with cell == column == buffer == 0 sames as for the first "real" row
 370 | 				//
 371 | 				// Edge case 1:
 372 | 				// For empty cells, sheetreader-core also generates a location info that points to the same cell as the
 373 | 				// next location info. By using the condition for the while loop, we skip these empty cells
 374 | 				while (current_loc < locs_infos.size() && locs_infos[current_loc].buffer == gstate.current_buffer &&
 375 | 				       locs_infos[current_loc].cell == gstate.current_cell) {
 376 | 
 377 | 					gstate.current_column = locs_infos[current_loc].column;
 378 | 					// Not sure whether row is ever -1ul, but this is how it's handled in sheetreader-core's nextRow()
 379 | 					if (locs_infos[current_loc].row == -1ul) {
 380 | 						++gstate.current_row;
 381 | 					} else {
 382 | 						gstate.current_row = locs_infos[current_loc].row;
 383 | 					}
 384 | 
 385 | 					long long adjusted_row = calc_adjusted_row(gstate.current_row, sheet->mSkipRows);
 386 | 
 387 | 					// This only happens for header rows -- we want to skip them
 388 | 					if (adjusted_row < 0) {
 389 | 						++current_loc;
 390 | 						// Skip to next row
 391 | 						if (current_loc < locs_infos.size()) {
 392 | 							gstate.current_cell = locs_infos[current_loc].cell;
 393 | 						} else {
 394 | 							throw InternalException("Skipped more rows than available in first buffer -- consider "
 395 | 							                        "decreasing number of threads");
 396 | 						}
 397 | 						continue;
 398 | 					}
 399 | 
 400 | 					// Increment index to location info for next iteration
 401 | 					++current_loc;
 402 | 
 403 | 					// If we reached the row limit of the current chunk, we return the number of copied rows
 404 | 					if (CheckRowLimitReached(gstate)) {
 405 | 						// Subtract 1, because we increment current_row before checking the limit
 406 | 						return (GetCardinality(gstate) - 1);
 407 | 					}
 408 | 				}
 409 | 				// We need to check this here, because we iterate up to cells.size() to get the last location info
 410 | 				if (gstate.current_cell >= cells.size()) {
 411 | 					break;
 412 | 				}
 413 | 
 414 | 				// Use short variable name for better readability
 415 | 				const auto current_column = gstate.current_column;
 416 | 
 417 | 				// If this cell is in a column that was not present in the first row, we throw an error
 418 | 				if (current_column >= bind_data.types.size()) {
 419 | 					throw InvalidInputException(
 420 | 					    "Row " + std::to_string(gstate.current_row) + "has more columns than the first row. Has: " +
 421 | 					    std::to_string(current_column + 1) + " Expected: " + std::to_string(bind_data.types.size()));
 422 | 				}
 423 | 
 424 | 				//! Content of current cell
 425 | 				const XlsxCell &cell = cells[gstate.current_cell];
 426 | 				//! Number of rows we skipped while parsing
 427 | 				long long mSkipRows = sheet->mSkipRows;
 428 | 				long long adjusted_row = calc_adjusted_row(gstate.current_row, mSkipRows);
 429 | 
 430 | 				bool types_compatible =
 431 | 				    TypesCompatible(bind_data.types[current_column], cell.type, bind_data.coerce_to_string);
 432 | 
 433 | 				// sheetreader-core doesn't determine empty cells to be T_NONE, instead it skips the cell,
 434 | 				// so it's not stored in mCells. We handle this by setting all cells as Invalid (aka null)
 435 | 				// and set them valid when they appear in mCells
 436 | 				if (cell.type == CellType::T_NONE || cell.type == CellType::T_ERROR || !types_compatible) {
 437 | 					SetNull(bind_data, output, flat_vectors, cell, adjusted_row, current_column);
 438 | 				} else if (bind_data.types[current_column] == LogicalType::VARCHAR && bind_data.coerce_to_string) {
 439 | 					SetCellString(bind_data, output, flat_vectors, cell, adjusted_row, current_column);
 440 | 				} else {
 441 | 					SetCell(bind_data, output, flat_vectors, cell, adjusted_row, current_column);
 442 | 				}
 443 | 
 444 | 				// Advance to next column
 445 | 				++gstate.current_column;
 446 | 			}
 447 | 
 448 | 			// If we reached the last cell in the current buffer, we remove it from the thread
 449 | 			sheet->mCells[gstate.current_thread].pop_front();
 450 | 			// Reset for next buffer
 451 | 			gstate.current_cell = 0;
 452 | 		}
 453 | 		// Reset thread index for next buffer index
 454 | 		gstate.current_thread = 0;
 455 | 	}
 456 | 	// Return number of copied rows in this chunk when all buffers are read (i.e. curren_buffer == max_buffers)
 457 | 	return GetCardinality(gstate);
 458 | }
 459 | 
 460 | //! Finish the current chunk
 461 | //! - Set the cardinality of the chunk
 462 | //! - Increment the chunk count
 463 | inline void FinishChunk(DataChunk &output, idx_t cardinality, SRGlobalState &gstate) {
 464 | 
 465 | 	// Indicate how many rows are in the chunk
 466 | 	// If cardinality is 0, it means that the chunk is empty and no more rows are to be expected
 467 | 	output.SetCardinality(cardinality);
 468 | 
 469 | 	// Increment number of chunks read so far
 470 | 	gstate.chunk_count++;
 471 | 
 472 | 	return;
 473 | }
 474 | 
 475 | //! Copy data from sheetreader-core to DuckDB data chunk
 476 | //! - Is called after bind function
 477 | //! - Is called multiple times until all data is copied are no more rows are needed (e.g. for LIMIT clause)
 478 | inline void SheetreaderCopyTableFun(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) {
 479 | 
 480 | 	//! Data from bind function
 481 | 	const SRBindData &bind_data = data_p.bind_data->Cast<SRBindData>();
 482 | 	//! State persisted in between table (copy) function calls
 483 | 	SRGlobalState &gstate = data_p.global_state->Cast<SRGlobalTableFunctionState>().state;
 484 | 
 485 | 	//! Number of columns (i.e. number of vectors in the data chunk)
 486 | 	const idx_t column_count = output.ColumnCount();
 487 | 
 488 | 	D_ASSERT(column_count == bind_data.types.size());
 489 | 
 490 | 	// =====================================
 491 | 	// Store FlatVectors for all columns (they have different data types)
 492 | 	// =====================================
 493 | 
 494 | 	//! Holds pointers to the data of the vectors in the data chunk
 495 | 	vector<DataPtr> flat_vectors;
 496 | 
 497 | 	for (idx_t col = 0; col < column_count; col++) {
 498 | 		switch (bind_data.types[col].id()) {
 499 | 		case LogicalTypeId::VARCHAR: {
 500 | 			Vector &vec = output.data[col];
 501 | 			string_t *data_vec = FlatVector::GetData<string_t>(vec);
 502 | 			DataPtr data;
 503 | 			data.string_data = data_vec;
 504 | 			// Store pointer to data
 505 | 			flat_vectors.push_back(data);
 506 | 			break;
 507 | 		}
 508 | 		case LogicalTypeId::DOUBLE: {
 509 | 			Vector &vec = output.data[col];
 510 | 			auto data_vec = FlatVector::GetData<double>(vec);
 511 | 			DataPtr data;
 512 | 			data.double_data = data_vec;
 513 | 			flat_vectors.push_back(data);
 514 | 			break;
 515 | 		}
 516 | 		case LogicalTypeId::BOOLEAN: {
 517 | 			Vector &vec = output.data[col];
 518 | 			auto data_vec = FlatVector::GetData<bool>(vec);
 519 | 			DataPtr data;
 520 | 			data.bool_data = data_vec;
 521 | 			flat_vectors.push_back(data);
 522 | 			break;
 523 | 		}
 524 | 		case LogicalTypeId::DATE: {
 525 | 			Vector &vec = output.data[col];
 526 | 			auto data_vec = FlatVector::GetData<date_t>(vec);
 527 | 			DataPtr data;
 528 | 			data.date_data = data_vec;
 529 | 			flat_vectors.push_back(data);
 530 | 			break;
 531 | 		}
 532 | 		default:
 533 | 			throw InternalException("This shouldn't happen. Unsupported Logical type");
 534 | 		}
 535 | 	}
 536 | 
 537 | 	// =====================================
 538 | 	// Copy data from sheetreader-core's mCells to DuckDB data chunk
 539 | 	// =====================================
 540 | 
 541 | 	// This version:
 542 | 	// - Uses SetValue only for VARCHAR, for other types it uses directly the flat vectors
 543 | 	// - Doesn't use nextRow() but directly iterates over the buffers
 544 | 	// - Has more features (coercion to string, handling empty cells, etc.)
 545 | 
 546 | 	//! Number of rows copied in this iteration
 547 | 	auto cardinality = StatefulCopy(gstate, bind_data, output, flat_vectors);
 548 | 
 549 | 	FinishChunk(output, cardinality, gstate);
 550 | 
 551 | 	return;
 552 | }
 553 | 
 554 | // =====================================
 555 | // Following are definitions for the bind function
 556 | // =====================================
 557 | 
 558 | //! Converts the cell types from sheetreader-core to DuckDB types (column_types)
 559 | //! and it also sets the column names (uses generic names)
 560 | inline bool ConvertCellTypes(vector<LogicalType> &column_types, vector<string> &column_names,
 561 |                              vector<CellType> &cell_types) {
 562 | 	idx_t current_column_index = 0;
 563 | 	//! Indicates if the first row contains only string values
 564 | 	bool first_row_all_string = true;
 565 | 
 566 | 	for (auto &col_type : cell_types) {
 567 | 		switch (col_type) {
 568 | 		case CellType::T_STRING_REF:
 569 | 			column_types.push_back(LogicalType::VARCHAR);
 570 | 			column_names.push_back("String" + std::to_string(current_column_index));
 571 | 			break;
 572 | 		case CellType::T_STRING:
 573 | 		case CellType::T_STRING_INLINE:
 574 | 			// TODO
 575 | 			throw BinderException("Inline & dynamic String types not supported yet");
 576 | 			break;
 577 | 		case CellType::T_NUMERIC:
 578 | 			column_types.push_back(LogicalType::DOUBLE);
 579 | 			column_names.push_back("Numeric" + std::to_string(current_column_index));
 580 | 			first_row_all_string = false;
 581 | 			break;
 582 | 		case CellType::T_BOOLEAN:
 583 | 			column_types.push_back(LogicalType::BOOLEAN);
 584 | 			column_names.push_back("Boolean" + std::to_string(current_column_index));
 585 | 			first_row_all_string = false;
 586 | 			break;
 587 | 		case CellType::T_DATE:
 588 | 			column_types.push_back(LogicalType::DATE);
 589 | 			column_names.push_back("Date" + std::to_string(current_column_index));
 590 | 			first_row_all_string = false;
 591 | 			break;
 592 | 		default:
 593 | 			throw BinderException("Unknown cell type in column in column " + std::to_string(current_column_index));
 594 | 		}
 595 | 		current_column_index++;
 596 | 	}
 597 | 
 598 | 	return first_row_all_string;
 599 | }
 600 | 
 601 | //! Get the names of the columns from the first row
 602 | //! Assumes that the first row contains only string values
 603 | inline vector<string> GetHeaderNames(vector<XlsxCell> &row, SRBindData &bind_data) {
 604 | 
 605 | 	vector<string> column_names;
 606 | 
 607 | 	for (idx_t j = 0; j < row.size(); j++) {
 608 | 		switch (row[j].type) {
 609 | 		case CellType::T_STRING_REF: {
 610 | 			auto value = bind_data.xlsx_file.getString(row[j].data.integer);
 611 | 			column_names.push_back(value);
 612 | 			break;
 613 | 		}
 614 | 		case CellType::T_STRING:
 615 | 		case CellType::T_STRING_INLINE: {
 616 | 			// TODO
 617 | 			throw BinderException("Inline & dynamic String types not supported yet");
 618 | 			break;
 619 | 		}
 620 | 		default:
 621 | 			throw BinderException("Header row contains non-string values");
 622 | 		}
 623 | 	}
 624 | 
 625 | 	return column_names;
 626 | }
 627 | 
 628 | //! Bind function for the sheetreader extension
 629 | //! - Gets (named) parameters (filename etc.) of table function and stores them
 630 | //! - Parses the .Xlsx-file
 631 | //! - Reads the first & second row to determine the types of the columns
 632 | //! - Reads the first & second row to determine the names of the columns (auto detects if the first row is a header)
 633 | //! - Writes the determined types in `return_types` and the names in `names`
 634 | //! - Creates the bind data object (is subtype of FunctionData) which contains all necessary information for the copy
 635 | //!   and most importantly stores the XlsxFile & XlsxSheet objects
 636 | inline unique_ptr<FunctionData> SheetreaderBindFun(ClientContext &context, TableFunctionBindInput &input,
 637 |                                                    vector<LogicalType> &return_types, vector<string> &names) {
 638 | 
 639 | 	// =====================================
 640 | 	// Get input parameters & prepare for parsing
 641 | 	// =====================================
 642 | 
 643 | 	// Get the file name from the input parameters & verify it exists
 644 | 	auto file_reader = MultiFileReader::Create(input.table_function);
 645 | 	auto file_list = file_reader->CreateFileList(context, input.inputs[0]);
 646 | 	auto file_infos = file_list->GetAllFiles();
 647 | 
 648 | 	if (file_infos.empty()) {
 649 | 		throw BinderException("No files found in path");
 650 | 	} else if (file_infos.size() > 1) {
 651 | 		throw BinderException("Only one file can be read at a time");
 652 | 	}
 653 | 
 654 | 	// Extract the file path from OpenFileInfo
 655 | 	string file_name = file_infos[0].path;
 656 | 
 657 | 	//! User specified sheet name
 658 | 	string sheet_name;
 659 | 	//! User specified sheet index -- starts with 1
 660 | 	int sheet_index;
 661 | 	//! Is set when the user specifies the sheet index with e.g. `sheet_index=2`
 662 | 	bool sheet_index_set = false;
 663 | 
 664 | 	//! User specified option to use header
 665 | 	bool use_header = false;
 666 | 
 667 | 	// Get named parameters that are needed for creating XlsxFile & XlsxSheet objects and therefore for creating
 668 | 	// bind_data
 669 | 	for (auto &kv : input.named_parameters) {
 670 | 		auto loption = StringUtil::Lower(kv.first);
 671 | 		if (loption == "sheet_name") {
 672 | 			sheet_name = StringValue::Get(kv.second);
 673 | 		} else if (loption == "sheet_index") {
 674 | 			sheet_index = IntegerValue::Get(kv.second);
 675 | 			sheet_index_set = true;
 676 | 		} else if (loption == "has_header") {
 677 | 			use_header = BooleanValue::Get(kv.second);
 678 | 		} else {
 679 | 			continue;
 680 | 		}
 681 | 	}
 682 | 
 683 | 	if (!sheet_name.empty() && sheet_index_set) {
 684 | 		throw BinderException("Sheet index & sheet name cannot be set at the same time.");
 685 | 	}
 686 | 
 687 | 	//! Contains all important data collected in this bind function & is returned to be used by table (copy) function
 688 | 	unique_ptr<SRBindData> bind_data;
 689 | 
 690 | 	try {
 691 | 		if (!sheet_name.empty()) {
 692 | 			bind_data = make_uniq<SRBindData>(file_name, sheet_name);
 693 | 		} else if (sheet_index_set) {
 694 | 			bind_data = make_uniq<SRBindData>(file_name, sheet_index);
 695 | 		} else {
 696 | 			// Default: sheet_index=1
 697 | 			bind_data = make_uniq<SRBindData>(file_name);
 698 | 		}
 699 | 	} catch (std::exception &e) {
 700 | 		throw BinderException(e.what());
 701 | 	}
 702 | 
 703 | 	//! Is set when the user specifies the types of the columns with e.g. `types=[VARCHAR,DOUBLE]`
 704 | 	bool has_user_types = false;
 705 | 
 706 | 	// Get all left named parameters
 707 | 	// You can find the documentation of the named parameters in the README.md and header file
 708 | 	for (auto &kv : input.named_parameters) {
 709 | 		auto loption = StringUtil::Lower(kv.first);
 710 | 		if (loption == "threads") {
 711 | 			bind_data->number_threads = IntegerValue::Get(kv.second);
 712 | 			if (bind_data->number_threads <= 0) {
 713 | 				throw BinderException("Number of threads must be greater than 0");
 714 | 			}
 715 | 		} else if (loption == "skip_rows") {
 716 | 			// Default: 0
 717 | 			bind_data->skip_rows = IntegerValue::Get(kv.second);
 718 | 		} else if (loption == "coerce_to_string") {
 719 | 			bind_data->coerce_to_string = BooleanValue::Get(kv.second);
 720 | 		} else if (loption == "force_types") {
 721 | 			bind_data->force_types = BooleanValue::Get(kv.second);
 722 | 		} else if (loption == "types") {
 723 | 			// Get all types as strings defined in list/array
 724 | 			auto &children = ListValue::GetChildren(kv.second);
 725 | 			// Convert strings to LogicalTypes & check if they are supported
 726 | 			for (auto &child : children) {
 727 | 				string raw_type = StringValue::Get(child);
 728 | 				LogicalType logical_type = TransformStringToLogicalType(raw_type);
 729 | 				if (logical_type.id() == LogicalTypeId::USER) {
 730 | 					throw BinderException("Unrecognized type \"%s\" for %s definition", raw_type, kv.first);
 731 | 				}
 732 | 				switch (logical_type.id()) {
 733 | 				case LogicalTypeId::VARCHAR:
 734 | 				case LogicalTypeId::DOUBLE:
 735 | 				case LogicalTypeId::BOOLEAN:
 736 | 				case LogicalTypeId::DATE: {
 737 | 					break;
 738 | 				}
 739 | 				default: {
 740 | 					throw BinderException("Unsupported type \"%s\" for %s definition", raw_type, kv.first);
 741 | 				}
 742 | 				}
 743 | 				bind_data->user_types.push_back(logical_type);
 744 | 			}
 745 | 			// Indicate that user provided types
 746 | 			has_user_types = true;
 747 | 
 748 | 			// We already handled them before
 749 | 		} else if (loption == "sheet_name" || loption == "sheet_index" || loption == "has_header") {
 750 | 			continue;
 751 | 		} else {
 752 | 			throw BinderException("Unknown named parameter");
 753 | 		}
 754 | 	}
 755 | 
 756 | 	// Doesn't change the parsing (only when combined with specifyTypes) -- we simply store it, to read it later while
 757 | 	// copying
 758 | 	bind_data->xlsx_sheet->mHeaders = use_header;
 759 | 
 760 | 	// If number threads > 1, we set parallel true
 761 | 	if (bind_data->number_threads > 1) {
 762 | 		bind_data->xlsx_file.mParallelStrings = true;
 763 | 	} else {
 764 | 		bind_data->xlsx_file.mParallelStrings = false;
 765 | 	}
 766 | 
 767 | 	// =====================================
 768 | 	// Parsing & check parsing result
 769 | 	// =====================================
 770 | 
 771 | 	// Parse the shared strings file
 772 | 	bind_data->xlsx_file.parseSharedStrings();
 773 | 
 774 | 	//! Used for better readability
 775 | 	auto &sheet = bind_data->xlsx_sheet;
 776 | 
 777 | 	// Parse the sheet
 778 | 	bool success = sheet->interleaved(bind_data->skip_rows, 0, bind_data->number_threads);
 779 | 
 780 | 	if (!success) {
 781 | 		throw BinderException("Failed to read sheet");
 782 | 	}
 783 | 
 784 | 	bind_data->xlsx_file.finalize();
 785 | 
 786 | 	//! Number of columns in the sheet
 787 | 	auto number_columns = sheet->mDimension.first;
 788 | 	//! Number of rows in the sheet
 789 | 	auto number_rows = sheet->mDimension.second;
 790 | 
 791 | 	if (number_columns == 0 || number_rows == 0) {
 792 | 		throw BinderException("Sheet appears to be empty");
 793 | 	}
 794 | 
 795 | 	// =====================================
 796 | 	// Determine column types & names
 797 | 	// =====================================
 798 | 
 799 | 	//! Cell types in the first row after skipped rows
 800 | 	vector<CellType> cell_types_first_row;
 801 | 	//! Cell types in the second row after skipped rows
 802 | 	vector<CellType> cell_types_second_row;
 803 | 	//! Cell values in the first row after skipped rows
 804 | 	vector<XlsxCell> cells_first_row;
 805 | 
 806 | 	// First buffer of first thread
 807 | 	auto first_buffer = &sheet->mCells[0].front();
 808 | 
 809 | 	// Probing the first two rows to get the types
 810 | 	if (first_buffer->size() < number_columns * 2) {
 811 | 		throw BinderException("Internal SheetReader extension error: Need minimum of two rows in first buffer to "
 812 | 		                      "determine column types and auto detect header row");
 813 | 	}
 814 | 
 815 | 	for (idx_t i = 0; i < number_columns; i++) {
 816 | 		cell_types_first_row.push_back(sheet->mCells[0].front()[i].type);
 817 | 		cells_first_row.push_back(sheet->mCells[0].front()[i]);
 818 | 	}
 819 | 
 820 | 	for (idx_t i = number_columns; i < number_columns * 2; i++) {
 821 | 		cell_types_second_row.push_back(sheet->mCells[0].front()[i].type);
 822 | 	}
 823 | 
 824 | 	// Convert CellType to LogicalType
 825 | 
 826 | 	//! DuckDB types of the cells in the first row
 827 | 	vector<LogicalType> column_types_first_row;
 828 | 	//! Column names of the cells in the first row
 829 | 	vector<string> column_names_first_row;
 830 | 
 831 | 	// Check if first row contains only string values, get DuckDB types & generic column names
 832 | 	bool first_row_all_string = ConvertCellTypes(column_types_first_row, column_names_first_row, cell_types_first_row);
 833 | 
 834 | 	if (use_header && !first_row_all_string) {
 835 | 		throw BinderException("First row must contain only strings when has_header is set to true");
 836 | 	}
 837 | 
 838 | 	vector<LogicalType> column_types_second_row;
 839 | 	vector<string> column_names_second_row;
 840 | 	//! Indicates whether a header row was detected
 841 | 	bool header_detected = false;
 842 | 
 843 | 	if (number_rows > 1) {
 844 | 		// Check if second row contains only string values, get DuckDB types & generic column names
 845 | 		bool second_row_all_string =
 846 | 		    ConvertCellTypes(column_types_second_row, column_names_second_row, cell_types_second_row);
 847 | 
 848 | 		// If the first row contains only string values, but the second row doesn't, we assume that the first row is a
 849 | 		// header row
 850 | 		if (use_header || (first_row_all_string && !second_row_all_string)) {
 851 | 			header_detected = true;
 852 | 
 853 | 			// Since the first row is a header row, we use the cell types of the second row
 854 | 			return_types = column_types_second_row;
 855 | 			bind_data->types = column_types_second_row;
 856 | 
 857 | 			//! Column names determined from the first row
 858 | 			vector<string> header_names;
 859 | 
 860 | 			// Get header names from cell values of first row
 861 | 			for (idx_t j = 0; j < cells_first_row.size(); j++) {
 862 | 				switch (cells_first_row[j].type) {
 863 | 				case CellType::T_STRING_REF: {
 864 | 					auto value = bind_data->xlsx_file.getString(cells_first_row[j].data.integer);
 865 | 					header_names.push_back(value);
 866 | 					break;
 867 | 				}
 868 | 				case CellType::T_STRING:
 869 | 				case CellType::T_STRING_INLINE: {
 870 | 					// TODO
 871 | 					throw BinderException("Inline & dynamic String types not supported yet");
 872 | 					break;
 873 | 				}
 874 | 				default:
 875 | 					throw BinderException("Header row contains non-string values");
 876 | 				}
 877 | 			}
 878 | 
 879 | 			// Set column names to header names
 880 | 			names = header_names;
 881 | 			bind_data->names = header_names;
 882 | 		} else {
 883 | 			// If first row is not a header row, we use the cell types of the first row for the column types
 884 | 			return_types = column_types_first_row;
 885 | 			bind_data->types = column_types_first_row;
 886 | 
 887 | 			// Use generic column names
 888 | 			names = column_names_first_row;
 889 | 			bind_data->names = column_names_first_row;
 890 | 		}
 891 | 	}
 892 | 
 893 | 	// Since header is only used for determining column names, we skip it
 894 | 	if (header_detected) {
 895 | 		bind_data->skip_rows++;
 896 | 		bind_data->xlsx_sheet->mSkipRows++;
 897 | 	}
 898 | 
 899 | 	// If user has specified types, we try to use them
 900 | 	if (has_user_types) {
 901 | 		if (bind_data->user_types.size() < number_columns) {
 902 | 			throw BinderException("Number of user defined types is less than number of columns in sheet");
 903 | 		}
 904 | 
 905 | 		idx_t column_index = 0;
 906 | 		for (auto &column_type : return_types) {
 907 | 
 908 | 			LogicalType user_type = bind_data->user_types[column_index];
 909 | 
 910 | 			// Check if user defined type is same as previously determined column type or can be coerced to string
 911 | 			// If forced_types == true, the compatibility check is skipped
 912 | 			if (!bind_data->force_types && user_type.id() != column_type.id() &&
 913 | 			    !(user_type == LogicalTypeId::VARCHAR && bind_data->coerce_to_string)) {
 914 | 				// TODO: EnumUtil does not work -- find appropriate replacement
 915 | 				// throw BinderException("User defined type %s for column with index %d is not compatible with actual
 916 | 				// type %s",
 917 | 				//                       EnumUtil::ToString<LogicalType>(user_type), column_index,
 918 | 				//                       EnumUtil::ToString<LogicalType>(column_type));
 919 | 				throw BinderException("User defined type for column with index %d is not compatible with actual type",
 920 | 				                      column_index);
 921 | 			}
 922 | 			column_index++;
 923 | 		}
 924 | 
 925 | 		// Add column names, if they are new user defined columns
 926 | 		vector<string> additional_column_names;
 927 | 
 928 | 		while (column_index < bind_data->user_types.size()) {
 929 | 			additional_column_names.push_back("Column " + std::to_string(column_index));
 930 | 			column_index++;
 931 | 		}
 932 | 
 933 | 		return_types = bind_data->user_types;
 934 | 		bind_data->types = bind_data->user_types;
 935 | 
 936 | 		// Concat additional column names
 937 | 		bind_data->names.insert(bind_data->names.end(), additional_column_names.begin(), additional_column_names.end());
 938 | 		names = bind_data->names;
 939 | 
 940 | 		D_ASSERT(return_types.size() == names.size());
 941 | 	}
 942 | 
 943 | 	// First row is discarded (is only needed for versions that use nextRow())
 944 | 	for (idx_t i = 0; i < bind_data->skip_rows; i++) {
 945 | 		sheet->nextRow();
 946 | 	}
 947 | 
 948 | 	return std::move(bind_data);
 949 | }
 950 | 
 951 | static void LoadInternal(ExtensionLoader &loader) {
 952 | 	// Register a table function
 953 | 	TableFunction sheetreader_table_function("sheetreader", {LogicalType::VARCHAR}, SheetreaderCopyTableFun,
 954 | 	                                         SheetreaderBindFun, SRGlobalTableFunctionState::Init,
 955 | 	                                         SRLocalTableFunctionState::Init);
 956 | 
 957 | 	// Define all named parameters
 958 | 	sheetreader_table_function.named_parameters["sheet_name"] = LogicalType::VARCHAR;
 959 | 	sheetreader_table_function.named_parameters["sheet_index"] = LogicalType::INTEGER;
 960 | 	sheetreader_table_function.named_parameters["threads"] = LogicalType::INTEGER;
 961 | 	sheetreader_table_function.named_parameters["skip_rows"] = LogicalType::INTEGER;
 962 | 	sheetreader_table_function.named_parameters["has_header"] = LogicalType::BOOLEAN;
 963 | 	// TODO: Support STRUCT, i.e. { 'column_name': 'type', ... }
 964 | 	// We would use ANY here, similar to read_csv.cpp, but we expect a STRUCT or LIST
 965 | 	// sheetreader_table_function.named_parameters["types"] = LogicalType::ANY;
 966 | 	sheetreader_table_function.named_parameters["types"] = LogicalType::LIST(LogicalType::VARCHAR);
 967 | 	sheetreader_table_function.named_parameters["force_types"] = LogicalType::BOOLEAN;
 968 | 	sheetreader_table_function.named_parameters["coerce_to_string"] = LogicalType::BOOLEAN;
 969 | 
 970 | 	loader.RegisterFunction(sheetreader_table_function);
 971 | }
 972 | 
 973 | void SheetreaderExtension::Load(ExtensionLoader &loader) {
 974 | 	LoadInternal(loader);
 975 | }
 976 | std::string SheetreaderExtension::Name() {
 977 | 	return "sheetreader";
 978 | }
 979 | 
 980 | std::string SheetreaderExtension::Version() const {
 981 | #ifdef EXT_VERSION_SHEETREADER
 982 | 	return EXT_VERSION_SHEETREADER;
 983 | #else
 984 | 	return "";
 985 | #endif
 986 | }
 987 | 
 988 | } // namespace duckdb
 989 | 
 990 | extern "C" {
 991 | 
 992 | DUCKDB_EXTENSION_API void sheetreader_init(duckdb::DatabaseInstance &db) {
 993 | 	duckdb::DuckDB db_wrapper(db);
 994 | 	db_wrapper.LoadStaticExtension<duckdb::SheetreaderExtension>();
 995 | }
 996 | 
 997 | DUCKDB_EXTENSION_API const char *sheetreader_version() {
 998 | 	return duckdb::DuckDB::LibraryVersion();
 999 | }
1000 | 
1001 | DUCKDB_EXTENSION_API void sheetreader_duckdb_cpp_init(duckdb::ExtensionLoader &loader) {
1002 | 	duckdb::SheetreaderExtension extension;
1003 | 	extension.Load(loader);
1004 | }
1005 | }
1006 | 
1007 | #ifndef DUCKDB_EXTENSION_MAIN
1008 | #error DUCKDB_EXTENSION_MAIN not defined
1009 | #endif
1010 | 


--------------------------------------------------------------------------------