├── .editorconfig ├── vcpkg.json ├── docker-demo ├── test_verification.sql ├── docker-compose.yml ├── Dockerfile ├── demo.sql └── README.md ├── .clangd ├── assets ├── benchmark_boxplot.png └── benchmark_customer_table.png ├── benchmarks ├── benchmark_sf_6_with_spatial.png ├── benchmark_sf_10_with_spatial.png ├── joined_barplot_benchmark_sf_1.png ├── barplot_benchmark_sf_1_with_spatial.png ├── barplot_benchmark_sf_6_with_spatial.png ├── boxplot_benchmark_sf_1_with_spatial.png ├── boxplot_benchmark_sf_6_with_spatial.png ├── joined_barplot_benchmark_1000000_rows.png ├── joined_barplot_benchmark_sf_6_with_spatial.png ├── tpch_customer_sf_1_results.csv ├── tpch_customer_sf_6_results.csv ├── tpch_orders_sf_1_results.csv └── tpch_customer_sf_10_results.csv ├── .gitignore ├── Makefile ├── extension_config.cmake ├── .gitmodules ├── .github ├── dependabot.yml └── workflows │ ├── _check_secrets.yml │ ├── MainDistributionPipeline.yml │ └── _extension_deploy.yml ├── test └── README.md ├── .vscode ├── tasks.json └── launch.json ├── docs └── UPDATING.md ├── .devcontainer ├── Dockerfile ├── devcontainer.json └── reinstall-cmake.sh ├── .clang-format ├── LICENSE ├── CMakeLists.txt ├── .clang-tidy ├── scripts └── extension-upload.sh ├── src ├── include │ └── sheetreader_extension.hpp └── sheetreader_extension.cpp └── README.md /.editorconfig: -------------------------------------------------------------------------------- 1 | duckdb/.editorconfig -------------------------------------------------------------------------------- /vcpkg.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": [ 3 | ] 4 | } -------------------------------------------------------------------------------- /docker-demo/test_verification.sql: -------------------------------------------------------------------------------- 1 | SELECT * FROM sheetreader('docker-demo/test.xlsx'); 2 | -------------------------------------------------------------------------------- /.clangd: -------------------------------------------------------------------------------- 1 | CompileFlags: 2 | CompilationDatabase: build/debug 3 | Add: -Wno-unqualified-std-cast-call 4 | -------------------------------------------------------------------------------- /assets/benchmark_boxplot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/polydbms/sheetreader-duckdb/HEAD/assets/benchmark_boxplot.png -------------------------------------------------------------------------------- /assets/benchmark_customer_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/polydbms/sheetreader-duckdb/HEAD/assets/benchmark_customer_table.png -------------------------------------------------------------------------------- /benchmarks/benchmark_sf_6_with_spatial.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/polydbms/sheetreader-duckdb/HEAD/benchmarks/benchmark_sf_6_with_spatial.png -------------------------------------------------------------------------------- /benchmarks/benchmark_sf_10_with_spatial.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/polydbms/sheetreader-duckdb/HEAD/benchmarks/benchmark_sf_10_with_spatial.png -------------------------------------------------------------------------------- /benchmarks/joined_barplot_benchmark_sf_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/polydbms/sheetreader-duckdb/HEAD/benchmarks/joined_barplot_benchmark_sf_1.png -------------------------------------------------------------------------------- /benchmarks/barplot_benchmark_sf_1_with_spatial.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/polydbms/sheetreader-duckdb/HEAD/benchmarks/barplot_benchmark_sf_1_with_spatial.png -------------------------------------------------------------------------------- /benchmarks/barplot_benchmark_sf_6_with_spatial.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/polydbms/sheetreader-duckdb/HEAD/benchmarks/barplot_benchmark_sf_6_with_spatial.png -------------------------------------------------------------------------------- /benchmarks/boxplot_benchmark_sf_1_with_spatial.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/polydbms/sheetreader-duckdb/HEAD/benchmarks/boxplot_benchmark_sf_1_with_spatial.png -------------------------------------------------------------------------------- /benchmarks/boxplot_benchmark_sf_6_with_spatial.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/polydbms/sheetreader-duckdb/HEAD/benchmarks/boxplot_benchmark_sf_6_with_spatial.png -------------------------------------------------------------------------------- /benchmarks/joined_barplot_benchmark_1000000_rows.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/polydbms/sheetreader-duckdb/HEAD/benchmarks/joined_barplot_benchmark_1000000_rows.png -------------------------------------------------------------------------------- /benchmarks/joined_barplot_benchmark_sf_6_with_spatial.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/polydbms/sheetreader-duckdb/HEAD/benchmarks/joined_barplot_benchmark_sf_6_with_spatial.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | .idea 3 | cmake-build-debug 4 | duckdb_unittest_tempdir/ 5 | .DS_Store 6 | testext 7 | test/python/__pycache__/ 8 | .Rhistory 9 | *.xlsx 10 | *.csv 11 | *.txt 12 | *.db 13 | *.db.wal 14 | benchmark/ 15 | .trunk/ -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PROJ_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) 2 | 3 | # Configuration of extension 4 | EXT_NAME=sheetreader 5 | EXT_CONFIG=${PROJ_DIR}extension_config.cmake 6 | 7 | # Include the Makefile from extension-ci-tools 8 | include extension-ci-tools/makefiles/duckdb_extension.Makefile -------------------------------------------------------------------------------- /extension_config.cmake: -------------------------------------------------------------------------------- 1 | # This file is included by DuckDB's build system. It specifies which extension to load 2 | 3 | # Extension from this repo 4 | duckdb_extension_load(sheetreader 5 | SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR} 6 | LOAD_TESTS 7 | ) 8 | 9 | # Any extra extensions that should be built 10 | # e.g.: duckdb_extension_load(json) -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "duckdb"] 2 | path = duckdb 3 | url = https://github.com/duckdb/duckdb 4 | branch = main 5 | [submodule "extension-ci-tools"] 6 | path = extension-ci-tools 7 | url = https://github.com/duckdb/extension-ci-tools 8 | branch = main 9 | [submodule "src/include/sheetreader-core"] 10 | path = src/include/sheetreader-core 11 | url = https://github.com/polydbms/sheetreader-core.git 12 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for more information: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | # https://containers.dev/guide/dependabot 6 | 7 | version: 2 8 | updates: 9 | - package-ecosystem: "devcontainers" 10 | directory: "/" 11 | schedule: 12 | interval: weekly 13 | -------------------------------------------------------------------------------- /test/README.md: -------------------------------------------------------------------------------- 1 | # Testing this extension 2 | This directory contains all the tests for this extension. The `sql` directory holds tests that are written as [SQLLogicTests](https://duckdb.org/dev/sqllogictest/intro.html). DuckDB aims to have most its tests in this format as SQL statements, so for the quack extension, this should probably be the goal too. 3 | 4 | The root makefile contains targets to build and run all of these tests. To run the SQLLogicTests: 5 | ```bash 6 | make test 7 | ``` 8 | or 9 | ```bash 10 | make test_debug 11 | ``` -------------------------------------------------------------------------------- /docker-demo/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | sheetreader-dev: 5 | build: 6 | context: . 7 | dockerfile: Dockerfile 8 | volumes: 9 | - ../:/workspace/sheetreader-duckdb 10 | - ccache_vol:/root/.ccache 11 | environment: 12 | - PATH=/usr/lib/ccache:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin 13 | working_dir: /workspace/sheetreader-duckdb 14 | command: /bin/bash 15 | deploy: 16 | resources: 17 | limits: 18 | memory: 12G # Limit to 12GB, leaving 4GB for host system 19 | 20 | volumes: 21 | ccache_vol: 22 | -------------------------------------------------------------------------------- /.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | // See https://go.microsoft.com/fwlink/?LinkId=733558 3 | // for the documentation about the tasks.json format 4 | "version": "2.0.0", 5 | "tasks": [ 6 | { 7 | "label": "Build release", 8 | "type": "shell", 9 | "command": "GEN=ninja make", 10 | "group": { 11 | "kind": "build", 12 | "isDefault": true 13 | } 14 | }, 15 | { 16 | "label": "Build debug", 17 | "type": "shell", 18 | "command": "GEN=ninja make debug", 19 | "group": { 20 | "kind": "build", 21 | "isDefault": false 22 | } 23 | } 24 | ] 25 | } 26 | -------------------------------------------------------------------------------- /benchmarks/tpch_customer_sf_1_results.csv: -------------------------------------------------------------------------------- 1 | ,function,time 2 | 0,Spatial,8.727685526013374 3 | 1,SheetReader 1 Thread,2.9178942814469337 4 | 2,SheetReader 4 Threads,1.6063461303710938 5 | 3,Spatial,8.46089394390583 6 | 4,SheetReader 1 Thread,3.051068462431431 7 | 5,SheetReader 4 Threads,1.420861043035984 8 | 6,Spatial,8.677419312298298 9 | 7,SheetReader 1 Thread,2.913003034889698 10 | 8,SheetReader 4 Threads,1.572079375386238 11 | 9,Spatial,8.883316360414028 12 | 10,SheetReader 1 Thread,2.935627445578575 13 | 11,SheetReader 4 Threads,1.431564912199974 14 | 12,Spatial,8.629714667797089 15 | 13,SheetReader 1 Thread,3.045584127306938 16 | 14,SheetReader 4 Threads,1.4394349306821823 17 | -------------------------------------------------------------------------------- /benchmarks/tpch_customer_sf_6_results.csv: -------------------------------------------------------------------------------- 1 | ,function,time 2 | 0,Spatial,49.53681559860706 3 | 1,SheetReader 1 Thread,17.911359935998917 4 | 2,SheetReader 4 Threads,9.131846889853477 5 | 3,Spatial,51.09752745181322 6 | 4,SheetReader 1 Thread,17.520940147340298 7 | 5,SheetReader 4 Threads,8.596089884638786 8 | 6,Spatial,50.83717566728592 9 | 7,SheetReader 1 Thread,17.522817224264145 10 | 8,SheetReader 4 Threads,8.461024843156338 11 | 9,Spatial,49.879797391593456 12 | 10,SheetReader 1 Thread,17.9565489590168 13 | 11,SheetReader 4 Threads,8.618413478136063 14 | 12,Spatial,49.76275556534529 15 | 13,SheetReader 1 Thread,17.571647956967354 16 | 14,SheetReader 4 Threads,8.666186735033989 17 | -------------------------------------------------------------------------------- /benchmarks/tpch_orders_sf_1_results.csv: -------------------------------------------------------------------------------- 1 | ,function,time 2 | 0,Spatial,150.51047530025244 3 | 1,SheetReader 1 Thread,42.91278725862503 4 | 2,SheetReader 4 Threads,17.066086061298847 5 | 3,Spatial,148.84033582359552 6 | 4,SheetReader 1 Thread,43.02319976687431 7 | 5,SheetReader 4 Threads,17.86900133639574 8 | 6,Spatial,151.8528371155262 9 | 7,SheetReader 1 Thread,44.46525827050209 10 | 8,SheetReader 4 Threads,17.78221821784973 11 | 9,Spatial,147.54299394786358 12 | 10,SheetReader 1 Thread,43.010885283350945 13 | 11,SheetReader 4 Threads,17.91719976812601 14 | 12,Spatial,159.33181025087833 15 | 13,SheetReader 1 Thread,43.35006716102362 16 | 14,SheetReader 4 Threads,16.704988904297352 17 | -------------------------------------------------------------------------------- /benchmarks/tpch_customer_sf_10_results.csv: -------------------------------------------------------------------------------- 1 | ,function,time 2 | 0,Spatial,85.97506861388683 3 | 1,SheetReader 1 Thread,30.615816429257393 4 | 2,SheetReader 4 Threads,14.14170940220356 5 | 3,Spatial,86.2689319550991 6 | 4,SheetReader 1 Thread,28.927293568849564 7 | 5,SheetReader 4 Threads,14.446914449334145 8 | 6,Spatial,84.92323327809572 9 | 7,SheetReader 1 Thread,29.267103753983974 10 | 8,SheetReader 4 Threads,14.090325027704239 11 | 9,Spatial,85.49243979901075 12 | 10,SheetReader 1 Thread,29.322267293930054 13 | 11,SheetReader 4 Threads,14.052365981042385 14 | 12,Spatial,86.0353828445077 15 | 13,SheetReader 1 Thread,29.225902386009693 16 | 14,SheetReader 4 Threads,14.108775869011879 17 | -------------------------------------------------------------------------------- /docs/UPDATING.md: -------------------------------------------------------------------------------- 1 | # Extension updating 2 | When cloning this template, the target version of DuckDB should be the latest stable release of DuckDB. However, there 3 | will inevitably come a time when a new DuckDB is released and the extension repository needs updating. This process goes 4 | as follows: 5 | 6 | - Bump submodules 7 | - `./duckdb` should be set to latest tagged release 8 | - `./extension-ci-tools` should be set to updated branch corresponding to latest DuckDB release 9 | - Bump versions in `./github/workflows` 10 | - `duckdb_version` input in `MainDistributionPipeline.yml` should be set to latest tagged release 11 | - reusable workflow `_extension_distribution.yml` should be set to updated branch corresponding to latest DuckDB release 12 | 13 | -------------------------------------------------------------------------------- /.devcontainer/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM mcr.microsoft.com/devcontainers/cpp:1-debian-11 2 | 3 | ARG REINSTALL_CMAKE_VERSION_FROM_SOURCE="none" 4 | 5 | # Optionally install the cmake for vcpkg 6 | COPY ./reinstall-cmake.sh /tmp/ 7 | 8 | RUN if [ "${REINSTALL_CMAKE_VERSION_FROM_SOURCE}" != "none" ]; then \ 9 | chmod +x /tmp/reinstall-cmake.sh && /tmp/reinstall-cmake.sh ${REINSTALL_CMAKE_VERSION_FROM_SOURCE}; \ 10 | fi \ 11 | && rm -f /tmp/reinstall-cmake.sh 12 | 13 | # [Optional] Uncomment this section to install additional vcpkg ports. 14 | # RUN su vscode -c "${VCPKG_ROOT}/vcpkg install " 15 | 16 | # [Optional] Uncomment this section to install additional packages. 17 | RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \ 18 | && apt-get -y install --no-install-recommends clangd -------------------------------------------------------------------------------- /docker-demo/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use Ubuntu as base image 2 | FROM ubuntu:22.04 3 | 4 | # Install dependencies 5 | RUN apt-get update && apt-get install -y \ 6 | wget \ 7 | unzip \ 8 | git \ 9 | cmake \ 10 | build-essential \ 11 | ninja-build \ 12 | libssl-dev \ 13 | python3-dev \ 14 | ccache \ 15 | && rm -rf /var/lib/apt/lists/* 16 | 17 | # Download and install DuckDB 18 | RUN wget https://github.com/duckdb/duckdb/releases/download/v1.4.2/duckdb_cli-linux-amd64.zip \ 19 | && unzip duckdb_cli-linux-amd64.zip \ 20 | && mv duckdb /usr/local/bin/ \ 21 | && chmod +x /usr/local/bin/duckdb \ 22 | && rm duckdb_cli-linux-amd64.zip 23 | 24 | # Create working directory 25 | WORKDIR /workspace 26 | 27 | # Copy the Excel file and demo script 28 | COPY test.xlsx /workspace/ 29 | COPY demo.sql /workspace/ 30 | 31 | # Set the entrypoint to bash for interactive use 32 | CMD ["/bin/bash"] 33 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | BasedOnStyle: LLVM 3 | TabWidth: 4 4 | IndentWidth: 4 5 | ColumnLimit: 120 6 | AllowShortFunctionsOnASingleLine: false 7 | --- 8 | UseTab: ForIndentation 9 | DerivePointerAlignment: false 10 | PointerAlignment: Right 11 | AlignConsecutiveMacros: true 12 | AlignTrailingComments: true 13 | AllowAllArgumentsOnNextLine: true 14 | AllowAllConstructorInitializersOnNextLine: true 15 | AllowAllParametersOfDeclarationOnNextLine: true 16 | AlignAfterOpenBracket: Align 17 | SpaceBeforeCpp11BracedList: true 18 | SpaceBeforeCtorInitializerColon: true 19 | SpaceBeforeInheritanceColon: true 20 | SpacesInAngles: false 21 | SpacesInCStyleCastParentheses: false 22 | SpacesInConditionalStatement: false 23 | AllowShortLambdasOnASingleLine: Inline 24 | AllowShortLoopsOnASingleLine: false 25 | AlwaysBreakTemplateDeclarations: Yes 26 | IncludeBlocks: Regroup 27 | Language: Cpp 28 | AccessModifierOffset: -4 29 | --- 30 | Language: Java 31 | SpaceAfterCStyleCast: true 32 | --- 33 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | 8 | { 9 | "name": "(gdb) Launch DuckDB repl", 10 | "type": "cppdbg", 11 | "request": "launch", 12 | "program": "${workspaceFolder}/build/debug/duckdb", 13 | "args": [], 14 | "stopAtEntry": false, 15 | "cwd": "${workspaceFolder}", 16 | "environment": [], 17 | "externalConsole": false, 18 | "MIMode": "gdb", 19 | "setupCommands": [ 20 | { 21 | "description": "Enable pretty-printing for gdb", 22 | "text": "-enable-pretty-printing", 23 | "ignoreFailures": true 24 | }, 25 | { 26 | "description": "Set Disassembly Flavor to Intel", 27 | "text": "-gdb-set disassembly-flavor intel", 28 | "ignoreFailures": true 29 | } 30 | ] 31 | } 32 | ], 33 | "inputs": [] 34 | } -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the 2 | // README at: https://github.com/devcontainers/templates/tree/main/src/cpp 3 | { 4 | "name": "C++", 5 | "build": { 6 | "dockerfile": "Dockerfile" 7 | }, 8 | 9 | // Features to add to the dev container. More info: https://containers.dev/features. 10 | "features": { 11 | "ghcr.io/devcontainers/features/python:1": { 12 | "installJupyterlab": "true" 13 | } 14 | }, 15 | 16 | // Use 'forwardPorts' to make a list of ports inside the container available locally. 17 | // "forwardPorts": [], 18 | 19 | // Use 'postCreateCommand' to run commands after the container is created. 20 | // "postCreateCommand": "gcc -v", 21 | 22 | // Configure tool-specific properties. 23 | // "customizations": {}, 24 | "customizations": { 25 | "vscode": { 26 | "extensions": [ 27 | "llvm-vs-code-extensions.vscode-clangd", 28 | "ms-toolsai.jupyter" 29 | ] 30 | } 31 | } 32 | // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. 33 | // "remoteUser": "root" 34 | } 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 polydbms 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.5) 2 | 3 | # Set extension name here 4 | set(TARGET_NAME sheetreader) 5 | 6 | # DuckDB's extension distribution supports vcpkg. As such, dependencies can be added in ./vcpkg.json and then 7 | # used in cmake with find_package. Feel free to remove or replace with other dependencies. 8 | # Note that it should also be removed from vcpkg.json to prevent needlessly installing it.. 9 | # find_package(Example-Package REQUIRED) 10 | 11 | set(EXTENSION_NAME ${TARGET_NAME}_extension) 12 | set(LOADABLE_EXTENSION_NAME ${TARGET_NAME}_loadable_extension) 13 | 14 | project(${TARGET_NAME}) 15 | include_directories(src/include) 16 | include_directories(src/include/sheetreader-core/src/) 17 | include_directories(src/include/sheetreader-core/src/fast_double_parser) 18 | include_directories(src/include/sheetreader-core/src/miniz) 19 | 20 | set(EXTENSION_SOURCES src/sheetreader_extension.cpp src/include/sheetreader-core/src/XlsxFile.cpp src/include/sheetreader-core/src/XlsxSheet.cpp src/include/sheetreader-core/src/miniz/miniz.cpp) 21 | 22 | build_static_extension(${TARGET_NAME} ${EXTENSION_SOURCES}) 23 | # TODO: We might need this at some point -- this is probably faster to build 24 | build_loadable_extension(${TARGET_NAME} " " ${EXTENSION_SOURCES}) 25 | 26 | install( 27 | TARGETS ${EXTENSION_NAME} 28 | EXPORT "${DUCKDB_EXPORT_SET}" 29 | LIBRARY DESTINATION "${INSTALL_LIB_DIR}" 30 | ARCHIVE DESTINATION "${INSTALL_LIB_DIR}") 31 | -------------------------------------------------------------------------------- /docker-demo/demo.sql: -------------------------------------------------------------------------------- 1 | -- SheetReader DuckDB Extension Demo 2 | -- This script demonstrates how to use the sheetreader extension to query Excel files 3 | 4 | -- Step 1: Install the sheetreader extension from community extensions 5 | INSTALL sheetreader FROM community; 6 | 7 | -- Step 2: Load the extension 8 | LOAD sheetreader; 9 | 10 | -- Step 3: Query the Excel file directly 11 | .print '=== Reading test.xlsx with sheetreader ===' 12 | SELECT * FROM sheetreader('test.xlsx'); 13 | 14 | -- Step 4: Get row count 15 | .print '' 16 | .print '=== Row count ===' 17 | SELECT COUNT(*) as total_rows FROM sheetreader('test.xlsx'); 18 | 19 | -- Step 5: Calculate statistics on the data 20 | .print '' 21 | .print '=== Statistics ===' 22 | SELECT 23 | MIN(Numeric0) as min_value, 24 | MAX(Numeric0) as max_value, 25 | AVG(Numeric0) as avg_value, 26 | SUM(Numeric0) as sum_value 27 | FROM sheetreader('test.xlsx'); 28 | 29 | -- Step 6: Create a table from the Excel data 30 | .print '' 31 | .print '=== Creating table from Excel data ===' 32 | CREATE TABLE excel_data AS 33 | FROM sheetreader('test.xlsx'); 34 | 35 | -- Step 7: Query the created table 36 | .print '' 37 | .print '=== Querying the created table ===' 38 | SELECT * FROM excel_data; 39 | 40 | -- Step 8: Filter data (example: values greater than 50) 41 | .print '' 42 | .print '=== Filtering values > 50 ===' 43 | SELECT * FROM excel_data WHERE Numeric0 > 50; 44 | 45 | .print '' 46 | .print '=== Demo completed successfully! ===' 47 | -------------------------------------------------------------------------------- /.github/workflows/_check_secrets.yml: -------------------------------------------------------------------------------- 1 | name: Check Secrets 2 | on: 3 | workflow_call: 4 | inputs: 5 | stub: 6 | required: false 7 | type: string 8 | default: "stub" 9 | secrets: 10 | S3_BUCKET: 11 | required: true 12 | S3_DEPLOY_ID: 13 | required: true 14 | S3_DEPLOY_KEY: 15 | required: true 16 | S3_REGION: 17 | required: true 18 | 19 | jobs: 20 | test-secrets-accessable: 21 | name: Test secrets accessable 22 | runs-on: ubuntu-latest 23 | environment: Actions 24 | env: 25 | BUCKET_NAME: ${{ secrets.S3_BUCKET }} 26 | AWS_ACCESS_KEY_ID: ${{ secrets.S3_DEPLOY_ID }} 27 | AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_DEPLOY_KEY }} 28 | AWS_DEFAULT_REGION: ${{ secrets.S3_REGION }} 29 | steps: 30 | # Check whether 31 | - name: Test secrets accessable 32 | run: echo ${BUCKET_NAME} | sed -e 's/\(.\)/\1 /g' 33 | - name: Check deploy secrets 34 | run: if [ -z ${AWS_ACCESS_KEY_ID+x} ]; then echo "access key is unset"; else echo "access key is set with length ${#AWS_ACCESS_KEY_ID}"; fi 35 | - name: Check deploy secrets 36 | run: if [ -z ${AWS_SECRET_ACCESS_KEY+x} ]; then echo "secret key is unset"; else echo "secret key is set with length ${#AWS_DEFAULT_REGION}"; fi 37 | - name: Check deploy secrets 38 | run: if [ -z ${AWS_DEFAULT_REGION+x} ]; then echo "region is unset"; else echo "region is set with length ${#AWS_DEFAULT_REGION}"; fi 39 | -------------------------------------------------------------------------------- /.github/workflows/MainDistributionPipeline.yml: -------------------------------------------------------------------------------- 1 | # 2 | # This workflow calls the main distribution pipeline from DuckDB to build, test and (optionally) release the extension 3 | # 4 | name: Main Extension Distribution Pipeline 5 | on: 6 | push: 7 | pull_request: 8 | workflow_dispatch: 9 | 10 | 11 | 12 | concurrency: 13 | group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/main' || github.sha }} 14 | cancel-in-progress: true 15 | 16 | jobs: 17 | # This is useful for debugging issues regarding the secret management: 18 | # 19 | # test-secrets-accessable: 20 | # name: Test secrets accessable 21 | # uses: ./.github/workflows/_check_secrets.yml 22 | # secrets: inherit 23 | 24 | 25 | 26 | duckdb-stable-build: 27 | name: Build extension binaries (DuckDB v1.4.2) 28 | # needs: test-secrets-accessable 29 | uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@v1.4.2 30 | with: 31 | duckdb_version: v1.4.2 32 | ci_tools_version: v1.4.2 33 | extension_name: sheetreader 34 | exclude_archs: "windows_amd64_rtools" 35 | 36 | # We disable deployment for now 37 | # 38 | # duckdb-stable-deploy: 39 | # name: Deploy extension binaries (DuckDB v1.0.0) 40 | # needs: duckdb-stable-build 41 | # uses: ./.github/workflows/_extension_deploy.yml 42 | # secrets: inherit 43 | # with: 44 | # duckdb_version: v1.0.0 45 | # extension_name: sheetreader 46 | # deploy_latest: ${{ startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main' }} 47 | # exclude_archs: "windows_amd64_rtools" 48 | -------------------------------------------------------------------------------- /.devcontainer/reinstall-cmake.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #------------------------------------------------------------------------------------------------------------- 3 | # Copyright (c) Microsoft Corporation. All rights reserved. 4 | # Licensed under the MIT License. See https://go.microsoft.com/fwlink/?linkid=2090316 for license information. 5 | #------------------------------------------------------------------------------------------------------------- 6 | # 7 | set -e 8 | 9 | CMAKE_VERSION=${1:-"none"} 10 | 11 | if [ "${CMAKE_VERSION}" = "none" ]; then 12 | echo "No CMake version specified, skipping CMake reinstallation" 13 | exit 0 14 | fi 15 | 16 | # Cleanup temporary directory and associated files when exiting the script. 17 | cleanup() { 18 | EXIT_CODE=$? 19 | set +e 20 | if [[ -n "${TMP_DIR}" ]]; then 21 | echo "Executing cleanup of tmp files" 22 | rm -Rf "${TMP_DIR}" 23 | fi 24 | exit $EXIT_CODE 25 | } 26 | trap cleanup EXIT 27 | 28 | 29 | echo "Installing CMake..." 30 | apt-get -y purge --auto-remove cmake 31 | mkdir -p /opt/cmake 32 | 33 | architecture=$(dpkg --print-architecture) 34 | case "${architecture}" in 35 | arm64) 36 | ARCH=aarch64 ;; 37 | amd64) 38 | ARCH=x86_64 ;; 39 | *) 40 | echo "Unsupported architecture ${architecture}." 41 | exit 1 42 | ;; 43 | esac 44 | 45 | CMAKE_BINARY_NAME="cmake-${CMAKE_VERSION}-linux-${ARCH}.sh" 46 | CMAKE_CHECKSUM_NAME="cmake-${CMAKE_VERSION}-SHA-256.txt" 47 | TMP_DIR=$(mktemp -d -t cmake-XXXXXXXXXX) 48 | 49 | echo "${TMP_DIR}" 50 | cd "${TMP_DIR}" 51 | 52 | curl -sSL "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/${CMAKE_BINARY_NAME}" -O 53 | curl -sSL "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/${CMAKE_CHECKSUM_NAME}" -O 54 | 55 | sha256sum -c --ignore-missing "${CMAKE_CHECKSUM_NAME}" 56 | sh "${TMP_DIR}/${CMAKE_BINARY_NAME}" --prefix=/opt/cmake --skip-license 57 | 58 | ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake 59 | ln -s /opt/cmake/bin/ctest /usr/local/bin/ctest 60 | -------------------------------------------------------------------------------- /docker-demo/README.md: -------------------------------------------------------------------------------- 1 | # SheetReader DuckDB Docker Demo 2 | 3 | Demo of the **sheetreader-duckdb** extension with DuckDB v1.4.0+ compatibility. 4 | 5 | ## Prerequisites 6 | 7 | - Docker and Docker Compose installed and running 8 | 9 | ## DuckDB v1.4.0 Extension Verification 10 | 11 | This setup allows you to verify that the sheetreader extension works correctly with DuckDB v1.4.0+. 12 | 13 | ### Build and Test the Extension 14 | 15 | **Step 1: Navigate to the demo directory** 16 | ```bash 17 | cd docker-demo 18 | ``` 19 | 20 | **Step 2: Build the Docker image** 21 | ```bash 22 | docker compose build 23 | ``` 24 | 25 | **Step 3: Build the extension from source** 26 | ```bash 27 | docker compose run --rm sheetreader-dev bash -c "GEN=ninja NINJA_BUILD_FLAGS='-j2' make" 28 | ``` 29 | 30 | This will: 31 | - Build DuckDB v1.4.0+ from source 32 | - Compile the sheetreader extension with the new API 33 | - Create a DuckDB binary with the extension pre-loaded 34 | 35 | **Step 4: Run the verification test** 36 | ```bash 37 | docker compose run --rm sheetreader-dev bash -c "./build/release/duckdb < docker-demo/test_verification.sql" 38 | ``` 39 | 40 | **Expected output:** 41 | ``` 42 | ┌──────────┐ 43 | │ Numeric0 │ 44 | │ double │ 45 | ├──────────┤ 46 | │ 92.0 │ 47 | │ 48.0 │ 48 | │ 99.0 │ 49 | │ 35.0 │ 50 | │ 97.0 │ 51 | └──────────┘ 52 | ``` 53 | 54 | If you see this output, the extension is working correctly with DuckDB v1.4.0+! ✅ 55 | 56 | --- 57 | 58 | ## Interactive Development 59 | 60 | For interactive development and testing: 61 | 62 | **Start an interactive shell:** 63 | ```bash 64 | docker compose run --rm sheetreader-dev bash 65 | ``` 66 | 67 | **Inside the container, you can:** 68 | ```bash 69 | # Build the extension 70 | GEN=ninja make 71 | 72 | 73 | # start DuckDB interactively 74 | ./build/release/duckdb 75 | ``` 76 | 77 | **Inside DuckDB, try queries:** 78 | ```sql 79 | -- Query the Excel file 80 | SELECT * FROM sheetreader('docker-demo/test.xlsx'); 81 | 82 | ``` 83 | 84 | **Exit:** 85 | ``` 86 | .exit # Exit DuckDB 87 | exit # Exit container 88 | ``` 89 | 90 | --- 91 | 92 | ## Files 93 | 94 | - **Dockerfile** - Ubuntu 22.04 with build dependencies (git, cmake, ninja, etc.) 95 | - **docker-compose.yml** - Docker Compose setup with volume mounts and ccache 96 | - **test.xlsx** - Sample Excel file with test data 97 | - **test_verification.sql** - Verification query for testing 98 | -------------------------------------------------------------------------------- /.clang-tidy: -------------------------------------------------------------------------------- 1 | Checks: '-*,clang-diagnostic-*,bugprone-*,performance-*,google-explicit-constructor,google-build-using-namespace,google-runtime-int,misc-definitions-in-headers,modernize-use-nullptr,modernize-use-override,-bugprone-macro-parentheses,readability-braces-around-statements,-bugprone-branch-clone,readability-identifier-naming,hicpp-exception-baseclass,misc-throw-by-value-catch-by-reference,-bugprone-signed-char-misuse,-bugprone-misplaced-widening-cast,-bugprone-sizeof-expression,-bugprone-narrowing-conversions,-bugprone-easily-swappable-parameters,google-global-names-in-headers,llvm-header-guard,misc-definitions-in-headers,modernize-use-emplace,modernize-use-bool-literals,-performance-inefficient-string-concatenation,-performance-no-int-to-ptr,readability-container-size-empty,cppcoreguidelines-pro-type-cstyle-cast' 2 | WarningsAsErrors: '*' 3 | HeaderFilterRegex: '.*^(re2.h)' 4 | AnalyzeTemporaryDtors: false 5 | FormatStyle: none 6 | CheckOptions: 7 | - key: readability-identifier-naming.ClassCase 8 | value: CamelCase 9 | - key: readability-identifier-naming.EnumCase 10 | value: CamelCase 11 | - key: readability-identifier-naming.TypedefCase 12 | value: lower_case 13 | - key: readability-identifier-naming.TypedefSuffix 14 | value: _t 15 | - key: readability-identifier-naming.FunctionCase 16 | value: CamelCase 17 | - key: readability-identifier-naming.MemberCase 18 | value: lower_case 19 | - key: readability-identifier-naming.ParameterCase 20 | value: lower_case 21 | - key: readability-identifier-naming.ConstantCase 22 | value: aNy_CasE 23 | - key: readability-identifier-naming.ConstantParameterCase 24 | value: lower_case 25 | - key: readability-identifier-naming.NamespaceCase 26 | value: lower_case 27 | - key: readability-identifier-naming.MacroDefinitionCase 28 | value: UPPER_CASE 29 | - key: readability-identifier-naming.StaticConstantCase 30 | value: UPPER_CASE 31 | - key: readability-identifier-naming.ConstantMemberCase 32 | value: aNy_CasE 33 | - key: readability-identifier-naming.StaticVariableCase 34 | value: UPPER_CASE 35 | - key: readability-identifier-naming.ClassConstantCase 36 | value: UPPER_CASE 37 | - key: readability-identifier-naming.EnumConstantCase 38 | value: UPPER_CASE 39 | - key: readability-identifier-naming.ConstexprVariableCase 40 | value: UPPER_CASE 41 | - key: readability-identifier-naming.StaticConstantCase 42 | value: UPPER_CASE 43 | - key: readability-identifier-naming.TemplateTemplateParameterCase 44 | value: UPPER_CASE 45 | - key: readability-identifier-naming.TypeTemplateParameterCase 46 | value: UPPER_CASE 47 | - key: readability-identifier-naming.VariableCase 48 | value: lower_case 49 | - key: modernize-use-emplace.SmartPointers 50 | value: '::std::shared_ptr;::duckdb::unique_ptr;::std::auto_ptr;::std::weak_ptr' 51 | 52 | -------------------------------------------------------------------------------- /scripts/extension-upload.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Extension upload script 4 | 5 | # Usage: ./extension-upload.sh 6 | # : Name of the extension 7 | # : Version (commit / version tag) of the extension 8 | # : Version (commit / version tag) of DuckDB 9 | # : Architecture target of the extension binary 10 | # : S3 bucket to upload to 11 | # : Set this as the latest version ("true" / "false", default: "false") 12 | # : Set this as a versioned version that will prevent its deletion 13 | 14 | set -e 15 | 16 | if [[ $4 == wasm* ]]; then 17 | ext="/tmp/extension/$1.duckdb_extension.wasm" 18 | else 19 | ext="/tmp/extension/$1.duckdb_extension" 20 | fi 21 | 22 | echo $ext 23 | 24 | script_dir="$(dirname "$(readlink -f "$0")")" 25 | 26 | # calculate SHA256 hash of extension binary 27 | cat $ext > $ext.append 28 | 29 | if [[ $4 == wasm* ]]; then 30 | # 0 for custom section 31 | # 113 in hex = 275 in decimal, total lenght of what follows (1 + 16 + 2 + 256) 32 | # [1(continuation) + 0010011(payload) = \x93, 0(continuation) + 10(payload) = \x02] 33 | echo -n -e '\x00' >> $ext.append 34 | echo -n -e '\x93\x02' >> $ext.append 35 | # 10 in hex = 16 in decimal, lenght of name, 1 byte 36 | echo -n -e '\x10' >> $ext.append 37 | echo -n -e 'duckdb_signature' >> $ext.append 38 | # the name of the WebAssembly custom section, 16 bytes 39 | # 100 in hex, 256 in decimal 40 | # [1(continuation) + 0000000(payload) = ff, 0(continuation) + 10(payload)], 41 | # for a grand total of 2 bytes 42 | echo -n -e '\x80\x02' >> $ext.append 43 | fi 44 | 45 | # (Optionally) Sign binary 46 | if [ "$DUCKDB_EXTENSION_SIGNING_PK" != "" ]; then 47 | echo "$DUCKDB_EXTENSION_SIGNING_PK" > private.pem 48 | $script_dir/../duckdb/scripts/compute-extension-hash.sh $ext.append > $ext.hash 49 | openssl pkeyutl -sign -in $ext.hash -inkey private.pem -pkeyopt digest:sha256 -out $ext.sign 50 | rm -f private.pem 51 | fi 52 | 53 | # Signature is always there, potentially defaulting to 256 zeros 54 | truncate -s 256 $ext.sign 55 | 56 | # append signature to extension binary 57 | cat $ext.sign >> $ext.append 58 | 59 | # compress extension binary 60 | if [[ $4 == wasm_* ]]; then 61 | brotli < $ext.append > "$ext.compressed" 62 | else 63 | gzip < $ext.append > "$ext.compressed" 64 | fi 65 | 66 | set -e 67 | 68 | # Abort if AWS key is not set 69 | if [ -z "$AWS_ACCESS_KEY_ID" ]; then 70 | echo "No AWS key found, skipping.." 71 | exit 0 72 | fi 73 | 74 | # upload versioned version 75 | if [[ $7 = 'true' ]]; then 76 | if [[ $4 == wasm* ]]; then 77 | aws s3 cp $ext.compressed s3://$5/$1/$2/$3/$4/$1.duckdb_extension.wasm --acl public-read --content-encoding br --content-type="application/wasm" 78 | else 79 | aws s3 cp $ext.compressed s3://$5/$1/$2/$3/$4/$1.duckdb_extension.gz --acl public-read 80 | fi 81 | fi 82 | 83 | # upload to latest version 84 | if [[ $6 = 'true' ]]; then 85 | if [[ $4 == wasm* ]]; then 86 | aws s3 cp $ext.compressed s3://$5/$3/$4/$1.duckdb_extension.wasm --acl public-read --content-encoding br --content-type="application/wasm" 87 | else 88 | aws s3 cp $ext.compressed s3://$5/$3/$4/$1.duckdb_extension.gz --acl public-read 89 | fi 90 | fi 91 | -------------------------------------------------------------------------------- /src/include/sheetreader_extension.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "duckdb.h" 4 | #include "duckdb.hpp" 5 | #include "duckdb/common/typedefs.hpp" 6 | #include "duckdb/common/types.hpp" 7 | #include "duckdb/common/unique_ptr.hpp" 8 | #include "duckdb/common/vector.hpp" 9 | #include "duckdb/function/function.hpp" 10 | #include "sheetreader-core/src/XlsxFile.h" 11 | #include "sheetreader-core/src/XlsxSheet.h" 12 | 13 | namespace duckdb { 14 | 15 | class SheetreaderExtension : public Extension { 16 | public: 17 | void Load(ExtensionLoader &loader) override; 18 | std::string Name() override; 19 | std::string Version() const override; 20 | }; 21 | 22 | //! Contains all data that is determined during the bind function 23 | struct SRBindData : public TableFunctionData { 24 | public: 25 | //! File name with path to file 26 | //! Sheet ID default is 1 27 | SRBindData(string file_name); 28 | //! File name with path to file and name of sheet 29 | //! Throws exception if sheet name is not found 30 | SRBindData(string file_name, string sheet_name); 31 | //! File name with path to file and index of sheet (starts with 1) 32 | //! Throws exception if sheet at index is not found 33 | SRBindData(string file_name, int sheet_index); 34 | 35 | public: 36 | //! The paths of the files we're reading 37 | vector file_names; 38 | 39 | //! All column names (in order) 40 | vector names; 41 | 42 | //! All column DuckDB types (in order) 43 | vector types; 44 | 45 | //! The .XLSX-file -- created by sheetreader-core 46 | XlsxFile xlsx_file; 47 | //! A sheet of xlsx_file -- created by sheetreader-core 48 | unique_ptr xlsx_sheet; 49 | 50 | //! Number of threads used while parsing 51 | idx_t number_threads = 1; 52 | 53 | //! Number of rows to skip while parsing 54 | idx_t skip_rows = 0; 55 | 56 | //! Coerce all cells to string in user defined column types 57 | bool coerce_to_string = false; 58 | 59 | //! User defined types 60 | vector user_types = {}; 61 | 62 | //! Use user_types even if they are not compatible with types determined by first/second row 63 | bool force_types = false; 64 | 65 | private: 66 | SRBindData(ClientContext &context, vector file_names, string sheet_name); 67 | }; 68 | //! Keeps state in between calls to the table (copy) function 69 | struct SRGlobalState { 70 | public: 71 | SRGlobalState(ClientContext &context, const SRBindData &bind_data); 72 | 73 | public: 74 | //! Bound data 75 | const SRBindData &bind_data; 76 | 77 | //! Number of chunk read so far 78 | idx_t chunk_count = 0; 79 | 80 | //! State of copying from mCells 81 | size_t max_buffers; 82 | //! Current index of thread 83 | size_t current_thread; 84 | //! Current index of buffer in thread 85 | size_t current_buffer; 86 | //! Current index of cell in buffer 87 | size_t current_cell; 88 | //! Current index of column in row 89 | unsigned long current_column; 90 | //! Current index of row in sheet 91 | long long current_row; 92 | //! Current index of row per thread 93 | std::vector current_locs; 94 | }; 95 | 96 | struct SRLocalState { 97 | public: 98 | SRLocalState(ClientContext &context, SRGlobalState &gstate); 99 | 100 | private: 101 | const SRBindData &bind_data; 102 | }; 103 | 104 | //! Contains SRGlobalState 105 | struct SRGlobalTableFunctionState : public GlobalTableFunctionState { 106 | public: 107 | SRGlobalTableFunctionState(ClientContext &context, TableFunctionInitInput &input); 108 | static unique_ptr Init(ClientContext &context, TableFunctionInitInput &input); 109 | 110 | public: 111 | SRGlobalState state; 112 | }; 113 | 114 | struct SRLocalTableFunctionState : public LocalTableFunctionState { 115 | public: 116 | SRLocalTableFunctionState(ClientContext &context, SRGlobalState &gstate); 117 | static unique_ptr Init(ExecutionContext &context, TableFunctionInitInput &input, 118 | GlobalTableFunctionState *global_state); 119 | 120 | public: 121 | SRLocalState state; 122 | }; 123 | } // namespace duckdb 124 | -------------------------------------------------------------------------------- /.github/workflows/_extension_deploy.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Reusable workflow that deploys the artifacts produced by github.com/duckdb/duckdb/.github/workflows/_extension_distribution.yml 3 | # 4 | # note: this workflow needs to be located in the extension repository, as it requires secrets to be passed to the 5 | # deploy script. However, it should generally not be necessary to modify this workflow in your extension repository, as 6 | # this workflow can be configured to use a custom deploy script. 7 | 8 | 9 | name: Extension Deployment 10 | on: 11 | workflow_call: 12 | inputs: 13 | # The name of the extension 14 | extension_name: 15 | required: true 16 | type: string 17 | # DuckDB version to build against 18 | duckdb_version: 19 | required: true 20 | type: string 21 | # ';' separated list of architectures to exclude, for example: 'linux_amd64;osx_arm64' 22 | exclude_archs: 23 | required: false 24 | type: string 25 | default: "" 26 | # Whether to upload this deployment as the latest. This may overwrite a previous deployment. 27 | deploy_latest: 28 | required: false 29 | type: boolean 30 | default: false 31 | # Whether to upload this deployment under a versioned path. These will not be deleted automatically 32 | deploy_versioned: 33 | required: false 34 | type: boolean 35 | default: false 36 | # Postfix added to artifact names. Can be used to guarantee unique names when this workflow is called multiple times 37 | artifact_postfix: 38 | required: false 39 | type: string 40 | default: "" 41 | # Override the default deploy script with a custom script 42 | deploy_script: 43 | required: false 44 | type: string 45 | default: "./scripts/extension-upload.sh" 46 | # Override the default matrix parse script with a custom script 47 | matrix_parse_script: 48 | required: false 49 | type: string 50 | default: "./duckdb/scripts/modify_distribution_matrix.py" 51 | secrets: 52 | S3_BUCKET: 53 | required: true 54 | S3_DEPLOY_ID: 55 | required: true 56 | S3_DEPLOY_KEY: 57 | required: true 58 | S3_REGION: 59 | required: true 60 | 61 | jobs: 62 | generate_matrix: 63 | name: Generate matrix 64 | environment: Actions 65 | runs-on: ubuntu-latest 66 | outputs: 67 | deploy_matrix: ${{ steps.parse-matrices.outputs.deploy_matrix }} 68 | steps: 69 | - uses: actions/checkout@v3 70 | with: 71 | fetch-depth: 0 72 | submodules: 'true' 73 | 74 | - name: Checkout DuckDB to version 75 | run: | 76 | cd duckdb 77 | git checkout ${{ inputs.duckdb_version }} 78 | 79 | - id: parse-matrices 80 | run: | 81 | python3 ${{ inputs.matrix_parse_script }} --input ./duckdb/.github/config/distribution_matrix.json --deploy_matrix --output deploy_matrix.json --exclude "${{ inputs.exclude_archs }}" --pretty 82 | deploy_matrix="`cat deploy_matrix.json`" 83 | echo deploy_matrix=$deploy_matrix >> $GITHUB_OUTPUT 84 | echo `cat $GITHUB_OUTPUT` 85 | 86 | deploy: 87 | name: Deploy 88 | environment: Actions 89 | runs-on: ubuntu-latest 90 | needs: generate_matrix 91 | if: ${{ needs.generate_matrix.outputs.deploy_matrix != '{}' && needs.generate_matrix.outputs.deploy_matrix != '' }} 92 | strategy: 93 | matrix: ${{fromJson(needs.generate_matrix.outputs.deploy_matrix)}} 94 | 95 | steps: 96 | - uses: actions/checkout@v3 97 | with: 98 | fetch-depth: 0 99 | submodules: 'true' 100 | 101 | - name: Checkout DuckDB to version 102 | run: | 103 | cd duckdb 104 | git checkout ${{ inputs.duckdb_version }} 105 | 106 | - uses: actions/download-artifact@v2 107 | with: 108 | name: ${{ inputs.extension_name }}-${{ inputs.duckdb_version }}-extension-${{matrix.duckdb_arch}}${{inputs.artifact_postfix}}${{startsWith(matrix.duckdb, 'wasm') && '.wasm' || ''}} 109 | path: | 110 | /tmp/extension 111 | 112 | - name: Deploy 113 | shell: bash 114 | env: 115 | AWS_ACCESS_KEY_ID: ${{ secrets.S3_DEPLOY_ID }} 116 | AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_DEPLOY_KEY }} 117 | AWS_DEFAULT_REGION: ${{ secrets.S3_REGION }} 118 | BUCKET_NAME: ${{ secrets.S3_BUCKET }} 119 | DUCKDB_EXTENSION_SIGNING_PK: ${{ secrets.S3_DUCKDB_ORG_EXTENSION_SIGNING_PK }} 120 | run: | 121 | pwd 122 | python3 -m pip install pip awscli 123 | git config --global --add safe.directory '*' 124 | cd duckdb 125 | git fetch --tags 126 | export DUCKDB_VERSION=`git tag --points-at HEAD` 127 | export DUCKDB_VERSION=${DUCKDB_VERSION:=`git log -1 --format=%h`} 128 | cd .. 129 | git fetch --tags 130 | export EXT_VERSION=`git tag --points-at HEAD` 131 | export EXT_VERSION=${EXT_VERSION:=`git log -1 --format=%h`} 132 | ${{ inputs.deploy_script }} ${{ inputs.extension_name }} $EXT_VERSION $DUCKDB_VERSION ${{ matrix.duckdb_arch }} $BUCKET_NAME ${{inputs.deploy_latest || 'true' && 'false'}} ${{inputs.deploy_versioned || 'true' && 'false'}} 133 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SheetReader DuckDB extension 2 | 3 | `sheetreader` is a DuckDB extension that allows reading XLSX files into DuckDB tables with SheetReader, our blazingly fast XLSX parser (https://github.com/polydbms/sheetreader-core). 4 | 5 | --- 6 | 7 | This repository is based on https://github.com/duckdb/extension-template. 8 | 9 | ## Table of Contents 10 | 11 | - [SheetReader DuckDB extension](#sheetreader-duckdb-extension) 12 | - [Table of Contents](#table-of-contents) 13 | - [Usage](#usage) 14 | - [Parameters](#parameters) 15 | - [More information on SheetReader](#more-information-on-sheetreader) 16 | - [Benchmarks](#benchmarks) 17 | - [Building yourself](#building-yourself) 18 | - [Running the extension](#running-the-extension) 19 | 20 | ## Usage 21 | 22 | Before using SheetReader, you need to install it from the [community extensions](https://community-extensions.duckdb.org/extensions/sheetreader.html) and load it into your DuckDB-environment: 23 | 24 | ```sql 25 | INSTALL sheetreader FROM community; 26 | LOAD sheetreader; 27 | ``` 28 | 29 | Now, you can run your first query: 30 | 31 | ```sql 32 | D SELECT * 33 | FROM sheetreader('test.xlsx'); 34 | ``` 35 | 36 | The `sheetreader()` function offers further parameters to load the XLSX-file as required: 37 | 38 | ```sql 39 | D CREATE TABLE test AS FROM sheetreader( 40 | 'test.xlsx', 41 | sheet_index=1, 42 | threads=16, 43 | skip_rows=0, 44 | has_header=TRUE, 45 | types=[BOOLEAN,VARCHAR], 46 | coerce_to_string=TRUE, 47 | force_types=TRUE 48 | ); 49 | ``` 50 | 51 | ### Parameters 52 | 53 | | Name | Description | Type | Default | 54 | | :----------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :-------------: | :--------------------------------------------------------------- | 55 | | `sheet_index` | Index of the sheet to read. Starts at 1. | `INTEGER` | `1` | 56 | | `sheet_name` | Name of the sheet to read.
Only either `sheet_index` or `sheet_name` can be set. | `VARCHAR` | `""` | 57 | | `threads` | Number of threads to use, while parsing | `INTEGER` | Half of available cores; minimum 1 | 58 | | `skip_rows` | Number of rows to skip | `INTEGER` | `0` | 59 | | `types` | List of types for all columns
  • Types currently available:
    `VARCHAR`,`BOOLEAN`,`DOUBLE`, `DATE`.
  • Useful in combination with `coerce_to_string` and `force_types`.
| `LIST(VARCHAR)` | Uses types determined by first & second row (after skipped rows) | 60 | | `coerce_to_string` | Coerce all cells in column of type `VARCHAR` to string (i.e. `VARCHAR`). | `BOOLEAN` | `false` | 61 | | `force_types` | Use `types` even if they are not compatible with types determined by first/second row.
Cells, that are not of the column type, are set to `NULL` or coerced to string, if option is set. | `BOOLEAN` | `false` | 62 | | `has_header` | If set to `true`:
  • Force to treat first row as header row (only works if all cells are of type `VARCHAR`).
  • If successful, the cell contents are used for column names.
  • Will overwrite the default behavior, which doesn't use the first row as headers, if all columns have type `VARCHAR`.

If set to `false`:
  • The extension will still try to treat the first row as header row.
  • The difference is that it will not fail, if the first row is not usable (i.e. not all cells are of type `VARCHAR`).
  • The first row won't be used as headers, if all columns have type `VARCHAR`.
| `BOOLEAN` | `false` | 63 | 64 | 65 | ## More information on SheetReader 66 | 67 | SheetReader was published in the [Information Systems Journal](https://www.sciencedirect.com/science/article/abs/pii/S0306437923000194). 68 | ``` 69 | @article{DBLP:journals/is/GavriilidisHZM23, 70 | author = {Haralampos Gavriilidis and 71 | Felix Henze and 72 | Eleni Tzirita Zacharatou and 73 | Volker Markl}, 74 | title = {SheetReader: Efficient Specialized Spreadsheet Parsing}, 75 | journal = {Inf. Syst.}, 76 | volume = {115}, 77 | pages = {102183}, 78 | year = {2023}, 79 | url = {https://doi.org/10.1016/j.is.2023.102183}, 80 | doi = {10.1016/J.IS.2023.102183}, 81 | timestamp = {Mon, 26 Jun 2023 20:54:32 +0200}, 82 | biburl = {https://dblp.org/rec/journals/is/GavriilidisHZM23.bib}, 83 | bibsource = {dblp computer science bibliography, https://dblp.org} 84 | } 85 | ``` 86 | 87 | ## Benchmarks 88 | 89 | You can find benchmarks in the above-mentioned paper, comparing SheetReader to other XLSX parsers. 90 | 91 | Here is a plot of preliminary benchmarks comparing the `sheetreader` DuckDB extension to the `spatial` extension's `st_read` function: 92 | 93 | 94 | ![Benchmark](./benchmarks/joined_barplot_benchmark_sf_1.png) 95 | 96 | (*System info: 2x Intel(R) Xeon(R) E5530 @ 2.40GHz, 47GiB RAM*) 97 | 98 | ## Building yourself 99 | 100 | First, clone this repository with the `--recurse-submodules` flag --- so you get all the needed source files. 101 | 102 | To build the extension, run: 103 | ```sh 104 | GEN=ninja make 105 | ``` 106 | The main binaries that will be built are: 107 | ```sh 108 | ./build/release/duckdb 109 | ./build/release/extension/sheetreader/sheetreader.duckdb_extension 110 | ``` 111 | - `duckdb` is the binary for the DuckDB shell with the extension code automatically loaded. 112 | - `sheetreader.duckdb_extension` is the loadable binary as it would be distributed. 113 | 114 | ### Running the extension 115 | 116 | To run the self-built extension code, simply start the shell with `./build/release/duckdb`. 117 | -------------------------------------------------------------------------------- /src/sheetreader_extension.cpp: -------------------------------------------------------------------------------- 1 | #include "duckdb.h" 2 | #include "duckdb/common/assert.hpp" 3 | #include "duckdb/common/helper.hpp" 4 | #include "duckdb/common/multi_file/multi_file_reader.hpp" 5 | #include "duckdb/common/typedefs.hpp" 6 | #include "duckdb/common/types.hpp" 7 | #include "duckdb/common/types/data_chunk.hpp" 8 | #include "duckdb/common/types/date.hpp" 9 | #include "duckdb/common/types/string_type.hpp" 10 | #include "duckdb/common/types/value.hpp" 11 | #include "duckdb/common/types/vector.hpp" 12 | #include "duckdb/common/unique_ptr.hpp" 13 | #include "duckdb/common/vector.hpp" 14 | #include "duckdb/common/vector_size.hpp" 15 | #include "duckdb/function/function.hpp" 16 | #include "sheetreader-core/src/XlsxFile.h" 17 | #include "sheetreader-core/src/XlsxSheet.h" 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #define DUCKDB_EXTENSION_MAIN 24 | 25 | #include "duckdb/common/exception.hpp" 26 | #include "duckdb/common/string_util.hpp" 27 | #include "duckdb/function/table_function.hpp" 28 | #include "sheetreader_extension.hpp" 29 | 30 | #include 31 | #include "duckdb/main/database.hpp" 32 | 33 | namespace duckdb { 34 | 35 | //! Determine default number of threads 36 | inline idx_t DefaultThreads() { 37 | #ifdef __EMSCRIPTEN__ 38 | // WebAssembly doesn't support threading in MVP builds 39 | return 1; 40 | #else 41 | // Returns 0 if not able to detect 42 | idx_t sys_number_threads = std::thread::hardware_concurrency(); 43 | 44 | // Don't be to greedy 45 | idx_t appropriate_number_threads = sys_number_threads / 2; 46 | 47 | if (appropriate_number_threads <= 0) { 48 | appropriate_number_threads = 1; 49 | } 50 | 51 | return appropriate_number_threads; 52 | #endif 53 | } 54 | 55 | // ===================================== 56 | // Following are a bunch of constructors for classes that hold the state of the sheetreader extension 57 | // Find the definitions & documentation of these classes in the sheetreader_extension.hpp file 58 | // ===================================== 59 | 60 | SRBindData::SRBindData(string file_name) : SRBindData(file_name, 1) { 61 | } 62 | 63 | SRBindData::SRBindData(string file_name, string sheet_name) 64 | : xlsx_file(file_name), xlsx_sheet(make_uniq(xlsx_file.getSheet(sheet_name))), 65 | number_threads(DefaultThreads()) { 66 | } 67 | 68 | SRBindData::SRBindData(string file_name, int sheet_index) 69 | : xlsx_file(file_name), xlsx_sheet(make_uniq(xlsx_file.getSheet(sheet_index))), 70 | number_threads(DefaultThreads()) { 71 | } 72 | 73 | SRGlobalState::SRGlobalState(ClientContext &context, const SRBindData &bind_data) 74 | : bind_data(bind_data), chunk_count(0) { 75 | } 76 | 77 | SRLocalState::SRLocalState(ClientContext &context, SRGlobalState &gstate) : bind_data(gstate.bind_data) { 78 | } 79 | 80 | SRGlobalTableFunctionState::SRGlobalTableFunctionState(ClientContext &context, TableFunctionInitInput &input) 81 | : state(context, input.bind_data->Cast()) { 82 | } 83 | 84 | unique_ptr SRGlobalTableFunctionState::Init(ClientContext &context, 85 | TableFunctionInitInput &input) { 86 | 87 | auto result = make_uniq(context, input); 88 | 89 | return std::move(result); 90 | } 91 | 92 | SRLocalTableFunctionState::SRLocalTableFunctionState(ClientContext &context, SRGlobalState &gstate) 93 | : state(context, gstate) { 94 | } 95 | 96 | unique_ptr SRLocalTableFunctionState::Init(ExecutionContext &context, 97 | TableFunctionInitInput &input, 98 | GlobalTableFunctionState *global_state) { 99 | auto &gstate = global_state->Cast(); 100 | auto result = make_uniq(context.client, gstate.state); 101 | 102 | return std::move(result); 103 | } 104 | 105 | // ===================================== 106 | // Following are definitions that are used to copy data from the sheetreader-core to the DuckDB data chunk 107 | // ===================================== 108 | 109 | //! DataPtr is a union that holds a pointer to the different data that are stored in the vectors of the data chunk 110 | union DataPtr { 111 | string_t *string_data; 112 | double *double_data; 113 | bool *bool_data; 114 | date_t *date_data; 115 | }; 116 | 117 | //! Set cell to NULL 118 | inline void SetNull(const SRBindData &bind_data, DataChunk &output, vector &flat_vectors, const XlsxCell &cell, 119 | idx_t row_id, idx_t column_id) { 120 | LogicalType expected_type = bind_data.types[column_id]; 121 | 122 | // Value constructor with LogicalType sets the value to NULL 123 | output.data[column_id].SetValue(row_id, Value(expected_type)); 124 | } 125 | 126 | //! Set all values in the data chunk to NULL 127 | inline void SetAllInvalid(DataChunk &output, idx_t cardinality) { 128 | // Iterate over all columns 129 | for (idx_t col = 0; col < output.ColumnCount(); col++) { 130 | Vector &vec = output.data[col]; 131 | // Validity mask saves the information about NULL values 132 | auto &validity = FlatVector::Validity(vec); 133 | validity.SetAllInvalid(cardinality); 134 | } 135 | } 136 | 137 | //! Set cell to the value of XlsxCell 138 | //! Expects XlsxCell to have the same type as the column 139 | inline void SetCell(const SRBindData &bind_data, DataChunk &output, vector &flat_vectors, const XlsxCell &cell, 140 | idx_t row_id, idx_t column_id) { 141 | 142 | auto &xlsx_file = bind_data.xlsx_file; 143 | 144 | // Get validity mask of the column and set it to valid (i.e. not NULL) 145 | Vector &vec = output.data[column_id]; 146 | auto &validity = FlatVector::Validity(vec); 147 | validity.SetValid(row_id); 148 | 149 | // Set the value of the cell to cell in the data chunk 150 | // Note: bind_data.types[column_id] is the expected type of the column, 151 | // so the type XlsxCell should be checked before calling this function 152 | switch (bind_data.types[column_id].id()) { 153 | case LogicalTypeId::VARCHAR: { 154 | auto value = xlsx_file.getString(cell.data.integer); 155 | // string_t creates values that fail the UTF-8 check, so we use the slow technique 156 | // flat_vectors[j].string_data[i] = string_t(value); 157 | output.data[column_id].SetValue(row_id, Value(value)); 158 | break; 159 | } 160 | case LogicalTypeId::DOUBLE: { 161 | auto value = cell.data.real; 162 | flat_vectors[column_id].double_data[row_id] = value; 163 | break; 164 | } 165 | case LogicalTypeId::BOOLEAN: { 166 | auto value = cell.data.boolean; 167 | flat_vectors[column_id].bool_data[row_id] = value; 168 | break; 169 | } 170 | case LogicalTypeId::DATE: { 171 | // Convert seconds to days 172 | date_t value = date_t((int)(cell.data.real / 86400.0)); 173 | flat_vectors[column_id].date_data[row_id] = value; 174 | break; 175 | } 176 | default: 177 | throw InternalException("This shouldn't happen. Unsupported Logical type"); 178 | } 179 | } 180 | 181 | //! Coerce cell to string and save it in the data chunk 182 | inline void SetCellString(const SRBindData &bind_data, DataChunk &output, vector &flat_vectors, 183 | const XlsxCell &cell, idx_t row_id, idx_t column_id) { 184 | 185 | auto &xlsx_file = bind_data.xlsx_file; 186 | 187 | // Get validity mask of the column and set it to valid (i.e. not NULL) 188 | Vector &vec = output.data[column_id]; 189 | auto &validity = FlatVector::Validity(vec); 190 | validity.SetValid(row_id); 191 | 192 | // Similar to SetCell() only difference: 193 | // Use coercion method depending on the type of the XlsxCell 194 | switch (cell.type) { 195 | case CellType::T_STRING_REF: { 196 | auto value = xlsx_file.getString(cell.data.integer); 197 | output.data[column_id].SetValue(row_id, Value(value)); 198 | break; 199 | } 200 | case CellType::T_NUMERIC: { 201 | auto value = cell.data.real; 202 | string str = std::to_string(value); 203 | output.data[column_id].SetValue(row_id, Value(str)); 204 | break; 205 | } 206 | case CellType::T_BOOLEAN: { 207 | auto value = cell.data.boolean; 208 | string str = value ? "TRUE" : "FALSE"; 209 | output.data[column_id].SetValue(row_id, Value(str)); 210 | break; 211 | } 212 | case CellType::T_DATE: { 213 | date_t value = date_t((int)(cell.data.real / 86400.0)); 214 | string str = Date::ToString(value); 215 | output.data[column_id].SetValue(row_id, Value(str)); 216 | break; 217 | } 218 | default: 219 | throw InternalException("This shouldn't happen. Unsupported Cell type"); 220 | } 221 | } 222 | 223 | //! Check if the types of the XlsxCell and the column are compatible 224 | //! Types are compatible with VARCHAR if coercing to string is enabled 225 | bool TypesCompatible(const LogicalType &expected_type, const CellType &cell_type, bool coerce_to_string) { 226 | switch (expected_type.id()) { 227 | case LogicalTypeId::VARCHAR: 228 | if (coerce_to_string) { 229 | switch (cell_type) { 230 | case CellType::T_STRING_REF: 231 | case CellType::T_NUMERIC: 232 | case CellType::T_BOOLEAN: 233 | case CellType::T_DATE: 234 | return true; 235 | default: 236 | return false; 237 | } 238 | } 239 | return cell_type == CellType::T_STRING_REF; 240 | case LogicalTypeId::DOUBLE: 241 | return cell_type == CellType::T_NUMERIC; 242 | case LogicalTypeId::BOOLEAN: 243 | return cell_type == CellType::T_BOOLEAN; 244 | case LogicalTypeId::DATE: 245 | return cell_type == CellType::T_DATE; 246 | default: 247 | // TODO: Add support for T_STRING and T_STRING_INLINE 248 | throw InternalException("This shouldn't happen. Unsupported Logical type"); 249 | } 250 | } 251 | 252 | //! Check if current_row is within the limit of the current chunk 253 | bool CheckRowLimitReached(SRGlobalState &gstate) { 254 | // Need offset, since current_row is index of the current row in whole table. 255 | // So we subtract the number of rows (determined by chunk) already copied 256 | long long row_offset = gstate.chunk_count * STANDARD_VECTOR_SIZE; 257 | // Limit is the last row of the current chunk (should be 2048 == STANDARD_VECTOR_SIZE) 258 | long long limit = row_offset + STANDARD_VECTOR_SIZE; 259 | long long skip_rows = gstate.bind_data.xlsx_sheet->mSkipRows; 260 | bool limit_reached = gstate.current_row - skip_rows >= limit; 261 | return limit_reached; 262 | } 263 | 264 | //! Get the number of rows copied so far 265 | idx_t GetCardinality(SRGlobalState &gstate) { 266 | // Same reason as in CheckRowLimitReached 267 | long long row_offset = gstate.chunk_count * STANDARD_VECTOR_SIZE; 268 | long long skip_rows = gstate.bind_data.xlsx_sheet->mSkipRows; 269 | // This is the case when no new rows are copied and last chunk was not full (last iteration) 270 | if (gstate.current_row + 1 < skip_rows + row_offset) { 271 | return 0; 272 | } 273 | return gstate.current_row - skip_rows - row_offset + 1; 274 | } 275 | 276 | /*! 277 | Summary of data is stored in mCells: 278 | ==================================== 279 | 280 | General layout: 281 | --------------- 282 | 283 | mCells = [thread[0], thread[1], thread[2], ...] 284 | thread[current_thread] = [buffer[0], buffer[1], buffer[2], ...] 285 | buffer[current_buffer] = [cell[0], cell[1], cell[2], ...] 286 | */ 287 | /*! 288 | - Copy data from sheetreader-core's mCells to DuckDB data chunk 289 | - Copies STANDARD_VECTOR_SIZE rows at a time 290 | - This function is stateful, so it keeps track of the current location in the mCells 291 | to maintain state between calls 292 | Returns Cardinality (number of rows copied) 293 | */ 294 | size_t StatefulCopy(SRGlobalState &gstate, const SRBindData &bind_data, DataChunk &output, 295 | vector &flat_vectors) { 296 | 297 | auto &sheet = bind_data.xlsx_sheet; 298 | 299 | // Every thread has a list of buffers 300 | D_ASSERT(bind_data.number_threads == sheet->mCells.size()); 301 | 302 | idx_t number_threads = bind_data.number_threads; 303 | 304 | if (number_threads == 0) { 305 | return 0; 306 | } 307 | 308 | size_t row_offset = gstate.chunk_count * STANDARD_VECTOR_SIZE; 309 | 310 | // Helper function to calculate the adjusted row 311 | auto calc_adjusted_row = [row_offset](long long current_row, unsigned long skip_rows) { 312 | return current_row - skip_rows - row_offset; 313 | }; 314 | 315 | // Initialize state for first call 316 | if (gstate.current_locs.empty()) { 317 | // Get number of buffers from first thread (is always the maximum) 318 | gstate.max_buffers = sheet->mCells[0].size(); 319 | gstate.current_thread = 0; 320 | gstate.current_buffer = 0; 321 | gstate.current_cell = 0; 322 | gstate.current_column = 0; 323 | gstate.current_row = -1; 324 | // Initialize current_locs for all threads 325 | gstate.current_locs = std::vector(number_threads, 0); 326 | } 327 | 328 | // Set all values to NULL per default, since sheetreader-core stores information about empty cells only by skipping 329 | // them in mCells. Since we iterate over mCells, empty cells are implicitly skipped. So we wouldn't know if a cell 330 | // in the chunk is empty if we don't set it to NULL here and set it to valid when we find it in mCells (see 331 | // SetValue) 332 | SetAllInvalid(output, STANDARD_VECTOR_SIZE); 333 | 334 | //! To get the correct order of rows we iterate for(buffer_index) { for(thread_index) { for(cell_index) } } 335 | //! This is due to how sheetreader-core writes the data to the buffers (stored in mCells) 336 | for (; gstate.current_buffer < gstate.max_buffers; ++gstate.current_buffer) { 337 | for (; gstate.current_thread < sheet->mCells.size(); ++gstate.current_thread) { 338 | 339 | // If there are no more buffers to read, prepare for finishing copying 340 | if (sheet->mCells[gstate.current_thread].empty()) { 341 | // Set to maxBuffers, so this is the last iteration 342 | gstate.current_buffer = gstate.max_buffers; 343 | 344 | // Return number of copied rows in this chunk 345 | return GetCardinality(gstate); 346 | } 347 | 348 | //! Current cell buffer 349 | const std::vector cells = sheet->mCells[gstate.current_thread].front(); 350 | //! Location info for current thread 351 | const std::vector &locs_infos = sheet->mLocationInfos[gstate.current_thread]; 352 | //! Current location index in current thread 353 | size_t ¤t_loc = gstate.current_locs[gstate.current_thread]; 354 | 355 | // This is a weird implementation detail of sheetreader-core: 356 | // currentCell <= cells.size() because there might be location info after last cell 357 | for (; gstate.current_cell <= cells.size(); ++gstate.current_cell) { 358 | 359 | // Description of the following loop: 360 | // Update currentRow & currentColumn when location info is available for current cell at currentLoc. 361 | // After setting those values: Advance to next location info. 362 | // 363 | // This means that the values won't be updated if there is no location info for the current cell 364 | // (e.g. not first cell in row) 365 | // 366 | // Edge case 0: 367 | // Loop is executed n+1 times for first location info, where n is the number of skip_rows (specified as 368 | // parameter for interleaved) This is because, SheetReader creates location infos for the skipped lines 369 | // with cell == column == buffer == 0 sames as for the first "real" row 370 | // 371 | // Edge case 1: 372 | // For empty cells, sheetreader-core also generates a location info that points to the same cell as the 373 | // next location info. By using the condition for the while loop, we skip these empty cells 374 | while (current_loc < locs_infos.size() && locs_infos[current_loc].buffer == gstate.current_buffer && 375 | locs_infos[current_loc].cell == gstate.current_cell) { 376 | 377 | gstate.current_column = locs_infos[current_loc].column; 378 | // Not sure whether row is ever -1ul, but this is how it's handled in sheetreader-core's nextRow() 379 | if (locs_infos[current_loc].row == -1ul) { 380 | ++gstate.current_row; 381 | } else { 382 | gstate.current_row = locs_infos[current_loc].row; 383 | } 384 | 385 | long long adjusted_row = calc_adjusted_row(gstate.current_row, sheet->mSkipRows); 386 | 387 | // This only happens for header rows -- we want to skip them 388 | if (adjusted_row < 0) { 389 | ++current_loc; 390 | // Skip to next row 391 | if (current_loc < locs_infos.size()) { 392 | gstate.current_cell = locs_infos[current_loc].cell; 393 | } else { 394 | throw InternalException("Skipped more rows than available in first buffer -- consider " 395 | "decreasing number of threads"); 396 | } 397 | continue; 398 | } 399 | 400 | // Increment index to location info for next iteration 401 | ++current_loc; 402 | 403 | // If we reached the row limit of the current chunk, we return the number of copied rows 404 | if (CheckRowLimitReached(gstate)) { 405 | // Subtract 1, because we increment current_row before checking the limit 406 | return (GetCardinality(gstate) - 1); 407 | } 408 | } 409 | // We need to check this here, because we iterate up to cells.size() to get the last location info 410 | if (gstate.current_cell >= cells.size()) { 411 | break; 412 | } 413 | 414 | // Use short variable name for better readability 415 | const auto current_column = gstate.current_column; 416 | 417 | // If this cell is in a column that was not present in the first row, we throw an error 418 | if (current_column >= bind_data.types.size()) { 419 | throw InvalidInputException( 420 | "Row " + std::to_string(gstate.current_row) + "has more columns than the first row. Has: " + 421 | std::to_string(current_column + 1) + " Expected: " + std::to_string(bind_data.types.size())); 422 | } 423 | 424 | //! Content of current cell 425 | const XlsxCell &cell = cells[gstate.current_cell]; 426 | //! Number of rows we skipped while parsing 427 | long long mSkipRows = sheet->mSkipRows; 428 | long long adjusted_row = calc_adjusted_row(gstate.current_row, mSkipRows); 429 | 430 | bool types_compatible = 431 | TypesCompatible(bind_data.types[current_column], cell.type, bind_data.coerce_to_string); 432 | 433 | // sheetreader-core doesn't determine empty cells to be T_NONE, instead it skips the cell, 434 | // so it's not stored in mCells. We handle this by setting all cells as Invalid (aka null) 435 | // and set them valid when they appear in mCells 436 | if (cell.type == CellType::T_NONE || cell.type == CellType::T_ERROR || !types_compatible) { 437 | SetNull(bind_data, output, flat_vectors, cell, adjusted_row, current_column); 438 | } else if (bind_data.types[current_column] == LogicalType::VARCHAR && bind_data.coerce_to_string) { 439 | SetCellString(bind_data, output, flat_vectors, cell, adjusted_row, current_column); 440 | } else { 441 | SetCell(bind_data, output, flat_vectors, cell, adjusted_row, current_column); 442 | } 443 | 444 | // Advance to next column 445 | ++gstate.current_column; 446 | } 447 | 448 | // If we reached the last cell in the current buffer, we remove it from the thread 449 | sheet->mCells[gstate.current_thread].pop_front(); 450 | // Reset for next buffer 451 | gstate.current_cell = 0; 452 | } 453 | // Reset thread index for next buffer index 454 | gstate.current_thread = 0; 455 | } 456 | // Return number of copied rows in this chunk when all buffers are read (i.e. curren_buffer == max_buffers) 457 | return GetCardinality(gstate); 458 | } 459 | 460 | //! Finish the current chunk 461 | //! - Set the cardinality of the chunk 462 | //! - Increment the chunk count 463 | inline void FinishChunk(DataChunk &output, idx_t cardinality, SRGlobalState &gstate) { 464 | 465 | // Indicate how many rows are in the chunk 466 | // If cardinality is 0, it means that the chunk is empty and no more rows are to be expected 467 | output.SetCardinality(cardinality); 468 | 469 | // Increment number of chunks read so far 470 | gstate.chunk_count++; 471 | 472 | return; 473 | } 474 | 475 | //! Copy data from sheetreader-core to DuckDB data chunk 476 | //! - Is called after bind function 477 | //! - Is called multiple times until all data is copied are no more rows are needed (e.g. for LIMIT clause) 478 | inline void SheetreaderCopyTableFun(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) { 479 | 480 | //! Data from bind function 481 | const SRBindData &bind_data = data_p.bind_data->Cast(); 482 | //! State persisted in between table (copy) function calls 483 | SRGlobalState &gstate = data_p.global_state->Cast().state; 484 | 485 | //! Number of columns (i.e. number of vectors in the data chunk) 486 | const idx_t column_count = output.ColumnCount(); 487 | 488 | D_ASSERT(column_count == bind_data.types.size()); 489 | 490 | // ===================================== 491 | // Store FlatVectors for all columns (they have different data types) 492 | // ===================================== 493 | 494 | //! Holds pointers to the data of the vectors in the data chunk 495 | vector flat_vectors; 496 | 497 | for (idx_t col = 0; col < column_count; col++) { 498 | switch (bind_data.types[col].id()) { 499 | case LogicalTypeId::VARCHAR: { 500 | Vector &vec = output.data[col]; 501 | string_t *data_vec = FlatVector::GetData(vec); 502 | DataPtr data; 503 | data.string_data = data_vec; 504 | // Store pointer to data 505 | flat_vectors.push_back(data); 506 | break; 507 | } 508 | case LogicalTypeId::DOUBLE: { 509 | Vector &vec = output.data[col]; 510 | auto data_vec = FlatVector::GetData(vec); 511 | DataPtr data; 512 | data.double_data = data_vec; 513 | flat_vectors.push_back(data); 514 | break; 515 | } 516 | case LogicalTypeId::BOOLEAN: { 517 | Vector &vec = output.data[col]; 518 | auto data_vec = FlatVector::GetData(vec); 519 | DataPtr data; 520 | data.bool_data = data_vec; 521 | flat_vectors.push_back(data); 522 | break; 523 | } 524 | case LogicalTypeId::DATE: { 525 | Vector &vec = output.data[col]; 526 | auto data_vec = FlatVector::GetData(vec); 527 | DataPtr data; 528 | data.date_data = data_vec; 529 | flat_vectors.push_back(data); 530 | break; 531 | } 532 | default: 533 | throw InternalException("This shouldn't happen. Unsupported Logical type"); 534 | } 535 | } 536 | 537 | // ===================================== 538 | // Copy data from sheetreader-core's mCells to DuckDB data chunk 539 | // ===================================== 540 | 541 | // This version: 542 | // - Uses SetValue only for VARCHAR, for other types it uses directly the flat vectors 543 | // - Doesn't use nextRow() but directly iterates over the buffers 544 | // - Has more features (coercion to string, handling empty cells, etc.) 545 | 546 | //! Number of rows copied in this iteration 547 | auto cardinality = StatefulCopy(gstate, bind_data, output, flat_vectors); 548 | 549 | FinishChunk(output, cardinality, gstate); 550 | 551 | return; 552 | } 553 | 554 | // ===================================== 555 | // Following are definitions for the bind function 556 | // ===================================== 557 | 558 | //! Converts the cell types from sheetreader-core to DuckDB types (column_types) 559 | //! and it also sets the column names (uses generic names) 560 | inline bool ConvertCellTypes(vector &column_types, vector &column_names, 561 | vector &cell_types) { 562 | idx_t current_column_index = 0; 563 | //! Indicates if the first row contains only string values 564 | bool first_row_all_string = true; 565 | 566 | for (auto &col_type : cell_types) { 567 | switch (col_type) { 568 | case CellType::T_STRING_REF: 569 | column_types.push_back(LogicalType::VARCHAR); 570 | column_names.push_back("String" + std::to_string(current_column_index)); 571 | break; 572 | case CellType::T_STRING: 573 | case CellType::T_STRING_INLINE: 574 | // TODO 575 | throw BinderException("Inline & dynamic String types not supported yet"); 576 | break; 577 | case CellType::T_NUMERIC: 578 | column_types.push_back(LogicalType::DOUBLE); 579 | column_names.push_back("Numeric" + std::to_string(current_column_index)); 580 | first_row_all_string = false; 581 | break; 582 | case CellType::T_BOOLEAN: 583 | column_types.push_back(LogicalType::BOOLEAN); 584 | column_names.push_back("Boolean" + std::to_string(current_column_index)); 585 | first_row_all_string = false; 586 | break; 587 | case CellType::T_DATE: 588 | column_types.push_back(LogicalType::DATE); 589 | column_names.push_back("Date" + std::to_string(current_column_index)); 590 | first_row_all_string = false; 591 | break; 592 | default: 593 | throw BinderException("Unknown cell type in column in column " + std::to_string(current_column_index)); 594 | } 595 | current_column_index++; 596 | } 597 | 598 | return first_row_all_string; 599 | } 600 | 601 | //! Get the names of the columns from the first row 602 | //! Assumes that the first row contains only string values 603 | inline vector GetHeaderNames(vector &row, SRBindData &bind_data) { 604 | 605 | vector column_names; 606 | 607 | for (idx_t j = 0; j < row.size(); j++) { 608 | switch (row[j].type) { 609 | case CellType::T_STRING_REF: { 610 | auto value = bind_data.xlsx_file.getString(row[j].data.integer); 611 | column_names.push_back(value); 612 | break; 613 | } 614 | case CellType::T_STRING: 615 | case CellType::T_STRING_INLINE: { 616 | // TODO 617 | throw BinderException("Inline & dynamic String types not supported yet"); 618 | break; 619 | } 620 | default: 621 | throw BinderException("Header row contains non-string values"); 622 | } 623 | } 624 | 625 | return column_names; 626 | } 627 | 628 | //! Bind function for the sheetreader extension 629 | //! - Gets (named) parameters (filename etc.) of table function and stores them 630 | //! - Parses the .Xlsx-file 631 | //! - Reads the first & second row to determine the types of the columns 632 | //! - Reads the first & second row to determine the names of the columns (auto detects if the first row is a header) 633 | //! - Writes the determined types in `return_types` and the names in `names` 634 | //! - Creates the bind data object (is subtype of FunctionData) which contains all necessary information for the copy 635 | //! and most importantly stores the XlsxFile & XlsxSheet objects 636 | inline unique_ptr SheetreaderBindFun(ClientContext &context, TableFunctionBindInput &input, 637 | vector &return_types, vector &names) { 638 | 639 | // ===================================== 640 | // Get input parameters & prepare for parsing 641 | // ===================================== 642 | 643 | // Get the file name from the input parameters & verify it exists 644 | auto file_reader = MultiFileReader::Create(input.table_function); 645 | auto file_list = file_reader->CreateFileList(context, input.inputs[0]); 646 | auto file_infos = file_list->GetAllFiles(); 647 | 648 | if (file_infos.empty()) { 649 | throw BinderException("No files found in path"); 650 | } else if (file_infos.size() > 1) { 651 | throw BinderException("Only one file can be read at a time"); 652 | } 653 | 654 | // Extract the file path from OpenFileInfo 655 | string file_name = file_infos[0].path; 656 | 657 | //! User specified sheet name 658 | string sheet_name; 659 | //! User specified sheet index -- starts with 1 660 | int sheet_index; 661 | //! Is set when the user specifies the sheet index with e.g. `sheet_index=2` 662 | bool sheet_index_set = false; 663 | 664 | //! User specified option to use header 665 | bool use_header = false; 666 | 667 | // Get named parameters that are needed for creating XlsxFile & XlsxSheet objects and therefore for creating 668 | // bind_data 669 | for (auto &kv : input.named_parameters) { 670 | auto loption = StringUtil::Lower(kv.first); 671 | if (loption == "sheet_name") { 672 | sheet_name = StringValue::Get(kv.second); 673 | } else if (loption == "sheet_index") { 674 | sheet_index = IntegerValue::Get(kv.second); 675 | sheet_index_set = true; 676 | } else if (loption == "has_header") { 677 | use_header = BooleanValue::Get(kv.second); 678 | } else { 679 | continue; 680 | } 681 | } 682 | 683 | if (!sheet_name.empty() && sheet_index_set) { 684 | throw BinderException("Sheet index & sheet name cannot be set at the same time."); 685 | } 686 | 687 | //! Contains all important data collected in this bind function & is returned to be used by table (copy) function 688 | unique_ptr bind_data; 689 | 690 | try { 691 | if (!sheet_name.empty()) { 692 | bind_data = make_uniq(file_name, sheet_name); 693 | } else if (sheet_index_set) { 694 | bind_data = make_uniq(file_name, sheet_index); 695 | } else { 696 | // Default: sheet_index=1 697 | bind_data = make_uniq(file_name); 698 | } 699 | } catch (std::exception &e) { 700 | throw BinderException(e.what()); 701 | } 702 | 703 | //! Is set when the user specifies the types of the columns with e.g. `types=[VARCHAR,DOUBLE]` 704 | bool has_user_types = false; 705 | 706 | // Get all left named parameters 707 | // You can find the documentation of the named parameters in the README.md and header file 708 | for (auto &kv : input.named_parameters) { 709 | auto loption = StringUtil::Lower(kv.first); 710 | if (loption == "threads") { 711 | bind_data->number_threads = IntegerValue::Get(kv.second); 712 | if (bind_data->number_threads <= 0) { 713 | throw BinderException("Number of threads must be greater than 0"); 714 | } 715 | } else if (loption == "skip_rows") { 716 | // Default: 0 717 | bind_data->skip_rows = IntegerValue::Get(kv.second); 718 | } else if (loption == "coerce_to_string") { 719 | bind_data->coerce_to_string = BooleanValue::Get(kv.second); 720 | } else if (loption == "force_types") { 721 | bind_data->force_types = BooleanValue::Get(kv.second); 722 | } else if (loption == "types") { 723 | // Get all types as strings defined in list/array 724 | auto &children = ListValue::GetChildren(kv.second); 725 | // Convert strings to LogicalTypes & check if they are supported 726 | for (auto &child : children) { 727 | string raw_type = StringValue::Get(child); 728 | LogicalType logical_type = TransformStringToLogicalType(raw_type); 729 | if (logical_type.id() == LogicalTypeId::USER) { 730 | throw BinderException("Unrecognized type \"%s\" for %s definition", raw_type, kv.first); 731 | } 732 | switch (logical_type.id()) { 733 | case LogicalTypeId::VARCHAR: 734 | case LogicalTypeId::DOUBLE: 735 | case LogicalTypeId::BOOLEAN: 736 | case LogicalTypeId::DATE: { 737 | break; 738 | } 739 | default: { 740 | throw BinderException("Unsupported type \"%s\" for %s definition", raw_type, kv.first); 741 | } 742 | } 743 | bind_data->user_types.push_back(logical_type); 744 | } 745 | // Indicate that user provided types 746 | has_user_types = true; 747 | 748 | // We already handled them before 749 | } else if (loption == "sheet_name" || loption == "sheet_index" || loption == "has_header") { 750 | continue; 751 | } else { 752 | throw BinderException("Unknown named parameter"); 753 | } 754 | } 755 | 756 | // Doesn't change the parsing (only when combined with specifyTypes) -- we simply store it, to read it later while 757 | // copying 758 | bind_data->xlsx_sheet->mHeaders = use_header; 759 | 760 | // If number threads > 1, we set parallel true 761 | if (bind_data->number_threads > 1) { 762 | bind_data->xlsx_file.mParallelStrings = true; 763 | } else { 764 | bind_data->xlsx_file.mParallelStrings = false; 765 | } 766 | 767 | // ===================================== 768 | // Parsing & check parsing result 769 | // ===================================== 770 | 771 | // Parse the shared strings file 772 | bind_data->xlsx_file.parseSharedStrings(); 773 | 774 | //! Used for better readability 775 | auto &sheet = bind_data->xlsx_sheet; 776 | 777 | // Parse the sheet 778 | bool success = sheet->interleaved(bind_data->skip_rows, 0, bind_data->number_threads); 779 | 780 | if (!success) { 781 | throw BinderException("Failed to read sheet"); 782 | } 783 | 784 | bind_data->xlsx_file.finalize(); 785 | 786 | //! Number of columns in the sheet 787 | auto number_columns = sheet->mDimension.first; 788 | //! Number of rows in the sheet 789 | auto number_rows = sheet->mDimension.second; 790 | 791 | if (number_columns == 0 || number_rows == 0) { 792 | throw BinderException("Sheet appears to be empty"); 793 | } 794 | 795 | // ===================================== 796 | // Determine column types & names 797 | // ===================================== 798 | 799 | //! Cell types in the first row after skipped rows 800 | vector cell_types_first_row; 801 | //! Cell types in the second row after skipped rows 802 | vector cell_types_second_row; 803 | //! Cell values in the first row after skipped rows 804 | vector cells_first_row; 805 | 806 | // First buffer of first thread 807 | auto first_buffer = &sheet->mCells[0].front(); 808 | 809 | // Probing the first two rows to get the types 810 | if (first_buffer->size() < number_columns * 2) { 811 | throw BinderException("Internal SheetReader extension error: Need minimum of two rows in first buffer to " 812 | "determine column types and auto detect header row"); 813 | } 814 | 815 | for (idx_t i = 0; i < number_columns; i++) { 816 | cell_types_first_row.push_back(sheet->mCells[0].front()[i].type); 817 | cells_first_row.push_back(sheet->mCells[0].front()[i]); 818 | } 819 | 820 | for (idx_t i = number_columns; i < number_columns * 2; i++) { 821 | cell_types_second_row.push_back(sheet->mCells[0].front()[i].type); 822 | } 823 | 824 | // Convert CellType to LogicalType 825 | 826 | //! DuckDB types of the cells in the first row 827 | vector column_types_first_row; 828 | //! Column names of the cells in the first row 829 | vector column_names_first_row; 830 | 831 | // Check if first row contains only string values, get DuckDB types & generic column names 832 | bool first_row_all_string = ConvertCellTypes(column_types_first_row, column_names_first_row, cell_types_first_row); 833 | 834 | if (use_header && !first_row_all_string) { 835 | throw BinderException("First row must contain only strings when has_header is set to true"); 836 | } 837 | 838 | vector column_types_second_row; 839 | vector column_names_second_row; 840 | //! Indicates whether a header row was detected 841 | bool header_detected = false; 842 | 843 | if (number_rows > 1) { 844 | // Check if second row contains only string values, get DuckDB types & generic column names 845 | bool second_row_all_string = 846 | ConvertCellTypes(column_types_second_row, column_names_second_row, cell_types_second_row); 847 | 848 | // If the first row contains only string values, but the second row doesn't, we assume that the first row is a 849 | // header row 850 | if (use_header || (first_row_all_string && !second_row_all_string)) { 851 | header_detected = true; 852 | 853 | // Since the first row is a header row, we use the cell types of the second row 854 | return_types = column_types_second_row; 855 | bind_data->types = column_types_second_row; 856 | 857 | //! Column names determined from the first row 858 | vector header_names; 859 | 860 | // Get header names from cell values of first row 861 | for (idx_t j = 0; j < cells_first_row.size(); j++) { 862 | switch (cells_first_row[j].type) { 863 | case CellType::T_STRING_REF: { 864 | auto value = bind_data->xlsx_file.getString(cells_first_row[j].data.integer); 865 | header_names.push_back(value); 866 | break; 867 | } 868 | case CellType::T_STRING: 869 | case CellType::T_STRING_INLINE: { 870 | // TODO 871 | throw BinderException("Inline & dynamic String types not supported yet"); 872 | break; 873 | } 874 | default: 875 | throw BinderException("Header row contains non-string values"); 876 | } 877 | } 878 | 879 | // Set column names to header names 880 | names = header_names; 881 | bind_data->names = header_names; 882 | } else { 883 | // If first row is not a header row, we use the cell types of the first row for the column types 884 | return_types = column_types_first_row; 885 | bind_data->types = column_types_first_row; 886 | 887 | // Use generic column names 888 | names = column_names_first_row; 889 | bind_data->names = column_names_first_row; 890 | } 891 | } 892 | 893 | // Since header is only used for determining column names, we skip it 894 | if (header_detected) { 895 | bind_data->skip_rows++; 896 | bind_data->xlsx_sheet->mSkipRows++; 897 | } 898 | 899 | // If user has specified types, we try to use them 900 | if (has_user_types) { 901 | if (bind_data->user_types.size() < number_columns) { 902 | throw BinderException("Number of user defined types is less than number of columns in sheet"); 903 | } 904 | 905 | idx_t column_index = 0; 906 | for (auto &column_type : return_types) { 907 | 908 | LogicalType user_type = bind_data->user_types[column_index]; 909 | 910 | // Check if user defined type is same as previously determined column type or can be coerced to string 911 | // If forced_types == true, the compatibility check is skipped 912 | if (!bind_data->force_types && user_type.id() != column_type.id() && 913 | !(user_type == LogicalTypeId::VARCHAR && bind_data->coerce_to_string)) { 914 | // TODO: EnumUtil does not work -- find appropriate replacement 915 | // throw BinderException("User defined type %s for column with index %d is not compatible with actual 916 | // type %s", 917 | // EnumUtil::ToString(user_type), column_index, 918 | // EnumUtil::ToString(column_type)); 919 | throw BinderException("User defined type for column with index %d is not compatible with actual type", 920 | column_index); 921 | } 922 | column_index++; 923 | } 924 | 925 | // Add column names, if they are new user defined columns 926 | vector additional_column_names; 927 | 928 | while (column_index < bind_data->user_types.size()) { 929 | additional_column_names.push_back("Column " + std::to_string(column_index)); 930 | column_index++; 931 | } 932 | 933 | return_types = bind_data->user_types; 934 | bind_data->types = bind_data->user_types; 935 | 936 | // Concat additional column names 937 | bind_data->names.insert(bind_data->names.end(), additional_column_names.begin(), additional_column_names.end()); 938 | names = bind_data->names; 939 | 940 | D_ASSERT(return_types.size() == names.size()); 941 | } 942 | 943 | // First row is discarded (is only needed for versions that use nextRow()) 944 | for (idx_t i = 0; i < bind_data->skip_rows; i++) { 945 | sheet->nextRow(); 946 | } 947 | 948 | return std::move(bind_data); 949 | } 950 | 951 | static void LoadInternal(ExtensionLoader &loader) { 952 | // Register a table function 953 | TableFunction sheetreader_table_function("sheetreader", {LogicalType::VARCHAR}, SheetreaderCopyTableFun, 954 | SheetreaderBindFun, SRGlobalTableFunctionState::Init, 955 | SRLocalTableFunctionState::Init); 956 | 957 | // Define all named parameters 958 | sheetreader_table_function.named_parameters["sheet_name"] = LogicalType::VARCHAR; 959 | sheetreader_table_function.named_parameters["sheet_index"] = LogicalType::INTEGER; 960 | sheetreader_table_function.named_parameters["threads"] = LogicalType::INTEGER; 961 | sheetreader_table_function.named_parameters["skip_rows"] = LogicalType::INTEGER; 962 | sheetreader_table_function.named_parameters["has_header"] = LogicalType::BOOLEAN; 963 | // TODO: Support STRUCT, i.e. { 'column_name': 'type', ... } 964 | // We would use ANY here, similar to read_csv.cpp, but we expect a STRUCT or LIST 965 | // sheetreader_table_function.named_parameters["types"] = LogicalType::ANY; 966 | sheetreader_table_function.named_parameters["types"] = LogicalType::LIST(LogicalType::VARCHAR); 967 | sheetreader_table_function.named_parameters["force_types"] = LogicalType::BOOLEAN; 968 | sheetreader_table_function.named_parameters["coerce_to_string"] = LogicalType::BOOLEAN; 969 | 970 | loader.RegisterFunction(sheetreader_table_function); 971 | } 972 | 973 | void SheetreaderExtension::Load(ExtensionLoader &loader) { 974 | LoadInternal(loader); 975 | } 976 | std::string SheetreaderExtension::Name() { 977 | return "sheetreader"; 978 | } 979 | 980 | std::string SheetreaderExtension::Version() const { 981 | #ifdef EXT_VERSION_SHEETREADER 982 | return EXT_VERSION_SHEETREADER; 983 | #else 984 | return ""; 985 | #endif 986 | } 987 | 988 | } // namespace duckdb 989 | 990 | extern "C" { 991 | 992 | DUCKDB_EXTENSION_API void sheetreader_init(duckdb::DatabaseInstance &db) { 993 | duckdb::DuckDB db_wrapper(db); 994 | db_wrapper.LoadStaticExtension(); 995 | } 996 | 997 | DUCKDB_EXTENSION_API const char *sheetreader_version() { 998 | return duckdb::DuckDB::LibraryVersion(); 999 | } 1000 | 1001 | DUCKDB_EXTENSION_API void sheetreader_duckdb_cpp_init(duckdb::ExtensionLoader &loader) { 1002 | duckdb::SheetreaderExtension extension; 1003 | extension.Load(loader); 1004 | } 1005 | } 1006 | 1007 | #ifndef DUCKDB_EXTENSION_MAIN 1008 | #error DUCKDB_EXTENSION_MAIN not defined 1009 | #endif 1010 | --------------------------------------------------------------------------------