├── .editorconfig ├── .github └── workflows │ ├── MainDistributionPipeline.yml │ └── schedule-1.2.yml ├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── LICENSE ├── Makefile ├── docs ├── README.md ├── UPDATING.md └── duckdb-shellfs.jpg ├── extension_config.cmake ├── scripts ├── bootstrap-template.py └── extension-upload.sh ├── src ├── include │ └── shellfs_extension.hpp ├── shell_file_system.cpp ├── shell_file_system.hpp └── shellfs_extension.cpp ├── test ├── README.md └── sql │ ├── json.test │ └── shellfs.test └── vcpkg.json /.editorconfig: -------------------------------------------------------------------------------- 1 | duckdb/.editorconfig -------------------------------------------------------------------------------- /.github/workflows/MainDistributionPipeline.yml: -------------------------------------------------------------------------------- 1 | # 2 | # This workflow calls the main distribution pipeline from DuckDB to build, test and (optionally) release the extension 3 | # 4 | name: Main Extension Distribution Pipeline 5 | on: 6 | push: 7 | pull_request: 8 | workflow_dispatch: 9 | schedule: 10 | - cron: '0 2 * * *' # Runs every night at 02:00 UTC 11 | 12 | concurrency: 13 | group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/main' || github.sha }} 14 | cancel-in-progress: true 15 | 16 | jobs: 17 | duckdb-stable-build: 18 | name: Build extension binaries 19 | uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main 20 | with: 21 | duckdb_version: main 22 | ci_tools_version: main 23 | extension_name: shellfs 24 | exclude_archs: "wasm_mvp;wasm_eh;wasm_threads" 25 | -------------------------------------------------------------------------------- /.github/workflows/schedule-1.2.yml: -------------------------------------------------------------------------------- 1 | name: Scheduled Trigger for 1.2 2 | 3 | on: 4 | schedule: 5 | - cron: '0 12 * * *' # Runs at 12:00 UTC every day 6 | workflow_dispatch: # Allows manual trigger 7 | 8 | jobs: 9 | trigger: 10 | runs-on: ubuntu-latest 11 | permissions: 12 | actions: write # Allow triggering workflows 13 | steps: 14 | - name: Checkout repository # Required for gh to work 15 | uses: actions/checkout@v4 16 | 17 | - name: Install GitHub CLI 18 | run: | 19 | sudo apt update && sudo apt install gh -y 20 | 21 | - name: Authenticate GH CLI 22 | run: | 23 | echo "${{ secrets.GITHUB_TOKEN }}" | gh auth login --with-token 24 | 25 | - name: Trigger Workflow on my-branch 26 | run: | 27 | gh workflow run MainDistributionPipeline.yml --ref v1.2 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | .idea 3 | cmake-build-debug 4 | duckdb_unittest_tempdir/ 5 | .DS_Store 6 | testext 7 | test/python/__pycache__/ 8 | .Rhistory 9 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "duckdb"] 2 | path = duckdb 3 | url = https://github.com/duckdb/duckdb 4 | branch = main 5 | [submodule "extension-ci-tools"] 6 | path = extension-ci-tools 7 | url = https://github.com/duckdb/extension-ci-tools 8 | branch = main -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.5) 2 | 3 | # Set extension name here 4 | set(TARGET_NAME shellfs) 5 | 6 | set(EXTENSION_NAME ${TARGET_NAME}_extension) 7 | set(LOADABLE_EXTENSION_NAME ${TARGET_NAME}_loadable_extension) 8 | 9 | project(${TARGET_NAME}) 10 | include_directories(src/include) 11 | 12 | set(EXTENSION_SOURCES src/shellfs_extension.cpp src/shell_file_system.cpp) 13 | 14 | build_static_extension(${TARGET_NAME} ${EXTENSION_SOURCES}) 15 | build_loadable_extension(${TARGET_NAME} " " ${EXTENSION_SOURCES}) 16 | 17 | install( 18 | TARGETS ${EXTENSION_NAME} 19 | EXPORT "${DUCKDB_EXPORT_SET}" 20 | LIBRARY DESTINATION "${INSTALL_LIB_DIR}" 21 | ARCHIVE DESTINATION "${INSTALL_LIB_DIR}") 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2024 Rusty Conover 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PROJ_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) 2 | 3 | # Configuration of extension 4 | EXT_NAME=shellfs 5 | EXT_CONFIG=${PROJ_DIR}extension_config.cmake 6 | 7 | # Include the Makefile from extension-ci-tools 8 | include extension-ci-tools/makefiles/duckdb_extension.Makefile -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # DuckDB Shellfs Extension 2 | 3 | ![DuckDB Shellfs Extension logo](duckdb-shellfs.jpg) 4 | 5 | The `shellfs` extension for DuckDB enables the use of Unix pipes for input and output. 6 | 7 | By appending a pipe character `|` to a filename, DuckDB will treat it as a series of commands to execute and capture the output. Conversely, if you prefix a filename with `|`, DuckDB will treat it as an output pipe. 8 | 9 | While the examples provided are simple, in practical scenarios, you might use this feature to run another program that generates CSV, JSON, or other formats to manage complexity that DuckDB cannot handle directly. 10 | 11 | The implementation uses `popen()` to create the pipe between processes. 12 | 13 | ## Installation 14 | 15 | **`shellfs` is a [DuckDB Community Extension](https://github.com/duckdb/community-extensions).** 16 | 17 | You can now use this by using this SQL: 18 | 19 | ```sql 20 | install shellfs from community; 21 | load shellfs; 22 | ``` 23 | 24 | --- 25 | 26 | ## Examples 27 | 28 | ### Reading input from a pipe 29 | 30 | ```sql 31 | 32 | -- Install the extension. 33 | install shellfs from community; 34 | load shellfs; 35 | 36 | -- Generate a sequence only return numbers that contain a 2 37 | SELECT * from read_csv('seq 1 100 | grep 2 |'); 38 | ┌─────────┐ 39 | │ column0 │ 40 | │ int64 │ 41 | ├─────────┤ 42 | │ 2 │ 43 | │ 12 │ 44 | │ 20 │ 45 | │ 21 │ 46 | │ 22 │ 47 | └─────────┘ 48 | 49 | -- Get the first multiples of 7 between 1 and 3 5 50 | -- demonstrate how commands can be chained together 51 | SELECT * from read_csv('seq 1 35 | awk "\$1 % 7 == 0" | head -n 2 |'); 52 | ┌─────────┐ 53 | │ column0 │ 54 | │ int64 │ 55 | ├─────────┤ 56 | │ 7 │ 57 | │ 14 │ 58 | └─────────┘ 59 | 60 | -- Do some arbitrary curl 61 | SELECT abbreviation, unixtime from 62 | read_json('curl -s http://worldtimeapi.org/api/timezone/Etc/UTC |'); 63 | ┌──────────────┬────────────┐ 64 | │ abbreviation │ unixtime │ 65 | │ varchar │ int64 │ 66 | ├──────────────┼────────────┤ 67 | │ UTC │ 1715983565 │ 68 | └──────────────┴────────────┘ 69 | ``` 70 | 71 | 72 | Create a program to generate CSV in Python: 73 | 74 | ```python 75 | #!/usr/bin/env python3 76 | 77 | print("counter1,counter2") 78 | for i in range(10000000): 79 | print(f"{i},{i}") 80 | ``` 81 | 82 | Run that program and determine the number of distinct values it produces: 83 | 84 | ```sql 85 | select count(distinct counter1) 86 | from read_csv('./test-csv.py |'); 87 | ┌──────────────────────────┐ 88 | │ count(DISTINCT counter1) │ 89 | │ int64 │ 90 | ├──────────────────────────┤ 91 | │ 10000000 │ 92 | └──────────────────────────┘ 93 | ``` 94 | 95 | When a command is not found or able to be executed, this is the result: 96 | 97 | ```sql 98 | SELECT count(distinct column0) from read_csv('foo |'); 99 | sh: foo: command not found 100 | ┌─────────────────────────┐ 101 | │ count(DISTINCT column0) │ 102 | │ int64 │ 103 | ├─────────────────────────┤ 104 | │ 0 │ 105 | └─────────────────────────┘ 106 | ``` 107 | 108 | The reason why there isn't an exception raised in this cause is because the `popen()` implementation starts a process with [`fork()`](https://man7.org/linux/man-pages/man2/fork.2.html) or the appropriate system call for the operating system, but when the the child process calls [`exec()`](https://man7.org/linux/man-pages/man3/exec.3.html) that fails, and there was no output produced by the child process. 109 | 110 | ### Writing output to a pipe 111 | 112 | ```sql 113 | -- Write all numbers from 1 to 30 out, but then filter via grep 114 | -- for only lines that contain 6. 115 | COPY (select * from unnest(generate_series(1, 30))) 116 | TO '| grep 6 > numbers.csv' (FORMAT 'CSV'); 117 | 6 118 | 16 119 | 26 120 | 121 | -- Copy the result set to the clipboard on Mac OS X using pbcopy 122 | COPY (select 'hello' as type, from unnest(generate_series(1, 30))) 123 | TO '| grep 3 | pbcopy' (FORMAT 'CSV'); 124 | type,"generate_series(1, 30)" 125 | hello,3 126 | hello,13 127 | hello,23 128 | hello,30 129 | 130 | -- Write an encrypted file out via openssl 131 | COPY (select 'hello' as type, * from unnest(generate_series(1, 30))) 132 | TO '| openssl enc -aes-256-cbc -salt -in - -out example.enc -pbkdf2 -iter 1000 -pass pass:testing12345' (FORMAT 'JSON'); 133 | 134 | ``` 135 | 136 | ## Configuration 137 | 138 | This extension introduces a new configuration option: 139 | 140 | `ignore_sigpipe` - a boolean option that, when set to true, ignores the SIGPIPE signal. This is useful when writing to a pipe that stops reading input. For example: 141 | 142 | ```sql 143 | COPY (select 'hello' as type, * from unnest(generate_series(1, 300))) TO '| head -n 100'; 144 | ``` 145 | 146 | In this scenario, DuckDB attempts to write 300 lines to the pipe, but the `head` command only reads the first 100 lines. After `head` reads the first 100 lines and exits, it closes the pipe. The next time DuckDB tries to write to the pipe, it receives a SIGPIPE signal. By default, this causes DuckDB to exit. However, if `ignore_sigpipe` is set to true, the SIGPIPE signal is ignored, allowing DuckDB to continue without error even if the pipe is closed. 147 | 148 | You can enable this option by setting it with the following command: 149 | 150 | ```sql 151 | set ignore_sigpipe = true; 152 | ``` 153 | 154 | ## Caveats 155 | 156 | When using `read_text()` or `read_blob()` the contents of the data read from a pipe is limited to 2GB in size. This is the maximum length of a single row's value. 157 | 158 | When using `read_csv()` or `read_json()` the contents of the pipe can be unlimited as it is processed in a streaming fashion. 159 | 160 | A demonstration of this would be: 161 | 162 | ```python 163 | #!/usr/bin/env python3 164 | 165 | print("counter1,counter2") 166 | for i in range(10000000): 167 | print(f"{i},{i}") 168 | ``` 169 | 170 | ```sql 171 | select count(distinct counter1) from read_csv('./test-csv.py |'); 172 | ┌──────────────────────────┐ 173 | │ count(DISTINCT counter1) │ 174 | │ int64 │ 175 | ├──────────────────────────┤ 176 | │ 10000000 │ 177 | └──────────────────────────┘ 178 | ``` 179 | 180 | If a `limit` clause is used you may see an error like this: 181 | 182 | ```sql 183 | select * from read_csv('./test-csv.py |') limit 3; 184 | ┌──────────┬──────────┐ 185 | │ counter1 │ counter2 │ 186 | │ int64 │ int64 │ 187 | ├──────────┼──────────┤ 188 | │ 0 │ 0 │ 189 | │ 1 │ 1 │ 190 | │ 2 │ 2 │ 191 | └──────────┴──────────┘ 192 | Traceback (most recent call last): 193 | File "/Users/rusty/Development/duckdb-shell-extension/./test-csv.py", line 5, in 194 | print(f"{i},{i}") 195 | BrokenPipeError: [Errno 32] Broken pipe 196 | Exception ignored in: <_io.TextIOWrapper name='' mode='w' encoding='utf-8'> 197 | BrokenPipeError: [Errno 32] Broken pipe 198 | ``` 199 | 200 | DuckDB continues to run, but the program that was producing output received a SIGPIPE signal because DuckDB closed the pipe after reading the necessary number of rows. It is up to the user of DuckDB to decide whether to suppress this behavior by setting the `ignore_sigpipe` configuration parameter. 201 | 202 | ## Building 203 | 204 | ### Build steps 205 | Now to build the extension, run: 206 | ```sh 207 | make 208 | ``` 209 | The main binaries that will be built are: 210 | ```sh 211 | ./build/release/duckdb 212 | ./build/release/test/unittest 213 | ./build/release/extension/shellfs/shellfs.duckdb_extension 214 | ``` 215 | - `duckdb` is the binary for the duckdb shell with the extension code automatically loaded. 216 | - `unittest` is the test runner of duckdb. Again, the extension is already linked into the binary. 217 | - `shellfs.duckdb_extension` is the loadable binary as it would be distributed. 218 | 219 | ## Running the extension 220 | To run the extension code, simply start the shell with `./build/release/duckdb`. 221 | 222 | Now we can use the features from the extension directly in DuckDB. 223 | 224 | ## Running the tests 225 | Different tests can be created for DuckDB extensions. The primary way of testing DuckDB extensions should be the SQL tests in `./test/sql`. These SQL tests can be run using: 226 | ```sh 227 | make test 228 | ``` 229 | 230 | ### Installing the deployed binaries 231 | 232 | To install your extension binaries from S3, you will need to do two things. Firstly, DuckDB should be launched with the 233 | `allow_unsigned_extensions` option set to true. How to set this will depend on the client you're using. Some examples: 234 | 235 | CLI: 236 | ```shell 237 | duckdb -unsigned 238 | ``` 239 | 240 | Python: 241 | ```python 242 | con = duckdb.connect(':memory:', config={'allow_unsigned_extensions' : 'true'}) 243 | ``` 244 | 245 | NodeJS: 246 | ```js 247 | db = new duckdb.Database(':memory:', {"allow_unsigned_extensions": "true"}); 248 | ``` 249 | 250 | Secondly, you will need to set the repository endpoint in DuckDB to the HTTP url of your bucket + version of the extension 251 | you want to install. To do this run the following SQL query in DuckDB: 252 | ```sql 253 | SET custom_extension_repository='bucket.s3.eu-west-1.amazonaws.com/shellfs/latest'; 254 | ``` 255 | Note that the `/latest` path will allow you to install the latest extension version available for your current version of 256 | DuckDB. To specify a specific version, you can pass the version instead. 257 | 258 | After running these steps, you can install and load your extension using the regular INSTALL/LOAD commands in DuckDB: 259 | 260 | ```sql 261 | INSTALL shellfs 262 | LOAD shellfs 263 | ``` 264 | -------------------------------------------------------------------------------- /docs/UPDATING.md: -------------------------------------------------------------------------------- 1 | # Extension updating 2 | When cloning this template, the target version of DuckDB should be the latest stable release of DuckDB. However, there 3 | will inevitably come a time when a new DuckDB is released and the extension repository needs updating. This process goes 4 | as follows: 5 | 6 | - Bump submodules 7 | - `./duckdb` should be set to latest tagged release 8 | - `./extension-ci-tools` should be set to updated branch corresponding to latest DuckDB release 9 | - Bump versions in `./github/workflows` 10 | - `duckdb_version` input in `MainDistributionPipeline.yml` should be set to latest tagged release 11 | - reusable workflow `_extension_distribution.yml` should be set to updated branch corresponding to latest DuckDB release 12 | 13 | -------------------------------------------------------------------------------- /docs/duckdb-shellfs.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Query-farm/shellfs/ae31efcb4d299d39d68fd2941a57f4bd0d1b5d1e/docs/duckdb-shellfs.jpg -------------------------------------------------------------------------------- /extension_config.cmake: -------------------------------------------------------------------------------- 1 | # This file is included by DuckDB's build system. It specifies which extension to load 2 | 3 | # Extension from this repo 4 | duckdb_extension_load(shellfs 5 | SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR} 6 | LOAD_TESTS 7 | ) 8 | 9 | # Any extra extensions that should be built 10 | # e.g.: duckdb_extension_load(json) -------------------------------------------------------------------------------- /scripts/bootstrap-template.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import sys, os, shutil, re 4 | from pathlib import Path 5 | 6 | shutil.copyfile(f'docs/NEXT_README.md', f'README.md') 7 | os.remove(f'docs/NEXT_README.md') 8 | os.remove(f'docs/README.md') 9 | 10 | if (len(sys.argv) != 2): 11 | raise Exception('usage: python3 bootstrap-template.py ') 12 | 13 | name_extension = sys.argv[1] 14 | 15 | def is_snake_case(s): 16 | # Define the regex pattern for snake case with numbers 17 | pattern = r'^[a-z0-9]+(_[a-z0-9]+)*$' 18 | 19 | # Use re.match to check if the string matches the pattern 20 | if re.match(pattern, s): 21 | return True 22 | else: 23 | return False 24 | 25 | if name_extension[0].isdigit(): 26 | raise Exception('Please dont start your extension name with a number.') 27 | 28 | if not is_snake_case(name_extension): 29 | raise Exception('Please enter the name of your extension in valid snake_case containing only lower case letters and numbers') 30 | 31 | def to_camel_case(snake_str): 32 | return "".join(x.capitalize() for x in snake_str.lower().split("_")) 33 | 34 | def replace(file_name, to_find, to_replace): 35 | with open(file_name, 'r', encoding="utf8") as file : 36 | filedata = file.read() 37 | filedata = filedata.replace(to_find, to_replace) 38 | with open(file_name, 'w', encoding="utf8") as file: 39 | file.write(filedata) 40 | 41 | files_to_search = [] 42 | files_to_search.extend(Path('./.github').rglob('./**/*.yml')) 43 | files_to_search.extend(Path('./test').rglob('./**/*.test')) 44 | files_to_search.extend(Path('./src').rglob('./**/*.hpp')) 45 | files_to_search.extend(Path('./src').rglob('./**/*.cpp')) 46 | files_to_search.extend(Path('./src').rglob('./**/*.txt')) 47 | files_to_search.extend(Path('./src').rglob('./*.md')) 48 | 49 | def replace_everywhere(to_find, to_replace): 50 | for path in files_to_search: 51 | replace(path, to_find, to_replace) 52 | replace(path, to_find.capitalize(), to_camel_case(to_replace)) 53 | replace(path, to_find.upper(), to_replace.upper()) 54 | 55 | replace("./CMakeLists.txt", to_find, to_replace) 56 | replace("./Makefile", to_find, to_replace) 57 | replace("./Makefile", to_find.capitalize(), to_camel_case(to_replace)) 58 | replace("./Makefile", to_find.upper(), to_replace.upper()) 59 | replace("./README.md", to_find, to_replace) 60 | replace("./extension_config.cmake", to_find, to_replace) 61 | 62 | replace_everywhere("quack", name_extension) 63 | replace_everywhere("Quack", name_extension.capitalize()) 64 | replace_everywhere("", name_extension) 65 | 66 | string_to_replace = name_extension 67 | string_to_find = "quack" 68 | 69 | # rename files 70 | os.rename(f'test/sql/{string_to_find}.test', f'test/sql/{string_to_replace}.test') 71 | os.rename(f'src/{string_to_find}_extension.cpp', f'src/{string_to_replace}_extension.cpp') 72 | os.rename(f'src/include/{string_to_find}_extension.hpp', f'src/include/{string_to_replace}_extension.hpp') 73 | 74 | # remove template-specific files 75 | os.remove('.github/workflows/ExtensionTemplate.yml') 76 | 77 | # finally, remove this bootstrap file 78 | os.remove(__file__) -------------------------------------------------------------------------------- /scripts/extension-upload.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Extension upload script 4 | 5 | # Usage: ./extension-upload.sh 6 | # : Name of the extension 7 | # : Version (commit / version tag) of the extension 8 | # : Version (commit / version tag) of DuckDB 9 | # : Architecture target of the extension binary 10 | # : S3 bucket to upload to 11 | # : Set this as the latest version ("true" / "false", default: "false") 12 | # : Set this as a versioned version that will prevent its deletion 13 | 14 | set -e 15 | 16 | if [[ $4 == wasm* ]]; then 17 | ext="/tmp/extension/$1.duckdb_extension.wasm" 18 | else 19 | ext="/tmp/extension/$1.duckdb_extension" 20 | fi 21 | 22 | echo $ext 23 | 24 | script_dir="$(dirname "$(readlink -f "$0")")" 25 | 26 | # calculate SHA256 hash of extension binary 27 | cat $ext > $ext.append 28 | 29 | if [[ $4 == wasm* ]]; then 30 | # 0 for custom section 31 | # 113 in hex = 275 in decimal, total lenght of what follows (1 + 16 + 2 + 256) 32 | # [1(continuation) + 0010011(payload) = \x93, 0(continuation) + 10(payload) = \x02] 33 | echo -n -e '\x00' >> $ext.append 34 | echo -n -e '\x93\x02' >> $ext.append 35 | # 10 in hex = 16 in decimal, lenght of name, 1 byte 36 | echo -n -e '\x10' >> $ext.append 37 | echo -n -e 'duckdb_signature' >> $ext.append 38 | # the name of the WebAssembly custom section, 16 bytes 39 | # 100 in hex, 256 in decimal 40 | # [1(continuation) + 0000000(payload) = ff, 0(continuation) + 10(payload)], 41 | # for a grand total of 2 bytes 42 | echo -n -e '\x80\x02' >> $ext.append 43 | fi 44 | 45 | # (Optionally) Sign binary 46 | if [ "$DUCKDB_EXTENSION_SIGNING_PK" != "" ]; then 47 | echo "$DUCKDB_EXTENSION_SIGNING_PK" > private.pem 48 | $script_dir/../duckdb/scripts/compute-extension-hash.sh $ext.append > $ext.hash 49 | openssl pkeyutl -sign -in $ext.hash -inkey private.pem -pkeyopt digest:sha256 -out $ext.sign 50 | rm -f private.pem 51 | fi 52 | 53 | # Signature is always there, potentially defaulting to 256 zeros 54 | truncate -s 256 $ext.sign 55 | 56 | # append signature to extension binary 57 | cat $ext.sign >> $ext.append 58 | 59 | # compress extension binary 60 | if [[ $4 == wasm_* ]]; then 61 | brotli < $ext.append > "$ext.compressed" 62 | else 63 | gzip < $ext.append > "$ext.compressed" 64 | fi 65 | 66 | set -e 67 | 68 | # Abort if AWS key is not set 69 | if [ -z "$AWS_ACCESS_KEY_ID" ]; then 70 | echo "No AWS key found, skipping.." 71 | exit 0 72 | fi 73 | 74 | # upload versioned version 75 | if [[ $7 = 'true' ]]; then 76 | if [[ $4 == wasm* ]]; then 77 | aws s3 cp $ext.compressed s3://$5/$1/$2/$3/$4/$1.duckdb_extension.wasm --acl public-read --content-encoding br --content-type="application/wasm" 78 | else 79 | aws s3 cp $ext.compressed s3://$5/$1/$2/$3/$4/$1.duckdb_extension.gz --acl public-read 80 | fi 81 | fi 82 | 83 | # upload to latest version 84 | if [[ $6 = 'true' ]]; then 85 | if [[ $4 == wasm* ]]; then 86 | aws s3 cp $ext.compressed s3://$5/$3/$4/$1.duckdb_extension.wasm --acl public-read --content-encoding br --content-type="application/wasm" 87 | else 88 | aws s3 cp $ext.compressed s3://$5/$3/$4/$1.duckdb_extension.gz --acl public-read 89 | fi 90 | fi 91 | -------------------------------------------------------------------------------- /src/include/shellfs_extension.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "duckdb.hpp" 4 | 5 | namespace duckdb { 6 | 7 | class ShellfsExtension : public Extension { 8 | public: 9 | void Load(DuckDB &db) override; 10 | std::string Name() override; 11 | }; 12 | 13 | } // namespace duckdb 14 | -------------------------------------------------------------------------------- /src/shell_file_system.cpp: -------------------------------------------------------------------------------- 1 | #include "shell_file_system.hpp" 2 | #include "duckdb/common/exception.hpp" 3 | #include "duckdb/common/file_opener.hpp" 4 | #include "duckdb/common/file_system.hpp" 5 | #include "duckdb/common/helper.hpp" 6 | #include "duckdb/common/limits.hpp" 7 | 8 | #ifndef _WIN32 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include 16 | #include 17 | #include 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #else 25 | #include "duckdb/common/windows_util.hpp" 26 | 27 | #include 28 | #include 29 | #include 30 | 31 | #endif 32 | 33 | namespace duckdb 34 | { 35 | 36 | struct ShellFileHandle : public FileHandle 37 | { 38 | public: 39 | ShellFileHandle(FileSystem &file_system, string path, FILE *pipe, FileOpenFlags flags) : FileHandle(file_system, std::move(path), std::move(flags)), pipe(pipe) 40 | { 41 | } 42 | ~ShellFileHandle() override 43 | { 44 | ShellFileHandle::Close(); 45 | } 46 | 47 | FILE *pipe; 48 | 49 | public: 50 | void Close() override 51 | { 52 | if (!pipe) 53 | { 54 | return; 55 | } 56 | 57 | int result; 58 | 59 | #ifndef _WIN32 60 | result = pclose(pipe); 61 | #else 62 | result = _pclose(pipe); 63 | #endif 64 | // Indicate that the pipe has been closed. 65 | pipe = NULL; 66 | 67 | if (result == -1) 68 | { 69 | throw IOException("Could not close pipe \"%s\": %s", {{"errno", std::to_string(errno)}}, path, 70 | strerror(errno)); 71 | } 72 | else 73 | { 74 | #ifndef _WIN32 75 | if (WIFEXITED(result)) 76 | { 77 | int exit_status = WEXITSTATUS(result); 78 | if (exit_status != 0) 79 | { 80 | throw IOException("Pipe process exited with non-zero exit code=\"%d\": %s", exit_status, path); 81 | } 82 | else if (WIFSIGNALED(result)) 83 | { 84 | int signal_number = WTERMSIG(result); 85 | throw IOException("Pipe process exited with signal signal=\"%d\": %s", signal_number, path); 86 | } 87 | else if (exit_status != 0) 88 | { 89 | throw IOException("Pipe process exited abnormally: %s", path); 90 | } 91 | } 92 | #endif 93 | } 94 | }; 95 | }; 96 | 97 | void ShellFileSystem::Reset(FileHandle &handle) 98 | { 99 | throw InternalException("Cannot reset shell file system"); 100 | } 101 | 102 | int64_t ShellFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes) 103 | { 104 | FILE *pipe = handle.Cast().pipe; 105 | 106 | if (!pipe) 107 | { 108 | return 0; 109 | } 110 | 111 | int64_t bytes_read = fread(buffer, 1, nr_bytes, pipe); 112 | if (bytes_read == -1) 113 | { 114 | throw IOException("Could not read from pipe \"%s\": %s", {{"errno", std::to_string(errno)}}, handle.path, 115 | strerror(errno)); 116 | } 117 | if (bytes_read == 0) 118 | { 119 | // Since the last read() returned 0 bytes, presume that EOF has been encountered, and rather than 120 | // having the close, by doing this if there are errors with the pipe they are caught in the query 121 | // rather than in the destructor. 122 | handle.Close(); 123 | } 124 | return bytes_read; 125 | } 126 | 127 | int64_t ShellFileSystem::Write(FileHandle &handle, void *buffer, int64_t nr_bytes) 128 | { 129 | FILE *pipe = handle.Cast().pipe; 130 | int64_t bytes_written = 0; 131 | 132 | while (nr_bytes > 0) 133 | { 134 | auto bytes_to_write = MinValue(idx_t(NumericLimits::Maximum()), idx_t(nr_bytes)); 135 | int64_t current_bytes_written = fwrite(buffer, 1, bytes_to_write, pipe); 136 | if (current_bytes_written <= 0) 137 | { 138 | throw IOException("Could not write to pipe \"%s\": %s", {{"errno", std::to_string(errno)}}, handle.path, 139 | strerror(errno)); 140 | } 141 | bytes_written += current_bytes_written; 142 | buffer = (void *)(data_ptr_cast(buffer) + current_bytes_written); 143 | nr_bytes -= current_bytes_written; 144 | } 145 | 146 | return bytes_written; 147 | } 148 | 149 | int64_t ShellFileSystem::GetFileSize(FileHandle &handle) 150 | { 151 | // You can't know the size of the data that will come over a pipe 152 | // some code uses the size to allocate buffers, so don't return 153 | // a very large number. 154 | return 0; 155 | } 156 | 157 | unique_ptr ShellFileSystem::OpenFile(const string &path, FileOpenFlags flags, 158 | optional_ptr opener) 159 | { 160 | FILE *pipe; 161 | if (path.front() == '|') 162 | { 163 | // We want to write to the pipe. 164 | #ifndef _WIN32 165 | pipe = popen(path.substr(1, path.size()).c_str(), "w"); 166 | #else 167 | pipe = _popen(path.substr(1, path.size()).c_str(), "w"); 168 | #endif 169 | } 170 | else 171 | { 172 | // We want to read from the pipe 173 | #ifndef _WIN32 174 | pipe = popen(path.substr(0, path.size() - 1).c_str(), "r"); 175 | #else 176 | pipe = _popen(path.substr(0, path.size() - 1).c_str(), "r"); 177 | #endif 178 | } 179 | 180 | #ifndef _WIN32 181 | Value value; 182 | bool ignore_sigpipe = false; 183 | if (FileOpener::TryGetCurrentSetting(opener, "ignore_sigpipe", value)) 184 | { 185 | ignore_sigpipe = value.GetValue(); 186 | } 187 | 188 | if (ignore_sigpipe) 189 | { 190 | signal(SIGPIPE, SIG_IGN); 191 | } 192 | #endif 193 | 194 | return make_uniq(*this, path, pipe, flags); 195 | } 196 | 197 | bool ShellFileSystem::CanHandleFile(const string &fpath) 198 | { 199 | if (fpath.empty()) 200 | { 201 | return false; 202 | } 203 | // If the filename ends with | or starts with | 204 | // it can be handled by this file system. 205 | return fpath.back() == '|' || fpath.front() == '|'; 206 | } 207 | 208 | } // namespace duckdb 209 | -------------------------------------------------------------------------------- /src/shell_file_system.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "duckdb/common/file_system.hpp" 4 | 5 | namespace duckdb 6 | { 7 | 8 | class ShellFileSystem : public FileSystem 9 | { 10 | public: 11 | duckdb::unique_ptr OpenFile(const string &path, FileOpenFlags flags, 12 | optional_ptr opener = nullptr) final; 13 | 14 | int64_t Read(FileHandle &handle, void *buffer, int64_t nr_bytes) override; 15 | int64_t Write(FileHandle &handle, void *buffer, int64_t nr_bytes) override; 16 | 17 | int64_t GetFileSize(FileHandle &handle) override; 18 | 19 | vector Glob(const string &path, FileOpener *opener = nullptr) override 20 | { 21 | return {path}; 22 | } 23 | 24 | bool FileExists(const string &filename, optional_ptr opener = nullptr) override 25 | { 26 | return false; 27 | } 28 | 29 | void Reset(FileHandle &handle) override; 30 | bool OnDiskFile(FileHandle &handle) override 31 | { 32 | return false; 33 | }; 34 | bool CanSeek() override 35 | { 36 | return false; 37 | } 38 | 39 | bool CanHandleFile(const string &fpath) override; 40 | 41 | bool IsPipe(const string &filename, optional_ptr opener) override 42 | { 43 | return true; 44 | } 45 | void FileSync(FileHandle &handle) override 46 | { 47 | } 48 | 49 | std::string GetName() const override 50 | { 51 | return "ShellFileSystem"; 52 | } 53 | }; 54 | 55 | } // namespace duckdb 56 | -------------------------------------------------------------------------------- /src/shellfs_extension.cpp: -------------------------------------------------------------------------------- 1 | #define DUCKDB_EXTENSION_MAIN 2 | 3 | #include "shellfs_extension.hpp" 4 | #include "shell_file_system.hpp" 5 | #include "duckdb.hpp" 6 | #include "duckdb/common/exception.hpp" 7 | #include "duckdb/common/string_util.hpp" 8 | #include "duckdb/function/scalar_function.hpp" 9 | #include "duckdb/main/extension_util.hpp" 10 | 11 | namespace duckdb { 12 | 13 | static void LoadInternal(DatabaseInstance &instance) { 14 | // Register a scalar function 15 | auto &fs = instance.GetFileSystem(); 16 | 17 | fs.RegisterSubSystem(make_uniq()); 18 | 19 | auto &config = DBConfig::GetConfig(instance); 20 | 21 | // When writing to a PIPE ignore the SIGPIPE error and consider that the write succeeded. 22 | config.AddExtensionOption("ignore_sigpipe", "Ignore SIGPIPE", LogicalType::BOOLEAN, Value(false)); 23 | 24 | } 25 | void ShellfsExtension::Load(DuckDB &db) { 26 | LoadInternal(*db.instance); 27 | } 28 | std::string ShellfsExtension::Name() { 29 | return "shellfs"; 30 | } 31 | 32 | } // namespace duckdb 33 | 34 | extern "C" { 35 | 36 | DUCKDB_EXTENSION_API void shellfs_init(duckdb::DatabaseInstance &db) { 37 | duckdb::DuckDB db_wrapper(db); 38 | db_wrapper.LoadExtension(); 39 | } 40 | 41 | DUCKDB_EXTENSION_API const char *shellfs_version() { 42 | return duckdb::DuckDB::LibraryVersion(); 43 | } 44 | } 45 | 46 | #ifndef DUCKDB_EXTENSION_MAIN 47 | #error DUCKDB_EXTENSION_MAIN not defined 48 | #endif 49 | -------------------------------------------------------------------------------- /test/README.md: -------------------------------------------------------------------------------- 1 | # Testing this extension 2 | This directory contains all the tests for this extension. The `sql` directory holds tests that are written as [SQLLogicTests](https://duckdb.org/dev/sqllogictest/intro.html). DuckDB aims to have most its tests in this format as SQL statements, so for the quack extension, this should probably be the goal too. 3 | 4 | The root makefile contains targets to build and run all of these tests. To run the SQLLogicTests: 5 | ```bash 6 | make test 7 | ``` 8 | or 9 | ```bash 10 | make test_debug 11 | ``` -------------------------------------------------------------------------------- /test/sql/json.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/json.test 2 | # description: test shellfs extension 3 | # group: [shellfs] 4 | 5 | # Require statement will ensure this test is run with this extension loaded 6 | require shellfs 7 | 8 | require json 9 | 10 | query I 11 | SELECT count(*) FROM (DESCRIBE select * from read_json('curl -s http://worldtimeapi.org/api/timezone/Etc/UTC |')); 12 | ---- 13 | 15 14 | -------------------------------------------------------------------------------- /test/sql/shellfs.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/shellfs.test 2 | # description: test shellfs extension 3 | # group: [shellfs] 4 | 5 | # Require statement will ensure this test is run with this extension loaded 6 | require shellfs 7 | 8 | # Confirm the extension works 9 | query I 10 | SELECT count(distinct column0) from read_csv('seq 0 99 | awk ''{print $1 "," $1}'' |') 11 | ---- 12 | 100 13 | 14 | query I 15 | SELECT count(distinct column0) from read_csv('seq 0 99 | awk ''{print $1 "," $1}'' | grep 2 |') 16 | ---- 17 | 19 18 | 19 | statement ok 20 | COPY (select * from unnest(generate_series(1, 100))) TO '| grep 20 > __TEST_DIR__/grep-test-pipe.csv'; 21 | 22 | query I 23 | SELECT column0 FROM read_csv_auto('__TEST_DIR__/grep-test-pipe.csv'); 24 | ---- 25 | 20 26 | -------------------------------------------------------------------------------- /vcpkg.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": [ 3 | ] 4 | } 5 | --------------------------------------------------------------------------------