├── .editorconfig
├── .github
    └── workflows
    │   ├── MainDistributionPipeline.yml
    │   └── schedule-1.2.yml
├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── LICENSE
├── Makefile
├── docs
    ├── README.md
    ├── UPDATING.md
    └── duckdb-shellfs.jpg
├── extension_config.cmake
├── scripts
    ├── bootstrap-template.py
    └── extension-upload.sh
├── src
    ├── include
    │   └── shellfs_extension.hpp
    ├── shell_file_system.cpp
    ├── shell_file_system.hpp
    └── shellfs_extension.cpp
├── test
    ├── README.md
    └── sql
    │   ├── json.test
    │   └── shellfs.test
└── vcpkg.json


/.editorconfig:
--------------------------------------------------------------------------------
1 | duckdb/.editorconfig


--------------------------------------------------------------------------------
/.github/workflows/MainDistributionPipeline.yml:
--------------------------------------------------------------------------------
 1 | #
 2 | # This workflow calls the main distribution pipeline from DuckDB to build, test and (optionally) release the extension
 3 | #
 4 | name: Main Extension Distribution Pipeline
 5 | on:
 6 |   push:
 7 |   pull_request:
 8 |   workflow_dispatch:
 9 |   schedule:
10 |     - cron: '0 2 * * *'  # Runs every night at 02:00 UTC
11 | 
12 | concurrency:
13 |   group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/main' || github.sha }}
14 |   cancel-in-progress: true
15 | 
16 | jobs:
17 |   duckdb-stable-build:
18 |     name: Build extension binaries
19 |     uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main
20 |     with:
21 |       duckdb_version: main
22 |       ci_tools_version: main
23 |       extension_name: shellfs
24 |       exclude_archs: "wasm_mvp;wasm_eh;wasm_threads"
25 | 


--------------------------------------------------------------------------------
/.github/workflows/schedule-1.2.yml:
--------------------------------------------------------------------------------
 1 | name: Scheduled Trigger for 1.2
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: '0 12 * * *'  # Runs at 12:00 UTC every day
 6 |   workflow_dispatch:  # Allows manual trigger
 7 | 
 8 | jobs:
 9 |   trigger:
10 |     runs-on: ubuntu-latest
11 |     permissions:
12 |       actions: write  # Allow triggering workflows
13 |     steps:
14 |       - name: Checkout repository  # Required for gh to work
15 |         uses: actions/checkout@v4
16 | 
17 |       - name: Install GitHub CLI
18 |         run: |
19 |           sudo apt update && sudo apt install gh -y
20 | 
21 |       - name: Authenticate GH CLI
22 |         run: |
23 |           echo "${{ secrets.GITHUB_TOKEN }}" | gh auth login --with-token
24 | 
25 |       - name: Trigger Workflow on my-branch
26 |         run: |
27 |           gh workflow run MainDistributionPipeline.yml --ref v1.2
28 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | .idea
3 | cmake-build-debug
4 | duckdb_unittest_tempdir/
5 | .DS_Store
6 | testext
7 | test/python/__pycache__/
8 | .Rhistory
9 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "duckdb"]
2 | 	path = duckdb
3 | 	url = https://github.com/duckdb/duckdb
4 | 	branch = main
5 | [submodule "extension-ci-tools"]
6 | 	path = extension-ci-tools
7 | 	url = https://github.com/duckdb/extension-ci-tools
8 | 	branch = main


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.5)
 2 | 
 3 | # Set extension name here
 4 | set(TARGET_NAME shellfs)
 5 | 
 6 | set(EXTENSION_NAME ${TARGET_NAME}_extension)
 7 | set(LOADABLE_EXTENSION_NAME ${TARGET_NAME}_loadable_extension)
 8 | 
 9 | project(${TARGET_NAME})
10 | include_directories(src/include)
11 | 
12 | set(EXTENSION_SOURCES src/shellfs_extension.cpp src/shell_file_system.cpp)
13 | 
14 | build_static_extension(${TARGET_NAME} ${EXTENSION_SOURCES})
15 | build_loadable_extension(${TARGET_NAME} " " ${EXTENSION_SOURCES})
16 | 
17 | install(
18 |   TARGETS ${EXTENSION_NAME}
19 |   EXPORT "${DUCKDB_EXPORT_SET}"
20 |   LIBRARY DESTINATION "${INSTALL_LIB_DIR}"
21 |   ARCHIVE DESTINATION "${INSTALL_LIB_DIR}")
22 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2024 Rusty Conover <rusty@conover.me>
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | PROJ_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
2 | 
3 | # Configuration of extension
4 | EXT_NAME=shellfs
5 | EXT_CONFIG=${PROJ_DIR}extension_config.cmake
6 | 
7 | # Include the Makefile from extension-ci-tools
8 | include extension-ci-tools/makefiles/duckdb_extension.Makefile


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
  1 | # DuckDB Shellfs Extension
  2 | 
  3 | ![DuckDB Shellfs Extension logo](duckdb-shellfs.jpg)
  4 | 
  5 | The `shellfs` extension for DuckDB enables the use of Unix pipes for input and output.
  6 | 
  7 | By appending a pipe character `|` to a filename, DuckDB will treat it as a series of commands to execute and capture the output. Conversely, if you prefix a filename with `|`, DuckDB will treat it as an output pipe.
  8 | 
  9 | While the examples provided are simple, in practical scenarios, you might use this feature to run another program that generates CSV, JSON, or other formats to manage complexity that DuckDB cannot handle directly.
 10 | 
 11 | The implementation uses `popen()` to create the pipe between processes.
 12 | 
 13 | ## Installation
 14 | 
 15 | **`shellfs` is a [DuckDB Community Extension](https://github.com/duckdb/community-extensions).**
 16 | 
 17 | You can now use this by using this SQL:
 18 | 
 19 | ```sql
 20 | install shellfs from community;
 21 | load shellfs;
 22 | ```
 23 | 
 24 | ---
 25 | 
 26 | ## Examples
 27 | 
 28 | ### Reading input from a pipe
 29 | 
 30 | ```sql
 31 | 
 32 | -- Install the extension.
 33 | install shellfs from community;
 34 | load shellfs;
 35 | 
 36 | -- Generate a sequence only return numbers that contain a 2
 37 | SELECT * from read_csv('seq 1 100 | grep 2 |');
 38 | ┌─────────┐
 39 | │ column0 │
 40 | │  int64  │
 41 | ├─────────┤
 42 | │       2 │
 43 | │      12 │
 44 | │      20 │
 45 | │      21 │
 46 | │      22 │
 47 | └─────────┘
 48 | 
 49 | -- Get the first multiples of 7 between 1 and 3 5
 50 | -- demonstrate how commands can be chained together
 51 | SELECT * from read_csv('seq 1 35 | awk "\$1 % 7 == 0" | head -n 2 |');
 52 | ┌─────────┐
 53 | │ column0 │
 54 | │  int64  │
 55 | ├─────────┤
 56 | │       7 │
 57 | │      14 │
 58 | └─────────┘
 59 | 
 60 | -- Do some arbitrary curl
 61 | SELECT abbreviation, unixtime from
 62 | read_json('curl -s http://worldtimeapi.org/api/timezone/Etc/UTC  |');
 63 | ┌──────────────┬────────────┐
 64 | │ abbreviation │  unixtime  │
 65 | │   varchar    │   int64    │
 66 | ├──────────────┼────────────┤
 67 | │ UTC          │ 1715983565 │
 68 | └──────────────┴────────────┘
 69 | ```
 70 | 
 71 | 
 72 | Create a program to generate CSV in Python:
 73 | 
 74 | ```python
 75 | #!/usr/bin/env python3
 76 | 
 77 | print("counter1,counter2")
 78 | for i in range(10000000):
 79 |     print(f"{i},{i}")
 80 | ```
 81 | 
 82 | Run that program and determine the number of distinct values it produces:
 83 | 
 84 | ```sql
 85 | select count(distinct counter1)
 86 | from read_csv('./test-csv.py |');
 87 | ┌──────────────────────────┐
 88 | │ count(DISTINCT counter1) │
 89 | │          int64           │
 90 | ├──────────────────────────┤
 91 | │                 10000000 │
 92 | └──────────────────────────┘
 93 | ```
 94 | 
 95 | When a command is not found or able to be executed, this is the result:
 96 | 
 97 | ```sql
 98 |  SELECT count(distinct column0) from read_csv('foo |');
 99 | sh: foo: command not found
100 | ┌─────────────────────────┐
101 | │ count(DISTINCT column0) │
102 | │          int64          │
103 | ├─────────────────────────┤
104 | │                       0 │
105 | └─────────────────────────┘
106 | ```
107 | 
108 | The reason why there isn't an exception raised in this cause is because the `popen()` implementation starts a process with [`fork()`](https://man7.org/linux/man-pages/man2/fork.2.html) or the appropriate system call for the operating system, but when the the child process calls [`exec()`](https://man7.org/linux/man-pages/man3/exec.3.html) that fails, and there was no output produced by the child process.
109 | 
110 | ### Writing output to a pipe
111 | 
112 | ```sql
113 | -- Write all numbers from 1 to 30 out, but then filter via grep
114 | -- for only lines that contain 6.
115 | COPY (select * from unnest(generate_series(1, 30)))
116 | TO '| grep 6 > numbers.csv' (FORMAT 'CSV');
117 | 6
118 | 16
119 | 26
120 | 
121 | -- Copy the result set to the clipboard on Mac OS X using pbcopy
122 | COPY (select 'hello' as type, from unnest(generate_series(1, 30)))
123 | TO '| grep 3 | pbcopy' (FORMAT 'CSV');
124 | type,"generate_series(1, 30)"
125 | hello,3
126 | hello,13
127 | hello,23
128 | hello,30
129 | 
130 | -- Write an encrypted file out via openssl
131 | COPY (select 'hello' as type, * from unnest(generate_series(1, 30)))
132 | TO '| openssl enc -aes-256-cbc -salt -in - -out example.enc -pbkdf2 -iter 1000 -pass pass:testing12345' (FORMAT 'JSON');
133 | 
134 | ```
135 | 
136 | ## Configuration
137 | 
138 | This extension introduces a new configuration option:
139 | 
140 | `ignore_sigpipe` - a boolean option that, when set to true, ignores the SIGPIPE signal. This is useful when writing to a pipe that stops reading input. For example:
141 | 
142 | ```sql
143 | COPY (select 'hello' as type, * from unnest(generate_series(1, 300))) TO '| head -n 100';
144 | ```
145 | 
146 | In this scenario, DuckDB attempts to write 300 lines to the pipe, but the `head` command only reads the first 100 lines. After `head` reads the first 100 lines and exits, it closes the pipe. The next time DuckDB tries to write to the pipe, it receives a SIGPIPE signal. By default, this causes DuckDB to exit. However, if `ignore_sigpipe` is set to true, the SIGPIPE signal is ignored, allowing DuckDB to continue without error even if the pipe is closed.
147 | 
148 | You can enable this option by setting it with the following command:
149 | 
150 | ```sql
151 | set ignore_sigpipe = true;
152 | ```
153 | 
154 | ## Caveats
155 | 
156 | When using `read_text()` or `read_blob()` the contents of the data read from a pipe is limited to 2GB in size.  This is the maximum length of a single row's value.
157 | 
158 | When using `read_csv()` or `read_json()` the contents of the pipe can be unlimited as it is processed in a streaming fashion.
159 | 
160 | A demonstration of this would be:
161 | 
162 | ```python
163 | #!/usr/bin/env python3
164 | 
165 | print("counter1,counter2")
166 | for i in range(10000000):
167 |     print(f"{i},{i}")
168 | ```
169 | 
170 | ```sql
171 | select count(distinct counter1) from read_csv('./test-csv.py |');
172 | ┌──────────────────────────┐
173 | │ count(DISTINCT counter1) │
174 | │          int64           │
175 | ├──────────────────────────┤
176 | │                 10000000 │
177 | └──────────────────────────┘
178 | ```
179 | 
180 | If a `limit` clause is used you may see an error like this:
181 | 
182 | ```sql
183 | select * from read_csv('./test-csv.py |') limit 3;
184 | ┌──────────┬──────────┐
185 | │ counter1 │ counter2 │
186 | │  int64   │  int64   │
187 | ├──────────┼──────────┤
188 | │        0 │        0 │
189 | │        1 │        1 │
190 | │        2 │        2 │
191 | └──────────┴──────────┘
192 | Traceback (most recent call last):
193 |   File "/Users/rusty/Development/duckdb-shell-extension/./test-csv.py", line 5, in <module>
194 |     print(f"{i},{i}")
195 | BrokenPipeError: [Errno 32] Broken pipe
196 | Exception ignored in: <_io.TextIOWrapper name='<stdout>' mode='w' encoding='utf-8'>
197 | BrokenPipeError: [Errno 32] Broken pipe
198 | ```
199 | 
200 | DuckDB continues to run, but the program that was producing output received a SIGPIPE signal because DuckDB closed the pipe after reading the necessary number of rows.  It is up to the user of DuckDB to decide whether to suppress this behavior by setting the `ignore_sigpipe` configuration parameter.
201 | 
202 | ## Building
203 | 
204 | ### Build steps
205 | Now to build the extension, run:
206 | ```sh
207 | make
208 | ```
209 | The main binaries that will be built are:
210 | ```sh
211 | ./build/release/duckdb
212 | ./build/release/test/unittest
213 | ./build/release/extension/shellfs/shellfs.duckdb_extension
214 | ```
215 | - `duckdb` is the binary for the duckdb shell with the extension code automatically loaded.
216 | - `unittest` is the test runner of duckdb. Again, the extension is already linked into the binary.
217 | - `shellfs.duckdb_extension` is the loadable binary as it would be distributed.
218 | 
219 | ## Running the extension
220 | To run the extension code, simply start the shell with `./build/release/duckdb`.
221 | 
222 | Now we can use the features from the extension directly in DuckDB.
223 | 
224 | ## Running the tests
225 | Different tests can be created for DuckDB extensions. The primary way of testing DuckDB extensions should be the SQL tests in `./test/sql`. These SQL tests can be run using:
226 | ```sh
227 | make test
228 | ```
229 | 
230 | ### Installing the deployed binaries
231 | 
232 | To install your extension binaries from S3, you will need to do two things. Firstly, DuckDB should be launched with the
233 | `allow_unsigned_extensions` option set to true. How to set this will depend on the client you're using. Some examples:
234 | 
235 | CLI:
236 | ```shell
237 | duckdb -unsigned
238 | ```
239 | 
240 | Python:
241 | ```python
242 | con = duckdb.connect(':memory:', config={'allow_unsigned_extensions' : 'true'})
243 | ```
244 | 
245 | NodeJS:
246 | ```js
247 | db = new duckdb.Database(':memory:', {"allow_unsigned_extensions": "true"});
248 | ```
249 | 
250 | Secondly, you will need to set the repository endpoint in DuckDB to the HTTP url of your bucket + version of the extension
251 | you want to install. To do this run the following SQL query in DuckDB:
252 | ```sql
253 | SET custom_extension_repository='bucket.s3.eu-west-1.amazonaws.com/shellfs/latest';
254 | ```
255 | Note that the `/latest` path will allow you to install the latest extension version available for your current version of
256 | DuckDB. To specify a specific version, you can pass the version instead.
257 | 
258 | After running these steps, you can install and load your extension using the regular INSTALL/LOAD commands in DuckDB:
259 | 
260 | ```sql
261 | INSTALL shellfs
262 | LOAD shellfs
263 | ```
264 | 


--------------------------------------------------------------------------------
/docs/UPDATING.md:
--------------------------------------------------------------------------------
 1 | # Extension updating 
 2 | When cloning this template, the target version of DuckDB should be the latest stable release of DuckDB. However, there 
 3 | will inevitably come a time when a new DuckDB is released and the extension repository needs updating. This process goes
 4 | as follows:
 5 | 
 6 | - Bump submodules
 7 |   - `./duckdb` should be set to latest tagged release
 8 |   - `./extension-ci-tools` should be set to updated branch corresponding to latest DuckDB release
 9 | - Bump versions in `./github/workflows`
10 |   - `duckdb_version` input in `MainDistributionPipeline.yml` should be set to latest tagged release
11 |   - reusable workflow `_extension_distribution.yml` should be set to updated branch corresponding to latest DuckDB release
12 | 
13 | 


--------------------------------------------------------------------------------
/docs/duckdb-shellfs.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Query-farm/shellfs/ae31efcb4d299d39d68fd2941a57f4bd0d1b5d1e/docs/duckdb-shellfs.jpg


--------------------------------------------------------------------------------
/extension_config.cmake:
--------------------------------------------------------------------------------
 1 | # This file is included by DuckDB's build system. It specifies which extension to load
 2 | 
 3 | # Extension from this repo
 4 | duckdb_extension_load(shellfs
 5 |     SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}
 6 |     LOAD_TESTS
 7 | )
 8 | 
 9 | # Any extra extensions that should be built
10 | # e.g.: duckdb_extension_load(json)


--------------------------------------------------------------------------------
/scripts/bootstrap-template.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | import sys, os, shutil, re
 4 | from pathlib import Path
 5 | 
 6 | shutil.copyfile(f'docs/NEXT_README.md', f'README.md')
 7 | os.remove(f'docs/NEXT_README.md')
 8 | os.remove(f'docs/README.md')
 9 | 
10 | if (len(sys.argv) != 2):
11 |     raise Exception('usage: python3 bootstrap-template.py <name_for_extension_in_snake_case>')
12 | 
13 | name_extension = sys.argv[1]
14 | 
15 | def is_snake_case(s):
16 |     # Define the regex pattern for snake case with numbers
17 |     pattern = r'^[a-z0-9]+(_[a-z0-9]+)*$'
18 | 
19 |     # Use re.match to check if the string matches the pattern
20 |     if re.match(pattern, s):
21 |         return True
22 |     else:
23 |         return False
24 | 
25 | if name_extension[0].isdigit():
26 |     raise Exception('Please dont start your extension name with a number.')
27 | 
28 | if not is_snake_case(name_extension):
29 |     raise Exception('Please enter the name of your extension in valid snake_case containing only lower case letters and numbers')
30 | 
31 | def to_camel_case(snake_str):
32 |     return "".join(x.capitalize() for x in snake_str.lower().split("_"))
33 | 
34 | def replace(file_name, to_find, to_replace):
35 |     with open(file_name, 'r', encoding="utf8") as file :
36 |         filedata = file.read()
37 |     filedata = filedata.replace(to_find, to_replace)
38 |     with open(file_name, 'w', encoding="utf8") as file:
39 |         file.write(filedata)
40 | 
41 | files_to_search = []
42 | files_to_search.extend(Path('./.github').rglob('./**/*.yml'))
43 | files_to_search.extend(Path('./test').rglob('./**/*.test'))
44 | files_to_search.extend(Path('./src').rglob('./**/*.hpp'))
45 | files_to_search.extend(Path('./src').rglob('./**/*.cpp'))
46 | files_to_search.extend(Path('./src').rglob('./**/*.txt'))
47 | files_to_search.extend(Path('./src').rglob('./*.md'))
48 | 
49 | def replace_everywhere(to_find, to_replace):
50 |     for path in files_to_search:
51 |         replace(path, to_find, to_replace)
52 |         replace(path, to_find.capitalize(), to_camel_case(to_replace))
53 |         replace(path, to_find.upper(), to_replace.upper())
54 |     
55 |     replace("./CMakeLists.txt", to_find, to_replace)
56 |     replace("./Makefile", to_find, to_replace)
57 |     replace("./Makefile", to_find.capitalize(), to_camel_case(to_replace))
58 |     replace("./Makefile", to_find.upper(), to_replace.upper())
59 |     replace("./README.md", to_find, to_replace)
60 |     replace("./extension_config.cmake", to_find, to_replace)
61 | 
62 | replace_everywhere("quack", name_extension)
63 | replace_everywhere("Quack", name_extension.capitalize())
64 | replace_everywhere("<extension_name>", name_extension)
65 | 
66 | string_to_replace = name_extension
67 | string_to_find = "quack"
68 | 
69 | # rename files
70 | os.rename(f'test/sql/{string_to_find}.test', f'test/sql/{string_to_replace}.test')
71 | os.rename(f'src/{string_to_find}_extension.cpp', f'src/{string_to_replace}_extension.cpp')
72 | os.rename(f'src/include/{string_to_find}_extension.hpp', f'src/include/{string_to_replace}_extension.hpp')
73 | 
74 | # remove template-specific files
75 | os.remove('.github/workflows/ExtensionTemplate.yml')
76 | 
77 | # finally, remove this bootstrap file
78 | os.remove(__file__)


--------------------------------------------------------------------------------
/scripts/extension-upload.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Extension upload script
 4 | 
 5 | # Usage: ./extension-upload.sh <name> <extension_version> <duckdb_version> <architecture> <s3_bucket> <copy_to_latest> <copy_to_versioned>
 6 | # <name>                : Name of the extension
 7 | # <extension_version>   : Version (commit / version tag) of the extension
 8 | # <duckdb_version>      : Version (commit / version tag) of DuckDB
 9 | # <architecture>        : Architecture target of the extension binary
10 | # <s3_bucket>           : S3 bucket to upload to
11 | # <copy_to_latest>      : Set this as the latest version ("true" / "false", default: "false")
12 | # <copy_to_versioned>   : Set this as a versioned version that will prevent its deletion
13 | 
14 | set -e
15 | 
16 | if [[ $4 == wasm* ]]; then
17 |   ext="/tmp/extension/$1.duckdb_extension.wasm"
18 | else
19 |   ext="/tmp/extension/$1.duckdb_extension"
20 | fi
21 | 
22 | echo $ext
23 | 
24 | script_dir="$(dirname "$(readlink -f "$0")")"
25 | 
26 | # calculate SHA256 hash of extension binary
27 | cat $ext > $ext.append
28 | 
29 | if [[ $4 == wasm* ]]; then
30 |   # 0 for custom section
31 |   # 113 in hex = 275 in decimal, total lenght of what follows (1 + 16 + 2 + 256)
32 |   # [1(continuation) + 0010011(payload) = \x93, 0(continuation) + 10(payload) = \x02]
33 |   echo -n -e '\x00' >> $ext.append
34 |   echo -n -e '\x93\x02' >> $ext.append
35 |   # 10 in hex = 16 in decimal, lenght of name, 1 byte
36 |   echo -n -e '\x10' >> $ext.append
37 |   echo -n -e 'duckdb_signature' >> $ext.append
38 |   # the name of the WebAssembly custom section, 16 bytes
39 |   # 100 in hex, 256 in decimal
40 |   # [1(continuation) + 0000000(payload) = ff, 0(continuation) + 10(payload)],
41 |   # for a grand total of 2 bytes
42 |   echo -n -e '\x80\x02' >> $ext.append
43 | fi
44 | 
45 | # (Optionally) Sign binary
46 | if [ "$DUCKDB_EXTENSION_SIGNING_PK" != "" ]; then
47 |   echo "$DUCKDB_EXTENSION_SIGNING_PK" > private.pem
48 |   $script_dir/../duckdb/scripts/compute-extension-hash.sh $ext.append > $ext.hash
49 |   openssl pkeyutl -sign -in $ext.hash -inkey private.pem -pkeyopt digest:sha256 -out $ext.sign
50 |   rm -f private.pem
51 | fi
52 | 
53 | # Signature is always there, potentially defaulting to 256 zeros
54 | truncate -s 256 $ext.sign
55 | 
56 | # append signature to extension binary
57 | cat $ext.sign >> $ext.append
58 | 
59 | # compress extension binary
60 | if [[ $4 == wasm_* ]]; then
61 |   brotli < $ext.append > "$ext.compressed"
62 | else
63 |   gzip < $ext.append > "$ext.compressed"
64 | fi
65 | 
66 | set -e
67 | 
68 | # Abort if AWS key is not set
69 | if [ -z "$AWS_ACCESS_KEY_ID" ]; then
70 |     echo "No AWS key found, skipping.."
71 |     exit 0
72 | fi
73 | 
74 | # upload versioned version
75 | if [[ $7 = 'true' ]]; then
76 |   if [[ $4 == wasm* ]]; then
77 |     aws s3 cp $ext.compressed s3://$5/$1/$2/$3/$4/$1.duckdb_extension.wasm --acl public-read --content-encoding br --content-type="application/wasm"
78 |   else
79 |     aws s3 cp $ext.compressed s3://$5/$1/$2/$3/$4/$1.duckdb_extension.gz --acl public-read
80 |   fi
81 | fi
82 | 
83 | # upload to latest version
84 | if [[ $6 = 'true' ]]; then
85 |   if [[ $4 == wasm* ]]; then
86 |     aws s3 cp $ext.compressed s3://$5/$3/$4/$1.duckdb_extension.wasm --acl public-read --content-encoding br --content-type="application/wasm"
87 |   else
88 |     aws s3 cp $ext.compressed s3://$5/$3/$4/$1.duckdb_extension.gz --acl public-read
89 |   fi
90 | fi
91 | 


--------------------------------------------------------------------------------
/src/include/shellfs_extension.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "duckdb.hpp"
 4 | 
 5 | namespace duckdb {
 6 | 
 7 | class ShellfsExtension : public Extension {
 8 | public:
 9 | 	void Load(DuckDB &db) override;
10 | 	std::string Name() override;
11 | };
12 | 
13 | } // namespace duckdb
14 | 


--------------------------------------------------------------------------------
/src/shell_file_system.cpp:
--------------------------------------------------------------------------------
  1 | #include "shell_file_system.hpp"
  2 | #include "duckdb/common/exception.hpp"
  3 | #include "duckdb/common/file_opener.hpp"
  4 | #include "duckdb/common/file_system.hpp"
  5 | #include "duckdb/common/helper.hpp"
  6 | #include "duckdb/common/limits.hpp"
  7 | 
  8 | #ifndef _WIN32
  9 | #include <dirent.h>
 10 | #include <fcntl.h>
 11 | #include <string.h>
 12 | #include <sys/types.h>
 13 | #include <unistd.h>
 14 | 
 15 | #include <stdio.h>
 16 | #include <stdlib.h>
 17 | #include <sys/wait.h>
 18 | 
 19 | #include <iostream>
 20 | #include <stdexcept>
 21 | #include <cstring>
 22 | #include <cstdlib>
 23 | 
 24 | #else
 25 | #include "duckdb/common/windows_util.hpp"
 26 | 
 27 | #include <io.h>
 28 | #include <string>
 29 | #include <stdio.h>
 30 | 
 31 | #endif
 32 | 
 33 | namespace duckdb
 34 | {
 35 | 
 36 | 	struct ShellFileHandle : public FileHandle
 37 | 	{
 38 | 	public:
 39 | 		ShellFileHandle(FileSystem &file_system, string path, FILE *pipe, FileOpenFlags flags) : FileHandle(file_system, std::move(path), std::move(flags)), pipe(pipe)
 40 | 		{
 41 | 		}
 42 | 		~ShellFileHandle() override
 43 | 		{
 44 | 			ShellFileHandle::Close();
 45 | 		}
 46 | 
 47 | 		FILE *pipe;
 48 | 
 49 | 	public:
 50 | 		void Close() override
 51 | 		{
 52 | 			if (!pipe)
 53 | 			{
 54 | 				return;
 55 | 			}
 56 | 
 57 | 			int result;
 58 | 
 59 | #ifndef _WIN32
 60 | 			result = pclose(pipe);
 61 | #else
 62 | 			result = _pclose(pipe);
 63 | #endif
 64 | 			// Indicate that the pipe has been closed.
 65 | 			pipe = NULL;
 66 | 
 67 | 			if (result == -1)
 68 | 			{
 69 | 				throw IOException("Could not close pipe \"%s\": %s", {{"errno", std::to_string(errno)}}, path,
 70 | 													strerror(errno));
 71 | 			}
 72 | 			else
 73 | 			{
 74 | #ifndef _WIN32
 75 | 				if (WIFEXITED(result))
 76 | 				{
 77 | 					int exit_status = WEXITSTATUS(result);
 78 | 					if (exit_status != 0)
 79 | 					{
 80 | 						throw IOException("Pipe process exited with non-zero exit code=\"%d\": %s", exit_status, path);
 81 | 					}
 82 | 					else if (WIFSIGNALED(result))
 83 | 					{
 84 | 						int signal_number = WTERMSIG(result);
 85 | 						throw IOException("Pipe process exited with signal signal=\"%d\": %s", signal_number, path);
 86 | 					}
 87 | 					else if (exit_status != 0)
 88 | 					{
 89 | 						throw IOException("Pipe process exited abnormally: %s", path);
 90 | 					}
 91 | 				}
 92 | #endif
 93 | 			}
 94 | 		};
 95 | 	};
 96 | 
 97 | 	void ShellFileSystem::Reset(FileHandle &handle)
 98 | 	{
 99 | 		throw InternalException("Cannot reset shell file system");
100 | 	}
101 | 
102 | 	int64_t ShellFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes)
103 | 	{
104 | 		FILE *pipe = handle.Cast<ShellFileHandle>().pipe;
105 | 
106 | 		if (!pipe)
107 | 		{
108 | 			return 0;
109 | 		}
110 | 
111 | 		int64_t bytes_read = fread(buffer, 1, nr_bytes, pipe);
112 | 		if (bytes_read == -1)
113 | 		{
114 | 			throw IOException("Could not read from pipe \"%s\": %s", {{"errno", std::to_string(errno)}}, handle.path,
115 | 												strerror(errno));
116 | 		}
117 | 		if (bytes_read == 0)
118 | 		{
119 | 			// Since the last read() returned 0 bytes, presume that EOF has been encountered, and rather than
120 | 			// having the close, by doing this if there are errors with the pipe they are caught in the query
121 | 			// rather than in the destructor.
122 | 			handle.Close();
123 | 		}
124 | 		return bytes_read;
125 | 	}
126 | 
127 | 	int64_t ShellFileSystem::Write(FileHandle &handle, void *buffer, int64_t nr_bytes)
128 | 	{
129 | 		FILE *pipe = handle.Cast<ShellFileHandle>().pipe;
130 | 		int64_t bytes_written = 0;
131 | 
132 | 		while (nr_bytes > 0)
133 | 		{
134 | 			auto bytes_to_write = MinValue<idx_t>(idx_t(NumericLimits<int32_t>::Maximum()), idx_t(nr_bytes));
135 | 			int64_t current_bytes_written = fwrite(buffer, 1, bytes_to_write, pipe);
136 | 			if (current_bytes_written <= 0)
137 | 			{
138 | 				throw IOException("Could not write to pipe \"%s\": %s", {{"errno", std::to_string(errno)}}, handle.path,
139 | 													strerror(errno));
140 | 			}
141 | 			bytes_written += current_bytes_written;
142 | 			buffer = (void *)(data_ptr_cast(buffer) + current_bytes_written);
143 | 			nr_bytes -= current_bytes_written;
144 | 		}
145 | 
146 | 		return bytes_written;
147 | 	}
148 | 
149 | 	int64_t ShellFileSystem::GetFileSize(FileHandle &handle)
150 | 	{
151 | 		// You can't know the size of the data that will come over a pipe
152 | 		// some code uses the size to allocate buffers, so don't return
153 | 		// a very large number.
154 | 		return 0;
155 | 	}
156 | 
157 | 	unique_ptr<FileHandle> ShellFileSystem::OpenFile(const string &path, FileOpenFlags flags,
158 | 																									 optional_ptr<FileOpener> opener)
159 | 	{
160 | 		FILE *pipe;
161 | 		if (path.front() == '|')
162 | 		{
163 | 			// We want to write to the pipe.
164 | #ifndef _WIN32
165 | 			pipe = popen(path.substr(1, path.size()).c_str(), "w");
166 | #else
167 | 			pipe = _popen(path.substr(1, path.size()).c_str(), "w");
168 | #endif
169 | 		}
170 | 		else
171 | 		{
172 | 			// We want to read from the pipe
173 | #ifndef _WIN32
174 | 			pipe = popen(path.substr(0, path.size() - 1).c_str(), "r");
175 | #else
176 | 			pipe = _popen(path.substr(0, path.size() - 1).c_str(), "r");
177 | #endif
178 | 		}
179 | 
180 | #ifndef _WIN32
181 | 		Value value;
182 | 		bool ignore_sigpipe = false;
183 | 		if (FileOpener::TryGetCurrentSetting(opener, "ignore_sigpipe", value))
184 | 		{
185 | 			ignore_sigpipe = value.GetValue<bool>();
186 | 		}
187 | 
188 | 		if (ignore_sigpipe)
189 | 		{
190 | 			signal(SIGPIPE, SIG_IGN);
191 | 		}
192 | #endif
193 | 
194 | 		return make_uniq<ShellFileHandle>(*this, path, pipe, flags);
195 | 	}
196 | 
197 | 	bool ShellFileSystem::CanHandleFile(const string &fpath)
198 | 	{
199 | 		if (fpath.empty())
200 | 		{
201 | 			return false;
202 | 		}
203 | 		// If the filename ends with | or starts with |
204 | 		// it can be handled by this file system.
205 | 		return fpath.back() == '|' || fpath.front() == '|';
206 | 	}
207 | 
208 | } // namespace duckdb
209 | 


--------------------------------------------------------------------------------
/src/shell_file_system.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "duckdb/common/file_system.hpp"
 4 | 
 5 | namespace duckdb
 6 | {
 7 | 
 8 | 	class ShellFileSystem : public FileSystem
 9 | 	{
10 | 	public:
11 | 		duckdb::unique_ptr<FileHandle> OpenFile(const string &path, FileOpenFlags flags,
12 | 																						optional_ptr<FileOpener> opener = nullptr) final;
13 | 
14 | 		int64_t Read(FileHandle &handle, void *buffer, int64_t nr_bytes) override;
15 | 		int64_t Write(FileHandle &handle, void *buffer, int64_t nr_bytes) override;
16 | 
17 | 		int64_t GetFileSize(FileHandle &handle) override;
18 | 
19 | 		vector<OpenFileInfo> Glob(const string &path, FileOpener *opener = nullptr) override
20 | 		{
21 | 			return {path};
22 | 		}
23 | 
24 | 		bool FileExists(const string &filename, optional_ptr<FileOpener> opener = nullptr) override
25 | 		{
26 | 			return false;
27 | 		}
28 | 
29 | 		void Reset(FileHandle &handle) override;
30 | 		bool OnDiskFile(FileHandle &handle) override
31 | 		{
32 | 			return false;
33 | 		};
34 | 		bool CanSeek() override
35 | 		{
36 | 			return false;
37 | 		}
38 | 
39 | 		bool CanHandleFile(const string &fpath) override;
40 | 
41 | 		bool IsPipe(const string &filename, optional_ptr<FileOpener> opener) override
42 | 		{
43 | 			return true;
44 | 		}
45 | 		void FileSync(FileHandle &handle) override
46 | 		{
47 | 		}
48 | 
49 | 		std::string GetName() const override
50 | 		{
51 | 			return "ShellFileSystem";
52 | 		}
53 | 	};
54 | 
55 | } // namespace duckdb
56 | 


--------------------------------------------------------------------------------
/src/shellfs_extension.cpp:
--------------------------------------------------------------------------------
 1 | #define DUCKDB_EXTENSION_MAIN
 2 | 
 3 | #include "shellfs_extension.hpp"
 4 | #include "shell_file_system.hpp"
 5 | #include "duckdb.hpp"
 6 | #include "duckdb/common/exception.hpp"
 7 | #include "duckdb/common/string_util.hpp"
 8 | #include "duckdb/function/scalar_function.hpp"
 9 | #include "duckdb/main/extension_util.hpp"
10 | 
11 | namespace duckdb {
12 | 
13 | static void LoadInternal(DatabaseInstance &instance) {
14 |     // Register a scalar function
15 | 	auto &fs = instance.GetFileSystem();
16 | 
17 | 	fs.RegisterSubSystem(make_uniq<ShellFileSystem>());
18 | 
19 | 	auto &config = DBConfig::GetConfig(instance);
20 | 
21 |     // When writing to a PIPE ignore the SIGPIPE error and consider that the write succeeded.
22 | 	config.AddExtensionOption("ignore_sigpipe", "Ignore SIGPIPE", LogicalType::BOOLEAN, Value(false));
23 | 
24 | }
25 | void ShellfsExtension::Load(DuckDB &db) {
26 | 	LoadInternal(*db.instance);
27 | }
28 | std::string ShellfsExtension::Name() {
29 | 	return "shellfs";
30 | }
31 | 
32 | } // namespace duckdb
33 | 
34 | extern "C" {
35 | 
36 | DUCKDB_EXTENSION_API void shellfs_init(duckdb::DatabaseInstance &db) {
37 |     duckdb::DuckDB db_wrapper(db);
38 |     db_wrapper.LoadExtension<duckdb::ShellfsExtension>();
39 | }
40 | 
41 | DUCKDB_EXTENSION_API const char *shellfs_version() {
42 | 	return duckdb::DuckDB::LibraryVersion();
43 | }
44 | }
45 | 
46 | #ifndef DUCKDB_EXTENSION_MAIN
47 | #error DUCKDB_EXTENSION_MAIN not defined
48 | #endif
49 | 


--------------------------------------------------------------------------------
/test/README.md:
--------------------------------------------------------------------------------
 1 | # Testing this extension
 2 | This directory contains all the tests for this extension. The `sql` directory holds tests that are written as [SQLLogicTests](https://duckdb.org/dev/sqllogictest/intro.html). DuckDB aims to have most its tests in this format as SQL statements, so for the quack extension, this should probably be the goal too.
 3 | 
 4 | The root makefile contains targets to build and run all of these tests. To run the SQLLogicTests:
 5 | ```bash
 6 | make test
 7 | ```
 8 | or 
 9 | ```bash
10 | make test_debug
11 | ```


--------------------------------------------------------------------------------
/test/sql/json.test:
--------------------------------------------------------------------------------
 1 | # name: test/sql/json.test
 2 | # description: test shellfs extension
 3 | # group: [shellfs]
 4 | 
 5 | # Require statement will ensure this test is run with this extension loaded
 6 | require shellfs
 7 | 
 8 | require json
 9 | 
10 | query I
11 | SELECT count(*) FROM (DESCRIBE select * from read_json('curl -s http://worldtimeapi.org/api/timezone/Etc/UTC  |'));
12 | ----
13 | 15
14 | 


--------------------------------------------------------------------------------
/test/sql/shellfs.test:
--------------------------------------------------------------------------------
 1 | # name: test/sql/shellfs.test
 2 | # description: test shellfs extension
 3 | # group: [shellfs]
 4 | 
 5 | # Require statement will ensure this test is run with this extension loaded
 6 | require shellfs
 7 | 
 8 | # Confirm the extension works
 9 | query I
10 | SELECT count(distinct column0) from read_csv('seq 0 99 | awk ''{print $1 "," $1}'' |')
11 | ----
12 | 100
13 | 
14 | query I
15 | SELECT count(distinct column0) from read_csv('seq 0 99 | awk ''{print $1 "," $1}'' | grep 2 |')
16 | ----
17 | 19
18 | 
19 | statement ok
20 | COPY (select * from unnest(generate_series(1, 100))) TO '| grep 20 > __TEST_DIR__/grep-test-pipe.csv';
21 | 
22 | query I
23 | SELECT column0 FROM read_csv_auto('__TEST_DIR__/grep-test-pipe.csv');
24 | ----
25 | 20
26 | 


--------------------------------------------------------------------------------
/vcpkg.json:
--------------------------------------------------------------------------------
1 | {
2 |   "dependencies": [
3 |   ]
4 | }
5 | 


--------------------------------------------------------------------------------