├── .github ├── dependabot.yml └── workflows │ └── release.yml ├── Dockerfile ├── LICENSE ├── README.md ├── examples ├── flight_insert.py ├── flight_read.py └── flight_watch.py ├── main.py ├── public └── index.html └── requirements.txt /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "pip" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "weekly" 12 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | on: 2 | release: 3 | types: [created] 4 | 5 | env: 6 | REGISTRY: ghcr.io 7 | IMAGE_NAME: ${{ github.repository }} 8 | 9 | jobs: 10 | releases-matrix: 11 | name: Moo Builder 12 | runs-on: ubuntu-latest 13 | strategy: 14 | matrix: 15 | goos: [linux] 16 | goarch: [amd64] 17 | 18 | steps: 19 | - uses: actions/checkout@v3 20 | - name: Log in to the Container registry 21 | uses: docker/login-action@v2.1.0 22 | with: 23 | registry: ${{ env.REGISTRY }} 24 | username: ${{ github.actor }} 25 | password: ${{ secrets.GITHUB_TOKEN }} 26 | 27 | - name: Extract metadata (tags, labels) for Docker 28 | id: meta 29 | uses: docker/metadata-action@v4.3.0 30 | with: 31 | images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} 32 | 33 | - name: Build and push Docker image 34 | uses: docker/build-push-action@v4.0.0 35 | with: 36 | context: . 37 | push: true 38 | tags: ${{ steps.meta.outputs.tags }} 39 | labels: ${{ steps.meta.outputs.labels }} 40 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM node:20-alpine AS build 2 | ENV VITE_SELFSERVICE="true" 3 | WORKDIR /app 4 | RUN apk add git 5 | RUN git clone -b selfservice https://github.com/quackscience/quack-ui /app 6 | RUN npm install -g pnpm 7 | RUN npx update-browserslist-db@latest 8 | RUN npm install && npm run build 9 | 10 | FROM python:3.8.10-slim 11 | ENV VITE_SELFSERVICE="true" 12 | WORKDIR /app 13 | ADD requirements.txt . 14 | RUN apt update && apt install -y binutils wget git \ 15 | && pip install -r requirements.txt \ 16 | && rm -rf /var/lib/apt/lists/* && rm -rf ~/.cache/pip/* 17 | ADD main.py . 18 | COPY --from=build /app/dist ./public 19 | EXPOSE 8123 20 | EXPOSE 8815 21 | CMD ["python3","./main.py"] 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 duckheads 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | # :baby_chick: [QuackFlight](https://quackpy.fly.dev/?user=default#U0VMRUNUCiAgICB0b3duLAogICAgZGlzdHJpY3QsCiAgICBjb3VudCgpIEFTIGMsCkZST00gcmVhZF9wYXJxdWV0KCdodHRwczovL2RhdGFzZXRzLWRvY3VtZW50YXRpb24uczMuZXUtd2VzdC0zLmFtYXpvbmF3cy5jb20vaG91c2VfcGFycXVldC9ob3VzZV8wLnBhcnF1ZXQnKQpXSEVSRSByZWFkX3BhcnF1ZXQudG93biA9PSAnTE9ORE9OJwpHUk9VUCBCWQogICAgdG93biwKICAgIGRpc3RyaWN0Ck9SREVSIEJZIGMgREVTQwpMSU1JVCAxMA==) 6 | 7 | 8 | _Serverless OLAP API built on top of DuckDB exposing HTTP/S and Arrow Flight SQL interfaces_ 9 | 10 | 11 | > [!IMPORTANT] 12 | > - Arrow Flight API for modern data clients _(DuckDB Airport)_ 13 | > - Easy HTTP API with multiple formats _(JSON,CSV,Parquet)_ 14 | > - Unlocked Concurrent inserts and querying on DuckDB 15 | > - Persistent storage using w/ multiuser authentication 16 | > - Native access to any DuckDB Extension & Format 17 | > - Embedded SQL Query Interface for instant usage 18 | 19 | 20 |
21 | 22 | ![quackflight_banner_yellow](https://github.com/user-attachments/assets/3f55d787-7888-4647-a856-ba78111ed657) 23 | 24 | ### :seedling: Get Started 25 | Run using [docker](https://github.com/quackscience/quackflight/pkgs/container/quackflight) or build from source 26 | ```bash 27 | docker pull ghcr.io/quackscience/quackflight:latest 28 | docker run -ti --rm -p 8123:8123 -p 8815:8815 ghcr.io/quackscience/quackflight:latest 29 | ``` 30 | 31 | ### 👉 Usage 32 | 33 | > See the [Examples](https://github.com/quackscience/quackflight/tree/main/examples) directory for quick starters 34 | 35 | > [!NOTE] 36 | > Quackpipe executes queries in `:memory:` unless _authentication_ details are provided for data persistence 37 | 38 |
39 | 40 | #### 🕸️ HTTP API 41 | Execute DuckDB queries using the HTTP POST/GET API _(compatible with the ClickHouse HTTP API)_ 42 | ```bash 43 | curl -X POST "http://user:persistence@localhost:8123" \ 44 | -H "Content-Type: application/json" \ 45 | -d 'SELECT version()' 46 | ``` 47 | 48 |
49 | 50 | 51 | 52 | #### ✈️ FLIGHT API 53 | Execute DuckDB queries using the _experimental_ Flight GRPC API and [Airport](https://github.com/Query-farm/duckdb-airport-extension) 54 | 55 | > [!NOTE] 56 | > Quackpipe executes queries in `:memory:` unless an `authorization` header is provided for data persistence 57 | 58 | ##### 🎫 Pass Airport Security 59 | ```sql 60 | CREATE PERSISTENT SECRET airport_flight ( 61 | · type airport, 62 | ‣ auth_token 'user:persistence', 63 | · scope 'grpc://localhost:8815' 64 | · ); 65 | ``` 66 | 67 | ##### 🎫 Take Airport Flights 68 | ```sql 69 | D select flight_descriptor, endpoint from airport_list_flights('grpc://127.0.0.1:8815', null); 70 | ┌─────────────────────────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ 71 | │ flight_descriptor │ endpoint │ 72 | │ union(cmd blob, path varchar[]) │ struct(ticket blob, "location" varchar[], expiration_time timestamp, app_metadata blob)[] │ 73 | ├─────────────────────────────────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ 74 | │ show_databases │ [{'ticket': SHOW DATABASES, 'location': [grpc://localhost:8815], 'expiration_time': NULL, 'app_metadata': }] │ 75 | │ show_tables │ [{'ticket': SHOW TABLES, 'location': [grpc://localhost:8815], 'expiration_time': NULL, 'app_metadata': }] │ 76 | │ show_version │ [{'ticket': SELECT version(), 'location': [grpc://localhost:8815], 'expiration_time': NULL, 'app_metadata': }] │ 77 | └─────────────────────────────────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ 78 | 79 | D select * from airport_take_flight('grpc://localhost:8815/', ['show_version']); 80 | ┌─────────────┐ 81 | │ "version"() │ 82 | │ varchar │ 83 | ├─────────────┤ 84 | │ v1.2.0 │ 85 | └─────────────┘ 86 | ``` 87 | 88 | ##### 🎫 ATTACH Flights Tables 89 | ```sql 90 | D --- Attach to Flight Server 91 | D ATTACH 'deltalake' (TYPE AIRPORT, location 'grpc://localhost:8815/'); 92 | 93 | D --- Create Schema + Table 94 | D CREATE SCHEMA deltalake.test1; 95 | D CREATE TABLE deltalake.test1.people ( 96 | name VARCHAR, 97 | love_of_duckdb INT, 98 | tags VARCHAR[] 99 | ); 100 | 101 | D --- Insert into Flight Table 102 | D INSERT INTO deltalake.test1.people values 103 | ('rusty', 5, ['airport', 'datasketches']); 104 | 105 | D --- Select from Flight Table 106 | D SELECT * FROM deltalake.test1.people; 107 | ┌─────────┬────────────────┬─────────────────────────┐ 108 | │ name │ love_of_duckdb │ tags │ 109 | │ varchar │ int32 │ varchar[] │ 110 | ├─────────┼────────────────┼─────────────────────────┤ 111 | │ rusty │ 5 │ [airport, datasketches] │ 112 | ├─────────┴────────────────┴─────────────────────────┤ 113 | │ 1 row. 3 columns │ 114 | └────────────────────────────────────────────────────┘ 115 | ``` 116 | 117 | > Flight Tables can be accessed via HTTP API using the schema name 118 | ```sql 119 | USE test1; SELECT * FROM people; 120 | ``` 121 | ![image](https://github.com/user-attachments/assets/82d9c7bf-cbf2-49d3-b4dc-a57a0ddaf46a) 122 | 123 | ##### 🎫 Take Custom Flights w/ Custom Headers + Ticket 124 | ```sql 125 | D SELECT * FROM airport_take_flight('grpc://localhost:8815', 'SELECT 1', headers := MAP{'authorization':'user:persistence'} ); 126 | ┌───────┐ 127 | │ 1 │ 128 | │ int32 │ 129 | ├───────┤ 130 | │ 1 │ 131 | └───────┘ 132 | ``` 133 | 134 | ##### 🎫 Take Python Flights 135 | ```python 136 | from pyarrow.flight import FlightClient, Ticket, FlightCallOptions 137 | import json 138 | import pandas 139 | import tabulate 140 | 141 | sql="""SELECT version()""" 142 | 143 | flight_ticket = Ticket(sql) 144 | 145 | token = (b"authorization", bytes(f"user:persistence".encode('utf-8'))) 146 | options = FlightCallOptions(headers=[token]) 147 | client = FlightClient(f"grpc://localhost:8815") 148 | 149 | reader = client.do_get(flight_ticket, options) 150 | arrow_table = reader.read_all() 151 | # Use pyarrow and pandas to view and analyze data 152 | data_frame = arrow_table.to_pandas() 153 | print(data_frame.to_markdown()) 154 | ``` 155 | ```sql 156 | | | "version"() | 157 | |---:|:--------------| 158 | | 0 | v1.2.0 | 159 | ``` 160 | 161 |
162 | 163 | ### 📺 SQL User-Interface 164 | quackflight ships with the DuckDB SQL quack user-interface based on [duck-ui](https://github.com/caioricciuti/duck-ui) 165 | 166 | 167 | 168 | 169 | 170 | ```mermaid 171 | 172 | sequenceDiagram 173 | participant Client 174 | participant QuackFlight 175 | participant DuckDB 176 | 177 | 178 | Client ->> QuackFlight: ListFlights 179 | QuackFlight ->> Client: Return Flights Table 180 | Client ->> QuackFlight: GetFlightInfo 181 | QuackFlight ->> DuckDB: DuckDB Execute 182 | DuckDB ->> QuackFlight: Arrow Results Stream 183 | QuackFlight ->> Client: FlightInfo(ticket) 184 | Client ->> QuackFlight: do_get(ticket) 185 | QuackFlight ->> Client: Stream of Results 186 | 187 | ``` 188 | 189 |
190 | 191 | 192 | ###### :black_joker: Disclaimers 193 | 194 | [^1]: DuckDB ® is a trademark of DuckDB Foundation. All rights reserved by their respective owners. [^1] 195 | [^2]: ClickHouse ® is a trademark of ClickHouse Inc. No direct affiliation or endorsement. [^2] 196 | [^3]: Released under the MIT license. See LICENSE for details. All rights reserved by their respective owners. [^3] 197 | [^4]: Flight implementation inspired by [Duck Takes Flight](https://www.definite.app/blog/duck-takes-flight) [^4] 198 | -------------------------------------------------------------------------------- /examples/flight_insert.py: -------------------------------------------------------------------------------- 1 | import pyarrow as pa 2 | import pyarrow.flight as flight 3 | import random 4 | import time 5 | from datetime import datetime 6 | import logging 7 | import json 8 | 9 | # Configure logging 10 | logging.basicConfig( 11 | level=logging.INFO, 12 | format='%(asctime)s - %(levelname)s - %(message)s' 13 | ) 14 | logger = logging.getLogger('flight_loader') 15 | 16 | def connect_with_retry(max_attempts=5): 17 | """Connect to Flight server with retry logic""" 18 | for attempt in range(max_attempts): 19 | try: 20 | # Create authentication headers for persistence 21 | token = (b"authorization", bytes(f"user:persistence".encode('utf-8'))) 22 | options = flight.FlightCallOptions(headers=[token]) 23 | 24 | # Create client and test connection 25 | client = flight.FlightClient(f"grpc://localhost:8815") 26 | ticket = flight.Ticket("SELECT 1".encode()) 27 | reader = client.do_get(ticket, options) 28 | list(reader) # Consume the result 29 | 30 | logger.info("Successfully connected to Flight server with persistence") 31 | return client, options 32 | except Exception as e: 33 | if attempt < max_attempts - 1: 34 | logger.warning(f"Connection attempt {attempt + 1} failed: {e}, retrying in 1 second...") 35 | time.sleep(1) 36 | else: 37 | logger.error(f"Failed to connect after {max_attempts} attempts") 38 | raise 39 | 40 | def generate_batch(batch_id): 41 | """Generate a batch of test data""" 42 | num_rows = 1_000 # Smaller batch size for more frequent updates 43 | data = { 44 | "batch_id": [batch_id] * num_rows, 45 | "timestamp": [datetime.now().isoformat()] * num_rows, 46 | "value": [random.uniform(0, 100) for _ in range(num_rows)], 47 | "category": [random.choice(['A', 'B', 'C', 'D']) for _ in range(num_rows)] 48 | } 49 | return data 50 | 51 | def continuous_load(client, options): 52 | """Continuously load data to the Flight server""" 53 | batch_id = 0 54 | table_name = "concurrent_test" # Use a constant table name 55 | 56 | # Create table using flight ticket 57 | create_table_sql = f""" 58 | CREATE TABLE IF NOT EXISTS {table_name} ( 59 | batch_id BIGINT, 60 | timestamp VARCHAR, 61 | value DOUBLE, 62 | category VARCHAR 63 | ) 64 | """ 65 | 66 | try: 67 | # Create table 68 | ticket = flight.Ticket(create_table_sql.encode()) 69 | reader = client.do_get(ticket, options) 70 | list(reader) # Consume the result 71 | logger.info(f"Table {table_name} created successfully") 72 | 73 | while True: 74 | try: 75 | # Generate data 76 | data = generate_batch(batch_id) 77 | 78 | # Insert data using simple INSERT VALUES 79 | values = [] 80 | for i in range(len(data['batch_id'])): 81 | values.append(f"({data['batch_id'][i]}, '{data['timestamp'][i]}', {data['value'][i]}, '{data['category'][i]}')") 82 | 83 | insert_sql = f""" 84 | INSERT INTO {table_name} (batch_id, timestamp, value, category) 85 | VALUES {','.join(values)} 86 | """ 87 | 88 | # Execute insert 89 | ticket = flight.Ticket(insert_sql.encode()) 90 | reader = client.do_get(ticket, options) 91 | list(reader) # Consume the result 92 | 93 | logger.info(f"Uploaded batch {batch_id} with {len(data['batch_id'])} rows") 94 | batch_id += 1 95 | time.sleep(2) # Write every 2 seconds 96 | 97 | except Exception as e: 98 | logger.error(f"Error uploading batch {batch_id}: {str(e)}") 99 | time.sleep(1) # Wait a bit before retrying on error 100 | 101 | except Exception as e: 102 | logger.error(f"Error setting up table: {str(e)}") 103 | raise 104 | 105 | if __name__ == "__main__": 106 | logger.info("Starting continuous data loader...") 107 | client, options = connect_with_retry() 108 | continuous_load(client, options) 109 | -------------------------------------------------------------------------------- /examples/flight_read.py: -------------------------------------------------------------------------------- 1 | from pyarrow.flight import FlightClient, Ticket, FlightCallOptions 2 | import json 3 | import pandas 4 | import tabulate 5 | 6 | # Downsampling query groups data into 2-hour bins 7 | sql="""CREATE TABLE IF NOT EXISTS test AS SELECT version(), now(); SELECT * FROM test;""" 8 | 9 | flight_ticket = Ticket(sql) 10 | 11 | token = (b"authorization", bytes(f"user:persistence".encode('utf-8'))) 12 | options = FlightCallOptions(headers=[token]) 13 | client = FlightClient(f"grpc://localhost:8815") 14 | 15 | reader = client.do_get(flight_ticket, options) 16 | arrow_table = reader.read_all() 17 | # Use pyarrow and pandas to view and analyze data 18 | data_frame = arrow_table.to_pandas() 19 | print(data_frame.to_markdown()) 20 | -------------------------------------------------------------------------------- /examples/flight_watch.py: -------------------------------------------------------------------------------- 1 | import pyarrow as pa 2 | import pyarrow.flight as flight 3 | import time 4 | from datetime import datetime 5 | import logging 6 | 7 | # Configure logging 8 | logging.basicConfig( 9 | level=logging.INFO, 10 | format='%(asctime)s - %(levelname)s - %(message)s' 11 | ) 12 | logger = logging.getLogger('flight_monitor') 13 | 14 | def connect_with_retry(max_attempts=5): 15 | """Connect to Flight server with retry logic""" 16 | for attempt in range(max_attempts): 17 | try: 18 | # Create authentication headers for persistence 19 | token = (b"authorization", bytes(f"user:persistence".encode('utf-8'))) 20 | options = flight.FlightCallOptions(headers=[token]) 21 | 22 | # Create client and test connection 23 | client = flight.FlightClient(f"grpc://localhost:8815") 24 | ticket = flight.Ticket("SELECT 1".encode()) 25 | reader = client.do_get(ticket, options) 26 | list(reader) # Consume the result 27 | 28 | logger.info("Successfully connected to Flight server with persistence") 29 | return client, options 30 | except Exception as e: 31 | if attempt < max_attempts - 1: 32 | logger.warning(f"Connection attempt {attempt + 1} failed: {e}, retrying in 1 second...") 33 | time.sleep(1) 34 | else: 35 | logger.error(f"Failed to connect after {max_attempts} attempts") 36 | raise 37 | 38 | def monitor_table(client, options): 39 | """Monitor the concurrent_test table""" 40 | table_name = "concurrent_test" 41 | 42 | try: 43 | while True: 44 | try: 45 | # Get total count 46 | count_sql = f"SELECT COUNT(*) as total FROM {table_name}" 47 | ticket = flight.Ticket(count_sql.encode()) 48 | reader = client.do_get(ticket, options) 49 | table = reader.read_all().to_pandas() 50 | total_count = table['total'][0] 51 | 52 | # Get latest sample 53 | sample_sql = f""" 54 | SELECT * FROM {table_name} 55 | ORDER BY RANDOM() 56 | LIMIT 1 57 | """ 58 | ticket = flight.Ticket(sample_sql.encode()) 59 | reader = client.do_get(ticket, options) 60 | sample = reader.read_all().to_pandas() 61 | 62 | logger.info(f"Total rows: {total_count}") 63 | logger.info(f"Sample row:\n{sample.to_string()}") 64 | logger.info("-" * 50) 65 | 66 | time.sleep(5) # Check every 5 seconds 67 | 68 | except Exception as e: 69 | logger.error(f"Error monitoring table: {str(e)}") 70 | time.sleep(1) # Wait a bit before retrying on error 71 | 72 | except KeyboardInterrupt: 73 | logger.info("Monitoring stopped by user") 74 | except Exception as e: 75 | logger.error(f"Fatal error in monitoring: {str(e)}") 76 | raise 77 | 78 | if __name__ == "__main__": 79 | logger.info("Starting table monitor...") 80 | client, options = connect_with_retry() 81 | monitor_table(client, options) 82 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import duckdb 4 | import json 5 | import time 6 | import tempfile 7 | import hashlib 8 | import base64 9 | import threading 10 | import msgpack 11 | from dataclasses import dataclass 12 | from typing import Optional, Dict, Any, List 13 | 14 | from flask import Flask, request, jsonify 15 | from flask_httpauth import HTTPBasicAuth 16 | from flask_cors import CORS 17 | from cachetools import LRUCache 18 | import pyarrow as pa 19 | import pyarrow.flight as flight 20 | import zstandard as zstd 21 | 22 | import signal 23 | import threading 24 | import logging 25 | import sys 26 | from threading import Lock 27 | 28 | # Configure logging 29 | logging.basicConfig( 30 | level=logging.DEBUG, 31 | format='%(asctime)s - %(threadName)s - %(levelname)s - %(message)s', 32 | handlers=[ 33 | logging.StreamHandler(sys.stdout), 34 | logging.FileHandler('server.log') 35 | ] 36 | ) 37 | logger = logging.getLogger('server') 38 | # Global flag for server state 39 | running = True 40 | 41 | 42 | def signal_handler(signum, frame): 43 | """Handle shutdown signals""" 44 | global running 45 | logger.info("Shutdown signal received") 46 | running = False 47 | 48 | 49 | # Force Self-Service for UI 50 | os.environ['VITE_SELFSERVICE'] = 'true' 51 | # Default path for temp databases 52 | dbpath = os.getenv('DBPATH', '/tmp/') 53 | # Check for custom UI path 54 | custom_ui_path = os.getenv('CUSTOM_UI_PATH') 55 | 56 | if custom_ui_path: 57 | app = Flask(__name__, static_folder=custom_ui_path, static_url_path="") 58 | else: 59 | app = Flask(__name__, static_folder="public", static_url_path="") 60 | 61 | auth = HTTPBasicAuth() 62 | CORS(app) 63 | 64 | # Initialize LRU Cache 65 | cache = LRUCache(maxsize=10) 66 | 67 | # Add near the top of the file, after imports but before app initialization 68 | from threading import Lock 69 | from typing import Dict, Optional 70 | 71 | class ConnectionManager: 72 | def __init__(self): 73 | self._connections: Dict[str, duckdb.DuckDBPyConnection] = {} 74 | self._lock = Lock() 75 | 76 | # Create default in-memory connection 77 | self._default_conn = duckdb.connect(':memory:') 78 | self._setup_extensions(self._default_conn) 79 | 80 | def _setup_extensions(self, conn: duckdb.DuckDBPyConnection): 81 | """Set up required extensions for a connection""" 82 | try: 83 | conn.install_extension("chsql", repository="community") 84 | conn.install_extension("chsql_native", repository="community") 85 | conn.load_extension("chsql") 86 | conn.load_extension("chsql_native") 87 | except Exception as e: 88 | logger.warning(f"Failed to initialize extensions: {e}") 89 | 90 | def get_connection(self, auth_hash: Optional[str] = None) -> duckdb.DuckDBPyConnection: 91 | """Get or create a connection for the given auth hash""" 92 | if not auth_hash: 93 | return self._default_conn 94 | 95 | with self._lock: 96 | if auth_hash not in self._connections: 97 | db_file = os.path.join(dbpath, f"{auth_hash}.db") 98 | logger.info(f'Creating new connection for {db_file}') 99 | conn = duckdb.connect(db_file) 100 | self._setup_extensions(conn) 101 | self._connections[auth_hash] = conn 102 | return self._connections[auth_hash] 103 | 104 | # Create global connection manager 105 | connection_manager = ConnectionManager() 106 | 107 | # Replace the global conn variable with a property that uses the connection manager 108 | def get_current_connection() -> duckdb.DuckDBPyConnection: 109 | """Get the current connection based on authentication""" 110 | auth = request.authorization if hasattr(request, 'authorization') else None 111 | if auth and auth.username and auth.password: 112 | user_pass_hash = hashlib.sha256((auth.username + auth.password).encode()).hexdigest() 113 | return connection_manager.get_connection(user_pass_hash) 114 | return connection_manager.get_connection() 115 | 116 | # Remove the global conn variable and replace with this property 117 | @property 118 | def conn(): 119 | return get_current_connection() 120 | 121 | 122 | @auth.verify_password 123 | def verify(username, password): 124 | if not (username and password): 125 | logger.debug('Using stateless session') 126 | return True 127 | 128 | logger.info(f"Using http auth: {username}:") 129 | user_pass_hash = hashlib.sha256((username + password).encode()).hexdigest() 130 | # Just verify the connection exists/can be created 131 | connection_manager.get_connection(user_pass_hash) 132 | return True 133 | 134 | 135 | def convert_to_ndjson(result): 136 | columns = result.description 137 | data = result.fetchall() 138 | ndjson_lines = [] 139 | for row in data: 140 | row_dict = {columns[i][0]: row[i] for i in range(len(columns))} 141 | ndjson_lines.append(json.dumps(row_dict)) 142 | return '\n'.join(ndjson_lines).encode() 143 | 144 | 145 | def convert_to_clickhouse_jsoncompact(result, query_time): 146 | columns = result.description 147 | data = result.fetchall() 148 | meta = [{"name": col[0], "type": col[1]} for col in columns] 149 | json_result = { 150 | "meta": meta, 151 | "data": data, 152 | "rows": len(data), 153 | "rows_before_limit_at_least": len(data), 154 | "statistics": { 155 | "elapsed": query_time, 156 | "rows_read": len(data), 157 | "bytes_read": sum(len(str(item)) for row in data for item in row) 158 | } 159 | } 160 | return json.dumps(json_result) 161 | 162 | 163 | def convert_to_clickhouse_json(result, query_time): 164 | columns = result.description 165 | data = result.fetchall() 166 | meta = [{"name": col[0], "type": col[1]} for col in columns] 167 | data_list = [] 168 | for row in data: 169 | row_dict = {columns[i][0]: row[i] for i in range(len(columns))} 170 | data_list.append(row_dict) 171 | json_result = { 172 | "meta": meta, 173 | "data": data_list, 174 | "rows": len(data), 175 | "statistics": { 176 | "elapsed": query_time, 177 | "rows_read": len(data), 178 | "bytes_read": sum(len(str(item)) for row in data for item in row) 179 | } 180 | } 181 | return json.dumps(json_result) 182 | 183 | 184 | def convert_to_csv_tsv(result, delimiter=','): 185 | columns = result.description 186 | data = result.fetchall() 187 | lines = [] 188 | header = delimiter.join([col[0] for col in columns]) 189 | lines.append(header) 190 | for row in data: 191 | line = delimiter.join([str(item) for item in row]) 192 | lines.append(line) 193 | return '\n'.join(lines).encode() 194 | 195 | 196 | def handle_insert_query(query, format, data=None, conn=None): 197 | if conn is None: 198 | conn = get_current_connection() 199 | table_name = query.split("INTO")[1].split()[0].strip() 200 | temp_file_name = None 201 | if format.lower() == 'jsoneachrow' and data is not None: 202 | temp_file_name = save_to_tempfile(data) 203 | if temp_file_name: 204 | try: 205 | ingest_query = f"COPY {table_name} FROM '{temp_file_name}' (FORMAT 'json')" 206 | conn.execute(ingest_query) 207 | except Exception as e: 208 | return b"", str(e).encode() 209 | finally: 210 | os.remove(temp_file_name) 211 | return b"Ok", b"" 212 | 213 | 214 | def save_to_tempfile(data): 215 | temp_file = tempfile.NamedTemporaryFile( 216 | delete=False, mode='w+', encoding='utf-8') 217 | temp_file.write(data) 218 | temp_file.flush() 219 | temp_file.close() 220 | return temp_file.name 221 | 222 | 223 | def duckdb_query_with_errmsg(query, format='JSONCompact', data=None, request_method="GET"): 224 | try: 225 | # Get connection for current request 226 | current_conn = get_current_connection() 227 | 228 | if request_method == "POST" and query.strip().lower().startswith('insert into') and data: 229 | return handle_insert_query(query, format, data, current_conn) 230 | start_time = time.time() 231 | result = current_conn.execute(query) 232 | query_time = time.time() - start_time 233 | if format.lower() == 'jsoncompact': 234 | output = convert_to_clickhouse_jsoncompact(result, query_time) 235 | elif format.lower() == 'json': 236 | output = convert_to_clickhouse_json(result, query_time) 237 | elif format.lower() == 'jsoneachrow': 238 | output = convert_to_ndjson(result) 239 | elif format.lower() == 'tsv': 240 | output = convert_to_csv_tsv(result, delimiter='\t') 241 | elif format.lower() == 'csv': 242 | output = convert_to_csv_tsv(result, delimiter=',') 243 | else: 244 | output = result.fetchall() 245 | if isinstance(output, list): 246 | output = json.dumps(output).encode() 247 | return output, b"" 248 | except Exception as e: 249 | return b"", str(e).encode() 250 | 251 | 252 | def sanitize_query(query): 253 | pattern = re.compile(r"(?i)\s*FORMAT\s+(\w+)\s*") 254 | match = re.search(pattern, query) 255 | if match: 256 | format_value = match.group(1).lower() 257 | query = re.sub(pattern, ' ', query).strip() 258 | return query, format_value.lower() 259 | return query, None 260 | 261 | 262 | @app.route('/', methods=["GET", "HEAD"]) 263 | @auth.login_required 264 | def clickhouse(): 265 | query = request.args.get('query', default="", type=str) 266 | format = request.args.get( 267 | 'default_format', default="JSONCompact", type=str) 268 | database = request.args.get('database', default="", type=str) 269 | query_id = request.args.get('query_id', default=None, type=str) 270 | data = None 271 | query, sanitized_format = sanitize_query(query) 272 | if sanitized_format: 273 | format = sanitized_format 274 | print( 275 | f"Received request: method={request.method}, query={query}, format={format}, database={database}") 276 | if query_id is not None and not query: 277 | if query_id in cache: 278 | return cache[query_id], 200 279 | if not query: 280 | return app.send_static_file('index.html') 281 | if request.method == "POST": 282 | data = request.get_data(as_text=True) 283 | if database: 284 | query = f"ATTACH '{database}' AS db; USE db; {query}" 285 | result, errmsg = duckdb_query_with_errmsg( 286 | query.strip(), format, data, request.method) 287 | if query_id and len(errmsg) == 0: 288 | cache[query_id] = result 289 | if len(errmsg) == 0: 290 | if request.method == "HEAD": 291 | response = app.response_class(status=200) 292 | response.headers['Content-Type'] = 'application/json' 293 | response.headers['Accept-Ranges'] = 'bytes' 294 | content_length = len(result) if isinstance( 295 | result, bytes) else len(result.decode()) 296 | response.headers['Content-Length'] = content_length 297 | return response 298 | return result, 200 299 | if len(result) > 0: 300 | print("warning:", errmsg) 301 | return result, 200 302 | print("Error occurred:", errmsg) 303 | return errmsg, 400 304 | 305 | 306 | @app.route('/', methods=["POST"]) 307 | @auth.login_required 308 | def play(): 309 | query = request.args.get('query', default=None, type=str) 310 | body = request.get_data() or None 311 | format = request.args.get( 312 | 'default_format', default="JSONCompact", type=str) 313 | database = request.args.get('database', default="", type=str) 314 | query_id = request.args.get('query_id', default=None, type=str) 315 | if query_id is not None and not query: 316 | if query_id in cache: 317 | return cache[query_id], 200 318 | if query is None: 319 | query = "" 320 | if body is not None: 321 | data = " ".join(body.decode('utf-8').strip().splitlines()) 322 | query = f"{query} {data}" 323 | if not query: 324 | return "Error: no query parameter provided", 400 325 | if database: 326 | query = f"ATTACH '{database}' AS db; USE db; {query}" 327 | query, sanitized_format = sanitize_query(query) 328 | if sanitized_format: 329 | format = sanitized_format 330 | print("DEBUG POST", query, format) 331 | result, errmsg = duckdb_query_with_errmsg(query.strip(), format) 332 | if len(errmsg) == 0: 333 | return result, 200 334 | if len(result) > 0: 335 | print("warning:", errmsg) 336 | return result, 200 337 | return errmsg, 400 338 | 339 | 340 | @app.route('/play', methods=["GET"]) 341 | def handle_play(): 342 | return app.send_static_file('index.html') 343 | 344 | 345 | @app.route('/ping', methods=["GET"]) 346 | def handle_ping(): 347 | return "Ok", 200 348 | 349 | 350 | @app.errorhandler(404) 351 | def handle_404(e): 352 | return app.send_static_file('index.html') 353 | 354 | 355 | host = os.getenv('HOST', '0.0.0.0') 356 | port = int(os.getenv('PORT', 8123)) 357 | flight_host = os.getenv('FLIGHT_HOST', 'localhost') 358 | flight_port = int(os.getenv('FLIGHT_PORT', 8815)) 359 | path = os.getenv('DATA', '.duckdb_data') 360 | 361 | def parse_ticket(ticket): 362 | try: 363 | # Try to decode the ticket as a JSON object 364 | ticket_obj = json.loads(ticket.ticket.decode("utf-8")) 365 | if isinstance(ticket_obj, str): 366 | # If the JSON object is a string, parse it again 367 | ticket_obj = json.loads(ticket_obj) 368 | if "query" in ticket_obj: 369 | return ticket_obj["query"] 370 | except (json.JSONDecodeError, AttributeError): 371 | # If decoding fails or "query" is not in the object, return the ticket as a string 372 | return ticket.ticket.decode("utf-8") 373 | 374 | 375 | @dataclass 376 | class FlightSchemaMetadata: 377 | type: str 378 | catalog: str 379 | schema: str 380 | name: str 381 | comment: Optional[str] 382 | input_schema: pa.Schema 383 | description: Optional[str] = None 384 | action_name: Optional[str] = None 385 | 386 | def serialize(self) -> bytes: 387 | metadata = { 388 | 'type': self.type, 389 | 'catalog': self.catalog, 390 | 'schema': self.schema, 391 | 'name': self.name, 392 | 'comment': self.comment, 393 | 'input_schema': self.input_schema.serialize().to_pybytes() 394 | } 395 | if self.description: 396 | metadata['description'] = self.description 397 | if self.action_name: 398 | metadata['action_name'] = self.action_name 399 | return msgpack.packb(metadata) 400 | 401 | @classmethod 402 | def deserialize(cls, data: bytes) -> 'FlightSchemaMetadata': 403 | metadata = msgpack.unpackb(data) 404 | metadata['input_schema'] = pa.ipc.read_schema(metadata['input_schema']) 405 | return cls(**metadata) 406 | 407 | 408 | # Add this helper class for schema serialization 409 | @dataclass 410 | class SerializedSchema: 411 | schema: str 412 | description: str 413 | tags: Dict[str, str] 414 | contents: Dict[str, Optional[str]] 415 | type: str 416 | 417 | def to_dict(self) -> Dict: 418 | return { 419 | "schema": self.schema, 420 | "description": self.description, 421 | "tags": self.tags, 422 | "type": self.type, 423 | "contents": { 424 | "url": None, 425 | "sha256": None, 426 | "serialized": None 427 | } 428 | } 429 | 430 | 431 | # Patch the main function where the ticket is processed 432 | if __name__ == '__main__': 433 | # Set up signal handlers 434 | signal.signal(signal.SIGINT, signal_handler) 435 | 436 | def run_flask(): 437 | """Run Flask server""" 438 | logger.info("Starting Flask server") 439 | try: 440 | app.run(host=host, port=port, use_reloader=False) 441 | except Exception as e: 442 | logger.exception("Flask server error") 443 | finally: 444 | logger.info("Flask server stopped") 445 | 446 | def run_flight_server(): 447 | """Run Flight server""" 448 | class HeaderMiddleware(flight.ServerMiddleware): 449 | def __init__(self): 450 | self.authorization = None 451 | self.headers = {} # Store all headers 452 | 453 | def call_completed(self, exception=None): 454 | pass 455 | 456 | class HeaderMiddlewareFactory(flight.ServerMiddlewareFactory): 457 | def start_call(self, info, headers): 458 | logger.debug(f"Info received: {info}") 459 | logger.debug(f"Headers received: {headers}") 460 | middleware = HeaderMiddleware() 461 | 462 | # Store all headers in the middleware 463 | middleware.headers = headers 464 | 465 | if "authorization" in headers: 466 | # Get first value from list 467 | auth = headers["authorization"][0] 468 | auth = auth[7:] if auth.startswith('Bearer ') else auth 469 | middleware.authorization = auth 470 | 471 | return middleware 472 | 473 | class DuckDBFlightServer(flight.FlightServerBase): 474 | def __init__(self, location=f"grpc://{flight_host}:{flight_port}", db_path=":memory:"): 475 | middleware = {"auth": HeaderMiddlewareFactory()} 476 | super().__init__(location=location, middleware=middleware) 477 | self._location = location 478 | logger.info(f"Initializing Flight server at {location}") 479 | self.conn = duckdb.connect(db_path) 480 | 481 | # Define schema for catalog listing 482 | catalog_schema = pa.schema([ 483 | ('catalog_name', pa.string()), 484 | ('schema_name', pa.string()), 485 | ('description', pa.string()) 486 | ]) 487 | 488 | # Define schema for table listing 489 | table_schema = pa.schema([ 490 | ('table_name', pa.string()), 491 | ('schema_name', pa.string()), 492 | ('catalog_name', pa.string()), 493 | ('table_type', pa.string()) 494 | ]) 495 | 496 | self.flights = [ 497 | { 498 | "command": "show_databases", 499 | "ticket": flight.Ticket("SHOW DATABASES".encode("utf-8")), 500 | "location": [self._location], 501 | "schema": catalog_schema 502 | }, 503 | { 504 | "command": "show_tables", 505 | "ticket": flight.Ticket("SHOW TABLES".encode("utf-8")), 506 | "location": [self._location], 507 | "schema": table_schema 508 | }, 509 | { 510 | "command": "show_version", 511 | "ticket": flight.Ticket("SELECT version()".encode("utf-8")), 512 | "location": [self._location], 513 | "schema": pa.schema([('version', pa.string())]) 514 | }, 515 | { 516 | "command": "list_schemas", 517 | "ticket": flight.Ticket("SHOW ALL TABLES".encode("utf-8")), 518 | "location": [self._location], 519 | "schema": table_schema 520 | } 521 | ] 522 | 523 | def _get_connection_from_context(self, context) -> duckdb.DuckDBPyConnection: 524 | """Get the appropriate connection based on Flight context""" 525 | middleware = context.get_middleware("auth") 526 | if middleware and middleware.authorization: 527 | auth_header = middleware.authorization 528 | if isinstance(auth_header, str): 529 | if ':' in auth_header: 530 | username, password = auth_header.split(':', 1) 531 | user_pass_hash = hashlib.sha256((username + password).encode()).hexdigest() 532 | else: 533 | user_pass_hash = auth_header 534 | return connection_manager.get_connection(user_pass_hash) 535 | return connection_manager.get_connection() 536 | 537 | def do_action(self, context, action): 538 | """Handle Flight actions""" 539 | logger.debug(f"Action Request: {action}") 540 | 541 | if action.type == "list_schemas": 542 | try: 543 | # Parse the request body 544 | body = json.loads(action.body.to_pybytes().decode('utf-8')) 545 | catalog_name = body.get("catalog_name", "main") 546 | 547 | # Query schemas from DuckDB 548 | query = """ 549 | SELECT 550 | schema_name as schema, 551 | 'DuckDB Schema' as description, 552 | '{}' as tags, 553 | 'table' as type 554 | FROM information_schema.schemata 555 | WHERE catalog_name = ? 556 | """ 557 | result = self.conn.execute(query, [catalog_name]).fetchall() 558 | 559 | # Convert results to SerializedSchema objects 560 | schemas = [] 561 | for row in result: 562 | schema = SerializedSchema( 563 | schema=catalog_name, 564 | description=row[1], 565 | tags=json.loads(row[2]), 566 | contents={"url": None, "sha256": None, "serialized": None}, 567 | type=row[3] 568 | ) 569 | schemas.append(schema.to_dict()) 570 | 571 | # Create the catalog root structure 572 | catalog_root = { 573 | "contents": { 574 | "url": None, 575 | "sha256": None, 576 | "serialized": None 577 | }, 578 | "schemas": schemas 579 | } 580 | 581 | # Serialize with msgpack 582 | packed_data = msgpack.packb(catalog_root) 583 | 584 | # Compress with zstd 585 | compressor = zstd.ZstdCompressor() 586 | compressed_data = compressor.compress(packed_data) 587 | 588 | # Create result with decompressed length and compressed data 589 | decompressed_length = len(packed_data) 590 | length_bytes = decompressed_length.to_bytes(4, byteorder='little') 591 | 592 | # Return results as flight.Result objects 593 | yield flight.Result(pa.py_buffer(length_bytes)) 594 | yield flight.Result(pa.py_buffer(compressed_data)) 595 | 596 | except Exception as e: 597 | logger.exception("Error in list_schemas action") 598 | raise flight.FlightUnavailableError(f"Failed to list schemas: {str(e)}") 599 | 600 | elif action.type == "create_schema": 601 | try: 602 | # Set up authenticated connection first 603 | middleware = context.get_middleware("auth") 604 | if middleware and middleware.authorization: 605 | auth_header = middleware.authorization 606 | logger.info(f"Using authorization from middleware: {auth_header}") 607 | if isinstance(auth_header, str): 608 | if ':' in auth_header: 609 | username, password = auth_header.split(':', 1) 610 | user_pass_hash = hashlib.sha256((username + password).encode()).hexdigest() 611 | else: 612 | user_pass_hash = auth_header 613 | 614 | db_file = os.path.join(dbpath, f"{user_pass_hash}.db") 615 | logger.info(f'Using database file: {db_file}') 616 | self.conn = duckdb.connect(db_file) 617 | 618 | # Try msgpack first 619 | try: 620 | body = msgpack.unpackb(action.body.to_pybytes()) 621 | except: 622 | # Fall back to UTF-8 if msgpack fails 623 | body = action.body.to_pybytes().decode('utf-8') 624 | 625 | # Extract schema name from the full path (e.g., deltalake.test1 -> test1) 626 | schema_name = body.split('.')[-1] if '.' in body else body 627 | 628 | # Create schema in the authenticated database 629 | query = f"CREATE SCHEMA IF NOT EXISTS {schema_name}" 630 | logger.debug(f"Creating schema with query: {query}") 631 | self.conn.execute(query) 632 | 633 | except Exception as e: 634 | logger.exception("Error in create_schema action") 635 | raise flight.FlightUnavailableError(f"Failed to create schema: {str(e)}") 636 | 637 | elif action.type == "create_table": 638 | try: 639 | # Set up authenticated connection first 640 | middleware = context.get_middleware("auth") 641 | if middleware and middleware.authorization: 642 | auth_header = middleware.authorization 643 | logger.info(f"Using authorization from middleware: {auth_header}") 644 | if isinstance(auth_header, str): 645 | if ':' in auth_header: 646 | username, password = auth_header.split(':', 1) 647 | user_pass_hash = hashlib.sha256((username + password).encode()).hexdigest() 648 | else: 649 | user_pass_hash = auth_header 650 | 651 | db_file = os.path.join(dbpath, f"{user_pass_hash}.db") 652 | logger.info(f'Using database file: {db_file}') 653 | self.conn = duckdb.connect(db_file) 654 | 655 | # Get the raw bytes and parse table info 656 | body_bytes = action.body.to_pybytes() 657 | logger.debug(f"Raw table creation bytes: {body_bytes.hex()}") 658 | 659 | try: 660 | # Parse Arrow IPC format 661 | reader = pa.ipc.open_stream(pa.py_buffer(body_bytes)) 662 | table = reader.read_all() 663 | 664 | logger.debug(f"Arrow schema: {table.schema}") 665 | logger.debug(f"Column names: {table.column_names}") 666 | 667 | # Get metadata from schema 668 | schema_metadata = table.schema.metadata 669 | catalog_name = schema_metadata.get(b'catalog_name', b'').decode('utf-8') 670 | schema_name = schema_metadata.get(b'schema_name', b'').decode('utf-8') 671 | table_name = schema_metadata.get(b'table_name', b'').decode('utf-8') 672 | 673 | # Extract actual schema name (e.g., test1 from deltalake.test1) 674 | actual_schema = schema_name.split('.')[-1] if '.' in schema_name else schema_name 675 | 676 | # Get columns from schema 677 | columns = [] 678 | for field in table.schema: 679 | columns.append({ 680 | 'name': field.name, 681 | 'type': self._arrow_to_duckdb_type(field.type) 682 | }) 683 | 684 | logger.debug(f"Parsed metadata - catalog: {catalog_name}, schema: {schema_name}, table: {table_name}") 685 | logger.debug(f"Columns: {columns}") 686 | 687 | if not actual_schema or not table_name: 688 | raise flight.FlightUnavailableError( 689 | f"Missing schema_name or table_name in request. Found catalog={catalog_name}, schema={schema_name}, table={table_name}") 690 | 691 | column_defs = [] 692 | for col in columns: 693 | name = col.get('name') 694 | type_ = col.get('type') 695 | if not name or not type_: 696 | raise flight.FlightUnavailableError(f"Invalid column definition: {col}") 697 | column_defs.append(f"{name} {type_}") 698 | 699 | # Create table in the authenticated database 700 | query = f"""CREATE TABLE IF NOT EXISTS {actual_schema}.{table_name} ( 701 | {', '.join(column_defs)} 702 | )""" 703 | 704 | logger.debug(f"Creating table with query: {query}") 705 | self.conn.execute(query) 706 | 707 | # Create and return FlightInfo for the newly created table 708 | schema_metadata = FlightSchemaMetadata( 709 | type="table", 710 | catalog=catalog_name, 711 | schema=schema_name, 712 | name=table_name, 713 | comment=None, 714 | input_schema=table.schema 715 | ) 716 | 717 | flight_info = flight.FlightInfo( 718 | table.schema, 719 | flight.FlightDescriptor.for_path(table_name.encode()), 720 | [flight.FlightEndpoint( 721 | ticket=flight.Ticket( 722 | f"SELECT * FROM {catalog_name}.{schema_name}.{table_name}".encode() 723 | ), 724 | locations=[self._location] 725 | )], 726 | -1, # total_records 727 | -1, # total_bytes 728 | schema_metadata.serialize() 729 | ) 730 | 731 | yield flight.Result(flight_info.serialize()) 732 | 733 | except Exception as e: 734 | logger.exception("Failed to parse Arrow IPC data") 735 | raise flight.FlightUnavailableError(f"Invalid Arrow IPC data in request: {str(e)}") 736 | 737 | except Exception as e: 738 | logger.exception("Error in create_table action") 739 | raise flight.FlightUnavailableError(f"Failed to create table: {str(e)}") 740 | 741 | else: 742 | raise flight.FlightUnavailableError(f"Action '{action.type}' not implemented") 743 | 744 | def do_get(self, context, ticket): 745 | """Handle 'GET' requests""" 746 | logger.debug("do_get called") 747 | try: 748 | # Access middleware and set up connection 749 | middleware = context.get_middleware("auth") 750 | if middleware and middleware.authorization: 751 | auth_header = middleware.authorization 752 | logger.info(f"Using authorization from middleware: {auth_header}") 753 | if isinstance(auth_header, str): 754 | if ':' in auth_header: 755 | username, password = auth_header.split(':', 1) 756 | user_pass_hash = hashlib.sha256((username + password).encode()).hexdigest() 757 | else: 758 | user_pass_hash = auth_header 759 | 760 | db_file = os.path.join(dbpath, f"{user_pass_hash}.db") 761 | logger.info(f'Using database file: {db_file}') 762 | self.conn = duckdb.connect(db_file) 763 | 764 | except Exception as e: 765 | logger.debug(f"Middleware access error: {e}") 766 | 767 | query = parse_ticket(ticket) 768 | 769 | # Rewrite query to use local schema instead of deltalake catalog 770 | if query.lower().startswith("select"): 771 | # Extract schema and table from deltalake.schema.table pattern 772 | parts = query.split() 773 | for i, part in enumerate(parts): 774 | if "deltalake." in part.lower(): 775 | # Remove the catalog prefix, keeping schema and table 776 | parts[i] = part.split(".", 1)[1] 777 | query = " ".join(parts) 778 | 779 | logger.info(f"Executing query: {query}") 780 | try: 781 | result_table = self.conn.execute(query).fetch_arrow_table() 782 | batches = result_table.to_batches(max_chunksize=1024) 783 | if not batches: 784 | logger.debug("No data in result") 785 | schema = result_table.schema 786 | return flight.RecordBatchStream(pa.Table.from_batches([], schema)) 787 | logger.debug(f"Returning {len(batches)} batches") 788 | return flight.RecordBatchStream(pa.Table.from_batches(batches)) 789 | except Exception as e: 790 | logger.exception(f"Query execution error: {str(e)}") 791 | raise 792 | 793 | def do_put(self, context, descriptor, reader, writer): 794 | """Handle 'PUT' requests""" 795 | table = reader.read_all() 796 | table_name = descriptor.path[0].decode('utf-8') 797 | self.conn.register("temp_table", table) 798 | self.conn.execute( 799 | f"INSERT INTO {table_name} SELECT * FROM temp_table") 800 | 801 | def get_flight_info(self, context, descriptor): 802 | """Implement 'get_flight_info'""" 803 | try: 804 | # Set up authenticated connection 805 | middleware = context.get_middleware("auth") 806 | if middleware and middleware.authorization: 807 | auth_header = middleware.authorization 808 | logger.info(f"Using authorization from middleware: {auth_header}") 809 | if isinstance(auth_header, str): 810 | if ':' in auth_header: 811 | username, password = auth_header.split(':', 1) 812 | user_pass_hash = hashlib.sha256((username + password).encode()).hexdigest() 813 | else: 814 | user_pass_hash = auth_header 815 | 816 | db_file = os.path.join(dbpath, f"{user_pass_hash}.db") 817 | logger.info(f'Using database file: {db_file}') 818 | self.conn = duckdb.connect(db_file) 819 | 820 | if descriptor.command is not None: 821 | query = descriptor.command.decode("utf-8") 822 | result_table = self.conn.execute(query).fetch_arrow_table() 823 | schema = result_table.schema 824 | endpoints = [flight.FlightEndpoint( 825 | ticket=flight.Ticket(query.encode("utf-8")), 826 | locations=[self._location] 827 | )] 828 | return flight.FlightInfo(schema, descriptor, endpoints, -1, -1) 829 | elif descriptor.path is not None: 830 | for flight_info in self.flights: 831 | if descriptor.path[0].decode("utf-8") == flight_info["command"]: 832 | query = flight_info["ticket"].ticket.decode("utf-8") 833 | logger.info(f"Attempting flight with query: {query}") 834 | try: 835 | result_table = self.conn.execute(query).fetch_arrow_table() 836 | schema = result_table.schema 837 | endpoints = [flight.FlightEndpoint( 838 | ticket=flight.Ticket(query.encode("utf-8")), 839 | locations=[self._location] 840 | )] 841 | return flight.FlightInfo(schema, descriptor, endpoints, -1, -1) 842 | 843 | except Exception as e: 844 | logger.exception(f"Flight execution error: {str(e)}") 845 | raise flight.FlightUnavailableError("Failed taking off") 846 | else: 847 | raise flight.FlightUnavailableError( 848 | "No command or path provided in the descriptor") 849 | except Exception as e: 850 | logger.exception("Error in get_flight_info") 851 | raise flight.FlightUnavailableError(f"Failed to get flight info: {str(e)}") 852 | 853 | def list_flights(self, context, criteria): 854 | """List available flights with metadata""" 855 | logger.info("Listing available flights") 856 | 857 | try: 858 | # Set up authenticated connection 859 | middleware = context.get_middleware("auth") 860 | if middleware and middleware.authorization: 861 | auth_header = middleware.authorization 862 | logger.info(f"Using authorization from middleware: {auth_header}") 863 | if isinstance(auth_header, str): 864 | if ':' in auth_header: 865 | username, password = auth_header.split(':', 1) 866 | user_pass_hash = hashlib.sha256((username + password).encode()).hexdigest() 867 | else: 868 | user_pass_hash = auth_header 869 | 870 | db_file = os.path.join(dbpath, f"{user_pass_hash}.db") 871 | logger.info(f'Using database file: {db_file}') 872 | self.conn = duckdb.connect(db_file) 873 | 874 | headers = middleware.headers if middleware else {} 875 | catalog_filter = None 876 | schema_filter = None 877 | 878 | # Extract filters from headers 879 | if "airport-list-flights-filter-catalog" in headers: 880 | catalog_filter = headers["airport-list-flights-filter-catalog"][0] 881 | if "airport-list-flights-filter-schema" in headers: 882 | schema_filter = headers["airport-list-flights-filter-schema"][0] 883 | 884 | logger.debug(f"Filtering flights - catalog: {catalog_filter}, schema: {schema_filter}") 885 | 886 | if catalog_filter and schema_filter: 887 | # Query for tables in the specific catalog and schema 888 | query = f""" 889 | SELECT 890 | table_name, 891 | table_schema as schema_name, 892 | table_catalog as catalog_name, 893 | table_type, 894 | column_name, 895 | data_type 896 | FROM information_schema.tables 897 | JOIN information_schema.columns USING (table_catalog, table_schema, table_name) 898 | WHERE table_catalog = '{catalog_filter}' 899 | AND table_schema = '{schema_filter}' 900 | ORDER BY table_name, ordinal_position 901 | """ 902 | 903 | try: 904 | result = self.conn.execute(query).fetchall() 905 | 906 | # Group results by table 907 | tables = {} 908 | for row in result: 909 | table_name = row[0] 910 | if table_name not in tables: 911 | tables[table_name] = { 912 | 'schema_name': row[1], 913 | 'catalog_name': row[2], 914 | 'table_type': row[3], 915 | 'columns': [] 916 | } 917 | tables[table_name]['columns'].append({ 918 | 'name': row[4], 919 | 'type': row[5] 920 | }) 921 | 922 | # Create flight info for each table 923 | for table_name, table_info in tables.items(): 924 | # Create Arrow schema from columns 925 | fields = [] 926 | for col in table_info['columns']: 927 | # Convert DuckDB type to Arrow type 928 | arrow_type = pa.string() # Default to string 929 | if 'INT' in col['type'].upper(): 930 | arrow_type = pa.int64() 931 | elif 'DOUBLE' in col['type'].upper() or 'FLOAT' in col['type'].upper(): 932 | arrow_type = pa.float64() 933 | elif 'BOOLEAN' in col['type'].upper(): 934 | arrow_type = pa.bool_() 935 | fields.append(pa.field(col['name'], arrow_type)) 936 | 937 | schema = pa.schema(fields) 938 | 939 | # Create metadata for the table 940 | schema_metadata = FlightSchemaMetadata( 941 | type="table", 942 | catalog=table_info['catalog_name'], 943 | schema=table_info['schema_name'], 944 | name=table_name, 945 | comment=None, 946 | input_schema=schema 947 | ) 948 | 949 | # Create flight info 950 | flight_info = flight.FlightInfo( 951 | schema, 952 | flight.FlightDescriptor.for_path([table_name.encode()]), 953 | [flight.FlightEndpoint( 954 | ticket=flight.Ticket( 955 | f"SELECT * FROM {table_info['catalog_name']}.{table_info['schema_name']}.{table_name}".encode() 956 | ), 957 | locations=[self._location] 958 | )], 959 | -1, # total_records 960 | -1, # total_bytes 961 | schema_metadata.serialize() 962 | ) 963 | 964 | yield flight_info 965 | 966 | except Exception as e: 967 | logger.exception(f"Error querying tables: {str(e)}") 968 | raise flight.FlightUnavailableError(f"Failed to list tables: {str(e)}") 969 | 970 | else: 971 | # Return default flights when no specific filters 972 | for flight_info in self.flights: 973 | schema_metadata = FlightSchemaMetadata( 974 | type="table", 975 | catalog="main", 976 | schema="public", 977 | name=flight_info["command"], 978 | comment=None, 979 | input_schema=flight_info["schema"] 980 | ) 981 | 982 | yield flight_info 983 | 984 | except Exception as e: 985 | logger.exception("Error in list_flights") 986 | raise flight.FlightUnavailableError(f"Failed to list flights: {str(e)}") 987 | 988 | def _arrow_to_duckdb_type(self, arrow_type): 989 | """Convert Arrow type to DuckDB type""" 990 | if pa.types.is_string(arrow_type): 991 | return 'VARCHAR' 992 | elif pa.types.is_int32(arrow_type): 993 | return 'INTEGER' 994 | elif pa.types.is_int64(arrow_type): 995 | return 'BIGINT' 996 | elif pa.types.is_float32(arrow_type): 997 | return 'FLOAT' 998 | elif pa.types.is_float64(arrow_type): 999 | return 'DOUBLE' 1000 | elif pa.types.is_boolean(arrow_type): 1001 | return 'BOOLEAN' 1002 | elif pa.types.is_list(arrow_type): 1003 | return f'{self._arrow_to_duckdb_type(arrow_type.value_type)}[]' 1004 | else: 1005 | return 'VARCHAR' # Default to VARCHAR for unknown types 1006 | 1007 | def do_exchange(self, context, descriptor, reader, writer): 1008 | """Handle data exchange (PUT/INSERT operations)""" 1009 | logger.debug("do_exchange called") 1010 | try: 1011 | # Get headers from middleware 1012 | middleware = context.get_middleware("auth") 1013 | headers = middleware.headers if middleware else {} 1014 | 1015 | # Set up authenticated connection 1016 | if middleware and middleware.authorization: 1017 | auth_header = middleware.authorization 1018 | logger.info(f"Using authorization from middleware: {auth_header}") 1019 | if isinstance(auth_header, str): 1020 | if ':' in auth_header: 1021 | username, password = auth_header.split(':', 1) 1022 | user_pass_hash = hashlib.sha256((username + password).encode()).hexdigest() 1023 | else: 1024 | user_pass_hash = auth_header 1025 | 1026 | db_file = os.path.join(dbpath, f"{user_pass_hash}.db") 1027 | logger.info(f'Using database file: {db_file}') 1028 | self.conn = duckdb.connect(db_file) 1029 | 1030 | # Get operation type from headers 1031 | operation = headers.get("airport-operation", [None])[0] 1032 | logger.debug(f"Exchange operation: {operation}") 1033 | 1034 | if operation == "insert": 1035 | # Get table path from headers 1036 | table_path = headers.get("airport-flight-path", [None])[0] 1037 | if not table_path: 1038 | raise flight.FlightUnavailableError("No table path provided for insert operation") 1039 | 1040 | logger.debug(f"Inserting into table: {table_path}") 1041 | 1042 | try: 1043 | # Read schema from reader 1044 | schema = reader.schema 1045 | logger.debug(f"Received schema: {schema}") 1046 | 1047 | # Create response schema early 1048 | response_schema = pa.schema([('rows_inserted', pa.int64())]) 1049 | writer.begin(response_schema) 1050 | 1051 | # Process data in batches 1052 | total_rows = 0 1053 | batch_num = 0 1054 | 1055 | # Read all batches 1056 | try: 1057 | while True: 1058 | try: 1059 | batch, metadata = reader.read_chunk() 1060 | if batch is None: 1061 | break 1062 | 1063 | batch_num += 1 1064 | logger.debug(f"Processing batch {batch_num} with {len(batch)} rows") 1065 | 1066 | # Create temporary table for this batch 1067 | temp_table = pa.Table.from_batches([batch]) 1068 | temp_name = f"temp_insert_table_{batch_num}" 1069 | 1070 | # Register and insert this batch 1071 | self.conn.register(temp_name, temp_table) 1072 | actual_schema = table_path.split('.')[0] if '.' in table_path else table_path 1073 | query = f"INSERT INTO {actual_schema}.{table_path} SELECT * FROM {temp_name}" 1074 | logger.debug(f"Executing insert query: {query}") 1075 | self.conn.execute(query) 1076 | 1077 | total_rows += len(batch) 1078 | 1079 | except StopIteration: 1080 | logger.debug("Reached end of input stream") 1081 | break 1082 | except Exception as e: 1083 | logger.exception(f"Error reading batch") 1084 | raise 1085 | 1086 | logger.debug(f"Inserted total of {total_rows} rows") 1087 | 1088 | # Write response 1089 | response_table = pa.Table.from_pylist( 1090 | [{'rows_inserted': total_rows}], 1091 | schema=response_schema 1092 | ) 1093 | writer.write_table(response_table) 1094 | writer.close() 1095 | 1096 | except Exception as e: 1097 | logger.exception("Error during insert operation") 1098 | raise flight.FlightUnavailableError(f"Insert operation failed: {str(e)}") 1099 | 1100 | else: 1101 | raise flight.FlightUnavailableError(f"Unsupported operation: {operation}") 1102 | 1103 | except Exception as e: 1104 | logger.exception("Error in do_exchange") 1105 | raise flight.FlightUnavailableError(f"Exchange operation failed: {str(e)}") 1106 | 1107 | server = DuckDBFlightServer() 1108 | logger.info( 1109 | f"Starting DuckDB Flight server on {flight_host}:{flight_port}") 1110 | server.serve() 1111 | 1112 | # Start Flask server in a daemon thread 1113 | flask_thread = threading.Thread(target=run_flask, daemon=True) 1114 | flask_thread.start() 1115 | 1116 | # Run Flight server in main thread 1117 | flight_thread = threading.Thread(target=run_flight_server, daemon=True) 1118 | flight_thread.start() 1119 | 1120 | # Keep main thread alive until signal 1121 | try: 1122 | while running: 1123 | time.sleep(1) 1124 | except KeyboardInterrupt: 1125 | logger.info("KeyboardInterrupt received") 1126 | finally: 1127 | logger.info("Shutting down...") 1128 | -------------------------------------------------------------------------------- /public/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | quackpipe 8 | 9 | 24 | 25 | 33 | 34 | 446 | 447 | 448 | 449 |
450 |
451 | 452 |
453 |
454 | 455 |
456 |
457 | 458 |  (Ctrl/Cmd+Enter) 459 | 460 | 461 | 471 | 472 | 🌑🌞 473 |
474 |
475 |
476 |
477 |

 478 |     
479 |
480 | 481 |

482 |

483 |

484 | 485 | 486 | 1098 | 1099 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Flask 2 | flask_httpauth 3 | flask-cors 4 | cachetools 5 | duckdb 6 | msgpack 7 | PyArrow 8 | zstandard 9 | --------------------------------------------------------------------------------