├── .gitignore ├── README.md ├── duck_iceberg_demo ├── __init__.py ├── deploy_duckdb_aws │ ├── Dockerfile │ ├── README.md │ ├── config.py │ ├── main.py │ └── requirements.txt ├── deploy_duckdb_gcp │ ├── README.md │ ├── build_cloud_run.sh │ ├── config.py │ ├── main.py │ └── requirements.txt ├── duck_iceberg_demo.ipynb ├── local_duck_cloud_aws_iceberg_demo.ipynb └── local_duck_cloud_gcp_iceberg_demo.ipynb ├── poetry.lock ├── pyproject.toml └── tests └── __init__.py /.gitignore: -------------------------------------------------------------------------------- 1 | .venv/ 2 | /duck_iceberg_demo/deploy_duckdb_gcp/**.ipynb 3 | /duck_iceberg_demo/deploy_duckdb_aws/**.ipynb 4 | **/__pycache__/ 5 | **/.DS_Store -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # This repo contains code referenced in the following blog posts 2 | - [**How We Migrated Our Data Warehouse from Snowflake to DuckDB**](https://www.definite.app/blog/duckdb-datawarehouse) 3 | - [**Why Databricks paid $1B for a 40 person startup (Tabular)**](https://www.definite.app/blog/databricks-tabular-acquisition) 4 | - [**Comparing Iceberg Query Engines**](https://www.definite.app/blog/iceberg-query-engine) 5 | - [**Running Iceberg and Serverless DuckDB in Google Cloud**](https://www.definite.app/blog/cloud-iceberg-duckdb) 6 | - [**Running Iceberg and Serverless DuckDB in AWS**](https://www.definite.app/blog/cloud-iceberg-duckdb-aws) 7 | 8 | 9 | # Installation 10 | To install all dependencies run: 11 | ``` 12 | poetry install 13 | ``` 14 | 15 | # DuckDB Flask App 16 | A simple Flask app for running DuckDB in GCP and AWS can be found in `duck_iceberg_demo/deploy_duckdb_gcp` and `duck_iceberg_demo/deploy_duckdb_aws` -------------------------------------------------------------------------------- /duck_iceberg_demo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/steven-luabase/duckdb-iceberg-demo/e2d1751f000e2b6cb740f9d977538db9c40b27e0/duck_iceberg_demo/__init__.py -------------------------------------------------------------------------------- /duck_iceberg_demo/deploy_duckdb_aws/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11-slim 2 | 3 | WORKDIR /project 4 | 5 | COPY . . 6 | 7 | RUN pip install -r requirements.txt 8 | 9 | # Exposing the flask app port from container to host 10 | EXPOSE 5000 11 | 12 | # Starting application 13 | CMD ["python", "main.py"] 14 | -------------------------------------------------------------------------------- /duck_iceberg_demo/deploy_duckdb_aws/README.md: -------------------------------------------------------------------------------- 1 | # DuckDB AWS Flask App 2 | This directory contains a Flask app that runs SQL queries with DuckDB against files in AWS S3. 3 | 4 | 5 | ## Configuration Setup 6 | To setup edit `config.py`: 7 | ``` 8 | ACCESS_KEY = "" 9 | SECRET_KEY_NAME = "" 10 | S3_BUCKET_REGION = "" 11 | SECRET_REGION = "" 12 | ``` 13 | Get your AWS Access and Secret Keys. Store your Secret Key in AWS [Secrets Manager](https://docs.aws.amazon.com/secretsmanager/latest/userguide/create_secret.html) and put the name of the secret in `config.py` for the `SECRET_KEY_NAME` variable. 14 | 15 | ## Running The App Locally 16 | You may need to authenticate with the AWS CLI first: 17 | ``` 18 | aws sso login 19 | ``` 20 | #### Using Flask 21 | `python main.py` 22 | #### In Docker 23 | Build with linux/amd64 architecture to be compatible with AWS ECS. 24 | ``` 25 | docker buildx build --platform=linux/amd64 -t duckdb-deploy . 26 | ``` 27 | ``` 28 | docker run -p 5000:5000 -v ~/.aws:/root/.aws -it duckdb-deploy 29 | ``` 30 | 31 | ## Deploy the app to AWS ECS 32 | See blog post: [**Running Iceberg and Serverless DuckDB in AWS**](https://www.definite.app/blog/cloud-iceberg-duckdb-aws) 33 | 34 | ## Query Files in S3 35 | ``` 36 | import requests 37 | 38 | sql = f''' 39 | select 40 | count(*) 41 | from read_parquet('s3:///some_data.parquet'); 42 | ''' 43 | 44 | url = 'http://127.0.0.1:5000/query' # if running locally 45 | query = { 46 | "query": sql 47 | } 48 | response = requests.post(url, params=query) 49 | response.json() 50 | ``` -------------------------------------------------------------------------------- /duck_iceberg_demo/deploy_duckdb_aws/config.py: -------------------------------------------------------------------------------- 1 | ACCESS_KEY = "" 2 | SECRET_KEY_NAME = "" 3 | S3_BUCKET_REGION = "" 4 | SECRET_REGION = "" 5 | -------------------------------------------------------------------------------- /duck_iceberg_demo/deploy_duckdb_aws/main.py: -------------------------------------------------------------------------------- 1 | import duckdb 2 | from flask import Flask, request, jsonify 3 | import boto3 4 | from botocore.exceptions import ClientError 5 | import config 6 | import json 7 | 8 | 9 | 10 | app = Flask(__name__) 11 | 12 | # gets secrets from aws secret manager 13 | def get_secret(secret_name, region_name): 14 | session = boto3.session.Session() 15 | client = session.client( 16 | service_name='secretsmanager', 17 | region_name=region_name 18 | ) 19 | 20 | try: 21 | get_secret_value_response = client.get_secret_value( 22 | SecretId=secret_name 23 | ) 24 | except ClientError as e: 25 | raise e 26 | 27 | secret = json.loads(get_secret_value_response['SecretString']) 28 | return secret.get('secret_key') 29 | 30 | 31 | def init_duckdb_connection(): 32 | access_key = config.ACCESS_KEY 33 | secret_key = get_secret(config.SECRET_KEY_NAME, config.SECRET_REGION) 34 | con = duckdb.connect() 35 | setup_sql = f""" 36 | INSTALL iceberg; 37 | LOAD iceberg; 38 | 39 | INSTALL httpfs; 40 | LOAD httpfs; 41 | 42 | CREATE SECRET ( 43 | TYPE S3, 44 | KEY_ID '{access_key}', 45 | SECRET '{secret_key}', 46 | REGION '{config.S3_BUCKET_REGION}' 47 | ); 48 | """ 49 | con.execute(setup_sql) 50 | return con 51 | 52 | 53 | # global duckdb connection 54 | duckdb_conn = init_duckdb_connection() 55 | 56 | 57 | @app.route("/query", methods=["POST"]) 58 | def query_iceberg(): 59 | try: 60 | query = request.args.get("query") 61 | if not query: 62 | return jsonify({"error": "Query parameter 'query' is required"}), 400 63 | result = duckdb_conn.execute(query).fetchall() 64 | return jsonify({"result": result}), 200 65 | except Exception as e: 66 | return jsonify({"error": str(e)}), 500 67 | 68 | 69 | if __name__ == "__main__": 70 | app.run(host='0.0.0.0', port=5000, debug=True) 71 | -------------------------------------------------------------------------------- /duck_iceberg_demo/deploy_duckdb_aws/requirements.txt: -------------------------------------------------------------------------------- 1 | Flask==3.0.3 2 | duckdb==1.0.0 3 | boto3==1.34.157 -------------------------------------------------------------------------------- /duck_iceberg_demo/deploy_duckdb_gcp/README.md: -------------------------------------------------------------------------------- 1 | # DuckDB Flask App 2 | This directory contains a Flask app that runs SQL queries with DuckDB against files in Google Cloud Storage. 3 | 4 | 5 | ## Configuration Setup 6 | To setup edit `config.py`: 7 | ``` 8 | PROJECT_ID = "" 9 | HMAC_KEY = "" 10 | HMAC_SECRET_KEY_NAME = "" 11 | ``` 12 | Create an HMAC key and secret for your Google Cloud Storage buckets [here](https://cloud.google.com/storage/docs/authentication/managing-hmackeys#create). Store your HMAC secret in Google Secrets Manager and put the name of the secret as HMAC_SECRET_KEY_NAME and the HMAC key as HMAC_KEY in `config.py` 13 | 14 | ## Running Locally 15 | You may need to authenticate with `gcloud` CLI first: 16 | ``` 17 | gcloud auth login 18 | ``` 19 | Then run: 20 | ``` 21 | python main.py 22 | ``` 23 | 24 | ## Deploy the app to Cloud Run 25 | ``` 26 | bash build_cloud_run.sh 27 | ``` 28 | 29 | ## Query Files in GCS 30 | ``` 31 | import requests 32 | 33 | sql = f''' 34 | select 35 | count(*) 36 | from read_parquet('gs:///some_data.parquet'); 37 | ''' 38 | 39 | url = '/queryquery' 40 | query = { 41 | "query": sql 42 | } 43 | response = requests.post(url, params=query) 44 | response.json() 45 | ``` -------------------------------------------------------------------------------- /duck_iceberg_demo/deploy_duckdb_gcp/build_cloud_run.sh: -------------------------------------------------------------------------------- 1 | SERVICE_NAME=duck-iceberg-demo 2 | REGION=us-east1 3 | PROJECT_ID=YOUR_PROJECT_ID 4 | gcloud config set project ${PROJECT_ID} 5 | gcloud run deploy ${SERVICE_NAME} \ 6 | --source . \ 7 | --platform managed \ 8 | --region ${REGION} \ 9 | --allow-unauthenticated \ 10 | --memory 8Gi \ 11 | --cpu 2 12 | 13 | -------------------------------------------------------------------------------- /duck_iceberg_demo/deploy_duckdb_gcp/config.py: -------------------------------------------------------------------------------- 1 | PROJECT_ID = "" 2 | HMAC_KEY = "" 3 | HMAC_SECRET_KEY_NAME = "" -------------------------------------------------------------------------------- /duck_iceberg_demo/deploy_duckdb_gcp/main.py: -------------------------------------------------------------------------------- 1 | import duckdb 2 | from flask import Flask, request, jsonify 3 | from google.cloud import secretmanager 4 | import config 5 | 6 | 7 | app = Flask(__name__) 8 | 9 | # gets secrets from google secret manager 10 | def get_secret(secret_name: str, project_id: str): 11 | client = secretmanager.SecretManagerServiceClient() 12 | secret = client.access_secret_version( 13 | name=f"projects/{project_id}/secrets/{secret_name}/versions/latest" 14 | ) 15 | return secret.payload.data.decode("utf-8") 16 | 17 | 18 | def init_duckdb_connection(): 19 | hmac_key = config.HMAC_KEY 20 | hmac_secret = get_secret(config.HMAC_SECRET_KEY_NAME, config.PROJECT_ID) 21 | con = duckdb.connect() 22 | setup_sql = f""" 23 | INSTALL iceberg; 24 | LOAD iceberg; 25 | 26 | INSTALL httpfs; 27 | LOAD httpfs; 28 | 29 | CREATE SECRET ( 30 | TYPE GCS, 31 | KEY_ID '{hmac_key}', 32 | SECRET '{hmac_secret}' 33 | ); 34 | """ 35 | con.execute(setup_sql) 36 | return con 37 | 38 | 39 | # global duckdb connection 40 | duckdb_conn = init_duckdb_connection() 41 | 42 | 43 | @app.route("/query", methods=["POST"]) 44 | def query_iceberg(): 45 | try: 46 | query = request.args.get("query") 47 | if not query: 48 | return jsonify({"error": "Query parameter 'query' is required"}), 400 49 | result = duckdb_conn.execute(query).fetchall() 50 | return jsonify({"result": result}), 200 51 | except Exception as e: 52 | return jsonify({"error": str(e)}), 500 53 | 54 | 55 | if __name__ == "__main__": 56 | app.run(debug=True) -------------------------------------------------------------------------------- /duck_iceberg_demo/deploy_duckdb_gcp/requirements.txt: -------------------------------------------------------------------------------- 1 | Flask==3.0.3 2 | duckdb==1.0.0 3 | google-cloud-secret-manager==2.20.1 -------------------------------------------------------------------------------- /duck_iceberg_demo/duck_iceberg_demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import duckdb\n", 10 | "from pyiceberg.catalog.sql import SqlCatalog\n", 11 | "import pyarrow as pa\n", 12 | "import os\n", 13 | "import shutil\n", 14 | "import sqlite3" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "name": "stdout", 24 | "output_type": "stream", 25 | "text": [ 26 | "41994806\n" 27 | ] 28 | } 29 | ], 30 | "source": [ 31 | "# get Q2 2023 to through april 2024 (latest available data)\n", 32 | "trips_ls = []\n", 33 | "months = [\n", 34 | " '2023-04',\n", 35 | " '2023-05', \n", 36 | " '2023-06', \n", 37 | " '2023-07', \n", 38 | " '2023-08', \n", 39 | " '2023-09', \n", 40 | " '2023-10', \n", 41 | " '2023-11', \n", 42 | " '2023-12', \n", 43 | " '2024-01', \n", 44 | " '2024-02', \n", 45 | " '2024-03', \n", 46 | " '2024-04'\n", 47 | " ]\n", 48 | "for month in months:\n", 49 | " table_path = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{month}.parquet'\n", 50 | " table = duckdb.sql(f\"SELECT * FROM '{table_path}'\").arrow()\n", 51 | " trips_ls.append(table)\n", 52 | "\n", 53 | "# concatenate all tables\n", 54 | "trips = pa.concat_tables(trips_ls)\n", 55 | "print(trips.num_rows)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 3, 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "data": { 65 | "text/plain": [ 66 | "VendorID: int32\n", 67 | "tpep_pickup_datetime: timestamp[us]\n", 68 | "tpep_dropoff_datetime: timestamp[us]\n", 69 | "passenger_count: int64\n", 70 | "trip_distance: double\n", 71 | "RatecodeID: int64\n", 72 | "store_and_fwd_flag: string\n", 73 | "PULocationID: int32\n", 74 | "DOLocationID: int32\n", 75 | "payment_type: int64\n", 76 | "fare_amount: double\n", 77 | "extra: double\n", 78 | "mta_tax: double\n", 79 | "tip_amount: double\n", 80 | "tolls_amount: double\n", 81 | "improvement_surcharge: double\n", 82 | "total_amount: double\n", 83 | "congestion_surcharge: double\n", 84 | "Airport_fee: double" 85 | ] 86 | }, 87 | "execution_count": 3, 88 | "metadata": {}, 89 | "output_type": "execute_result" 90 | } 91 | ], 92 | "source": [ 93 | "trips.schema" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 4, 99 | "metadata": {}, 100 | "outputs": [ 101 | { 102 | "name": "stdout", 103 | "output_type": "stream", 104 | "text": [ 105 | "265\n" 106 | ] 107 | } 108 | ], 109 | "source": [ 110 | "# get location zone mapping\n", 111 | "zones = duckdb.sql(\"SELECT * FROM 'https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv'\").arrow()\n", 112 | "print(zones.num_rows)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 5, 118 | "metadata": {}, 119 | "outputs": [ 120 | { 121 | "data": { 122 | "text/plain": [ 123 | "LocationID: int64\n", 124 | "Borough: string\n", 125 | "Zone: string\n", 126 | "service_zone: string" 127 | ] 128 | }, 129 | "execution_count": 5, 130 | "metadata": {}, 131 | "output_type": "execute_result" 132 | } 133 | ], 134 | "source": [ 135 | "zones.schema" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 6, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "# create iceberg catalog using sqlite\n", 145 | "warehouse_path = \"/duck_iceberg_demo/\"\n", 146 | "name_space = 'demo_db'\n", 147 | "# create iceberg catalog using sqlite\n", 148 | "catalog = SqlCatalog(\n", 149 | " name_space,\n", 150 | " **{\n", 151 | " \"uri\": f\"sqlite:///{warehouse_path}/pyiceberg_catalog.db\",\n", 152 | " \"warehouse\": f\"file://{warehouse_path}\",\n", 153 | " },\n", 154 | ")\n", 155 | "\n", 156 | "# create a namespace for Iceberg\n", 157 | "catalog.create_namespace(name_space)" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 7, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "def get_iceberg_tables(database_path, table_namespace=None, table_name=None):\n", 167 | " \"\"\"\n", 168 | " Connect to the SQLite database and retrieve the list of Iceberg tables.\n", 169 | " Optionally filter by namespace and table name.\n", 170 | "\n", 171 | " Parameters:\n", 172 | " database_path (str): The path to the SQLite database file.\n", 173 | " table_namespace (str, optional): The namespace of the table to search for.\n", 174 | " table_name (str, optional): The name of the table to search for.\n", 175 | "\n", 176 | " Returns:\n", 177 | " list: A list of dictionaries, each representing an Iceberg table.\n", 178 | "\n", 179 | " Raises:\n", 180 | " ValueError: If only one of table_namespace or table_name is provided.\n", 181 | " \"\"\"\n", 182 | " # Check if both namespace and table name are provided together\n", 183 | " if (table_namespace and not table_name) or (table_name and not table_namespace):\n", 184 | " raise ValueError(\"Both table_namespace and table_name must be provided together.\")\n", 185 | " \n", 186 | " # Connect to the SQLite database\n", 187 | " con_meta = sqlite3.connect(database_path)\n", 188 | " con_meta.row_factory = sqlite3.Row\n", 189 | "\n", 190 | " # Create a cursor object to execute SQL queries\n", 191 | " cursor = con_meta.cursor()\n", 192 | "\n", 193 | " # Base query to list tables in the database\n", 194 | " query = 'SELECT * FROM \"iceberg_tables\" WHERE 1=1'\n", 195 | " params = []\n", 196 | "\n", 197 | " # Add conditions to the query based on provided namespace and table name\n", 198 | " if table_namespace and table_name:\n", 199 | " query += ' AND \"table_namespace\" = ? AND \"table_name\" = ?'\n", 200 | " params.append(table_namespace)\n", 201 | " params.append(table_name)\n", 202 | "\n", 203 | " # Execute the query with parameters\n", 204 | " cursor.execute(query, params)\n", 205 | "\n", 206 | " # Fetch all results\n", 207 | " results = cursor.fetchall()\n", 208 | "\n", 209 | " # Convert results to list of dictionaries\n", 210 | " table_list = []\n", 211 | " for row in results:\n", 212 | " row_dict = {key: row[key] for key in row.keys()}\n", 213 | " table_list.append(row_dict)\n", 214 | "\n", 215 | " # Close the connection\n", 216 | " con_meta.close()\n", 217 | "\n", 218 | " return table_list\n", 219 | "\n" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 8, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "def create_metadata_for_tables(tables):\n", 229 | " \"\"\"\n", 230 | " Iterate through all tables and create metadata files.\n", 231 | "\n", 232 | " Parameters:\n", 233 | " tables (list): A list of dictionaries, each representing an Iceberg table with a 'metadata_location'.\n", 234 | " \"\"\"\n", 235 | " for table in tables:\n", 236 | " metadata_location = table['metadata_location'].replace('file://', '')\n", 237 | " metadata_dir = os.path.dirname(metadata_location)\n", 238 | " new_metadata_file = os.path.join(metadata_dir, 'v1.metadata.json')\n", 239 | " version_hint_file = os.path.join(metadata_dir, 'version-hint.text')\n", 240 | "\n", 241 | " # Ensure the metadata directory exists\n", 242 | " os.makedirs(metadata_dir, exist_ok=True)\n", 243 | "\n", 244 | " # Copy the metadata file to v1.metadata.json\n", 245 | " shutil.copy(metadata_location, new_metadata_file)\n", 246 | " print(f\"Copied metadata file to {new_metadata_file}\")\n", 247 | "\n", 248 | " # Create the version-hint.text file with content \"1\"\n", 249 | " with open(version_hint_file, 'w') as f:\n", 250 | " f.write('1')\n", 251 | " print(f\"Created {version_hint_file} with content '1'\")" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 13, 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [ 260 | "# add tables to iceberg catalog\n", 261 | "for table, table_name in [\n", 262 | " (trips, \"trips\"),\n", 263 | " (zones, \"zones\"),\n", 264 | "]: \n", 265 | "\t# create the iceberg table\n", 266 | " iceberg_table = catalog.create_table(\n", 267 | " f\"{name_space}.{table_name}\",\n", 268 | " schema=table.schema,\n", 269 | " )\n", 270 | "\n", 271 | " # add data to iceberg table\n", 272 | " iceberg_table.append(table)\n", 273 | "\n", 274 | " # copy catalog metadata to iceberg table\n", 275 | " catalog_records = get_iceberg_tables(f\"{warehouse_path}/pyiceberg_catalog.db\", name_space, table_name)\n", 276 | " create_metadata_for_tables(catalog_records)\n", 277 | " \n", 278 | " print(f\"Created {table_name}, {table.num_rows} rows\")" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 12, 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "## uncomment to append more data to iceberg tables, to simulate new data coming in\n", 288 | "# for table, table_name in [\n", 289 | "# (trips, \"trips\"),\n", 290 | "# (zones, \"zones\"),\n", 291 | "# ]: \n", 292 | "# iceberg_table = catalog.load_table(f\"{name_space}.{table_name}\")\n", 293 | "# # add data to iceberg table\n", 294 | "# iceberg_table.append(table)\n", 295 | "\n", 296 | "# # copy catalog metadata to iceberg table\n", 297 | "# catalog_records = get_iceberg_tables(f\"{warehouse_path}/pyiceberg_catalog.db\", name_space, table_name)\n", 298 | "# create_metadata_for_tables(catalog_records)\n", 299 | " \n", 300 | "# print(f\"Loaded {table_name}, {table.num_rows} rows\")" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 10, 306 | "metadata": {}, 307 | "outputs": [ 308 | { 309 | "data": { 310 | "text/plain": [ 311 | "" 312 | ] 313 | }, 314 | "execution_count": 10, 315 | "metadata": {}, 316 | "output_type": "execute_result" 317 | } 318 | ], 319 | "source": [ 320 | "# initiate a duckdb connection which we will use to be the query engine for iceberg\n", 321 | "con = duckdb.connect(database=':memory:', read_only=False)\n", 322 | "setup_sql = '''\n", 323 | "INSTALL iceberg;\n", 324 | "LOAD iceberg;\n", 325 | "'''\n", 326 | "res = con.execute(setup_sql)\n", 327 | "\n", 328 | "# create the schema and views of iceberg tables in duckdb\n", 329 | "database_path = f'{warehouse_path}/demo_db.db'\n", 330 | "\n", 331 | "create_view_sql = f'''\n", 332 | "CREATE SCHEMA IF NOT EXISTS taxi;\n", 333 | "\n", 334 | "CREATE VIEW taxi.trips AS\n", 335 | "SELECT * FROM iceberg_scan('{database_path}/trips', allow_moved_paths = true);\n", 336 | "\n", 337 | "CREATE VIEW taxi.zones AS\n", 338 | "SELECT * FROM iceberg_scan('{database_path}/zones', allow_moved_paths = true);\n", 339 | "'''\n", 340 | "\n", 341 | "con.execute(create_view_sql)\n" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 11, 347 | "metadata": {}, 348 | "outputs": [ 349 | { 350 | "data": { 351 | "text/html": [ 352 | "
\n", 353 | "\n", 366 | "\n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | "
count_star()
041994806
\n", 380 | "
" 381 | ], 382 | "text/plain": [ 383 | " count_star()\n", 384 | "0 41994806" 385 | ] 386 | }, 387 | "execution_count": 11, 388 | "metadata": {}, 389 | "output_type": "execute_result" 390 | } 391 | ], 392 | "source": [ 393 | "sql = f'''\n", 394 | "select \n", 395 | " count(*)\n", 396 | "from taxi.trips\n", 397 | "'''\n", 398 | "\n", 399 | "res = con.execute(sql)\n", 400 | "res.fetchdf()" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": 107, 406 | "metadata": {}, 407 | "outputs": [ 408 | { 409 | "name": "stdout", 410 | "output_type": "stream", 411 | "text": [ 412 | "CPU times: user 6.63 s, sys: 170 ms, total: 6.8 s\n", 413 | "Wall time: 3.59 s\n" 414 | ] 415 | }, 416 | { 417 | "data": { 418 | "text/html": [ 419 | "
\n", 420 | "\n", 433 | "\n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | "
monthavg_passenger_countavg_trip_distancetotal_trip_distanceavg_total_amounttotal_amounttotal_trips
02023-04-011.3828224.0961902.693788e+0728.2694781.859093e+086576326
12023-05-011.3588014.3457933.053931e+0728.9629352.035320e+087027328
22023-06-011.3690124.3687542.889720e+0729.0685911.922747e+086614518
32023-07-011.4019614.4894372.610242e+0728.5680681.661001e+085814186
42023-08-011.3869794.7827772.701505e+0728.6280301.617026e+085648402
52023-09-011.3564044.2742582.433541e+0729.7819141.695628e+085693482
62023-10-011.3597253.9266872.766170e+0729.1712752.054982e+087044538
72023-11-011.3580133.6327332.426470e+0728.6957921.916725e+086679462
82023-12-011.4081603.6762522.482600e+0728.5415051.927429e+086753074
92024-01-011.3392773.6521752.165464e+0726.8016001.589133e+085929246
102024-02-011.3259433.8608582.322331e+0726.6244121.601476e+086015066
112024-03-011.3376244.5174213.236832e+0727.1205941.943251e+087165222
122024-04-011.3341425.2838503.713788e+0727.4934251.932393e+087028564
\n", 579 | "
" 580 | ], 581 | "text/plain": [ 582 | " month avg_passenger_count avg_trip_distance total_trip_distance \\\n", 583 | "0 2023-04-01 1.382822 4.096190 2.693788e+07 \n", 584 | "1 2023-05-01 1.358801 4.345793 3.053931e+07 \n", 585 | "2 2023-06-01 1.369012 4.368754 2.889720e+07 \n", 586 | "3 2023-07-01 1.401961 4.489437 2.610242e+07 \n", 587 | "4 2023-08-01 1.386979 4.782777 2.701505e+07 \n", 588 | "5 2023-09-01 1.356404 4.274258 2.433541e+07 \n", 589 | "6 2023-10-01 1.359725 3.926687 2.766170e+07 \n", 590 | "7 2023-11-01 1.358013 3.632733 2.426470e+07 \n", 591 | "8 2023-12-01 1.408160 3.676252 2.482600e+07 \n", 592 | "9 2024-01-01 1.339277 3.652175 2.165464e+07 \n", 593 | "10 2024-02-01 1.325943 3.860858 2.322331e+07 \n", 594 | "11 2024-03-01 1.337624 4.517421 3.236832e+07 \n", 595 | "12 2024-04-01 1.334142 5.283850 3.713788e+07 \n", 596 | "\n", 597 | " avg_total_amount total_amount total_trips \n", 598 | "0 28.269478 1.859093e+08 6576326 \n", 599 | "1 28.962935 2.035320e+08 7027328 \n", 600 | "2 29.068591 1.922747e+08 6614518 \n", 601 | "3 28.568068 1.661001e+08 5814186 \n", 602 | "4 28.628030 1.617026e+08 5648402 \n", 603 | "5 29.781914 1.695628e+08 5693482 \n", 604 | "6 29.171275 2.054982e+08 7044538 \n", 605 | "7 28.695792 1.916725e+08 6679462 \n", 606 | "8 28.541505 1.927429e+08 6753074 \n", 607 | "9 26.801600 1.589133e+08 5929246 \n", 608 | "10 26.624412 1.601476e+08 6015066 \n", 609 | "11 27.120594 1.943251e+08 7165222 \n", 610 | "12 27.493425 1.932393e+08 7028564 " 611 | ] 612 | }, 613 | "execution_count": 107, 614 | "metadata": {}, 615 | "output_type": "execute_result" 616 | } 617 | ], 618 | "source": [ 619 | "sql = f'''\n", 620 | "select \n", 621 | " date_trunc('month', tpep_pickup_datetime) as month,\n", 622 | " avg(passenger_count) as avg_passenger_count,\n", 623 | " avg(trip_distance) as avg_trip_distance,\n", 624 | " sum(trip_distance) as total_trip_distance,\n", 625 | " avg(total_amount) as avg_total_amount,\n", 626 | " sum(total_amount) as total_amount,\n", 627 | " count(*) as total_trips\n", 628 | "from taxi.trips\n", 629 | "-- some data pre and post our target date range is in the dataset, so we filter it out\n", 630 | "where tpep_pickup_datetime between '2023-04-01' and '2024-05-01'\n", 631 | "group by 1\n", 632 | "order by 1\n", 633 | "'''\n", 634 | "\n", 635 | "%time res = con.execute(sql)\n", 636 | "res.fetchdf()" 637 | ] 638 | }, 639 | { 640 | "cell_type": "code", 641 | "execution_count": 109, 642 | "metadata": {}, 643 | "outputs": [ 644 | { 645 | "name": "stdout", 646 | "output_type": "stream", 647 | "text": [ 648 | "CPU times: user 5.79 s, sys: 63.4 ms, total: 5.86 s\n", 649 | "Wall time: 2.99 s\n" 650 | ] 651 | }, 652 | { 653 | "data": { 654 | "text/html": [ 655 | "
\n", 656 | "\n", 669 | "\n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | "
Boroughtotal_tripstotal_amount
0Manhattan1487077923.690928e+09
1Queens90186164.853421e+08
2Brooklyn64948603.254173e+08
3Unknown14691644.296070e+07
4Bronx10133625.287546e+07
5N/A7298328.671968e+07
6EWR5006566.250183e+07
7Staten Island449484.510061e+06
\n", 729 | "
" 730 | ], 731 | "text/plain": [ 732 | " Borough total_trips total_amount\n", 733 | "0 Manhattan 148707792 3.690928e+09\n", 734 | "1 Queens 9018616 4.853421e+08\n", 735 | "2 Brooklyn 6494860 3.254173e+08\n", 736 | "3 Unknown 1469164 4.296070e+07\n", 737 | "4 Bronx 1013362 5.287546e+07\n", 738 | "5 N/A 729832 8.671968e+07\n", 739 | "6 EWR 500656 6.250183e+07\n", 740 | "7 Staten Island 44948 4.510061e+06" 741 | ] 742 | }, 743 | "execution_count": 109, 744 | "metadata": {}, 745 | "output_type": "execute_result" 746 | } 747 | ], 748 | "source": [ 749 | "sql = f'''\n", 750 | "select \n", 751 | " zones.Borough,\n", 752 | " count(*) as total_trips,\n", 753 | " sum(total_amount) as total_amount\n", 754 | "from taxi.zones as zones\n", 755 | "left join taxi.trips as trips\n", 756 | " on zones.LocationID = trips.DOLocationID\n", 757 | "group by 1 \n", 758 | "order by 2 desc\n", 759 | "'''\n", 760 | "\n", 761 | "%time res = con.execute(sql)\n", 762 | "res.fetchdf()" 763 | ] 764 | }, 765 | { 766 | "cell_type": "code", 767 | "execution_count": 111, 768 | "metadata": {}, 769 | "outputs": [ 770 | { 771 | "name": "stdout", 772 | "output_type": "stream", 773 | "text": [ 774 | "CPU times: user 43.2 s, sys: 9.8 s, total: 53 s\n", 775 | "Wall time: 26.2 s\n" 776 | ] 777 | }, 778 | { 779 | "data": { 780 | "text/html": [ 781 | "
\n", 782 | "\n", 795 | "\n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | "
pickup_boroughdropoff_boroughtrip_count
0BronxBronx311200
1BronxManhattan270232
2BronxQueens57432
3BronxBrooklyn55544
4BronxN/A4176
5BronxUnknown1848
6BronxStaten Island1304
7BronxEWR280
8BrooklynBrooklyn1452112
9BrooklynManhattan1045936
10BrooklynQueens345792
11BrooklynBronx55736
12BrooklynN/A8016
13BrooklynEWR7376
14BrooklynUnknown7168
15BrooklynStaten Island5144
16EWREWR37680
17EWRUnknown1896
18EWRN/A1640
19EWRManhattan1032
\n", 927 | "
" 928 | ], 929 | "text/plain": [ 930 | " pickup_borough dropoff_borough trip_count\n", 931 | "0 Bronx Bronx 311200\n", 932 | "1 Bronx Manhattan 270232\n", 933 | "2 Bronx Queens 57432\n", 934 | "3 Bronx Brooklyn 55544\n", 935 | "4 Bronx N/A 4176\n", 936 | "5 Bronx Unknown 1848\n", 937 | "6 Bronx Staten Island 1304\n", 938 | "7 Bronx EWR 280\n", 939 | "8 Brooklyn Brooklyn 1452112\n", 940 | "9 Brooklyn Manhattan 1045936\n", 941 | "10 Brooklyn Queens 345792\n", 942 | "11 Brooklyn Bronx 55736\n", 943 | "12 Brooklyn N/A 8016\n", 944 | "13 Brooklyn EWR 7376\n", 945 | "14 Brooklyn Unknown 7168\n", 946 | "15 Brooklyn Staten Island 5144\n", 947 | "16 EWR EWR 37680\n", 948 | "17 EWR Unknown 1896\n", 949 | "18 EWR N/A 1640\n", 950 | "19 EWR Manhattan 1032" 951 | ] 952 | }, 953 | "execution_count": 111, 954 | "metadata": {}, 955 | "output_type": "execute_result" 956 | } 957 | ], 958 | "source": [ 959 | "sql = f'''\n", 960 | "select \n", 961 | " starting_zone.Borough as pickup_borough,\n", 962 | " ending_zone.Borough as dropoff_borough,\n", 963 | " count(*) as trip_count\n", 964 | "from\n", 965 | "taxi.trips as trips\n", 966 | "left join taxi.zones as starting_zone\n", 967 | " on trips.PULocationID = starting_zone.LocationID\n", 968 | "left join taxi.zones as ending_zone\n", 969 | " on trips.DOLocationID = ending_zone.LocationID\n", 970 | "group by 1, 2\n", 971 | "order by 1 asc, 3 desc\n", 972 | "'''\n", 973 | "\n", 974 | "%time res = con.execute(sql)\n", 975 | "res.fetchdf().head(20)" 976 | ] 977 | }, 978 | { 979 | "cell_type": "code", 980 | "execution_count": null, 981 | "metadata": {}, 982 | "outputs": [], 983 | "source": [] 984 | } 985 | ], 986 | "metadata": { 987 | "kernelspec": { 988 | "display_name": ".venv", 989 | "language": "python", 990 | "name": "python3" 991 | }, 992 | "language_info": { 993 | "codemirror_mode": { 994 | "name": "ipython", 995 | "version": 3 996 | }, 997 | "file_extension": ".py", 998 | "mimetype": "text/x-python", 999 | "name": "python", 1000 | "nbconvert_exporter": "python", 1001 | "pygments_lexer": "ipython3", 1002 | "version": "3.10.11" 1003 | } 1004 | }, 1005 | "nbformat": 4, 1006 | "nbformat_minor": 2 1007 | } 1008 | -------------------------------------------------------------------------------- /duck_iceberg_demo/local_duck_cloud_aws_iceberg_demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import duckdb\n", 10 | "from pyiceberg.catalog.sql import SqlCatalog\n", 11 | "import pyarrow as pa\n", 12 | "import os\n", 13 | "import shutil\n", 14 | "import gcsfs\n", 15 | "import boto3\n", 16 | "\n", 17 | "import os\n", 18 | "os.environ['AWS_DEFAULT_REGION'] = 'us-east-2' # set the region for where your s3 bucket is if different from your default region" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "# get Q2 2023 to through april 2024 (latest available data)\n", 28 | "trips_ls = []\n", 29 | "months = [\n", 30 | " '2023-04',\n", 31 | " '2023-05', \n", 32 | " '2023-06', \n", 33 | " '2023-07', \n", 34 | " '2023-08', \n", 35 | " '2023-09', \n", 36 | " '2023-10', \n", 37 | " '2023-11', \n", 38 | " '2023-12', \n", 39 | " '2024-01', \n", 40 | " '2024-02', \n", 41 | " '2024-03', \n", 42 | " '2024-04'\n", 43 | " ]\n", 44 | "for month in months:\n", 45 | " table_path = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{month}.parquet'\n", 46 | " # NOTE: this initial data read doesn't require Duckdb, something like pandas works as well\n", 47 | " table = duckdb.sql(f\"SELECT * FROM '{table_path}'\").arrow()\n", 48 | " trips_ls.append(table)\n", 49 | "\n", 50 | "# concatenate all tables\n", 51 | "trips = pa.concat_tables(trips_ls)\n", 52 | "print(\"Rows in trips: \",trips.num_rows)\n", 53 | "\n", 54 | "# get location zone mapping\n", 55 | "zones = duckdb.sql(\"SELECT * FROM 'https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv'\").arrow()\n", 56 | "print(\"Rows in zones: \",zones.num_rows)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "# create Iceberg catalog using Postgres and GCS\n", 66 | "catalog_name = \"demo_iceberg\"\n", 67 | "catalog_uri = \"\" #replace with Postgres URI\n", 68 | "warehouse_path = \"s3://\" #replace with bucket name you created in S3\n", 69 | "\n", 70 | "catalog = SqlCatalog(\n", 71 | " catalog_name,\n", 72 | " **{\n", 73 | " \"uri\": catalog_uri,\n", 74 | " \"warehouse\": warehouse_path,\n", 75 | " },\n", 76 | ")\n", 77 | "\n", 78 | "# create a namespace for Iceberg\n", 79 | "name_space = 'taxi'\n", 80 | "try:\n", 81 | " catalog.create_namespace(name_space)\n", 82 | "except Exception as e:\n", 83 | " print(e)" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "region_name='us-east-2' # replace with your s3 region\n", 93 | "\n", 94 | "def add_version_hint(iceberg_table):\n", 95 | " \"\"\"\n", 96 | " Adds version hint file to Iceberg table metadata\n", 97 | " Addresses issue mentioned here: https://github.com/duckdb/duckdb_iceberg/issues/29\n", 98 | " Determines if Iceberg table is in local file system or in GCS/S3\n", 99 | " \"\"\"\n", 100 | " metadata_location = iceberg_table.metadata_location\n", 101 | " protocol = metadata_location.split(\":\")[0]\n", 102 | "\n", 103 | " if protocol == \"file\":\n", 104 | " metadata_location = metadata_location[7:]\n", 105 | " elif protocol == \"gs\" or protocol == \"s3\":\n", 106 | " metadata_location = metadata_location[5:]\n", 107 | " else:\n", 108 | " print(f\"Unsupported metadata location: {metadata_location}\")\n", 109 | " return\n", 110 | "\n", 111 | " metadata_dir = os.path.dirname(metadata_location)\n", 112 | " new_metadata_file = os.path.join(metadata_dir, \"v1.metadata.json\")\n", 113 | " version_hint_file = os.path.join(metadata_dir, \"version-hint.text\")\n", 114 | "\n", 115 | " if protocol == \"file\":\n", 116 | " shutil.copy(metadata_location, new_metadata_file)\n", 117 | " with open(version_hint_file, \"w\") as f:\n", 118 | " f.write(\"1\")\n", 119 | " elif protocol == \"gs\":\n", 120 | " fs = gcsfs.GCSFileSystem()\n", 121 | " fs.copy(metadata_location, new_metadata_file)\n", 122 | " with fs.open(version_hint_file, \"w\") as f:\n", 123 | " f.write(\"1\")\n", 124 | " elif protocol == \"s3\":\n", 125 | " s3 = boto3.client('s3')\n", 126 | " bucket_name = metadata_location.split('/')[0]\n", 127 | " s3_file_key = '/'.join(metadata_location.split('/')[1:])\n", 128 | " new_s3_file_key = os.path.join(os.path.dirname(s3_file_key), \"v1.metadata.json\")\n", 129 | " version_hint_key = os.path.join(os.path.dirname(s3_file_key), \"version-hint.text\")\n", 130 | "\n", 131 | " s3.copy({'Bucket': bucket_name, 'Key': s3_file_key}, bucket_name, new_s3_file_key)\n", 132 | " s3.put_object(Bucket=bucket_name, Key=version_hint_key, Body='1')\n", 133 | "\n", 134 | " print(f\"Copied metadata file to {new_metadata_file}\")\n", 135 | " print(f\"Created {version_hint_file} with content '1'\")" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "# add tables to iceberg catalog and load data into S3\n", 145 | "for table, table_name in [\n", 146 | " (trips, \"trips\"),\n", 147 | " (zones, \"zones\"),\n", 148 | "]: \n", 149 | "\t# create the iceberg table\n", 150 | " iceberg_table = catalog.create_table(\n", 151 | " f\"{name_space}.{table_name}\",\n", 152 | " schema=table.schema,\n", 153 | " )\n", 154 | "\n", 155 | " # add data to iceberg table in S3\n", 156 | " iceberg_table.append(table)\n", 157 | "\n", 158 | " # copy catalog version hint metadata into S3\n", 159 | " add_version_hint(iceberg_table)\n", 160 | " \n", 161 | " print(f\"Created {table_name}, {table.num_rows} rows\")" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "# initiate a duckdb connection which we will use to be the query engine for iceberg\n", 171 | "import duckdb\n", 172 | "\n", 173 | "con = duckdb.connect(database=':memory:', read_only=False)\n", 174 | "setup_sql = '''\n", 175 | "INSTALL iceberg;\n", 176 | "LOAD iceberg;\n", 177 | "\n", 178 | "CREATE SECRET (\n", 179 | " TYPE S3,\n", 180 | " KEY_ID '',\n", 181 | " SECRET '',\n", 182 | " REGION ''\n", 183 | ");\n", 184 | "'''\n", 185 | "res = con.execute(setup_sql)" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "# create the schema and views of iceberg tables in duckdb\n", 195 | "database_path = f'{warehouse_path}{name_space}.db'\n", 196 | "\n", 197 | "create_view_sql = f'''\n", 198 | "CREATE SCHEMA IF NOT EXISTS taxi;\n", 199 | "\n", 200 | "CREATE VIEW taxi.trips AS\n", 201 | "SELECT * FROM iceberg_scan('{database_path}/trips', allow_moved_paths = true);\n", 202 | "\n", 203 | "CREATE VIEW taxi.zones AS\n", 204 | "SELECT * FROM iceberg_scan('{database_path}/zones', allow_moved_paths = true);\n", 205 | "'''\n", 206 | "\n", 207 | "con.execute(create_view_sql)" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "sql = f'''\n", 217 | "select \n", 218 | " count(*)\n", 219 | "from taxi.trips\n", 220 | "'''\n", 221 | "\n", 222 | "%time res = con.execute(sql)\n", 223 | "res.fetchdf()" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "sql = f'''\n", 233 | "select \n", 234 | " date_trunc('month', tpep_pickup_datetime) as month,\n", 235 | " avg(passenger_count) as avg_passenger_count,\n", 236 | " avg(trip_distance) as avg_trip_distance,\n", 237 | " sum(trip_distance) as total_trip_distance,\n", 238 | " avg(total_amount) as avg_total_amount,\n", 239 | " sum(total_amount) as total_amount,\n", 240 | " count(*) as total_trips\n", 241 | "from taxi.trips\n", 242 | "-- some data pre and post our target date range is in the dataset, so we filter it out\n", 243 | "where tpep_pickup_datetime between '2023-04-01' and '2024-05-01'\n", 244 | "group by 1\n", 245 | "order by 1\n", 246 | "'''\n", 247 | "\n", 248 | "%time res = con.execute(sql)\n", 249 | "res.fetchdf()" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "sql = f'''\n", 259 | "select \n", 260 | " zones.Borough,\n", 261 | " count(*) as total_trips,\n", 262 | " sum(total_amount) as total_amount\n", 263 | "from taxi.zones as zones\n", 264 | "left join taxi.trips as trips\n", 265 | " on zones.LocationID = trips.DOLocationID\n", 266 | "group by 1 \n", 267 | "order by 2 desc\n", 268 | "'''\n", 269 | "\n", 270 | "%time res = con.execute(sql)\n", 271 | "res.fetchdf()" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "metadata": {}, 278 | "outputs": [], 279 | "source": [ 280 | "sql = f'''\n", 281 | "select \n", 282 | " starting_zone.Borough as pickup_borough,\n", 283 | " ending_zone.Borough as dropoff_borough,\n", 284 | " count(*) as trip_count\n", 285 | "from\n", 286 | "taxi.trips as trips\n", 287 | "left join taxi.zones as starting_zone\n", 288 | " on trips.PULocationID = starting_zone.LocationID\n", 289 | "left join taxi.zones as ending_zone\n", 290 | " on trips.DOLocationID = ending_zone.LocationID\n", 291 | "group by 1, 2\n", 292 | "order by 1 asc, 3 desc\n", 293 | "'''\n", 294 | "\n", 295 | "%time res = con.execute(sql)\n", 296 | "res.fetchdf().head(20)" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "metadata": {}, 303 | "outputs": [], 304 | "source": [] 305 | } 306 | ], 307 | "metadata": { 308 | "kernelspec": { 309 | "display_name": ".venv", 310 | "language": "python", 311 | "name": "python3" 312 | }, 313 | "language_info": { 314 | "codemirror_mode": { 315 | "name": "ipython", 316 | "version": 3 317 | }, 318 | "file_extension": ".py", 319 | "mimetype": "text/x-python", 320 | "name": "python", 321 | "nbconvert_exporter": "python", 322 | "pygments_lexer": "ipython3", 323 | "version": "3.10.11" 324 | } 325 | }, 326 | "nbformat": 4, 327 | "nbformat_minor": 2 328 | } 329 | -------------------------------------------------------------------------------- /duck_iceberg_demo/local_duck_cloud_gcp_iceberg_demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import duckdb\n", 10 | "from pyiceberg.catalog.sql import SqlCatalog\n", 11 | "import pyarrow as pa\n", 12 | "import os\n", 13 | "import shutil\n", 14 | "import gcsfs" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# get Q2 2023 to through april 2024 (latest available data)\n", 24 | "trips_ls = []\n", 25 | "months = [\n", 26 | " '2023-04',\n", 27 | " '2023-05', \n", 28 | " '2023-06', \n", 29 | " '2023-07', \n", 30 | " '2023-08', \n", 31 | " '2023-09', \n", 32 | " '2023-10', \n", 33 | " '2023-11', \n", 34 | " '2023-12', \n", 35 | " '2024-01', \n", 36 | " '2024-02', \n", 37 | " '2024-03', \n", 38 | " '2024-04'\n", 39 | " ]\n", 40 | "for month in months:\n", 41 | " table_path = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{month}.parquet'\n", 42 | " # NOTE: this initial data read doesn't require Duckdb, something like pandas works as well\n", 43 | " table = duckdb.sql(f\"SELECT * FROM '{table_path}'\").arrow()\n", 44 | " trips_ls.append(table)\n", 45 | "\n", 46 | "# concatenate all tables\n", 47 | "trips = pa.concat_tables(trips_ls)\n", 48 | "print(\"Rows in trips: \",trips.num_rows)\n", 49 | "\n", 50 | "# get location zone mapping\n", 51 | "zones = duckdb.sql(\"SELECT * FROM 'https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv'\").arrow()\n", 52 | "print(\"Rows in zones: \",zones.num_rows)" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "# create Iceberg catalog using Postgres and GCS\n", 62 | "catalog_name = \"demo_iceberg\"\n", 63 | "catalog_uri = \"\" #replace with Postgres URI\n", 64 | "warehouse_path = \"gs://\" #replace with bucket name you created in GCS\n", 65 | "\n", 66 | "catalog = SqlCatalog(\n", 67 | " catalog_name,\n", 68 | " **{\n", 69 | " \"uri\": catalog_uri,\n", 70 | " \"warehouse\": warehouse_path,\n", 71 | " },\n", 72 | ")\n", 73 | "\n", 74 | "# create a namespace for Iceberg\n", 75 | "name_space = 'taxi'\n", 76 | "try:\n", 77 | " catalog.create_namespace(name_space)\n", 78 | "except Exception as e:\n", 79 | " print(e)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "def add_version_hint(iceberg_table):\n", 89 | " \"\"\"\n", 90 | " Adds version hint file to Iceberg table metadata\n", 91 | " Addresses issue mentioned here: https://github.com/duckdb/duckdb_iceberg/issues/29\n", 92 | " Determines if Iceberg table is in local file system or in GCS\n", 93 | " \"\"\"\n", 94 | " metadata_location = iceberg_table.metadata_location\n", 95 | " protocol = metadata_location.split(\":\")[0]\n", 96 | "\n", 97 | " if protocol == \"file\":\n", 98 | " metadata_location = metadata_location[7:]\n", 99 | " elif protocol == \"gs\":\n", 100 | " metadata_location = metadata_location[5:]\n", 101 | " else:\n", 102 | " print(f\"Unsupported metadata location: {metadata_location}\")\n", 103 | " return\n", 104 | "\n", 105 | " metadata_dir = os.path.dirname(metadata_location)\n", 106 | " new_metadata_file = os.path.join(metadata_dir, \"v1.metadata.json\")\n", 107 | " version_hint_file = os.path.join(metadata_dir, \"version-hint.text\")\n", 108 | "\n", 109 | " if protocol == \"file\":\n", 110 | " shutil.copy(metadata_location, new_metadata_file)\n", 111 | " with open(version_hint_file, \"w\") as f:\n", 112 | " f.write(\"1\")\n", 113 | " elif protocol == \"gs\":\n", 114 | " fs = gcsfs.GCSFileSystem()\n", 115 | " fs.copy(metadata_location, new_metadata_file)\n", 116 | " with fs.open(version_hint_file, \"w\") as f:\n", 117 | " f.write(\"1\")\n", 118 | "\n", 119 | " print(f\"Copied metadata file to {new_metadata_file}\")\n", 120 | " print(f\"Created {version_hint_file} with content '1'\")" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "# add tables to iceberg catalog and load data into GCS\n", 130 | "for table, table_name in [\n", 131 | " (trips, \"trips\"),\n", 132 | " (zones, \"zones\"),\n", 133 | "]: \n", 134 | "\t# create the iceberg table\n", 135 | " iceberg_table = catalog.create_table(\n", 136 | " f\"{name_space}.{table_name}\",\n", 137 | " schema=table.schema,\n", 138 | " )\n", 139 | "\n", 140 | " # add data to iceberg table in GCS\n", 141 | " iceberg_table.append(table)\n", 142 | "\n", 143 | " # copy catalog version hint metadata into GCS\n", 144 | " add_version_hint(iceberg_table)\n", 145 | " \n", 146 | " print(f\"Created {table_name}, {table.num_rows} rows\")" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "# initiate a duckdb connection which we will use to be the query engine for iceberg\n", 156 | "import duckdb\n", 157 | "\n", 158 | "con = duckdb.connect(database=':memory:', read_only=False)\n", 159 | "setup_sql = '''\n", 160 | "INSTALL iceberg;\n", 161 | "LOAD iceberg;\n", 162 | "\n", 163 | "CREATE SECRET (\n", 164 | " TYPE GCS,\n", 165 | " KEY_ID '',\n", 166 | " SECRET ''\n", 167 | ");\n", 168 | "'''\n", 169 | "res = con.execute(setup_sql)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "# create the schema and views of iceberg tables in duckdb\n", 179 | "database_path = f'{warehouse_path}/{name_space}.db'\n", 180 | "\n", 181 | "create_view_sql = f'''\n", 182 | "CREATE SCHEMA IF NOT EXISTS taxi;\n", 183 | "\n", 184 | "CREATE VIEW taxi.trips AS\n", 185 | "SELECT * FROM iceberg_scan('{database_path}/trips', allow_moved_paths = true);\n", 186 | "\n", 187 | "CREATE VIEW taxi.zones AS\n", 188 | "SELECT * FROM iceberg_scan('{database_path}/zones', allow_moved_paths = true);\n", 189 | "'''\n", 190 | "\n", 191 | "con.execute(create_view_sql)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "sql = f'''\n", 201 | "select \n", 202 | " count(*)\n", 203 | "from taxi.trips\n", 204 | "'''\n", 205 | "\n", 206 | "%time res = con.execute(sql)\n", 207 | "res.fetchdf()" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "sql = f'''\n", 217 | "select \n", 218 | " date_trunc('month', tpep_pickup_datetime) as month,\n", 219 | " avg(passenger_count) as avg_passenger_count,\n", 220 | " avg(trip_distance) as avg_trip_distance,\n", 221 | " sum(trip_distance) as total_trip_distance,\n", 222 | " avg(total_amount) as avg_total_amount,\n", 223 | " sum(total_amount) as total_amount,\n", 224 | " count(*) as total_trips\n", 225 | "from taxi.trips\n", 226 | "-- some data pre and post our target date range is in the dataset, so we filter it out\n", 227 | "where tpep_pickup_datetime between '2023-04-01' and '2024-05-01'\n", 228 | "group by 1\n", 229 | "order by 1\n", 230 | "'''\n", 231 | "\n", 232 | "%time res = con.execute(sql)\n", 233 | "res.fetchdf()" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "sql = f'''\n", 243 | "select \n", 244 | " zones.Borough,\n", 245 | " count(*) as total_trips,\n", 246 | " sum(total_amount) as total_amount\n", 247 | "from taxi.zones as zones\n", 248 | "left join taxi.trips as trips\n", 249 | " on zones.LocationID = trips.DOLocationID\n", 250 | "group by 1 \n", 251 | "order by 2 desc\n", 252 | "'''\n", 253 | "\n", 254 | "%time res = con.execute(sql)\n", 255 | "res.fetchdf()" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "sql = f'''\n", 265 | "select \n", 266 | " starting_zone.Borough as pickup_borough,\n", 267 | " ending_zone.Borough as dropoff_borough,\n", 268 | " count(*) as trip_count\n", 269 | "from\n", 270 | "taxi.trips as trips\n", 271 | "left join taxi.zones as starting_zone\n", 272 | " on trips.PULocationID = starting_zone.LocationID\n", 273 | "left join taxi.zones as ending_zone\n", 274 | " on trips.DOLocationID = ending_zone.LocationID\n", 275 | "group by 1, 2\n", 276 | "order by 1 asc, 3 desc\n", 277 | "'''\n", 278 | "\n", 279 | "%time res = con.execute(sql)\n", 280 | "res.fetchdf().head(20)" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": null, 286 | "metadata": {}, 287 | "outputs": [], 288 | "source": [] 289 | } 290 | ], 291 | "metadata": { 292 | "kernelspec": { 293 | "display_name": ".venv", 294 | "language": "python", 295 | "name": "python3" 296 | }, 297 | "language_info": { 298 | "codemirror_mode": { 299 | "name": "ipython", 300 | "version": 3 301 | }, 302 | "file_extension": ".py", 303 | "mimetype": "text/x-python", 304 | "name": "python", 305 | "nbconvert_exporter": "python", 306 | "pygments_lexer": "ipython3", 307 | "version": "3.10.11" 308 | } 309 | }, 310 | "nbformat": 4, 311 | "nbformat_minor": 2 312 | } 313 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "duck-iceberg-demo" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Steven Wang "] 6 | readme = "README.md" 7 | packages = [{include = "duck_iceberg_demo"}] 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.10" 11 | duckdb = "^1.0.0" 12 | pyiceberg = {extras = ["gcsfs"], version = "^0.6.1"} 13 | sqlalchemy = "^2.0.31" 14 | pyarrow = "^16.1.0" 15 | pandas = "^2.2.2" 16 | psycopg2 = "^2.9.9" 17 | flask = "^3.0.3" 18 | google-cloud-secret-manager = "^2.20.1" 19 | boto3 = "^1.34.157" 20 | 21 | 22 | [tool.poetry.group.dev.dependencies] 23 | ipykernel = "^6.29.5" 24 | 25 | [build-system] 26 | requires = ["poetry-core"] 27 | build-backend = "poetry.core.masonry.api" 28 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/steven-luabase/duckdb-iceberg-demo/e2d1751f000e2b6cb740f9d977538db9c40b27e0/tests/__init__.py --------------------------------------------------------------------------------