├── .gitignore
├── README.md
├── duck_iceberg_demo
    ├── __init__.py
    ├── deploy_duckdb_aws
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── config.py
    │   ├── main.py
    │   └── requirements.txt
    ├── deploy_duckdb_gcp
    │   ├── README.md
    │   ├── build_cloud_run.sh
    │   ├── config.py
    │   ├── main.py
    │   └── requirements.txt
    ├── duck_iceberg_demo.ipynb
    ├── local_duck_cloud_aws_iceberg_demo.ipynb
    └── local_duck_cloud_gcp_iceberg_demo.ipynb
├── poetry.lock
├── pyproject.toml
└── tests
    └── __init__.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .venv/
2 | /duck_iceberg_demo/deploy_duckdb_gcp/**.ipynb
3 | /duck_iceberg_demo/deploy_duckdb_aws/**.ipynb
4 | **/__pycache__/
5 | **/.DS_Store


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # This repo contains code referenced in the following blog posts
 2 | - [**How We Migrated Our Data Warehouse from Snowflake to DuckDB**](https://www.definite.app/blog/duckdb-datawarehouse)
 3 | - [**Why Databricks paid $1B for a 40 person startup (Tabular)**](https://www.definite.app/blog/databricks-tabular-acquisition)
 4 | - [**Comparing Iceberg Query Engines**](https://www.definite.app/blog/iceberg-query-engine)
 5 | - [**Running Iceberg and Serverless DuckDB in Google Cloud**](https://www.definite.app/blog/cloud-iceberg-duckdb)
 6 | - [**Running Iceberg and Serverless DuckDB in AWS**](https://www.definite.app/blog/cloud-iceberg-duckdb-aws)
 7 | 
 8 | 
 9 | # Installation
10 | To install all dependencies run:
11 | ```
12 | poetry install
13 | ```
14 | 
15 | # DuckDB Flask App
16 | A simple Flask app for running DuckDB in GCP and AWS can be found in `duck_iceberg_demo/deploy_duckdb_gcp` and `duck_iceberg_demo/deploy_duckdb_aws`


--------------------------------------------------------------------------------
/duck_iceberg_demo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/steven-luabase/duckdb-iceberg-demo/e2d1751f000e2b6cb740f9d977538db9c40b27e0/duck_iceberg_demo/__init__.py


--------------------------------------------------------------------------------
/duck_iceberg_demo/deploy_duckdb_aws/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.11-slim
 2 | 
 3 | WORKDIR /project
 4 | 
 5 | COPY . .
 6 | 
 7 | RUN pip install -r requirements.txt
 8 | 
 9 | # Exposing the flask app port from container to host
10 | EXPOSE 5000
11 | 
12 | # Starting application
13 | CMD ["python", "main.py"]
14 | 


--------------------------------------------------------------------------------
/duck_iceberg_demo/deploy_duckdb_aws/README.md:
--------------------------------------------------------------------------------
 1 | # DuckDB AWS Flask App
 2 | This directory contains a Flask app that runs SQL queries with DuckDB against files in AWS S3. 
 3 | 
 4 | 
 5 | ## Configuration Setup
 6 | To setup edit `config.py`:
 7 | ```
 8 | ACCESS_KEY = "<AWS_ACCESS_KEY>"
 9 | SECRET_KEY_NAME = "<AWS_SECRET_KEY_NAME>"
10 | S3_BUCKET_REGION = "<S3_BUCKET_REGION>"
11 | SECRET_REGION = "<SECRET_REGION>"
12 | ```
13 | Get your AWS Access and Secret Keys. Store your Secret Key in AWS [Secrets Manager](https://docs.aws.amazon.com/secretsmanager/latest/userguide/create_secret.html) and put the name of the secret in `config.py` for the `SECRET_KEY_NAME` variable.
14 | 
15 | ## Running The App Locally
16 | You may need to authenticate with the AWS CLI first:
17 | ```
18 | aws sso login
19 | ```
20 | ####  Using Flask
21 | `python main.py`
22 | #### In Docker
23 | Build with linux/amd64 architecture to be compatible with AWS ECS.
24 | ```
25 | docker buildx build --platform=linux/amd64  -t duckdb-deploy .
26 | ```
27 | ```
28 | docker run -p 5000:5000 -v ~/.aws:/root/.aws -it duckdb-deploy
29 | ```
30 | 
31 | ## Deploy the app to AWS ECS
32 | See blog post: [**Running Iceberg and Serverless DuckDB in AWS**](https://www.definite.app/blog/cloud-iceberg-duckdb-aws)
33 | 
34 | ## Query Files in S3
35 | ```
36 | import requests 
37 | 
38 | sql = f'''
39 | select 
40 |     count(*)
41 | from read_parquet('s3://<BUCKET_PATH>/some_data.parquet');
42 | '''
43 | 
44 | url = 'http://127.0.0.1:5000/query' # if running locally
45 | query = {
46 |     "query": sql
47 | }
48 | response = requests.post(url, params=query)
49 | response.json()
50 | ```


--------------------------------------------------------------------------------
/duck_iceberg_demo/deploy_duckdb_aws/config.py:
--------------------------------------------------------------------------------
1 | ACCESS_KEY = "<AWS_ACCESS_KEY>"
2 | SECRET_KEY_NAME = "<AWS_SECRET_KEY_NAME>"
3 | S3_BUCKET_REGION = "<S3_BUCKET_REGION>"
4 | SECRET_REGION = "<SECRET_REGION>"
5 | 


--------------------------------------------------------------------------------
/duck_iceberg_demo/deploy_duckdb_aws/main.py:
--------------------------------------------------------------------------------
 1 | import duckdb
 2 | from flask import Flask, request, jsonify
 3 | import boto3
 4 | from botocore.exceptions import ClientError
 5 | import config
 6 | import json
 7 | 
 8 | 
 9 | 
10 | app = Flask(__name__)
11 | 
12 | # gets secrets from aws secret manager
13 | def get_secret(secret_name, region_name):
14 |     session = boto3.session.Session()
15 |     client = session.client(
16 |         service_name='secretsmanager',
17 |         region_name=region_name
18 |     )
19 | 
20 |     try:
21 |         get_secret_value_response = client.get_secret_value(
22 |             SecretId=secret_name
23 |         )
24 |     except ClientError as e:
25 |         raise e
26 | 
27 |     secret = json.loads(get_secret_value_response['SecretString'])
28 |     return secret.get('secret_key')
29 | 
30 | 
31 | def init_duckdb_connection():
32 |     access_key = config.ACCESS_KEY
33 |     secret_key = get_secret(config.SECRET_KEY_NAME, config.SECRET_REGION)
34 |     con = duckdb.connect()
35 |     setup_sql = f"""
36 |         INSTALL iceberg;
37 |         LOAD iceberg;
38 | 
39 |         INSTALL httpfs;
40 |         LOAD httpfs;
41 | 
42 |         CREATE SECRET (
43 |             TYPE S3,
44 |             KEY_ID '{access_key}',
45 |             SECRET '{secret_key}',
46 |             REGION '{config.S3_BUCKET_REGION}'
47 |         );
48 |     """
49 |     con.execute(setup_sql)
50 |     return con
51 | 
52 | 
53 | # global duckdb connection
54 | duckdb_conn = init_duckdb_connection()
55 | 
56 | 
57 | @app.route("/query", methods=["POST"])
58 | def query_iceberg():
59 |     try:
60 |         query = request.args.get("query")
61 |         if not query:
62 |             return jsonify({"error": "Query parameter 'query' is required"}), 400
63 |         result = duckdb_conn.execute(query).fetchall()
64 |         return jsonify({"result": result}), 200
65 |     except Exception as e:
66 |         return jsonify({"error": str(e)}), 500
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     app.run(host='0.0.0.0', port=5000, debug=True)
71 | 


--------------------------------------------------------------------------------
/duck_iceberg_demo/deploy_duckdb_aws/requirements.txt:
--------------------------------------------------------------------------------
1 | Flask==3.0.3
2 | duckdb==1.0.0
3 | boto3==1.34.157


--------------------------------------------------------------------------------
/duck_iceberg_demo/deploy_duckdb_gcp/README.md:
--------------------------------------------------------------------------------
 1 | # DuckDB Flask App
 2 | This directory contains a Flask app that runs SQL queries with DuckDB against files in Google Cloud Storage. 
 3 | 
 4 | 
 5 | ## Configuration Setup
 6 | To setup edit `config.py`:
 7 | ```
 8 | PROJECT_ID = "<YOUR_PROJECT_ID>"
 9 | HMAC_KEY = "<YOUR_HMAC_KEY>"
10 | HMAC_SECRET_KEY_NAME = "<YOUR_HMAC_SECRET_KEY_NAME>"
11 | ```
12 | Create an HMAC key and secret for your Google Cloud Storage buckets [here](https://cloud.google.com/storage/docs/authentication/managing-hmackeys#create). Store your HMAC secret in Google Secrets Manager and put the name of the secret as HMAC_SECRET_KEY_NAME and the HMAC key as HMAC_KEY in `config.py`
13 | 
14 | ## Running Locally
15 | You may need to authenticate with `gcloud` CLI first:
16 | ```
17 | gcloud auth login
18 | ```
19 | Then run:
20 | ```
21 | python main.py
22 | ```
23 | 
24 | ## Deploy the app to Cloud Run
25 | ```
26 | bash build_cloud_run.sh
27 | ```
28 | 
29 | ## Query Files in GCS
30 | ```
31 | import requests 
32 | 
33 | sql = f'''
34 | select 
35 |     count(*)
36 | from read_parquet('gs://<OTHER_BUCKET>/some_data.parquet');
37 | '''
38 | 
39 | url = '<CLOUD_RUN_URL>/queryquery'
40 | query = {
41 |     "query": sql
42 | }
43 | response = requests.post(url, params=query)
44 | response.json()
45 | ```


--------------------------------------------------------------------------------
/duck_iceberg_demo/deploy_duckdb_gcp/build_cloud_run.sh:
--------------------------------------------------------------------------------
 1 | SERVICE_NAME=duck-iceberg-demo
 2 | REGION=us-east1
 3 | PROJECT_ID=YOUR_PROJECT_ID
 4 | gcloud config set project ${PROJECT_ID}
 5 | gcloud run deploy ${SERVICE_NAME} \
 6 |   --source . \
 7 |   --platform managed \
 8 |   --region ${REGION} \
 9 |   --allow-unauthenticated \
10 |   --memory 8Gi \
11 |   --cpu 2
12 | 
13 | 


--------------------------------------------------------------------------------
/duck_iceberg_demo/deploy_duckdb_gcp/config.py:
--------------------------------------------------------------------------------
1 | PROJECT_ID = "<YOUR_PROJECT_ID>"
2 | HMAC_KEY = "<YOUR_HMAC_KEY>"
3 | HMAC_SECRET_KEY_NAME = "<YOUR_HMAC_SECRET_KEY_NAME>"


--------------------------------------------------------------------------------
/duck_iceberg_demo/deploy_duckdb_gcp/main.py:
--------------------------------------------------------------------------------
 1 | import duckdb
 2 | from flask import Flask, request, jsonify
 3 | from google.cloud import secretmanager
 4 | import config
 5 | 
 6 | 
 7 | app = Flask(__name__)
 8 | 
 9 | # gets secrets from google secret manager
10 | def get_secret(secret_name: str, project_id: str):
11 |     client = secretmanager.SecretManagerServiceClient()
12 |     secret = client.access_secret_version(
13 |         name=f"projects/{project_id}/secrets/{secret_name}/versions/latest"
14 |     )
15 |     return secret.payload.data.decode("utf-8")
16 | 
17 | 
18 | def init_duckdb_connection():
19 |     hmac_key = config.HMAC_KEY
20 |     hmac_secret = get_secret(config.HMAC_SECRET_KEY_NAME, config.PROJECT_ID)
21 |     con = duckdb.connect()
22 |     setup_sql = f"""
23 |         INSTALL iceberg;
24 |         LOAD iceberg;
25 | 
26 |         INSTALL httpfs;
27 |         LOAD httpfs;
28 | 
29 |         CREATE SECRET (
30 |             TYPE GCS,
31 |             KEY_ID '{hmac_key}',
32 |             SECRET '{hmac_secret}'
33 |         );
34 |     """
35 |     con.execute(setup_sql)
36 |     return con
37 | 
38 | 
39 | # global duckdb connection
40 | duckdb_conn = init_duckdb_connection()
41 | 
42 | 
43 | @app.route("/query", methods=["POST"])
44 | def query_iceberg():
45 |     try:
46 |         query = request.args.get("query")
47 |         if not query:
48 |             return jsonify({"error": "Query parameter 'query' is required"}), 400
49 |         result = duckdb_conn.execute(query).fetchall()
50 |         return jsonify({"result": result}), 200
51 |     except Exception as e:
52 |         return jsonify({"error": str(e)}), 500
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     app.run(debug=True)


--------------------------------------------------------------------------------
/duck_iceberg_demo/deploy_duckdb_gcp/requirements.txt:
--------------------------------------------------------------------------------
1 | Flask==3.0.3
2 | duckdb==1.0.0
3 | google-cloud-secret-manager==2.20.1


--------------------------------------------------------------------------------
/duck_iceberg_demo/duck_iceberg_demo.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 1,
   6 |    "metadata": {},
   7 |    "outputs": [],
   8 |    "source": [
   9 |     "import duckdb\n",
  10 |     "from pyiceberg.catalog.sql import SqlCatalog\n",
  11 |     "import pyarrow as pa\n",
  12 |     "import os\n",
  13 |     "import shutil\n",
  14 |     "import sqlite3"
  15 |    ]
  16 |   },
  17 |   {
  18 |    "cell_type": "code",
  19 |    "execution_count": 2,
  20 |    "metadata": {},
  21 |    "outputs": [
  22 |     {
  23 |      "name": "stdout",
  24 |      "output_type": "stream",
  25 |      "text": [
  26 |       "41994806\n"
  27 |      ]
  28 |     }
  29 |    ],
  30 |    "source": [
  31 |     "# get Q2 2023 to through april 2024 (latest available data)\n",
  32 |     "trips_ls = []\n",
  33 |     "months = [\n",
  34 |     "    '2023-04',\n",
  35 |     "    '2023-05', \n",
  36 |     "    '2023-06', \n",
  37 |     "    '2023-07', \n",
  38 |     "    '2023-08', \n",
  39 |     "    '2023-09', \n",
  40 |     "    '2023-10', \n",
  41 |     "    '2023-11', \n",
  42 |     "    '2023-12', \n",
  43 |     "    '2024-01', \n",
  44 |     "    '2024-02', \n",
  45 |     "    '2024-03', \n",
  46 |     "    '2024-04'\n",
  47 |     "    ]\n",
  48 |     "for month in months:\n",
  49 |     "    table_path = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{month}.parquet'\n",
  50 |     "    table = duckdb.sql(f\"SELECT * FROM '{table_path}'\").arrow()\n",
  51 |     "    trips_ls.append(table)\n",
  52 |     "\n",
  53 |     "# concatenate all tables\n",
  54 |     "trips = pa.concat_tables(trips_ls)\n",
  55 |     "print(trips.num_rows)"
  56 |    ]
  57 |   },
  58 |   {
  59 |    "cell_type": "code",
  60 |    "execution_count": 3,
  61 |    "metadata": {},
  62 |    "outputs": [
  63 |     {
  64 |      "data": {
  65 |       "text/plain": [
  66 |        "VendorID: int32\n",
  67 |        "tpep_pickup_datetime: timestamp[us]\n",
  68 |        "tpep_dropoff_datetime: timestamp[us]\n",
  69 |        "passenger_count: int64\n",
  70 |        "trip_distance: double\n",
  71 |        "RatecodeID: int64\n",
  72 |        "store_and_fwd_flag: string\n",
  73 |        "PULocationID: int32\n",
  74 |        "DOLocationID: int32\n",
  75 |        "payment_type: int64\n",
  76 |        "fare_amount: double\n",
  77 |        "extra: double\n",
  78 |        "mta_tax: double\n",
  79 |        "tip_amount: double\n",
  80 |        "tolls_amount: double\n",
  81 |        "improvement_surcharge: double\n",
  82 |        "total_amount: double\n",
  83 |        "congestion_surcharge: double\n",
  84 |        "Airport_fee: double"
  85 |       ]
  86 |      },
  87 |      "execution_count": 3,
  88 |      "metadata": {},
  89 |      "output_type": "execute_result"
  90 |     }
  91 |    ],
  92 |    "source": [
  93 |     "trips.schema"
  94 |    ]
  95 |   },
  96 |   {
  97 |    "cell_type": "code",
  98 |    "execution_count": 4,
  99 |    "metadata": {},
 100 |    "outputs": [
 101 |     {
 102 |      "name": "stdout",
 103 |      "output_type": "stream",
 104 |      "text": [
 105 |       "265\n"
 106 |      ]
 107 |     }
 108 |    ],
 109 |    "source": [
 110 |     "# get location zone mapping\n",
 111 |     "zones = duckdb.sql(\"SELECT * FROM 'https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv'\").arrow()\n",
 112 |     "print(zones.num_rows)"
 113 |    ]
 114 |   },
 115 |   {
 116 |    "cell_type": "code",
 117 |    "execution_count": 5,
 118 |    "metadata": {},
 119 |    "outputs": [
 120 |     {
 121 |      "data": {
 122 |       "text/plain": [
 123 |        "LocationID: int64\n",
 124 |        "Borough: string\n",
 125 |        "Zone: string\n",
 126 |        "service_zone: string"
 127 |       ]
 128 |      },
 129 |      "execution_count": 5,
 130 |      "metadata": {},
 131 |      "output_type": "execute_result"
 132 |     }
 133 |    ],
 134 |    "source": [
 135 |     "zones.schema"
 136 |    ]
 137 |   },
 138 |   {
 139 |    "cell_type": "code",
 140 |    "execution_count": 6,
 141 |    "metadata": {},
 142 |    "outputs": [],
 143 |    "source": [
 144 |     "# create iceberg catalog using sqlite\n",
 145 |     "warehouse_path = \"/duck_iceberg_demo/\"\n",
 146 |     "name_space = 'demo_db'\n",
 147 |     "# create iceberg catalog using sqlite\n",
 148 |     "catalog = SqlCatalog(\n",
 149 |     "    name_space,\n",
 150 |     "    **{\n",
 151 |     "        \"uri\": f\"sqlite:///{warehouse_path}/pyiceberg_catalog.db\",\n",
 152 |     "        \"warehouse\": f\"file://{warehouse_path}\",\n",
 153 |     "    },\n",
 154 |     ")\n",
 155 |     "\n",
 156 |     "# create a namespace for Iceberg\n",
 157 |     "catalog.create_namespace(name_space)"
 158 |    ]
 159 |   },
 160 |   {
 161 |    "cell_type": "code",
 162 |    "execution_count": 7,
 163 |    "metadata": {},
 164 |    "outputs": [],
 165 |    "source": [
 166 |     "def get_iceberg_tables(database_path, table_namespace=None, table_name=None):\n",
 167 |     "    \"\"\"\n",
 168 |     "    Connect to the SQLite database and retrieve the list of Iceberg tables.\n",
 169 |     "    Optionally filter by namespace and table name.\n",
 170 |     "\n",
 171 |     "    Parameters:\n",
 172 |     "        database_path (str): The path to the SQLite database file.\n",
 173 |     "        table_namespace (str, optional): The namespace of the table to search for.\n",
 174 |     "        table_name (str, optional): The name of the table to search for.\n",
 175 |     "\n",
 176 |     "    Returns:\n",
 177 |     "        list: A list of dictionaries, each representing an Iceberg table.\n",
 178 |     "\n",
 179 |     "    Raises:\n",
 180 |     "        ValueError: If only one of table_namespace or table_name is provided.\n",
 181 |     "    \"\"\"\n",
 182 |     "    # Check if both namespace and table name are provided together\n",
 183 |     "    if (table_namespace and not table_name) or (table_name and not table_namespace):\n",
 184 |     "        raise ValueError(\"Both table_namespace and table_name must be provided together.\")\n",
 185 |     "    \n",
 186 |     "    # Connect to the SQLite database\n",
 187 |     "    con_meta = sqlite3.connect(database_path)\n",
 188 |     "    con_meta.row_factory = sqlite3.Row\n",
 189 |     "\n",
 190 |     "    # Create a cursor object to execute SQL queries\n",
 191 |     "    cursor = con_meta.cursor()\n",
 192 |     "\n",
 193 |     "    # Base query to list tables in the database\n",
 194 |     "    query = 'SELECT * FROM \"iceberg_tables\" WHERE 1=1'\n",
 195 |     "    params = []\n",
 196 |     "\n",
 197 |     "    # Add conditions to the query based on provided namespace and table name\n",
 198 |     "    if table_namespace and table_name:\n",
 199 |     "        query += ' AND \"table_namespace\" = ? AND \"table_name\" = ?'\n",
 200 |     "        params.append(table_namespace)\n",
 201 |     "        params.append(table_name)\n",
 202 |     "\n",
 203 |     "    # Execute the query with parameters\n",
 204 |     "    cursor.execute(query, params)\n",
 205 |     "\n",
 206 |     "    # Fetch all results\n",
 207 |     "    results = cursor.fetchall()\n",
 208 |     "\n",
 209 |     "    # Convert results to list of dictionaries\n",
 210 |     "    table_list = []\n",
 211 |     "    for row in results:\n",
 212 |     "        row_dict = {key: row[key] for key in row.keys()}\n",
 213 |     "        table_list.append(row_dict)\n",
 214 |     "\n",
 215 |     "    # Close the connection\n",
 216 |     "    con_meta.close()\n",
 217 |     "\n",
 218 |     "    return table_list\n",
 219 |     "\n"
 220 |    ]
 221 |   },
 222 |   {
 223 |    "cell_type": "code",
 224 |    "execution_count": 8,
 225 |    "metadata": {},
 226 |    "outputs": [],
 227 |    "source": [
 228 |     "def create_metadata_for_tables(tables):\n",
 229 |     "    \"\"\"\n",
 230 |     "    Iterate through all tables and create metadata files.\n",
 231 |     "\n",
 232 |     "    Parameters:\n",
 233 |     "        tables (list): A list of dictionaries, each representing an Iceberg table with a 'metadata_location'.\n",
 234 |     "    \"\"\"\n",
 235 |     "    for table in tables:\n",
 236 |     "        metadata_location = table['metadata_location'].replace('file://', '')\n",
 237 |     "        metadata_dir = os.path.dirname(metadata_location)\n",
 238 |     "        new_metadata_file = os.path.join(metadata_dir, 'v1.metadata.json')\n",
 239 |     "        version_hint_file = os.path.join(metadata_dir, 'version-hint.text')\n",
 240 |     "\n",
 241 |     "        # Ensure the metadata directory exists\n",
 242 |     "        os.makedirs(metadata_dir, exist_ok=True)\n",
 243 |     "\n",
 244 |     "        # Copy the metadata file to v1.metadata.json\n",
 245 |     "        shutil.copy(metadata_location, new_metadata_file)\n",
 246 |     "        print(f\"Copied metadata file to {new_metadata_file}\")\n",
 247 |     "\n",
 248 |     "        # Create the version-hint.text file with content \"1\"\n",
 249 |     "        with open(version_hint_file, 'w') as f:\n",
 250 |     "            f.write('1')\n",
 251 |     "        print(f\"Created {version_hint_file} with content '1'\")"
 252 |    ]
 253 |   },
 254 |   {
 255 |    "cell_type": "code",
 256 |    "execution_count": 13,
 257 |    "metadata": {},
 258 |    "outputs": [],
 259 |    "source": [
 260 |     "# add tables to iceberg catalog\n",
 261 |     "for table, table_name in [\n",
 262 |     "    (trips, \"trips\"),\n",
 263 |     "    (zones, \"zones\"),\n",
 264 |     "]:  \n",
 265 |     "\t# create the iceberg table\n",
 266 |     "    iceberg_table = catalog.create_table(\n",
 267 |     "        f\"{name_space}.{table_name}\",\n",
 268 |     "        schema=table.schema,\n",
 269 |     "    )\n",
 270 |     "\n",
 271 |     "    # add data to iceberg table\n",
 272 |     "    iceberg_table.append(table)\n",
 273 |     "\n",
 274 |     "    # copy catalog metadata to iceberg table\n",
 275 |     "    catalog_records = get_iceberg_tables(f\"{warehouse_path}/pyiceberg_catalog.db\", name_space, table_name)\n",
 276 |     "    create_metadata_for_tables(catalog_records)\n",
 277 |     "    \n",
 278 |     "    print(f\"Created {table_name}, {table.num_rows} rows\")"
 279 |    ]
 280 |   },
 281 |   {
 282 |    "cell_type": "code",
 283 |    "execution_count": 12,
 284 |    "metadata": {},
 285 |    "outputs": [],
 286 |    "source": [
 287 |     "## uncomment to append more data to iceberg tables, to simulate new data coming in\n",
 288 |     "# for table, table_name in [\n",
 289 |     "#     (trips, \"trips\"),\n",
 290 |     "#     (zones, \"zones\"),\n",
 291 |     "# ]:  \n",
 292 |     "#     iceberg_table = catalog.load_table(f\"{name_space}.{table_name}\")\n",
 293 |     "#     # add data to iceberg table\n",
 294 |     "#     iceberg_table.append(table)\n",
 295 |     "\n",
 296 |     "#     # copy catalog metadata to iceberg table\n",
 297 |     "#     catalog_records = get_iceberg_tables(f\"{warehouse_path}/pyiceberg_catalog.db\", name_space, table_name)\n",
 298 |     "#     create_metadata_for_tables(catalog_records)\n",
 299 |     "    \n",
 300 |     "#     print(f\"Loaded {table_name}, {table.num_rows} rows\")"
 301 |    ]
 302 |   },
 303 |   {
 304 |    "cell_type": "code",
 305 |    "execution_count": 10,
 306 |    "metadata": {},
 307 |    "outputs": [
 308 |     {
 309 |      "data": {
 310 |       "text/plain": [
 311 |        "<duckdb.duckdb.DuckDBPyConnection at 0x149d79fb0>"
 312 |       ]
 313 |      },
 314 |      "execution_count": 10,
 315 |      "metadata": {},
 316 |      "output_type": "execute_result"
 317 |     }
 318 |    ],
 319 |    "source": [
 320 |     "# initiate a duckdb connection which we will use to be the query engine for iceberg\n",
 321 |     "con = duckdb.connect(database=':memory:', read_only=False)\n",
 322 |     "setup_sql = '''\n",
 323 |     "INSTALL iceberg;\n",
 324 |     "LOAD iceberg;\n",
 325 |     "'''\n",
 326 |     "res = con.execute(setup_sql)\n",
 327 |     "\n",
 328 |     "# create the schema and views of iceberg tables in duckdb\n",
 329 |     "database_path = f'{warehouse_path}/demo_db.db'\n",
 330 |     "\n",
 331 |     "create_view_sql = f'''\n",
 332 |     "CREATE SCHEMA IF NOT EXISTS taxi;\n",
 333 |     "\n",
 334 |     "CREATE VIEW taxi.trips AS\n",
 335 |     "SELECT * FROM iceberg_scan('{database_path}/trips', allow_moved_paths = true);\n",
 336 |     "\n",
 337 |     "CREATE VIEW taxi.zones AS\n",
 338 |     "SELECT * FROM iceberg_scan('{database_path}/zones', allow_moved_paths = true);\n",
 339 |     "'''\n",
 340 |     "\n",
 341 |     "con.execute(create_view_sql)\n"
 342 |    ]
 343 |   },
 344 |   {
 345 |    "cell_type": "code",
 346 |    "execution_count": 11,
 347 |    "metadata": {},
 348 |    "outputs": [
 349 |     {
 350 |      "data": {
 351 |       "text/html": [
 352 |        "<div>\n",
 353 |        "<style scoped>\n",
 354 |        "    .dataframe tbody tr th:only-of-type {\n",
 355 |        "        vertical-align: middle;\n",
 356 |        "    }\n",
 357 |        "\n",
 358 |        "    .dataframe tbody tr th {\n",
 359 |        "        vertical-align: top;\n",
 360 |        "    }\n",
 361 |        "\n",
 362 |        "    .dataframe thead th {\n",
 363 |        "        text-align: right;\n",
 364 |        "    }\n",
 365 |        "</style>\n",
 366 |        "<table border=\"1\" class=\"dataframe\">\n",
 367 |        "  <thead>\n",
 368 |        "    <tr style=\"text-align: right;\">\n",
 369 |        "      <th></th>\n",
 370 |        "      <th>count_star()</th>\n",
 371 |        "    </tr>\n",
 372 |        "  </thead>\n",
 373 |        "  <tbody>\n",
 374 |        "    <tr>\n",
 375 |        "      <th>0</th>\n",
 376 |        "      <td>41994806</td>\n",
 377 |        "    </tr>\n",
 378 |        "  </tbody>\n",
 379 |        "</table>\n",
 380 |        "</div>"
 381 |       ],
 382 |       "text/plain": [
 383 |        "   count_star()\n",
 384 |        "0      41994806"
 385 |       ]
 386 |      },
 387 |      "execution_count": 11,
 388 |      "metadata": {},
 389 |      "output_type": "execute_result"
 390 |     }
 391 |    ],
 392 |    "source": [
 393 |     "sql = f'''\n",
 394 |     "select \n",
 395 |     "    count(*)\n",
 396 |     "from taxi.trips\n",
 397 |     "'''\n",
 398 |     "\n",
 399 |     "res = con.execute(sql)\n",
 400 |     "res.fetchdf()"
 401 |    ]
 402 |   },
 403 |   {
 404 |    "cell_type": "code",
 405 |    "execution_count": 107,
 406 |    "metadata": {},
 407 |    "outputs": [
 408 |     {
 409 |      "name": "stdout",
 410 |      "output_type": "stream",
 411 |      "text": [
 412 |       "CPU times: user 6.63 s, sys: 170 ms, total: 6.8 s\n",
 413 |       "Wall time: 3.59 s\n"
 414 |      ]
 415 |     },
 416 |     {
 417 |      "data": {
 418 |       "text/html": [
 419 |        "<div>\n",
 420 |        "<style scoped>\n",
 421 |        "    .dataframe tbody tr th:only-of-type {\n",
 422 |        "        vertical-align: middle;\n",
 423 |        "    }\n",
 424 |        "\n",
 425 |        "    .dataframe tbody tr th {\n",
 426 |        "        vertical-align: top;\n",
 427 |        "    }\n",
 428 |        "\n",
 429 |        "    .dataframe thead th {\n",
 430 |        "        text-align: right;\n",
 431 |        "    }\n",
 432 |        "</style>\n",
 433 |        "<table border=\"1\" class=\"dataframe\">\n",
 434 |        "  <thead>\n",
 435 |        "    <tr style=\"text-align: right;\">\n",
 436 |        "      <th></th>\n",
 437 |        "      <th>month</th>\n",
 438 |        "      <th>avg_passenger_count</th>\n",
 439 |        "      <th>avg_trip_distance</th>\n",
 440 |        "      <th>total_trip_distance</th>\n",
 441 |        "      <th>avg_total_amount</th>\n",
 442 |        "      <th>total_amount</th>\n",
 443 |        "      <th>total_trips</th>\n",
 444 |        "    </tr>\n",
 445 |        "  </thead>\n",
 446 |        "  <tbody>\n",
 447 |        "    <tr>\n",
 448 |        "      <th>0</th>\n",
 449 |        "      <td>2023-04-01</td>\n",
 450 |        "      <td>1.382822</td>\n",
 451 |        "      <td>4.096190</td>\n",
 452 |        "      <td>2.693788e+07</td>\n",
 453 |        "      <td>28.269478</td>\n",
 454 |        "      <td>1.859093e+08</td>\n",
 455 |        "      <td>6576326</td>\n",
 456 |        "    </tr>\n",
 457 |        "    <tr>\n",
 458 |        "      <th>1</th>\n",
 459 |        "      <td>2023-05-01</td>\n",
 460 |        "      <td>1.358801</td>\n",
 461 |        "      <td>4.345793</td>\n",
 462 |        "      <td>3.053931e+07</td>\n",
 463 |        "      <td>28.962935</td>\n",
 464 |        "      <td>2.035320e+08</td>\n",
 465 |        "      <td>7027328</td>\n",
 466 |        "    </tr>\n",
 467 |        "    <tr>\n",
 468 |        "      <th>2</th>\n",
 469 |        "      <td>2023-06-01</td>\n",
 470 |        "      <td>1.369012</td>\n",
 471 |        "      <td>4.368754</td>\n",
 472 |        "      <td>2.889720e+07</td>\n",
 473 |        "      <td>29.068591</td>\n",
 474 |        "      <td>1.922747e+08</td>\n",
 475 |        "      <td>6614518</td>\n",
 476 |        "    </tr>\n",
 477 |        "    <tr>\n",
 478 |        "      <th>3</th>\n",
 479 |        "      <td>2023-07-01</td>\n",
 480 |        "      <td>1.401961</td>\n",
 481 |        "      <td>4.489437</td>\n",
 482 |        "      <td>2.610242e+07</td>\n",
 483 |        "      <td>28.568068</td>\n",
 484 |        "      <td>1.661001e+08</td>\n",
 485 |        "      <td>5814186</td>\n",
 486 |        "    </tr>\n",
 487 |        "    <tr>\n",
 488 |        "      <th>4</th>\n",
 489 |        "      <td>2023-08-01</td>\n",
 490 |        "      <td>1.386979</td>\n",
 491 |        "      <td>4.782777</td>\n",
 492 |        "      <td>2.701505e+07</td>\n",
 493 |        "      <td>28.628030</td>\n",
 494 |        "      <td>1.617026e+08</td>\n",
 495 |        "      <td>5648402</td>\n",
 496 |        "    </tr>\n",
 497 |        "    <tr>\n",
 498 |        "      <th>5</th>\n",
 499 |        "      <td>2023-09-01</td>\n",
 500 |        "      <td>1.356404</td>\n",
 501 |        "      <td>4.274258</td>\n",
 502 |        "      <td>2.433541e+07</td>\n",
 503 |        "      <td>29.781914</td>\n",
 504 |        "      <td>1.695628e+08</td>\n",
 505 |        "      <td>5693482</td>\n",
 506 |        "    </tr>\n",
 507 |        "    <tr>\n",
 508 |        "      <th>6</th>\n",
 509 |        "      <td>2023-10-01</td>\n",
 510 |        "      <td>1.359725</td>\n",
 511 |        "      <td>3.926687</td>\n",
 512 |        "      <td>2.766170e+07</td>\n",
 513 |        "      <td>29.171275</td>\n",
 514 |        "      <td>2.054982e+08</td>\n",
 515 |        "      <td>7044538</td>\n",
 516 |        "    </tr>\n",
 517 |        "    <tr>\n",
 518 |        "      <th>7</th>\n",
 519 |        "      <td>2023-11-01</td>\n",
 520 |        "      <td>1.358013</td>\n",
 521 |        "      <td>3.632733</td>\n",
 522 |        "      <td>2.426470e+07</td>\n",
 523 |        "      <td>28.695792</td>\n",
 524 |        "      <td>1.916725e+08</td>\n",
 525 |        "      <td>6679462</td>\n",
 526 |        "    </tr>\n",
 527 |        "    <tr>\n",
 528 |        "      <th>8</th>\n",
 529 |        "      <td>2023-12-01</td>\n",
 530 |        "      <td>1.408160</td>\n",
 531 |        "      <td>3.676252</td>\n",
 532 |        "      <td>2.482600e+07</td>\n",
 533 |        "      <td>28.541505</td>\n",
 534 |        "      <td>1.927429e+08</td>\n",
 535 |        "      <td>6753074</td>\n",
 536 |        "    </tr>\n",
 537 |        "    <tr>\n",
 538 |        "      <th>9</th>\n",
 539 |        "      <td>2024-01-01</td>\n",
 540 |        "      <td>1.339277</td>\n",
 541 |        "      <td>3.652175</td>\n",
 542 |        "      <td>2.165464e+07</td>\n",
 543 |        "      <td>26.801600</td>\n",
 544 |        "      <td>1.589133e+08</td>\n",
 545 |        "      <td>5929246</td>\n",
 546 |        "    </tr>\n",
 547 |        "    <tr>\n",
 548 |        "      <th>10</th>\n",
 549 |        "      <td>2024-02-01</td>\n",
 550 |        "      <td>1.325943</td>\n",
 551 |        "      <td>3.860858</td>\n",
 552 |        "      <td>2.322331e+07</td>\n",
 553 |        "      <td>26.624412</td>\n",
 554 |        "      <td>1.601476e+08</td>\n",
 555 |        "      <td>6015066</td>\n",
 556 |        "    </tr>\n",
 557 |        "    <tr>\n",
 558 |        "      <th>11</th>\n",
 559 |        "      <td>2024-03-01</td>\n",
 560 |        "      <td>1.337624</td>\n",
 561 |        "      <td>4.517421</td>\n",
 562 |        "      <td>3.236832e+07</td>\n",
 563 |        "      <td>27.120594</td>\n",
 564 |        "      <td>1.943251e+08</td>\n",
 565 |        "      <td>7165222</td>\n",
 566 |        "    </tr>\n",
 567 |        "    <tr>\n",
 568 |        "      <th>12</th>\n",
 569 |        "      <td>2024-04-01</td>\n",
 570 |        "      <td>1.334142</td>\n",
 571 |        "      <td>5.283850</td>\n",
 572 |        "      <td>3.713788e+07</td>\n",
 573 |        "      <td>27.493425</td>\n",
 574 |        "      <td>1.932393e+08</td>\n",
 575 |        "      <td>7028564</td>\n",
 576 |        "    </tr>\n",
 577 |        "  </tbody>\n",
 578 |        "</table>\n",
 579 |        "</div>"
 580 |       ],
 581 |       "text/plain": [
 582 |        "        month  avg_passenger_count  avg_trip_distance  total_trip_distance  \\\n",
 583 |        "0  2023-04-01             1.382822           4.096190         2.693788e+07   \n",
 584 |        "1  2023-05-01             1.358801           4.345793         3.053931e+07   \n",
 585 |        "2  2023-06-01             1.369012           4.368754         2.889720e+07   \n",
 586 |        "3  2023-07-01             1.401961           4.489437         2.610242e+07   \n",
 587 |        "4  2023-08-01             1.386979           4.782777         2.701505e+07   \n",
 588 |        "5  2023-09-01             1.356404           4.274258         2.433541e+07   \n",
 589 |        "6  2023-10-01             1.359725           3.926687         2.766170e+07   \n",
 590 |        "7  2023-11-01             1.358013           3.632733         2.426470e+07   \n",
 591 |        "8  2023-12-01             1.408160           3.676252         2.482600e+07   \n",
 592 |        "9  2024-01-01             1.339277           3.652175         2.165464e+07   \n",
 593 |        "10 2024-02-01             1.325943           3.860858         2.322331e+07   \n",
 594 |        "11 2024-03-01             1.337624           4.517421         3.236832e+07   \n",
 595 |        "12 2024-04-01             1.334142           5.283850         3.713788e+07   \n",
 596 |        "\n",
 597 |        "    avg_total_amount  total_amount  total_trips  \n",
 598 |        "0          28.269478  1.859093e+08      6576326  \n",
 599 |        "1          28.962935  2.035320e+08      7027328  \n",
 600 |        "2          29.068591  1.922747e+08      6614518  \n",
 601 |        "3          28.568068  1.661001e+08      5814186  \n",
 602 |        "4          28.628030  1.617026e+08      5648402  \n",
 603 |        "5          29.781914  1.695628e+08      5693482  \n",
 604 |        "6          29.171275  2.054982e+08      7044538  \n",
 605 |        "7          28.695792  1.916725e+08      6679462  \n",
 606 |        "8          28.541505  1.927429e+08      6753074  \n",
 607 |        "9          26.801600  1.589133e+08      5929246  \n",
 608 |        "10         26.624412  1.601476e+08      6015066  \n",
 609 |        "11         27.120594  1.943251e+08      7165222  \n",
 610 |        "12         27.493425  1.932393e+08      7028564  "
 611 |       ]
 612 |      },
 613 |      "execution_count": 107,
 614 |      "metadata": {},
 615 |      "output_type": "execute_result"
 616 |     }
 617 |    ],
 618 |    "source": [
 619 |     "sql = f'''\n",
 620 |     "select \n",
 621 |     "    date_trunc('month', tpep_pickup_datetime) as month,\n",
 622 |     "    avg(passenger_count) as avg_passenger_count,\n",
 623 |     "    avg(trip_distance) as avg_trip_distance,\n",
 624 |     "    sum(trip_distance) as total_trip_distance,\n",
 625 |     "    avg(total_amount) as avg_total_amount,\n",
 626 |     "    sum(total_amount) as total_amount,\n",
 627 |     "    count(*) as total_trips\n",
 628 |     "from taxi.trips\n",
 629 |     "-- some data pre and post our target date range is in the dataset, so we filter it out\n",
 630 |     "where tpep_pickup_datetime between '2023-04-01' and '2024-05-01'\n",
 631 |     "group by 1\n",
 632 |     "order by 1\n",
 633 |     "'''\n",
 634 |     "\n",
 635 |     "%time res = con.execute(sql)\n",
 636 |     "res.fetchdf()"
 637 |    ]
 638 |   },
 639 |   {
 640 |    "cell_type": "code",
 641 |    "execution_count": 109,
 642 |    "metadata": {},
 643 |    "outputs": [
 644 |     {
 645 |      "name": "stdout",
 646 |      "output_type": "stream",
 647 |      "text": [
 648 |       "CPU times: user 5.79 s, sys: 63.4 ms, total: 5.86 s\n",
 649 |       "Wall time: 2.99 s\n"
 650 |      ]
 651 |     },
 652 |     {
 653 |      "data": {
 654 |       "text/html": [
 655 |        "<div>\n",
 656 |        "<style scoped>\n",
 657 |        "    .dataframe tbody tr th:only-of-type {\n",
 658 |        "        vertical-align: middle;\n",
 659 |        "    }\n",
 660 |        "\n",
 661 |        "    .dataframe tbody tr th {\n",
 662 |        "        vertical-align: top;\n",
 663 |        "    }\n",
 664 |        "\n",
 665 |        "    .dataframe thead th {\n",
 666 |        "        text-align: right;\n",
 667 |        "    }\n",
 668 |        "</style>\n",
 669 |        "<table border=\"1\" class=\"dataframe\">\n",
 670 |        "  <thead>\n",
 671 |        "    <tr style=\"text-align: right;\">\n",
 672 |        "      <th></th>\n",
 673 |        "      <th>Borough</th>\n",
 674 |        "      <th>total_trips</th>\n",
 675 |        "      <th>total_amount</th>\n",
 676 |        "    </tr>\n",
 677 |        "  </thead>\n",
 678 |        "  <tbody>\n",
 679 |        "    <tr>\n",
 680 |        "      <th>0</th>\n",
 681 |        "      <td>Manhattan</td>\n",
 682 |        "      <td>148707792</td>\n",
 683 |        "      <td>3.690928e+09</td>\n",
 684 |        "    </tr>\n",
 685 |        "    <tr>\n",
 686 |        "      <th>1</th>\n",
 687 |        "      <td>Queens</td>\n",
 688 |        "      <td>9018616</td>\n",
 689 |        "      <td>4.853421e+08</td>\n",
 690 |        "    </tr>\n",
 691 |        "    <tr>\n",
 692 |        "      <th>2</th>\n",
 693 |        "      <td>Brooklyn</td>\n",
 694 |        "      <td>6494860</td>\n",
 695 |        "      <td>3.254173e+08</td>\n",
 696 |        "    </tr>\n",
 697 |        "    <tr>\n",
 698 |        "      <th>3</th>\n",
 699 |        "      <td>Unknown</td>\n",
 700 |        "      <td>1469164</td>\n",
 701 |        "      <td>4.296070e+07</td>\n",
 702 |        "    </tr>\n",
 703 |        "    <tr>\n",
 704 |        "      <th>4</th>\n",
 705 |        "      <td>Bronx</td>\n",
 706 |        "      <td>1013362</td>\n",
 707 |        "      <td>5.287546e+07</td>\n",
 708 |        "    </tr>\n",
 709 |        "    <tr>\n",
 710 |        "      <th>5</th>\n",
 711 |        "      <td>N/A</td>\n",
 712 |        "      <td>729832</td>\n",
 713 |        "      <td>8.671968e+07</td>\n",
 714 |        "    </tr>\n",
 715 |        "    <tr>\n",
 716 |        "      <th>6</th>\n",
 717 |        "      <td>EWR</td>\n",
 718 |        "      <td>500656</td>\n",
 719 |        "      <td>6.250183e+07</td>\n",
 720 |        "    </tr>\n",
 721 |        "    <tr>\n",
 722 |        "      <th>7</th>\n",
 723 |        "      <td>Staten Island</td>\n",
 724 |        "      <td>44948</td>\n",
 725 |        "      <td>4.510061e+06</td>\n",
 726 |        "    </tr>\n",
 727 |        "  </tbody>\n",
 728 |        "</table>\n",
 729 |        "</div>"
 730 |       ],
 731 |       "text/plain": [
 732 |        "         Borough  total_trips  total_amount\n",
 733 |        "0      Manhattan    148707792  3.690928e+09\n",
 734 |        "1         Queens      9018616  4.853421e+08\n",
 735 |        "2       Brooklyn      6494860  3.254173e+08\n",
 736 |        "3        Unknown      1469164  4.296070e+07\n",
 737 |        "4          Bronx      1013362  5.287546e+07\n",
 738 |        "5            N/A       729832  8.671968e+07\n",
 739 |        "6            EWR       500656  6.250183e+07\n",
 740 |        "7  Staten Island        44948  4.510061e+06"
 741 |       ]
 742 |      },
 743 |      "execution_count": 109,
 744 |      "metadata": {},
 745 |      "output_type": "execute_result"
 746 |     }
 747 |    ],
 748 |    "source": [
 749 |     "sql = f'''\n",
 750 |     "select \n",
 751 |     "    zones.Borough,\n",
 752 |     "    count(*) as total_trips,\n",
 753 |     "    sum(total_amount) as total_amount\n",
 754 |     "from taxi.zones as zones\n",
 755 |     "left join taxi.trips as trips\n",
 756 |     "    on zones.LocationID = trips.DOLocationID\n",
 757 |     "group by 1 \n",
 758 |     "order by 2 desc\n",
 759 |     "'''\n",
 760 |     "\n",
 761 |     "%time res = con.execute(sql)\n",
 762 |     "res.fetchdf()"
 763 |    ]
 764 |   },
 765 |   {
 766 |    "cell_type": "code",
 767 |    "execution_count": 111,
 768 |    "metadata": {},
 769 |    "outputs": [
 770 |     {
 771 |      "name": "stdout",
 772 |      "output_type": "stream",
 773 |      "text": [
 774 |       "CPU times: user 43.2 s, sys: 9.8 s, total: 53 s\n",
 775 |       "Wall time: 26.2 s\n"
 776 |      ]
 777 |     },
 778 |     {
 779 |      "data": {
 780 |       "text/html": [
 781 |        "<div>\n",
 782 |        "<style scoped>\n",
 783 |        "    .dataframe tbody tr th:only-of-type {\n",
 784 |        "        vertical-align: middle;\n",
 785 |        "    }\n",
 786 |        "\n",
 787 |        "    .dataframe tbody tr th {\n",
 788 |        "        vertical-align: top;\n",
 789 |        "    }\n",
 790 |        "\n",
 791 |        "    .dataframe thead th {\n",
 792 |        "        text-align: right;\n",
 793 |        "    }\n",
 794 |        "</style>\n",
 795 |        "<table border=\"1\" class=\"dataframe\">\n",
 796 |        "  <thead>\n",
 797 |        "    <tr style=\"text-align: right;\">\n",
 798 |        "      <th></th>\n",
 799 |        "      <th>pickup_borough</th>\n",
 800 |        "      <th>dropoff_borough</th>\n",
 801 |        "      <th>trip_count</th>\n",
 802 |        "    </tr>\n",
 803 |        "  </thead>\n",
 804 |        "  <tbody>\n",
 805 |        "    <tr>\n",
 806 |        "      <th>0</th>\n",
 807 |        "      <td>Bronx</td>\n",
 808 |        "      <td>Bronx</td>\n",
 809 |        "      <td>311200</td>\n",
 810 |        "    </tr>\n",
 811 |        "    <tr>\n",
 812 |        "      <th>1</th>\n",
 813 |        "      <td>Bronx</td>\n",
 814 |        "      <td>Manhattan</td>\n",
 815 |        "      <td>270232</td>\n",
 816 |        "    </tr>\n",
 817 |        "    <tr>\n",
 818 |        "      <th>2</th>\n",
 819 |        "      <td>Bronx</td>\n",
 820 |        "      <td>Queens</td>\n",
 821 |        "      <td>57432</td>\n",
 822 |        "    </tr>\n",
 823 |        "    <tr>\n",
 824 |        "      <th>3</th>\n",
 825 |        "      <td>Bronx</td>\n",
 826 |        "      <td>Brooklyn</td>\n",
 827 |        "      <td>55544</td>\n",
 828 |        "    </tr>\n",
 829 |        "    <tr>\n",
 830 |        "      <th>4</th>\n",
 831 |        "      <td>Bronx</td>\n",
 832 |        "      <td>N/A</td>\n",
 833 |        "      <td>4176</td>\n",
 834 |        "    </tr>\n",
 835 |        "    <tr>\n",
 836 |        "      <th>5</th>\n",
 837 |        "      <td>Bronx</td>\n",
 838 |        "      <td>Unknown</td>\n",
 839 |        "      <td>1848</td>\n",
 840 |        "    </tr>\n",
 841 |        "    <tr>\n",
 842 |        "      <th>6</th>\n",
 843 |        "      <td>Bronx</td>\n",
 844 |        "      <td>Staten Island</td>\n",
 845 |        "      <td>1304</td>\n",
 846 |        "    </tr>\n",
 847 |        "    <tr>\n",
 848 |        "      <th>7</th>\n",
 849 |        "      <td>Bronx</td>\n",
 850 |        "      <td>EWR</td>\n",
 851 |        "      <td>280</td>\n",
 852 |        "    </tr>\n",
 853 |        "    <tr>\n",
 854 |        "      <th>8</th>\n",
 855 |        "      <td>Brooklyn</td>\n",
 856 |        "      <td>Brooklyn</td>\n",
 857 |        "      <td>1452112</td>\n",
 858 |        "    </tr>\n",
 859 |        "    <tr>\n",
 860 |        "      <th>9</th>\n",
 861 |        "      <td>Brooklyn</td>\n",
 862 |        "      <td>Manhattan</td>\n",
 863 |        "      <td>1045936</td>\n",
 864 |        "    </tr>\n",
 865 |        "    <tr>\n",
 866 |        "      <th>10</th>\n",
 867 |        "      <td>Brooklyn</td>\n",
 868 |        "      <td>Queens</td>\n",
 869 |        "      <td>345792</td>\n",
 870 |        "    </tr>\n",
 871 |        "    <tr>\n",
 872 |        "      <th>11</th>\n",
 873 |        "      <td>Brooklyn</td>\n",
 874 |        "      <td>Bronx</td>\n",
 875 |        "      <td>55736</td>\n",
 876 |        "    </tr>\n",
 877 |        "    <tr>\n",
 878 |        "      <th>12</th>\n",
 879 |        "      <td>Brooklyn</td>\n",
 880 |        "      <td>N/A</td>\n",
 881 |        "      <td>8016</td>\n",
 882 |        "    </tr>\n",
 883 |        "    <tr>\n",
 884 |        "      <th>13</th>\n",
 885 |        "      <td>Brooklyn</td>\n",
 886 |        "      <td>EWR</td>\n",
 887 |        "      <td>7376</td>\n",
 888 |        "    </tr>\n",
 889 |        "    <tr>\n",
 890 |        "      <th>14</th>\n",
 891 |        "      <td>Brooklyn</td>\n",
 892 |        "      <td>Unknown</td>\n",
 893 |        "      <td>7168</td>\n",
 894 |        "    </tr>\n",
 895 |        "    <tr>\n",
 896 |        "      <th>15</th>\n",
 897 |        "      <td>Brooklyn</td>\n",
 898 |        "      <td>Staten Island</td>\n",
 899 |        "      <td>5144</td>\n",
 900 |        "    </tr>\n",
 901 |        "    <tr>\n",
 902 |        "      <th>16</th>\n",
 903 |        "      <td>EWR</td>\n",
 904 |        "      <td>EWR</td>\n",
 905 |        "      <td>37680</td>\n",
 906 |        "    </tr>\n",
 907 |        "    <tr>\n",
 908 |        "      <th>17</th>\n",
 909 |        "      <td>EWR</td>\n",
 910 |        "      <td>Unknown</td>\n",
 911 |        "      <td>1896</td>\n",
 912 |        "    </tr>\n",
 913 |        "    <tr>\n",
 914 |        "      <th>18</th>\n",
 915 |        "      <td>EWR</td>\n",
 916 |        "      <td>N/A</td>\n",
 917 |        "      <td>1640</td>\n",
 918 |        "    </tr>\n",
 919 |        "    <tr>\n",
 920 |        "      <th>19</th>\n",
 921 |        "      <td>EWR</td>\n",
 922 |        "      <td>Manhattan</td>\n",
 923 |        "      <td>1032</td>\n",
 924 |        "    </tr>\n",
 925 |        "  </tbody>\n",
 926 |        "</table>\n",
 927 |        "</div>"
 928 |       ],
 929 |       "text/plain": [
 930 |        "   pickup_borough dropoff_borough  trip_count\n",
 931 |        "0           Bronx           Bronx      311200\n",
 932 |        "1           Bronx       Manhattan      270232\n",
 933 |        "2           Bronx          Queens       57432\n",
 934 |        "3           Bronx        Brooklyn       55544\n",
 935 |        "4           Bronx             N/A        4176\n",
 936 |        "5           Bronx         Unknown        1848\n",
 937 |        "6           Bronx   Staten Island        1304\n",
 938 |        "7           Bronx             EWR         280\n",
 939 |        "8        Brooklyn        Brooklyn     1452112\n",
 940 |        "9        Brooklyn       Manhattan     1045936\n",
 941 |        "10       Brooklyn          Queens      345792\n",
 942 |        "11       Brooklyn           Bronx       55736\n",
 943 |        "12       Brooklyn             N/A        8016\n",
 944 |        "13       Brooklyn             EWR        7376\n",
 945 |        "14       Brooklyn         Unknown        7168\n",
 946 |        "15       Brooklyn   Staten Island        5144\n",
 947 |        "16            EWR             EWR       37680\n",
 948 |        "17            EWR         Unknown        1896\n",
 949 |        "18            EWR             N/A        1640\n",
 950 |        "19            EWR       Manhattan        1032"
 951 |       ]
 952 |      },
 953 |      "execution_count": 111,
 954 |      "metadata": {},
 955 |      "output_type": "execute_result"
 956 |     }
 957 |    ],
 958 |    "source": [
 959 |     "sql = f'''\n",
 960 |     "select \n",
 961 |     "    starting_zone.Borough as pickup_borough,\n",
 962 |     "    ending_zone.Borough as dropoff_borough,\n",
 963 |     "    count(*) as trip_count\n",
 964 |     "from\n",
 965 |     "taxi.trips as trips\n",
 966 |     "left join taxi.zones as starting_zone\n",
 967 |     "    on trips.PULocationID = starting_zone.LocationID\n",
 968 |     "left join taxi.zones as ending_zone\n",
 969 |     "    on trips.DOLocationID = ending_zone.LocationID\n",
 970 |     "group by 1, 2\n",
 971 |     "order by 1 asc, 3 desc\n",
 972 |     "'''\n",
 973 |     "\n",
 974 |     "%time res = con.execute(sql)\n",
 975 |     "res.fetchdf().head(20)"
 976 |    ]
 977 |   },
 978 |   {
 979 |    "cell_type": "code",
 980 |    "execution_count": null,
 981 |    "metadata": {},
 982 |    "outputs": [],
 983 |    "source": []
 984 |   }
 985 |  ],
 986 |  "metadata": {
 987 |   "kernelspec": {
 988 |    "display_name": ".venv",
 989 |    "language": "python",
 990 |    "name": "python3"
 991 |   },
 992 |   "language_info": {
 993 |    "codemirror_mode": {
 994 |     "name": "ipython",
 995 |     "version": 3
 996 |    },
 997 |    "file_extension": ".py",
 998 |    "mimetype": "text/x-python",
 999 |    "name": "python",
1000 |    "nbconvert_exporter": "python",
1001 |    "pygments_lexer": "ipython3",
1002 |    "version": "3.10.11"
1003 |   }
1004 |  },
1005 |  "nbformat": 4,
1006 |  "nbformat_minor": 2
1007 | }
1008 | 


--------------------------------------------------------------------------------
/duck_iceberg_demo/local_duck_cloud_aws_iceberg_demo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import duckdb\n",
 10 |     "from pyiceberg.catalog.sql import SqlCatalog\n",
 11 |     "import pyarrow as pa\n",
 12 |     "import os\n",
 13 |     "import shutil\n",
 14 |     "import gcsfs\n",
 15 |     "import boto3\n",
 16 |     "\n",
 17 |     "import os\n",
 18 |     "os.environ['AWS_DEFAULT_REGION'] = 'us-east-2' # set the region for where your s3 bucket is if different from your default region"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "# get Q2 2023 to through april 2024 (latest available data)\n",
 28 |     "trips_ls = []\n",
 29 |     "months = [\n",
 30 |     "    '2023-04',\n",
 31 |     "    '2023-05', \n",
 32 |     "    '2023-06', \n",
 33 |     "    '2023-07', \n",
 34 |     "    '2023-08', \n",
 35 |     "    '2023-09', \n",
 36 |     "    '2023-10', \n",
 37 |     "    '2023-11', \n",
 38 |     "    '2023-12', \n",
 39 |     "    '2024-01', \n",
 40 |     "    '2024-02', \n",
 41 |     "    '2024-03', \n",
 42 |     "    '2024-04'\n",
 43 |     "    ]\n",
 44 |     "for month in months:\n",
 45 |     "    table_path = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{month}.parquet'\n",
 46 |     "    # NOTE: this initial data read doesn't require Duckdb, something like pandas works as well\n",
 47 |     "    table = duckdb.sql(f\"SELECT * FROM '{table_path}'\").arrow()\n",
 48 |     "    trips_ls.append(table)\n",
 49 |     "\n",
 50 |     "# concatenate all tables\n",
 51 |     "trips = pa.concat_tables(trips_ls)\n",
 52 |     "print(\"Rows in trips: \",trips.num_rows)\n",
 53 |     "\n",
 54 |     "# get location zone mapping\n",
 55 |     "zones = duckdb.sql(\"SELECT * FROM 'https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv'\").arrow()\n",
 56 |     "print(\"Rows in zones: \",zones.num_rows)"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "# create Iceberg catalog using Postgres and GCS\n",
 66 |     "catalog_name = \"demo_iceberg\"\n",
 67 |     "catalog_uri = \"<YOUR_POSTGRES_URI>\" #replace with Postgres URI\n",
 68 |     "warehouse_path = \"s3://<YOUR_S3_PATH>\" #replace with bucket name you created in S3\n",
 69 |     "\n",
 70 |     "catalog = SqlCatalog(\n",
 71 |     "    catalog_name,\n",
 72 |     "    **{\n",
 73 |     "        \"uri\": catalog_uri,\n",
 74 |     "        \"warehouse\": warehouse_path,\n",
 75 |     "    },\n",
 76 |     ")\n",
 77 |     "\n",
 78 |     "# create a namespace for Iceberg\n",
 79 |     "name_space = 'taxi'\n",
 80 |     "try:\n",
 81 |     "    catalog.create_namespace(name_space)\n",
 82 |     "except Exception as e:\n",
 83 |     "    print(e)"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "region_name='us-east-2' # replace with your s3 region\n",
 93 |     "\n",
 94 |     "def add_version_hint(iceberg_table):\n",
 95 |     "    \"\"\"\n",
 96 |     "    Adds version hint file to Iceberg table metadata\n",
 97 |     "    Addresses issue mentioned here: https://github.com/duckdb/duckdb_iceberg/issues/29\n",
 98 |     "    Determines if Iceberg table is in local file system or in GCS/S3\n",
 99 |     "    \"\"\"\n",
100 |     "    metadata_location = iceberg_table.metadata_location\n",
101 |     "    protocol = metadata_location.split(\":\")[0]\n",
102 |     "\n",
103 |     "    if protocol == \"file\":\n",
104 |     "        metadata_location = metadata_location[7:]\n",
105 |     "    elif protocol == \"gs\" or protocol == \"s3\":\n",
106 |     "        metadata_location = metadata_location[5:]\n",
107 |     "    else:\n",
108 |     "        print(f\"Unsupported metadata location: {metadata_location}\")\n",
109 |     "        return\n",
110 |     "\n",
111 |     "    metadata_dir = os.path.dirname(metadata_location)\n",
112 |     "    new_metadata_file = os.path.join(metadata_dir, \"v1.metadata.json\")\n",
113 |     "    version_hint_file = os.path.join(metadata_dir, \"version-hint.text\")\n",
114 |     "\n",
115 |     "    if protocol == \"file\":\n",
116 |     "        shutil.copy(metadata_location, new_metadata_file)\n",
117 |     "        with open(version_hint_file, \"w\") as f:\n",
118 |     "            f.write(\"1\")\n",
119 |     "    elif protocol == \"gs\":\n",
120 |     "        fs = gcsfs.GCSFileSystem()\n",
121 |     "        fs.copy(metadata_location, new_metadata_file)\n",
122 |     "        with fs.open(version_hint_file, \"w\") as f:\n",
123 |     "            f.write(\"1\")\n",
124 |     "    elif protocol == \"s3\":\n",
125 |     "        s3 = boto3.client('s3')\n",
126 |     "        bucket_name = metadata_location.split('/')[0]\n",
127 |     "        s3_file_key = '/'.join(metadata_location.split('/')[1:])\n",
128 |     "        new_s3_file_key = os.path.join(os.path.dirname(s3_file_key), \"v1.metadata.json\")\n",
129 |     "        version_hint_key = os.path.join(os.path.dirname(s3_file_key), \"version-hint.text\")\n",
130 |     "\n",
131 |     "        s3.copy({'Bucket': bucket_name, 'Key': s3_file_key}, bucket_name, new_s3_file_key)\n",
132 |     "        s3.put_object(Bucket=bucket_name, Key=version_hint_key, Body='1')\n",
133 |     "\n",
134 |     "    print(f\"Copied metadata file to {new_metadata_file}\")\n",
135 |     "    print(f\"Created {version_hint_file} with content '1'\")"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "# add tables to iceberg catalog and load data into S3\n",
145 |     "for table, table_name in [\n",
146 |     "    (trips, \"trips\"),\n",
147 |     "    (zones, \"zones\"),\n",
148 |     "]:  \n",
149 |     "\t# create the iceberg table\n",
150 |     "    iceberg_table = catalog.create_table(\n",
151 |     "        f\"{name_space}.{table_name}\",\n",
152 |     "        schema=table.schema,\n",
153 |     "    )\n",
154 |     "\n",
155 |     "    # add data to iceberg table in S3\n",
156 |     "    iceberg_table.append(table)\n",
157 |     "\n",
158 |     "    # copy catalog version hint metadata into S3\n",
159 |     "    add_version_hint(iceberg_table)\n",
160 |     "    \n",
161 |     "    print(f\"Created {table_name}, {table.num_rows} rows\")"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": null,
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": [
170 |     "# initiate a duckdb connection which we will use to be the query engine for iceberg\n",
171 |     "import duckdb\n",
172 |     "\n",
173 |     "con = duckdb.connect(database=':memory:', read_only=False)\n",
174 |     "setup_sql = '''\n",
175 |     "INSTALL iceberg;\n",
176 |     "LOAD iceberg;\n",
177 |     "\n",
178 |     "CREATE SECRET (\n",
179 |     "    TYPE S3,\n",
180 |     "    KEY_ID '<YOUR_AWS_KEY>',\n",
181 |     "    SECRET '<YOUR_AWS_SECRET_KEY>',\n",
182 |     "    REGION '<YOUR_AWS_REGION>'\n",
183 |     ");\n",
184 |     "'''\n",
185 |     "res = con.execute(setup_sql)"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": null,
191 |    "metadata": {},
192 |    "outputs": [],
193 |    "source": [
194 |     "# create the schema and views of iceberg tables in duckdb\n",
195 |     "database_path = f'{warehouse_path}{name_space}.db'\n",
196 |     "\n",
197 |     "create_view_sql = f'''\n",
198 |     "CREATE SCHEMA IF NOT EXISTS taxi;\n",
199 |     "\n",
200 |     "CREATE VIEW taxi.trips AS\n",
201 |     "SELECT * FROM iceberg_scan('{database_path}/trips', allow_moved_paths = true);\n",
202 |     "\n",
203 |     "CREATE VIEW taxi.zones AS\n",
204 |     "SELECT * FROM iceberg_scan('{database_path}/zones', allow_moved_paths = true);\n",
205 |     "'''\n",
206 |     "\n",
207 |     "con.execute(create_view_sql)"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {},
214 |    "outputs": [],
215 |    "source": [
216 |     "sql = f'''\n",
217 |     "select \n",
218 |     "    count(*)\n",
219 |     "from taxi.trips\n",
220 |     "'''\n",
221 |     "\n",
222 |     "%time res = con.execute(sql)\n",
223 |     "res.fetchdf()"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "metadata": {},
230 |    "outputs": [],
231 |    "source": [
232 |     "sql = f'''\n",
233 |     "select \n",
234 |     "    date_trunc('month', tpep_pickup_datetime) as month,\n",
235 |     "    avg(passenger_count) as avg_passenger_count,\n",
236 |     "    avg(trip_distance) as avg_trip_distance,\n",
237 |     "    sum(trip_distance) as total_trip_distance,\n",
238 |     "    avg(total_amount) as avg_total_amount,\n",
239 |     "    sum(total_amount) as total_amount,\n",
240 |     "    count(*) as total_trips\n",
241 |     "from taxi.trips\n",
242 |     "-- some data pre and post our target date range is in the dataset, so we filter it out\n",
243 |     "where tpep_pickup_datetime between '2023-04-01' and '2024-05-01'\n",
244 |     "group by 1\n",
245 |     "order by 1\n",
246 |     "'''\n",
247 |     "\n",
248 |     "%time res = con.execute(sql)\n",
249 |     "res.fetchdf()"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": null,
255 |    "metadata": {},
256 |    "outputs": [],
257 |    "source": [
258 |     "sql = f'''\n",
259 |     "select \n",
260 |     "    zones.Borough,\n",
261 |     "    count(*) as total_trips,\n",
262 |     "    sum(total_amount) as total_amount\n",
263 |     "from taxi.zones as zones\n",
264 |     "left join taxi.trips as trips\n",
265 |     "    on zones.LocationID = trips.DOLocationID\n",
266 |     "group by 1 \n",
267 |     "order by 2 desc\n",
268 |     "'''\n",
269 |     "\n",
270 |     "%time res = con.execute(sql)\n",
271 |     "res.fetchdf()"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": null,
277 |    "metadata": {},
278 |    "outputs": [],
279 |    "source": [
280 |     "sql = f'''\n",
281 |     "select \n",
282 |     "    starting_zone.Borough as pickup_borough,\n",
283 |     "    ending_zone.Borough as dropoff_borough,\n",
284 |     "    count(*) as trip_count\n",
285 |     "from\n",
286 |     "taxi.trips as trips\n",
287 |     "left join taxi.zones as starting_zone\n",
288 |     "    on trips.PULocationID = starting_zone.LocationID\n",
289 |     "left join taxi.zones as ending_zone\n",
290 |     "    on trips.DOLocationID = ending_zone.LocationID\n",
291 |     "group by 1, 2\n",
292 |     "order by 1 asc, 3 desc\n",
293 |     "'''\n",
294 |     "\n",
295 |     "%time res = con.execute(sql)\n",
296 |     "res.fetchdf().head(20)"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": null,
302 |    "metadata": {},
303 |    "outputs": [],
304 |    "source": []
305 |   }
306 |  ],
307 |  "metadata": {
308 |   "kernelspec": {
309 |    "display_name": ".venv",
310 |    "language": "python",
311 |    "name": "python3"
312 |   },
313 |   "language_info": {
314 |    "codemirror_mode": {
315 |     "name": "ipython",
316 |     "version": 3
317 |    },
318 |    "file_extension": ".py",
319 |    "mimetype": "text/x-python",
320 |    "name": "python",
321 |    "nbconvert_exporter": "python",
322 |    "pygments_lexer": "ipython3",
323 |    "version": "3.10.11"
324 |   }
325 |  },
326 |  "nbformat": 4,
327 |  "nbformat_minor": 2
328 | }
329 | 


--------------------------------------------------------------------------------
/duck_iceberg_demo/local_duck_cloud_gcp_iceberg_demo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import duckdb\n",
 10 |     "from pyiceberg.catalog.sql import SqlCatalog\n",
 11 |     "import pyarrow as pa\n",
 12 |     "import os\n",
 13 |     "import shutil\n",
 14 |     "import gcsfs"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "# get Q2 2023 to through april 2024 (latest available data)\n",
 24 |     "trips_ls = []\n",
 25 |     "months = [\n",
 26 |     "    '2023-04',\n",
 27 |     "    '2023-05', \n",
 28 |     "    '2023-06', \n",
 29 |     "    '2023-07', \n",
 30 |     "    '2023-08', \n",
 31 |     "    '2023-09', \n",
 32 |     "    '2023-10', \n",
 33 |     "    '2023-11', \n",
 34 |     "    '2023-12', \n",
 35 |     "    '2024-01', \n",
 36 |     "    '2024-02', \n",
 37 |     "    '2024-03', \n",
 38 |     "    '2024-04'\n",
 39 |     "    ]\n",
 40 |     "for month in months:\n",
 41 |     "    table_path = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{month}.parquet'\n",
 42 |     "    # NOTE: this initial data read doesn't require Duckdb, something like pandas works as well\n",
 43 |     "    table = duckdb.sql(f\"SELECT * FROM '{table_path}'\").arrow()\n",
 44 |     "    trips_ls.append(table)\n",
 45 |     "\n",
 46 |     "# concatenate all tables\n",
 47 |     "trips = pa.concat_tables(trips_ls)\n",
 48 |     "print(\"Rows in trips: \",trips.num_rows)\n",
 49 |     "\n",
 50 |     "# get location zone mapping\n",
 51 |     "zones = duckdb.sql(\"SELECT * FROM 'https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv'\").arrow()\n",
 52 |     "print(\"Rows in zones: \",zones.num_rows)"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "# create Iceberg catalog using Postgres and GCS\n",
 62 |     "catalog_name = \"demo_iceberg\"\n",
 63 |     "catalog_uri = \"<YOUR_POSTGRES_URI>\" #replace with Postgres URI\n",
 64 |     "warehouse_path = \"gs://<YOUR_BUCKET>\" #replace with bucket name you created in GCS\n",
 65 |     "\n",
 66 |     "catalog = SqlCatalog(\n",
 67 |     "    catalog_name,\n",
 68 |     "    **{\n",
 69 |     "        \"uri\": catalog_uri,\n",
 70 |     "        \"warehouse\": warehouse_path,\n",
 71 |     "    },\n",
 72 |     ")\n",
 73 |     "\n",
 74 |     "# create a namespace for Iceberg\n",
 75 |     "name_space = 'taxi'\n",
 76 |     "try:\n",
 77 |     "    catalog.create_namespace(name_space)\n",
 78 |     "except Exception as e:\n",
 79 |     "    print(e)"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "def add_version_hint(iceberg_table):\n",
 89 |     "    \"\"\"\n",
 90 |     "    Adds version hint file to Iceberg table metadata\n",
 91 |     "    Addresses issue mentioned here: https://github.com/duckdb/duckdb_iceberg/issues/29\n",
 92 |     "    Determines if Iceberg table is in local file system or in GCS\n",
 93 |     "    \"\"\"\n",
 94 |     "    metadata_location = iceberg_table.metadata_location\n",
 95 |     "    protocol = metadata_location.split(\":\")[0]\n",
 96 |     "\n",
 97 |     "    if protocol == \"file\":\n",
 98 |     "        metadata_location = metadata_location[7:]\n",
 99 |     "    elif protocol == \"gs\":\n",
100 |     "        metadata_location = metadata_location[5:]\n",
101 |     "    else:\n",
102 |     "        print(f\"Unsupported metadata location: {metadata_location}\")\n",
103 |     "        return\n",
104 |     "\n",
105 |     "    metadata_dir = os.path.dirname(metadata_location)\n",
106 |     "    new_metadata_file = os.path.join(metadata_dir, \"v1.metadata.json\")\n",
107 |     "    version_hint_file = os.path.join(metadata_dir, \"version-hint.text\")\n",
108 |     "\n",
109 |     "    if protocol == \"file\":\n",
110 |     "        shutil.copy(metadata_location, new_metadata_file)\n",
111 |     "        with open(version_hint_file, \"w\") as f:\n",
112 |     "            f.write(\"1\")\n",
113 |     "    elif protocol == \"gs\":\n",
114 |     "        fs = gcsfs.GCSFileSystem()\n",
115 |     "        fs.copy(metadata_location, new_metadata_file)\n",
116 |     "        with fs.open(version_hint_file, \"w\") as f:\n",
117 |     "            f.write(\"1\")\n",
118 |     "\n",
119 |     "    print(f\"Copied metadata file to {new_metadata_file}\")\n",
120 |     "    print(f\"Created {version_hint_file} with content '1'\")"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "# add tables to iceberg catalog and load data into GCS\n",
130 |     "for table, table_name in [\n",
131 |     "    (trips, \"trips\"),\n",
132 |     "    (zones, \"zones\"),\n",
133 |     "]:  \n",
134 |     "\t# create the iceberg table\n",
135 |     "    iceberg_table = catalog.create_table(\n",
136 |     "        f\"{name_space}.{table_name}\",\n",
137 |     "        schema=table.schema,\n",
138 |     "    )\n",
139 |     "\n",
140 |     "    # add data to iceberg table in GCS\n",
141 |     "    iceberg_table.append(table)\n",
142 |     "\n",
143 |     "    # copy catalog version hint metadata into GCS\n",
144 |     "    add_version_hint(iceberg_table)\n",
145 |     "    \n",
146 |     "    print(f\"Created {table_name}, {table.num_rows} rows\")"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "# initiate a duckdb connection which we will use to be the query engine for iceberg\n",
156 |     "import duckdb\n",
157 |     "\n",
158 |     "con = duckdb.connect(database=':memory:', read_only=False)\n",
159 |     "setup_sql = '''\n",
160 |     "INSTALL iceberg;\n",
161 |     "LOAD iceberg;\n",
162 |     "\n",
163 |     "CREATE SECRET (\n",
164 |     "    TYPE GCS,\n",
165 |     "    KEY_ID '<YOUR_HMAC_KEY>',\n",
166 |     "    SECRET '<YOUR_HMAC_SECRET>'\n",
167 |     ");\n",
168 |     "'''\n",
169 |     "res = con.execute(setup_sql)"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "# create the schema and views of iceberg tables in duckdb\n",
179 |     "database_path = f'{warehouse_path}/{name_space}.db'\n",
180 |     "\n",
181 |     "create_view_sql = f'''\n",
182 |     "CREATE SCHEMA IF NOT EXISTS taxi;\n",
183 |     "\n",
184 |     "CREATE VIEW taxi.trips AS\n",
185 |     "SELECT * FROM iceberg_scan('{database_path}/trips', allow_moved_paths = true);\n",
186 |     "\n",
187 |     "CREATE VIEW taxi.zones AS\n",
188 |     "SELECT * FROM iceberg_scan('{database_path}/zones', allow_moved_paths = true);\n",
189 |     "'''\n",
190 |     "\n",
191 |     "con.execute(create_view_sql)"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {},
198 |    "outputs": [],
199 |    "source": [
200 |     "sql = f'''\n",
201 |     "select \n",
202 |     "    count(*)\n",
203 |     "from taxi.trips\n",
204 |     "'''\n",
205 |     "\n",
206 |     "%time res = con.execute(sql)\n",
207 |     "res.fetchdf()"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {},
214 |    "outputs": [],
215 |    "source": [
216 |     "sql = f'''\n",
217 |     "select \n",
218 |     "    date_trunc('month', tpep_pickup_datetime) as month,\n",
219 |     "    avg(passenger_count) as avg_passenger_count,\n",
220 |     "    avg(trip_distance) as avg_trip_distance,\n",
221 |     "    sum(trip_distance) as total_trip_distance,\n",
222 |     "    avg(total_amount) as avg_total_amount,\n",
223 |     "    sum(total_amount) as total_amount,\n",
224 |     "    count(*) as total_trips\n",
225 |     "from taxi.trips\n",
226 |     "-- some data pre and post our target date range is in the dataset, so we filter it out\n",
227 |     "where tpep_pickup_datetime between '2023-04-01' and '2024-05-01'\n",
228 |     "group by 1\n",
229 |     "order by 1\n",
230 |     "'''\n",
231 |     "\n",
232 |     "%time res = con.execute(sql)\n",
233 |     "res.fetchdf()"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": null,
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": [
242 |     "sql = f'''\n",
243 |     "select \n",
244 |     "    zones.Borough,\n",
245 |     "    count(*) as total_trips,\n",
246 |     "    sum(total_amount) as total_amount\n",
247 |     "from taxi.zones as zones\n",
248 |     "left join taxi.trips as trips\n",
249 |     "    on zones.LocationID = trips.DOLocationID\n",
250 |     "group by 1 \n",
251 |     "order by 2 desc\n",
252 |     "'''\n",
253 |     "\n",
254 |     "%time res = con.execute(sql)\n",
255 |     "res.fetchdf()"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": null,
261 |    "metadata": {},
262 |    "outputs": [],
263 |    "source": [
264 |     "sql = f'''\n",
265 |     "select \n",
266 |     "    starting_zone.Borough as pickup_borough,\n",
267 |     "    ending_zone.Borough as dropoff_borough,\n",
268 |     "    count(*) as trip_count\n",
269 |     "from\n",
270 |     "taxi.trips as trips\n",
271 |     "left join taxi.zones as starting_zone\n",
272 |     "    on trips.PULocationID = starting_zone.LocationID\n",
273 |     "left join taxi.zones as ending_zone\n",
274 |     "    on trips.DOLocationID = ending_zone.LocationID\n",
275 |     "group by 1, 2\n",
276 |     "order by 1 asc, 3 desc\n",
277 |     "'''\n",
278 |     "\n",
279 |     "%time res = con.execute(sql)\n",
280 |     "res.fetchdf().head(20)"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "code",
285 |    "execution_count": null,
286 |    "metadata": {},
287 |    "outputs": [],
288 |    "source": []
289 |   }
290 |  ],
291 |  "metadata": {
292 |   "kernelspec": {
293 |    "display_name": ".venv",
294 |    "language": "python",
295 |    "name": "python3"
296 |   },
297 |   "language_info": {
298 |    "codemirror_mode": {
299 |     "name": "ipython",
300 |     "version": 3
301 |    },
302 |    "file_extension": ".py",
303 |    "mimetype": "text/x-python",
304 |    "name": "python",
305 |    "nbconvert_exporter": "python",
306 |    "pygments_lexer": "ipython3",
307 |    "version": "3.10.11"
308 |   }
309 |  },
310 |  "nbformat": 4,
311 |  "nbformat_minor": 2
312 | }
313 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "duck-iceberg-demo"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["Steven Wang <steven@luabase.com>"]
 6 | readme = "README.md"
 7 | packages = [{include = "duck_iceberg_demo"}]
 8 | 
 9 | [tool.poetry.dependencies]
10 | python = "^3.10"
11 | duckdb = "^1.0.0"
12 | pyiceberg = {extras = ["gcsfs"], version = "^0.6.1"}
13 | sqlalchemy = "^2.0.31"
14 | pyarrow = "^16.1.0"
15 | pandas = "^2.2.2"
16 | psycopg2 = "^2.9.9"
17 | flask = "^3.0.3"
18 | google-cloud-secret-manager = "^2.20.1"
19 | boto3 = "^1.34.157"
20 | 
21 | 
22 | [tool.poetry.group.dev.dependencies]
23 | ipykernel = "^6.29.5"
24 | 
25 | [build-system]
26 | requires = ["poetry-core"]
27 | build-backend = "poetry.core.masonry.api"
28 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/steven-luabase/duckdb-iceberg-demo/e2d1751f000e2b6cb740f9d977538db9c40b27e0/tests/__init__.py


--------------------------------------------------------------------------------