├── .gitignore
├── LICENSE
├── README.md
├── cache_middleware.py
├── data-proxy-arch.png
├── gitcommit.sh
├── main.py
├── requirements.txt
├── sample.env
├── tickit.duckdb
└── vercel.json


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Senthilnathan Karuppaiah
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # DuckDB Data API and Micro ORM
  2 | 
  3 | Ever wondered if MotherDuck offers a REST API? This project steps in to bridge that gap, enabling you to interact with DuckDB databases through a RESTful interface without writing a single line of backend code. Designed for deployment at the edge, it's a cost-effective solution for hobbyists and small projects.
  4 | 
  5 | The DuckDB Data API facilitates direct access to your DuckDB database, allowing for CRUD operations via HTTP.
  6 | 
  7 | ## Motivation
  8 | 
  9 | - **Simplicity and Rapid Prototyping**: Offers a dynamic, RESTful interface to cater to various project requirements with minimal or no coding.
 10 | - **Adaptation to Serverless and Edge Computing**: Aims to provide a fitting database solution for applications operating at the internet's edge.
 11 | - **Leveraging DuckDB’s Speed**: While DuckDB excels in performance, integrating it directly into web applications posed challenges.
 12 | - **Filling the Gap Left by MotherDuck**: Although MotherDuck allows DuckDB to run serverlessly, it lacked a REST interface. This project aims to bridge that gap.
 13 | - **Inspiration from MongoDB's Atlas Data Proxy**: The convenience and developer-friendly nature of MongoDB's Atlas Data Proxy influenced the development of this project, promoting quick backend support for rapid prototyping.
 14 | - **Cost-Effective Deployment on Vercel**: The project prioritizes a deployment strategy that incurs minimal to no costs on platforms like Vercel.
 15 | 
 16 | ## Design Goals
 17 | 
 18 | - **Adopting the ActiveRecord Pattern**: Mimics the ActiveRecord design pattern to offer an intuitive way of interacting with database records.
 19 | - **Automated CRUD Operations**: Streamlines Create, Read, Update, and Delete operations to save development time and effort.
 20 | - **Flexibility and Dynamic Nature**: Ensures the proxy can adapt to varying project needs without rigid constraints.
 21 | - **Single-File Implementation**: Strives for a lightweight solution, ideally encapsulated within a single file for ease of use and integration.
 22 | 
 23 | 
 24 | ## Getting Started
 25 | 
 26 | ### Python Environment Setup
 27 | 
 28 | 1. **Create a Virtual Environment**: 
 29 | 
 30 |    ```bash
 31 |    python3 -m venv env
 32 |    source env/bin/activate
 33 |    pip install -r requirements.txt
 34 |    ```
 35 | 
 36 | 2. Create a `.env` file at the root of your project to configure the environment settings:
 37 | 
 38 | ```env
 39 | # .env file
 40 | DUCKDB_DATABASE_URL=duckdb:///tickit.duckdb
 41 | DUCKDB_SCHEMA_NAME=main
 42 | QUERY_BLACKLIST=DELETE,DROP,TRUNCATE,ALTER
 43 | ```
 44 | 
 45 | - `DUCKDB_DATABASE_URL`: Specifies the connection URL to your local DuckDB database file or to a Motherduck connectionstring. Alternatively, you can set this to `:memory:` to use an in-memory database.
 46 | - `DUCKDB_SCHEMA_NAME`: Sets the default schema for database operations within the data api. If left unset, it defaults to the "main" schema.
 47 | - `QUERY_BLACKLIST`: Defines a comma-separated list of SQL keywords that the data api will reject to prevent potentially destructive operations. If this list is empty or not set, no commands will be blocked, and all types of queries will be permitted.
 48 | 
 49 | 3. **Install Packages**
 50 |    ```bash
 51 |    pip install -r requirements.txt
 52 |    ```
 53 | 
 54 | 4. **Run the project**:
 55 | 
 56 |    ```bash
 57 |    uvicorn main:app --reload
 58 |    ```
 59 | 
 60 | ### Examples of `DUCKDB_DATABASE_URL` Configurations
 61 | 
 62 | Physical DuckDB file:
 63 | 
 64 | ```env
 65 | DUCKDB_DATABASE_URL=duckdb:///path/to/your/database.duckdb
 66 | ```
 67 | In-memory DuckDB instance:
 68 | 
 69 | ```env
 70 | DUCKDB_DATABASE_URL=duckdb:///:memory:
 71 | ```
 72 | 
 73 | Motherduck:
 74 | 
 75 | ```env
 76 | DUCKDB_DATABASE_URL=duckdb:///md:[motherduck-token]@[db-name]
 77 | ```
 78 | 
 79 | ## RESTful Routes and Actions
 80 | 
 81 | Interact with your DuckDB database through the following RESTful routes by replacing `entity` with your table name:
 82 | 
 83 | | Method | Route             | Description                               | Query Parameter Examples                              |
 84 | |--------|-------------------|-------------------------------------------|-------------------------------------------------------|
 85 | | GET    | `/entity`         | List entities                             | `?limit=10&skip=20?select=field1..?order=field1 asc?field1.eq=value` |
 86 | | POST   | `/entity`         | Create a new entity                       | N/A                                                     |
 87 | | GET    | `/entity/:id`     | Get a single entity by ID                 | N/A                                                     |
 88 | | PUT    | `/entity/:id`     | Replace an entity by ID (full update)     | N/A                                                     |
 89 | | PATCH  | `/entity/:id`     | Update an entity by ID (partial update)   | N/A                                                     |
 90 | | DELETE | `/entity/:id`     | Delete an entity by ID                    | N/A                                                     |
 91 | 
 92 | ### Query Parameter Examples
 93 | 
 94 | - **Filtering**: `?field1.eq=value` filters the list by `field1` equal to `value`.
 95 | - **Sorting**: `?order=field1 asc` sorts the list by `field1` in ascending order.
 96 | - **Pagination**: `?limit=10&skip=20` limits the list to 10 entities, skipping the first 20.
 97 | - **Selecting Fields**: `?select=field1,field2` selects only `field1` and `field2` to be returned in each entity in the list.
 98 | 
 99 | ### Supported Filter Operators
100 | 
101 | The DuckDB Data Proxy supports a range of filter operators for querying data, allowing for precise data retrieval based on specific criteria:
102 | 
103 | - `.eq`: Equals
104 | - `.neq`: Not equals
105 | - `.gt`: Greater than
106 | - `.gte`: Greater than or equal to
107 | - `.lt`: Less than
108 | - `.lte`: Less than or equal to
109 | - `.like`: Like (for pattern matching)
110 | - `.ilike`: Case-insensitive pattern matching
111 | 
112 | These operators can be used in query parameters to filter the data retrieved from the database. For example, `?name.like=%john%` would filter records where the `name` field contains "john".
113 | 
114 | It would be more organized to place `execute/sql` in the "Additional Endpoints" section if it serves a different or more specific purpose than the standard CRUD operations. It's common to separate utility or administrative endpoints from the main CRUD operations to clarify their use cases. Here's how you could mention it:
115 | 
116 | 
117 | ## Additional Endpoints
118 | 
119 | In addition to the core RESTful routes, the DuckDB Data Proxy provides several utility endpoints for diagnostics, metadata, and system health checks:
120 | 
121 | | Method | Route                  | Description                                                   | Query Parameter Examples |
122 | |--------|------------------------|---------------------------------------------------------------|--------------------------|
123 | | GET    | `/`                    | Root endpoint returning a welcome message.                   | N/A                      |
124 | | GET    | `/health`              | Health check endpoint.                                        | N/A                      |
125 | | GET    | `/debug/connection`    | Tests database connection.                                    | N/A                      |
126 | | POST   | `/execute/sql`         | Execute a custom SQL query (SELECT or DDL statement).        | N/A                      |
127 | 
128 | The `POST /execute/sql` endpoint is for advanced users who need to execute custom SQL queries or DDL statements that are not covered by the standard CRUD operations. Please use this endpoint with caution, as improper use can affect database integrity and security.
129 | 
130 | 
131 | ### Complete Documentation for Metadata and Profiler Endpoints
132 | 
133 | ---
134 | 
135 | ### Metadata Endpoints
136 | 
137 | | Method | Route                      | Description                                   |
138 | |--------|----------------------------|-----------------------------------------------|
139 | | GET    | `/metadata/databases`      | Lists all databases in the current instance. |
140 | | GET    | `/metadata/schemas`        | Lists all schemas in the current database.   |
141 | | GET    | `/metadata/tables`         | Lists all tables in the current schema.      |
142 | | GET    | `/metadata/columns`        | Lists all columns in the current schema.     |
143 | | GET    | `/metadata/views`          | Lists all views in the current schema.       |
144 | | GET    | `/metadata/constraints`    | Lists all constraints in the current schema. |
145 | 
146 | ---
147 | 
148 | ### `/metadata/{path:path}` Combinations
149 | 
150 | | Path Format                     | Description                                                       |
151 | |---------------------------------|-------------------------------------------------------------------|
152 | | `/metadata/{database}`          | Lists all schemas in the specified database.                     |
153 | | `/metadata/{database}/{schema}` | Lists all tables in the specified schema of a database.          |
154 | | `/metadata/{database}/{schema}/{table}` | Lists all columns in the specified table of a schema.            |
155 | | `/metadata/{database}/{schema}/{table}/{column}` | Fetches metadata for the specific column in the specified table. |
156 | 
157 | ---
158 | 
159 | ### Table Info and Column Profiler Endpoints
160 | 
161 | | Method | Route                                    | Description                                                   |
162 | |--------|------------------------------------------|---------------------------------------------------------------|
163 | | GET    | `/metadata/{catalog}/{schema}/{table}/summarize` | Fetch statistical summaries for all columns in the table.      |
164 | | GET    | `/metadata/{catalog}/{schema}/{table}/column/{column}/summarize` | Fetch statistical summaries for a specific column in a table. |
165 | | GET    | `/profile`                               | Profiles a table or a specific column. Use `object=db.schema.table` or `object=db.schema.table.column`. |
166 | | GET    | `/describe`         | Fetches metadata for a specific object using `object=db.schema.table`. |
167 | ---
168 | 
169 | ## Playground
170 | Interact with the following tables from **tickit** db: `sale`, `event`, `data`, `category`, `user`, `listing`, `venue`
171 | 
172 | ### User Table API Endpoints
173 | 
174 | ```plaintext
175 | GET     https://duckdb-data-api.vercel.app/user?state.eq=NL&liketheatre=true&limit=10&offset=5&order=firstname%20asc    - List, filter, sort and paginate users
176 | POST    https://duckdb-data-api.vercel.app/user                                                                         - Create a new user
177 | GET     https://duckdb-data-api.vercel.app/user/{id}                                                                    - Get a single user by ID
178 | PUT     https://duckdb-data-api.vercel.app/user/{id}                                                                    - Replace a user by ID (full update)
179 | PATCH   https://duckdb-data-api.vercel.app/user/{id}                                                                    - Update a user by ID (partial update)
180 | DELETE  https://duckdb-data-api.vercel.app/user/{id}                                                                    - Delete a user by ID
181 | ```
182 | 
183 | ## Heads Up on Limitations
184 | 
185 | - **Performance Considerations**: Because of the way we jump from the edge to MotherDuck and back, and how we fetch data, especially when counting items for pagination, there might be a slight delay.
186 | - **Primary Key Expectations**: Right now, we expect the primary key in your tables to be named "id". We know that’s not always the case, so we’re thinking of ways to work around this in future updates.
187 | 
188 | ## Technology Stack
189 | 
190 | 1. **FastAPI**:
191 | 2. **DuckDB and MotherDuck**:
192 | 3. **Python 3.9**:
193 | 4. **SQLAlchemy**:
194 | 
195 | ## Architecture
196 | 
197 | ```mermaid
198 | flowchart LR
199 |     Client -->|REST API Call| FastAPI
200 |     FastAPI -->|SQLAlchemy ORM| DuckDB
201 |     DuckDB -->|Process Query| MotherDuck
202 |     MotherDuck -->|Return Results| DuckDB
203 |     DuckDB -->|ORM| FastAPI
204 |     FastAPI -->|JSON Response| Client
205 | 
206 |     style FastAPI fill:#f9f,stroke:#333,stroke-width:2px
207 |     style DuckDB fill:#bbf,stroke:#333,stroke-width:2px
208 |     style MotherDuck fill:#fbf,stroke:#333,stroke-width:2px
209 |     style Client fill:#dfd,stroke:#333,stroke-width:2px
210 | ```
211 | 
212 | ## Deployment in Vercel
213 | 
214 | Deploying your application to Vercel can significantly simplify the process, thanks to its support for serverless functions. For a detailed guide on deploying FastAPI applications to Vercel, check out this insightful [blog post](https://dev.to/mihaiandrei97/building-a-fastapi-application-and-deploying-it-with-vercel-ho7). It walks you through the steps to ensure your application runs smoothly in a serverless environment, making your DuckDB Data Proxy accessible from anywhere.
215 | 
216 | 
217 | ## From Goduck to DuckDB Data Proxy
218 | 
219 | ### Transitioning from Golang to Python
220 | Before diving into this Python project, I launched [Goduck](https://github.com/senthilsweb/goduck), a similar initiative built with Golang. It aimed to provide REST API interaction with DuckDB and MotherDuck, much like what we're doing here but in the Go ecosystem.
221 | 
222 | ### Shifting Gears to Python
223 | While trying to deploy Goduck across various environments, including serverless platforms, I faced hurdles due to the C-go dependency of the Go duckDB driver, which made the build process tricky for different Linux systems. This challenge highlighted the benefits of Python's straightforwardness and the extensive support from its community. Here are the main insights:
224 | 
225 | - **Simpler Python Driver**: Python's approach to DuckDB felt more straightforward and developer-friendly.
226 | - **Larger Python Community**: The vast Python community meant more potential users and contributors for this project.
227 | - **Inspiration from MongoDB**: MongoDB's Atlas Data Proxy, which simplifies database operations, inspired me to offer a similar experience for DuckDB users, facilitating quick backend setups for rapid prototyping.


--------------------------------------------------------------------------------
/cache_middleware.py:
--------------------------------------------------------------------------------
 1 | """
 2 | cache_middleware.py
 3 | 
 4 | Implements caching for FastAPI requests to enhance performance and response times.
 5 | The middleware utilizes Upstash Redis due to limitations encountered with vercel-kv
 6 | in the Python environment. Vercel-kv offers native support for JavaScript but not for Python.
 7 | 
 8 | Caching is particularly important for this application as it uses MotherDuck, a serverless
 9 | version of DuckDB. While DuckDB is performant, deploying it at the edge with Vercel's hobby
10 | plan introduces constraints such as memory and RAM limitations, and a function timeout of 10 seconds.
11 | To mitigate these limitations and improve response times, a Redis cache was implemented.
12 | Upstash Redis was chosen for its ease of integration with Vercel, offering 500 MB of free cache
13 | storage under the hobby plan.
14 | 
15 | Cache keys are consistently lowercased and prefixed with 'duckdb-data-api:' to ensure
16 | uniformity and to avoid case-sensitive cache misses. Only GET requests and a specific POST
17 | request for executing SQL are cached.
18 | 
19 | For the POST method specific to the "/execute/sql" route, the caching strategy involves generating
20 | a unique cache key based on the content of the request body. This is achieved by computing an MD5 checksum
21 | of the POST body, ensuring that different contents produce different cache keys, thereby accurately
22 | caching responses based on the actual query being executed. This method addresses the challenge of caching
23 | dynamic content that could vary significantly with each request.
24 | """
25 | 
26 | 
27 | from fastapi import Request
28 | from starlette.middleware.base import BaseHTTPMiddleware
29 | from starlette.responses import Response
30 | from upstash_redis.asyncio import Redis
31 | import hashlib
32 | 
33 | # Load environment variables
34 | from dotenv import load_dotenv
35 | load_dotenv()
36 | 
37 | # Initialize Upstash Redis using environment variables
38 | redis = Redis.from_env()
39 | 
40 | class CacheMiddleware(BaseHTTPMiddleware):
41 |     """
42 |     Middleware to cache GET and specific POST request responses using Upstash Redis.
43 |     Generates unique cache keys based on the request method, path, query parameters, and
44 |     for POST requests, the content of the request body.
45 |     """
46 |     
47 |     async def dispatch(self, request: Request, call_next):
48 |         """
49 |         Process an incoming request by checking if it's cached. If not, call the next
50 |         request handler and cache the response if applicable.
51 |         """
52 |         # Construct the base cache key from the method and path
53 |         base_key = f"{request.method}-{request.url.path}"
54 |         
55 |         # Special handling for POST to "/execute/sql"
56 |         if request.method == "POST" and request.url.path == "/execute/sql":
57 |             # Read and then reset the request body for hashing and processing
58 |             body = await request.body()
59 |             request._body = body  # Reset body after reading
60 |             
61 |             # Create a checksum of the body to use in the cache key
62 |             checksum = hashlib.md5(body).hexdigest()
63 |             cache_key = f"duckdb-data-api:{base_key}?{checksum}".lower()
64 |         elif request.method == "GET":
65 |             # Use query parameters to distinguish GET requests
66 |             cache_key = f"duckdb-data-api:{base_key}?{request.query_params}".lower()
67 |         else:
68 |             cache_key = None
69 | 
70 |         # Try to retrieve the cached response
71 |         if cache_key:
72 |             cached_response = await redis.get(cache_key)
73 |             if cached_response:
74 |                 print(f"Cache hit for key: {cache_key}")
75 |                 return Response(content=cached_response, status_code=200, media_type='application/json')
76 |             print(f"Cache miss for key: {cache_key}")
77 | 
78 |         # Proceed with the actual request handling if no cache is found
79 |         response = await call_next(request)
80 | 
81 |         # Cache the response if the status code is 200 and we have a cache key
82 |         if response.status_code == 200 and cache_key:
83 |             body = b''.join([chunk async for chunk in response.body_iterator])
84 |             cache_content = body.decode()
85 |             headers = {"Content-Length": str(len(cache_content))}
86 |             await redis.set(cache_key, cache_content)
87 |             print(f"Cached response for key: {cache_key}")
88 |             return Response(content=cache_content, status_code=200, media_type='application/json', headers=headers)
89 | 
90 |         return response


--------------------------------------------------------------------------------
/data-proxy-arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/senthilsweb/duckdb-data-api/1da02c1ee06ae17265798ea30b2896db8fd1e1c3/data-proxy-arch.png


--------------------------------------------------------------------------------
/gitcommit.sh:
--------------------------------------------------------------------------------
 1 | str="no comments"
 2 | git ls-files --modified | xargs git add
 3 | git ls-files --deleted | xargs git rm
 4 | git add -A
 5 | if [ ! -z "$1" -a "$1" != " " ]; then
 6 |         str=$1
 7 | fi
 8 | git commit -m "$str"
 9 | 
10 | git push -u origin main


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | """
  2 | File Name: main.py
  3 | Author(s): Sernthilnathan Karuppaiah and ChatGPT4 :-)
  4 | Date: 14-Mar-2024
  5 | Description: This FastAPI application serves as a data proxy to DuckDB, offering endpoints for basic database
  6 |              operations such as listing tables, reading table data with optional filtering, sorting, and pagination,
  7 |              and a debug endpoint to check database connectivity. It is designed for dynamic usage, following
  8 |              the ActiveRecord design pattern akin to a Rails-type microORM, and utilizes SQLAlchemy for 
  9 |              database interaction.
 10 | """
 11 | 
 12 | from fastapi import FastAPI, Depends, HTTPException, Request, Query, Path, Body
 13 | from fastapi.encoders import jsonable_encoder
 14 | from fastapi.middleware.cors import CORSMiddleware
 15 | from fastapi.responses import JSONResponse
 16 | import sqlglot
 17 | from sqlglot import parse_one, exp
 18 | from sqlglot.optimizer import optimize
 19 | from sqlalchemy import create_engine, text
 20 | from sqlalchemy.orm import sessionmaker, Session
 21 | from typing import List, Dict, Any
 22 | from pydantic import BaseModel
 23 | from datetime import datetime
 24 | from cache_middleware import CacheMiddleware
 25 | import os
 26 | from dotenv import load_dotenv
 27 | import math
 28 | from decimal import Decimal
 29 | from sqlalchemy.sql import text
 30 | 
 31 | 
 32 | 
 33 | # Initialize environment variables and set HOME for duckDB compatibility in serverless environments.
 34 | # Only load .env file if running locally and not in Vercel
 35 | if os.environ.get('VERCEL', None) != '1':
 36 |     # Clear all environment variables
 37 |     os.environ.clear()
 38 |     load_dotenv()
 39 | 
 40 | os.environ['HOME'] = '/tmp'
 41 | # Initialize environment variables and set HOME for duckDB compatibility in serverless environments.
 42 | # Only load .env file if running locally and not in Vercel
 43 | if os.environ.get('VERCEL', None) != '1':
 44 |     load_dotenv()
 45 | 
 46 | # Configuration variables
 47 | DATABASE_URL = os.getenv("DUCKDB_DATABASE_URL", default="duckdb:///tickit.duckdb")
 48 | print(f"DATABASE_URL = [{DATABASE_URL}]")
 49 | SCHEMA_NAME = os.getenv("DUCKDB_SCHEMA_NAME", default="main")
 50 | print(f"SCHEMA_NAME = [{SCHEMA_NAME}]")
 51 | BLACKLIST_KEYWORDS = [keyword for keyword in os.getenv("QUERY_BLACKLIST", "").split(",") if keyword]
 52 | 
 53 | 
 54 | # Database engine setup
 55 | engine = create_engine(DATABASE_URL)
 56 | SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
 57 | 
 58 | app = FastAPI()
 59 | #app.add_middleware(CacheMiddleware)
 60 | 
 61 | # Dependency to get the database session
 62 | def get_db():
 63 |     db = SessionLocal()
 64 |     try:
 65 |         yield db
 66 |     finally:
 67 |         db.close()
 68 | 
 69 | app.add_middleware(
 70 |     CORSMiddleware,
 71 |     allow_origins=["*"],  # Allow all origins
 72 |     allow_credentials=True,
 73 |     allow_methods=["*"],  # Allow all methods
 74 |     allow_headers=["*"],  # Allow all headers
 75 | )
 76 | 
 77 | @app.get("/")
 78 | async def root():
 79 |     """Root endpoint returning welcome message."""
 80 |     return {"message": "Welcome to DuckDB Data Proxy!"}
 81 | 
 82 | @app.get("/health")
 83 | async def health_check():
 84 |     """Health check endpoint."""
 85 |     return {"message": "I am doing great!"}
 86 | 
 87 | @app.get("/debug/connection")
 88 | def debug_connection(db: Session = Depends(get_db)):
 89 |     """
 90 |     Debug endpoint to test database connection.
 91 |     
 92 |     Attempts a simple query to verify database connectivity.
 93 |     """
 94 |     try:
 95 |         result = db.execute(text("SELECT 1"))
 96 |         return {"status": "success", "message": "Database connection established successfully."}
 97 |     except Exception as e:
 98 |         return {"status": "error", "message": str(e)}
 99 | 
100 | 
101 | 
102 | def prepare_where_clauses(request: Request):
103 |     """
104 |     Prepares WHERE clauses for SQL queries based on request query parameters.
105 |     
106 |     Supports various operators like .eq, .gt, .gte, .lt, .lte, .neq, and .like.
107 |     """
108 |     where_clauses = []
109 |     params = {}
110 |     for key, value in request.query_params.items():
111 |         if key not in ["select", "limit", "offset", "order"]:
112 |             operator = "="  # Default operator
113 |             if key.endswith(".eq"):
114 |                 operator = "="
115 |                 key = key[:-3]
116 |             elif key.endswith(".gt"):
117 |                 operator = ">"
118 |                 key = key[:-3]
119 |             elif key.endswith(".gte"):
120 |                 operator = ">="
121 |                 key = key[:-4]
122 |             elif key.endswith(".lt"):
123 |                 operator = "<"
124 |                 key = key[:-3]
125 |             elif key.endswith(".lte"):
126 |                 operator = "<="
127 |                 key = key[:-4]
128 |             elif key.endswith(".neq"):
129 |                 operator = "<>"
130 |                 key = key[:-4]
131 |             elif key.endswith(".like"):
132 |                 operator = "ILIKE"
133 |                 key = key[:-5]
134 |             where_clauses.append(f"{key} {operator} :{key}")
135 |             params[key] = value
136 |     return " AND ".join(where_clauses), params
137 | 
138 | @app.get("/entity/{table_name}", response_model=List[Dict[str, Any]])
139 | def get_entities(table_name: str, request: Request, select: str = Query("*"),
140 |                     order: str = Query(None), skip: int = Query(0, alias="offset"),
141 |                     limit: int = Query(100), db: Session = Depends(get_db)):
142 |     """
143 |     Endpoint to read data from a specified table with optional filtering, sorting, and pagination.
144 |     
145 |     Validates table name against existing tables to prevent SQL injection.
146 |     """
147 |     # Validate table name
148 |     if table_name not in list_tables(db):
149 |         raise HTTPException(status_code=404, detail="Table not found")
150 |     
151 |     # Construct query with optional WHERE, ORDER BY, and pagination
152 |     base_query = f"SELECT {select} FROM  {SCHEMA_NAME}.{table_name}"
153 |     where_clauses, params = prepare_where_clauses(request)
154 |     if where_clauses:
155 |         base_query += f" WHERE {where_clauses}"
156 |         count_query = f"SELECT COUNT(*) FROM {SCHEMA_NAME}.{table_name} WHERE {where_clauses}"
157 |             
158 |     else:
159 |         count_query = f"SELECT COUNT(*) FROM {SCHEMA_NAME}.{table_name}"
160 | 
161 |     if order:
162 |         base_query += f" ORDER BY {order}"
163 |     base_query += " LIMIT :limit OFFSET :offset"
164 |     print(f"base_query = {base_query}")
165 |     params.update({"limit": limit, "offset": skip})
166 |     print(f"params = {params}")
167 |     # Execute query and handle results
168 |     try:
169 |         result_proxy = db.execute(text(base_query), params)
170 |         results = result_proxy.fetchall()
171 |         # Use params for count query as well to respect WHERE conditions
172 |         total_count = db.execute(text(count_query), params).scalar()
173 |         page_number = math.ceil(skip / limit) + 1
174 |         total_pages = math.ceil(total_count / limit)
175 |         response_data = {
176 |             "total_rows": total_count,
177 |             "total_pages": total_pages,
178 |             "limit": limit,
179 |             "offset": skip,
180 |             "current_page": page_number,
181 |             "data": [{key: (value.isoformat() if isinstance(value, datetime) else value) 
182 |                       for key, value in dict(zip(result_proxy.keys(), row)).items()} for row in results]
183 |         }
184 |         return JSONResponse(content=response_data)
185 |     except Exception as e:
186 |         raise HTTPException(status_code=500, detail=str(e))
187 | 
188 | @app.get("/entity/{table_name}/{id}", response_model=Dict[str, Any])
189 | def get_entity(table_name: str, id: int = Path(..., description="The ID of the entity to retrieve"), 
190 |                db: Session = Depends(get_db)):
191 |     """
192 |     Dynamically fetches a single entity by its ID from a specified table.
193 |     
194 |     Parameters:
195 |     - table_name: str - The name of the table from which to retrieve the entity.
196 |     - id: int - The unique identifier of the entity to retrieve.
197 | 
198 |     Returns a single entity matching the given ID from the specified table, with datetime fields properly serialized.
199 |     """
200 |     # Validate table name
201 |     if table_name not in list_tables(db):
202 |         raise HTTPException(status_code=404, detail="Table not found")
203 |     
204 |     query = text(f"SELECT * FROM {SCHEMA_NAME}.{table_name} WHERE id = :id")
205 |     result = db.execute(query, {"id": id}).fetchone()
206 |     
207 |     if result is None:
208 |         raise HTTPException(status_code=404, detail=f"Record [{id}] not found in [{SCHEMA_NAME}.{table_name}]")
209 | 
210 |     # Convert the RowProxy object to a dictionary
211 |     result_dict = {key: value for key, value in result._mapping.items()}
212 | 
213 |     # Serialize using jsonable_encoder to handle datetime and other complex types
214 |     return jsonable_encoder(result_dict)
215 | 
216 | @app.delete("/entity/{table_name}/{id}", response_model=Dict[str, Any])
217 | def delete_entity(table_name: str, id: int = Path(..., description="The ID of the entity to delete"), 
218 |                   db: Session = Depends(get_db)):
219 |     """
220 |     Deletes a single entity by its ID from a specified table.
221 |     """
222 |    # Validate table name
223 |     if table_name not in list_tables(db):
224 |         raise HTTPException(status_code=404, detail="Table not found")
225 |     
226 |     # Check if the entity exists
227 |     exists_query = text(f"SELECT EXISTS(SELECT 1 FROM {SCHEMA_NAME}.{table_name} WHERE id = :id)")
228 |     exists = db.execute(exists_query, {"id": id}).scalar()
229 |     
230 |     if not exists:
231 |         raise HTTPException(status_code=404, detail=f"Record [{id}] not found in [{SCHEMA_NAME}.{table_name}]")
232 | 
233 |     # Delete the entity
234 |     delete_query = text(f"DELETE FROM {SCHEMA_NAME}.{table_name} WHERE id = :id")
235 |     db.execute(delete_query, {"id": id})
236 |     db.commit()
237 | 
238 |     return {"message": f"Record [{id}] deleted successfully from [{SCHEMA_NAME}.{table_name}]"}
239 | 
240 | @app.post("/entity/{table_name}", response_model=Dict[str, Any])
241 | def create_entity(table_name: str, entity_data: Dict[str, Any] = Body(...), 
242 |                   db: Session = Depends(get_db)):
243 |     """
244 |     Creates a new entity in the specified table with the provided data.
245 |     """
246 |     # Validate table name
247 |     if table_name not in list_tables(db):
248 |         raise HTTPException(status_code=404, detail="Table not found")
249 | 
250 |     # Constructing SQL INSERT statement dynamically based on entity_data
251 |     columns = ', '.join(entity_data.keys())
252 |     values = ', '.join([f":{key}" for key in entity_data.keys()])
253 |     insert_query = text(f"INSERT INTO {SCHEMA_NAME}.{table_name} ({columns}) VALUES ({values}) RETURNING *")
254 |     
255 |     # Execute the query and fetch the newly created entity
256 |     result = db.execute(insert_query, entity_data).fetchone()
257 |     db.commit()
258 |     
259 |     if result is None:
260 |         raise HTTPException(status_code=500, detail="Failed to create record")
261 |     
262 |     # Convert the RowProxy object to a dictionary
263 |     result_dict = {key: value for key, value in result._mapping.items()}
264 | 
265 |     # Serialize using jsonable_encoder to handle datetime and other complex types
266 |     return jsonable_encoder(result_dict)
267 | 
268 | @app.patch("/entity/{table_name}/{id}", response_model=Dict[str, Any])
269 | def update_entity(table_name: str, id: int, update_data: Dict[str, Any] = Body(...), 
270 |                   db: Session = Depends(get_db)):
271 |     """
272 |     Updates an existing entity in the specified table with the provided data.
273 |     """
274 |     # Validate table name
275 |     if table_name not in list_tables(db):
276 |         raise HTTPException(status_code=404, detail="Table not found")
277 | 
278 |     # First, check if the entity exists
279 |     exists_query = text(f"SELECT EXISTS(SELECT 1 FROM {SCHEMA_NAME}.{table_name} WHERE id = :id)")
280 |     exists = db.execute(exists_query, {"id": id}).scalar()
281 |     if not exists:
282 |         raise HTTPException(status_code=404, detail="Entity not found")
283 | 
284 |     # Constructing SQL UPDATE statement dynamically based on update_data
285 |     set_clauses = ', '.join([f"{key} = :{key}" for key in update_data.keys()])
286 |     update_query = text(f"UPDATE {SCHEMA_NAME}.{table_name} SET {set_clauses} WHERE id = :id RETURNING *")
287 |     
288 |     # Execute the query and fetch the updated entity
289 |     result = db.execute(update_query, {**update_data, "id": id}).fetchone()
290 |     db.commit()
291 |     
292 |     if result is None:
293 |         raise HTTPException(status_code=500, detail="Failed to update record [{id}] in [{table_name}]")
294 | 
295 |     # Convert the result row to a dict to ensure compatibility with FastAPI's response_model
296 |     updated_entity = {column: value for column, value in result._mapping.items()}
297 |     return updated_entity
298 | 
299 | @app.put("/entity/{table_name}/{id}", response_model=Dict[str, Any])
300 | def replace_entity(table_name: str, id: int, new_data: Dict[str, Any] = Body(...), 
301 |                    db: Session = Depends(get_db)):
302 |   
303 |     if table_name not in list_tables(db):
304 |         raise HTTPException(status_code=404, detail="Table not found")
305 | 
306 |     # First, check if the entity exists
307 |     exists_query = text(f"SELECT EXISTS(SELECT 1 FROM {SCHEMA_NAME}.{table_name} WHERE id = :id)")
308 |     exists = db.execute(exists_query, {"id": id}).scalar()
309 |     if not exists:
310 |         raise HTTPException(status_code=404, detail="Table not found")
311 | 
312 |     # Assuming all fields must be provided for a PUT operation, construct a dynamic UPDATE statement
313 |     set_clauses = ', '.join([f"{key} = :{key}" for key in new_data.keys()])
314 |     update_query = text(f"UPDATE {SCHEMA_NAME}.{table_name} SET {set_clauses} WHERE id = :id RETURNING *")
315 |     
316 |     # Execute the query and fetch the updated entity
317 |     result = db.execute(update_query, {**new_data, "id": id}).fetchone()
318 |     db.commit()
319 |     
320 |     if result is None:
321 |         raise HTTPException(status_code=500, detail="Failed to replace record [{id}] in [{table_name}]")
322 | 
323 |     # Convert the result row to a dict to ensure compatibility with FastAPI's response_model
324 |     replaced_entity = {column: value for column, value in result._mapping.items()}
325 |     return replaced_entity
326 | 
327 | def is_query_blacklisted(query: str) -> bool:
328 |     # Check if BLACKLIST_KEYWORDS is actually empty or contains only an empty string
329 |     if not BLACKLIST_KEYWORDS or BLACKLIST_KEYWORDS == ['']:
330 |         return False
331 | 
332 |     query_lower = query.lower()
333 |     for keyword in BLACKLIST_KEYWORDS:
334 |         # Skip empty strings which might be a result of splitting an empty environment variable
335 |         if keyword and keyword in query_lower:
336 |             return True
337 |     return False
338 | 
339 | @app.post("/execute/sql")
340 | def execute_custom_query(query: str = Body(..., embed=True), db: Session = Depends(get_db)):
341 |     """
342 |     Executes a custom SQL query, which can be a SELECT statement or a DDL statement.
343 |     Checks against a blacklist for prohibited keywords.
344 | 
345 |     Parameters:
346 |     - query: str - The SQL query to execute.
347 | 
348 |     If the query is a SELECT statement, returns the fetched data.
349 |     For DDL statements, returns a confirmation message.
350 |     """
351 |     #query = query.strip().lower()
352 |     query = query.strip()
353 |     if is_query_blacklisted(query):
354 |         raise HTTPException(status_code=403, detail="The query contains prohibited keywords.")
355 | 
356 |     if query.startswith("select") or query.startswith("SELECT"):
357 |         # It's a select query
358 |         return execute_select_query(query, db)
359 |     else:
360 |         # It's a DDL query
361 |         return execute_ddl_query(query, db)
362 |     
363 | 
364 | @app.get("/metadata/databases", response_model=List[Dict[str, Any]])
365 | def get_md_duckdb_databases(db: Session = Depends(get_db)):
366 |     return execute_metadata_query("SELECT * FROM duckdb_databases", db)
367 | 
368 | @app.get("/metadata/schemas", response_model=List[Dict[str, Any]])
369 | def get_md_duckdb_databases(db: Session = Depends(get_db)):
370 |     return execute_metadata_query("SELECT * FROM duckdb_schemas", db)
371 | 
372 | @app.get("/metadata/tables", response_model=List[Dict[str, Any]])
373 | def get_md_duckdb_databases(db: Session = Depends(get_db)):
374 |     return execute_metadata_query("SELECT * FROM duckdb_columns", db)
375 | 
376 | @app.get("/metadata/columns", response_model=List[Dict[str, Any]])
377 | def get_md_duckdb_databases(db: Session = Depends(get_db)):
378 |     return execute_metadata_query("SELECT * FROM duckdb_columns", db)
379 | 
380 | @app.get("/metadata/views", response_model=List[Dict[str, Any]])
381 | def get_md_duckdb_databases(db: Session = Depends(get_db)):
382 |     return execute_metadata_query("SELECT * FROM duckdb_views", db)
383 | 
384 | @app.get("/metadata/constraints", response_model=List[Dict[str, Any]])
385 | def get_md_duckdb_databases(db: Session = Depends(get_db)):
386 |     return execute_metadata_query("SELECT * FROM duckdb_constraints", db)
387 | 
388 | @app.get("/metadata/{path:path}", response_model=List[Dict[str, Any]])
389 | def handle_metadata_routes(path: str, db: Session = Depends(get_db)):
390 |     """
391 |     Handles metadata routes dynamically for DuckDB catalogs, schemas, tables, and columns.
392 |     Retrieves all available fields from the information schema.
393 |     """
394 |     parts = path.split("/")  # Split the path into components
395 | 
396 |     if len(parts) == 1:  # Matches /metadata/{catalog}
397 |         catalog = parts[0]
398 |         query = f"""
399 |             SELECT *
400 |             FROM information_schema.schemata
401 |             WHERE catalog_name = '{catalog}';
402 |         """
403 | 
404 |     elif len(parts) == 2:  # Matches /metadata/{catalog}/{schema}
405 |         catalog, schema = parts
406 |         query = f"""
407 |             SELECT *
408 |             FROM information_schema.tables
409 |             WHERE table_catalog = '{catalog}' AND table_schema = '{schema}';
410 |         """
411 | 
412 |     elif len(parts) == 3:  # Matches /metadata/{catalog}/{schema}/{table}
413 |         catalog, schema, table = parts
414 |         query = f"""
415 |             SELECT *
416 |             FROM information_schema.columns
417 |             WHERE table_catalog = '{catalog}' AND table_schema = '{schema}' AND table_name = '{table}';
418 |         """
419 | 
420 |     elif len(parts) == 4:  # Matches /metadata/{catalog}/{schema}/{table}/{column}
421 |         catalog, schema, table, column = parts
422 |         query = f"""
423 |             SELECT *
424 |             FROM information_schema.columns
425 |             WHERE table_catalog = '{catalog}' AND table_schema = '{schema}' AND table_name = '{table}' AND column_name = '{column}';
426 |         """
427 | 
428 |     else:
429 |         # Return a 400 error if the path format is invalid
430 |         raise HTTPException(status_code=400, detail="Invalid route format. Check the number of parts.")
431 | 
432 |     # Execute the query and return results
433 |     return execute_metadata_query(query, db)
434 | 
435 | @app.get("/describe", response_model=List[Dict[str, Any]])
436 | def describe_object(object: str = Query(..., description="The object to describe, in the format 'db.schema.table'"),
437 |                     db: Session = Depends(get_db)):
438 |     """
439 |     Fetches metadata for the specified object (table).
440 |     Query parameter format: 'db.schema.table'.
441 |     """
442 |     # Split the object into components
443 |     try:
444 |         catalog, schema, table = object.split(".")
445 |     except ValueError:
446 |         raise HTTPException(status_code=400, detail="Invalid object format. Use 'db.schema.table'.")
447 | 
448 |     # Construct the query
449 |     query = f"DESCRIBE TABLE {catalog}.{schema}.{table}"
450 |     
451 |     # Execute and return the result
452 |     return execute_metadata_query(query, db)
453 | 
454 | 
455 | @app.get("/profile", response_model=List[Dict[str, Any]])
456 | def profile_object(object: str = Query(..., description="The object to profile, in the format 'db.schema.table' or 'db.schema.table.column'"),
457 |                    db: Session = Depends(get_db)):
458 |     """
459 |     Fetches profile metadata for the specified object.
460 |     Query parameter format: 'db.schema.table' (for table) or 'db.schema.table.column' (for specific column).
461 |     """
462 |     parts = object.split(".")
463 |     if len(parts) == 3:
464 |         # Table-level profile
465 |         catalog, schema, table = parts
466 |         query = f"SUMMARIZE TABLE {catalog}.{schema}.{table}"
467 |         return execute_profile_query(query, db)
468 |     elif len(parts) == 4:
469 |         # Column-level profile
470 |         catalog, schema, table, column = parts
471 |         query = f"SUMMARIZE TABLE {catalog}.{schema}.{table}"
472 |         all_columns = execute_profile_query(query, db)
473 |         
474 |         # Filter for the specific column
475 |         column_summary = [col for col in all_columns if col["column_name"] == column]
476 |         if not column_summary:
477 |             raise HTTPException(status_code=404, detail=f"Column '{column}' not found in table '{table}'.")
478 |         return column_summary
479 |     else:
480 |         raise HTTPException(status_code=400, detail="Invalid object format. Use 'db.schema.table' or 'db.schema.table.column'.")
481 | 
482 | 
483 | def execute_profile_query(query: str, db: Session) -> List[Dict[str, Any]]:
484 |     """
485 |     Executes a profile-specific query (e.g., SUMMARIZE TABLE) and handles Decimal objects for JSON serialization.
486 |     """
487 |     try:
488 |         # Use SQLAlchemy's text() to wrap raw SQL queries
489 |         result_proxy = db.execute(text(query))
490 |         results = result_proxy.fetchall()
491 | 
492 |         # Convert results to JSON-serializable format
493 |         serialized_results = []
494 |         for row in results:
495 |             serialized_row = {}
496 |             for key, value in zip(result_proxy.keys(), row):
497 |                 # Handle Decimal conversion for SUMMARIZE TABLE results
498 |                 if isinstance(value, Decimal):
499 |                     serialized_row[key] = float(value)
500 |                 else:
501 |                     serialized_row[key] = value
502 |             serialized_results.append(serialized_row)
503 |         
504 |         return serialized_results
505 |     except Exception as e:
506 |         # Log and raise an HTTP exception for errors
507 |         raise HTTPException(status_code=500, detail=f"Error executing profile query: {str(e)}")
508 | 
509 | def execute_metadata_query(query: str, db: Session) -> List[Dict[str, Any]]:
510 |     """
511 |     Executes a metadata query and formats the results.
512 | 
513 |     Parameters:
514 |     - query: str - The SQL query to execute.
515 |     - db: Session - The database session to use for query execution.
516 | 
517 |     Returns:
518 |     - A list of dictionaries where each dictionary represents a row of query results.
519 |     """
520 |     print(query)  # Log the query for debugging purposes
521 |     try:
522 |         # Execute the query using the database session
523 |         result_proxy = db.execute(text(query))
524 |         results = result_proxy.fetchall()
525 | 
526 |         # Convert query results into a structured format
527 |         response_data = {
528 |             "data": [
529 |                 {key: (value.isoformat() if isinstance(value, datetime) else value)
530 |                  for key, value in dict(zip(result_proxy.keys(), row)).items()}
531 |                 for row in results
532 |             ]
533 |         }
534 |         # Return the formatted response data as JSON
535 |         return JSONResponse(content=response_data)
536 |     except Exception as e:
537 |         # Handle any exceptions that occur during query execution
538 |         raise HTTPException(status_code=500, detail=str(e))
539 | 
540 | 
541 | def execute_select_query(query: str, db: Session):
542 | 
543 |     print(query)
544 | 
545 |     try:
546 |         result_proxy = db.execute(text(query))
547 |         results = result_proxy.mappings().all()  # Convert to list of dictionaries
548 |         # Serialize the results using jsonable_encoder to handle special data types like datetime
549 |         json_compatible_data = jsonable_encoder(results)
550 |         return JSONResponse(content={"data": json_compatible_data, "total_rows": len(results)})
551 |     except Exception as e:
552 |         raise HTTPException(status_code=400, detail=str(e))
553 | 
554 | def execute_ddl_query(query: str, db: Session):
555 |     try:
556 |         db.execute(text(query))
557 |         db.commit()  # Make sure to commit the transaction for DDL operations
558 |         return JSONResponse(content={"message": "Query executed successfully"})
559 |     except Exception as e:
560 |         db.rollback()  # Rollback the transaction in case of failure
561 |         raise HTTPException(status_code=400, detail=str(e))
562 |     
563 | @app.post("/sqlglot/transpile")
564 | async def sqlglot_transpile_sql(request: Request):
565 |     try:
566 |         # Parse JSON dynamically without a Pydantic model
567 |         body = await request.json()
568 |         sql = body.get("sql")
569 |         transpile_to = body.get("transpile_to")
570 | 
571 |         if not sql:
572 |             raise ValueError("No SQL provided for transpilation.")
573 |         if not transpile_to:
574 |             raise ValueError("No target language provided for transpilation.")
575 | 
576 |         # Transpile the provided SQL to the specified target language
577 |         transpiled_sql = sqlglot.transpile(sql, write=transpile_to, identify=True, pretty=True)[0]
578 |         return {"result_sql": transpiled_sql}
579 |     except ValueError as e:
580 |         raise HTTPException(status_code=400, detail=str(e))
581 |     except Exception as e:
582 |         raise HTTPException(status_code=500, detail=f"An error occurred while transpiling: {e}")
583 | 
584 | @app.post("/sqlglot/prettify")
585 | async def sqlglot_prettify_sql(request: Request):
586 |     try:
587 |         # Parse JSON dynamically without a Pydantic model
588 |         body = await request.json()
589 |         sql = body.get("sql")
590 | 
591 |         if not sql:
592 |             raise ValueError("No SQL provided for prettify.")
593 | 
594 |         # Transpile the provided SQL to the specified target language
595 |         prettified_sql = sqlglot.optimizer.optimize(sql).sql(pretty=True)
596 |         return {"result_sql": prettified_sql}
597 |     except ValueError as e:
598 |         raise HTTPException(status_code=400, detail=str(e))
599 |     except Exception as e:
600 |         raise HTTPException(status_code=500, detail=f"An error occurred while prettify: {e}")
601 |     
602 | @app.post("/sqlglot/extract/column")
603 | async def sqlglot_extract_columns(request: Request):
604 |     try:
605 |         body = await request.json()
606 |         sql = body.get("sql")
607 | 
608 |         if not sql:
609 |             raise ValueError("No SQL provided.")
610 | 
611 |         parsed_sql = parse_one(sql)
612 | 
613 |         # Extract columns
614 |         columns = [column.alias_or_name for column in parsed_sql.find_all(exp.Column)]
615 | 
616 |         return {"data": columns}
617 |     except ValueError as e:
618 |         raise HTTPException(status_code=400, detail=str(e))
619 |     except Exception as e:
620 |         raise HTTPException(status_code=500, detail=f"An error occurred while extracting columns: {e}")
621 | 
622 | @app.post("/sqlglot/extract/table")
623 | async def sqlglot_extract_tables(request: Request):
624 |     try:
625 |         body = await request.json()
626 |         sql = body.get("sql")
627 | 
628 |         if not sql:
629 |             raise ValueError("No SQL provided.")
630 | 
631 |         parsed_sql = parse_one(sql)
632 | 
633 |         # Extract tables
634 |         tables = [table.name for table in parsed_sql.find_all(exp.Table)]
635 | 
636 |         return {"data": tables}
637 |     except ValueError as e:
638 |         raise HTTPException(status_code=400, detail=str(e))
639 |     except Exception as e:
640 |         raise HTTPException(status_code=500, detail=f"An error occurred while extracting tables: {e}")
641 | 
642 | @app.post("/sqlglot/extract/projection")
643 | async def sqlglot_extract_projections(request: Request):
644 |     try:
645 |         body = await request.json()
646 |         sql = body.get("sql")
647 | 
648 |         if not sql:
649 |             raise ValueError("No SQL provided.")
650 | 
651 |         parsed_sql = parse_one(sql)
652 | 
653 |         # Extract projections
654 |         projections = []
655 |         for select in parsed_sql.find_all(exp.Select):
656 |             projections.extend([projection.alias_or_name for projection in select.expressions])
657 | 
658 |         return {"data": projections}
659 |     except ValueError as e:
660 |         raise HTTPException(status_code=400, detail=str(e))
661 |     except Exception as e:
662 |         raise HTTPException(status_code=500, detail=f"An error occurred while extracting projections: {e}")


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiohttp==3.9.4
 2 | aiosignal==1.3.1
 3 | annotated-types==0.6.0
 4 | anyio==4.3.0
 5 | async-timeout==4.0.3
 6 | attrs==23.2.0
 7 | certifi==2024.2.2
 8 | charset-normalizer==3.3.2
 9 | click==8.1.7
10 | duckdb==1.1.3
11 | duckdb_engine==0.14.0
12 | exceptiongroup==1.2.0
13 | fastapi==0.110.0
14 | frozenlist==1.4.1
15 | h11==0.14.0
16 | httpcore==1.0.5
17 | httpx==0.27.0
18 | idna==3.6
19 | multidict==6.0.5
20 | packaging==24.2
21 | pydantic==2.7.0
22 | pydantic_core==2.18.1
23 | python-dotenv==1.0.0
24 | pytz==2024.1
25 | requests==2.31.0
26 | sniffio==1.3.1
27 | SQLAlchemy==2.0.36
28 | sqlglot==26.0.1
29 | sqlglotrs==0.3.0
30 | starlette==0.36.3
31 | typing_extensions==4.12.2
32 | upstash-redis==1.0.0
33 | urllib3==2.2.1
34 | uvicorn==0.28.0
35 | yarl==1.9.4
36 | 


--------------------------------------------------------------------------------
/sample.env:
--------------------------------------------------------------------------------
1 | DUCKDB_DATABASE_URL=duckdb:///tickit.duckdb
2 | DUCKDB_SCHEMA_NAME=
3 | QUERY_BLACKLIST=DELETE,DROP,TRUNCATE,ALTER


--------------------------------------------------------------------------------
/tickit.duckdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/senthilsweb/duckdb-data-api/1da02c1ee06ae17265798ea30b2896db8fd1e1c3/tickit.duckdb


--------------------------------------------------------------------------------
/vercel.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": 2,
 3 |     "builds": [
 4 |       { "src": "main.py", "use": "@vercel/python" }
 5 |     ],
 6 |     "routes": [
 7 |       { "src": "/(.*)", "dest": "/main.py" }
 8 |     ],
 9 |     "env": {
10 |       "APP_MODULE": "main:app"
11 |     }
12 |   }
13 |   


--------------------------------------------------------------------------------