├── backend ├── app │ ├── __init__.py │ ├── db │ │ ├── __init__.py │ │ ├── tables.py │ │ ├── auth_queries.py │ │ └── queries.py │ ├── models │ │ ├── __init__.py │ │ ├── common.py │ │ ├── response.py │ │ ├── auth.py │ │ └── admin.py │ ├── routers │ │ ├── __init__.py │ │ ├── auth.py │ │ ├── admin.py │ │ └── share.py │ ├── core │ │ ├── delta │ │ │ ├── __init__.py │ │ │ ├── models.py │ │ │ ├── share.py │ │ │ └── utils.py │ │ ├── iceberg │ │ │ ├── __init__.py │ │ │ ├── share.py │ │ │ └── models.py │ │ ├── cloud │ │ │ ├── base.py │ │ │ ├── __init__.py │ │ │ ├── aws.py │ │ │ ├── azure.py │ │ │ └── gcs.py │ │ ├── __init__.py │ │ └── base.py │ ├── securities │ │ ├── __init__.py │ │ ├── user_auth.py │ │ └── jwt_utils.py │ ├── utilities │ │ ├── __init__.py │ │ ├── defaults.py │ │ ├── exceptions.py │ │ ├── responses.py │ │ ├── validators.py │ │ └── pagination.py │ ├── conf.py │ ├── main.py │ ├── serverconf.yaml │ └── README.md ├── tests │ ├── __init__.py │ ├── test_database.sqlite │ ├── mock_results.py │ └── test_share_apis.py ├── Dockerfile └── requirements.txt ├── frontend ├── app │ ├── __init__.py │ ├── core │ │ ├── api │ │ │ ├── __init__.py │ │ │ ├── config.py │ │ │ ├── jwt_auth.py │ │ │ └── rest.py │ │ ├── base │ │ │ ├── __init__.py │ │ │ ├── auth.py │ │ │ ├── layout.py │ │ │ └── client.py │ │ ├── __init__.py │ │ ├── share.py │ │ ├── table.py │ │ ├── user.py │ │ ├── login.py │ │ ├── schema.py │ │ ├── table_format.py │ │ └── link.py │ └── main.py ├── requirements.txt ├── config.yaml ├── Dockerfile └── README.md ├── images └── lakehouse-sharing-arch.png ├── notebooks ├── profile.json └── client-example.ipynb ├── .github └── workflows │ ├── style.yaml │ └── ci.yaml ├── .pre-commit-config.yaml ├── .env.example ├── Makefile ├── docker-compose.yaml ├── .gitignore ├── sqls └── prepopulate_data.py └── README.md /backend/app/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /backend/app/db/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /backend/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /frontend/app/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /backend/app/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /backend/app/routers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /frontend/app/core/api/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /backend/app/core/delta/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /backend/app/core/iceberg/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /backend/app/securities/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /frontend/app/core/base/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /frontend/app/core/__init__.py: -------------------------------------------------------------------------------- 1 | from .base.auth import BaseAuth 2 | -------------------------------------------------------------------------------- /frontend/requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit 2 | requests 3 | PyYAML 4 | pyjwt 5 | -------------------------------------------------------------------------------- /frontend/config.yaml: -------------------------------------------------------------------------------- 1 | lakehouse-sharing: 2 | host: localhost 3 | port: 8001 4 | prefix: delta-sharing 5 | -------------------------------------------------------------------------------- /images/lakehouse-sharing-arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajagurunath/lakehouse-sharing/HEAD/images/lakehouse-sharing-arch.png -------------------------------------------------------------------------------- /backend/tests/test_database.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajagurunath/lakehouse-sharing/HEAD/backend/tests/test_database.sqlite -------------------------------------------------------------------------------- /backend/app/utilities/__init__.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | 3 | 4 | def get_random_uuid(): 5 | return str(uuid.uuid4()) 6 | 7 | 8 | def get_random_uuid_hex(): 9 | return str(uuid.uuid4().hex) 10 | -------------------------------------------------------------------------------- /backend/app/core/cloud/base.py: -------------------------------------------------------------------------------- 1 | class BaseCloudSigner(object): 2 | def __init__(self, bucket: str, path: str, expiration: int) -> None: 3 | self.bucket = bucket 4 | self.path = path 5 | self.expiration = expiration 6 | 7 | def sign(self): 8 | raise NotImplementedError 9 | -------------------------------------------------------------------------------- /notebooks/profile.json: -------------------------------------------------------------------------------- 1 | { 2 | "shareCredentialsVersion": 1, 3 | "endpoint": "http://localhost:8001/delta-sharing", 4 | "bearerToken": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJzdXNhbiIsImV4cCI6MTY3NDkyOTkzNn0.-FjXuA3jOJAa8L9x42EfrWeDMNdUwOyWmOPoY3XxGgI", 5 | "expirationTime": "2023-01-28T18:18:56Z" 6 | } 7 | -------------------------------------------------------------------------------- /frontend/app/core/base/auth.py: -------------------------------------------------------------------------------- 1 | import os 2 | from abc import ABC, abstractmethod 3 | 4 | import requests 5 | 6 | 7 | class BaseAuth(ABC): 8 | def __init__(self, url) -> None: 9 | self.url = url 10 | self.prefix = "auth" 11 | super().__init__() 12 | 13 | def get_headers(self): 14 | ... 15 | 16 | def get_token(self, path): 17 | ... 18 | -------------------------------------------------------------------------------- /frontend/app/core/base/layout.py: -------------------------------------------------------------------------------- 1 | import os 2 | from abc import ABC, abstractmethod 3 | 4 | import requests 5 | from core.base.client import BaseClient 6 | 7 | 8 | class BaseLayout(ABC): 9 | client: BaseClient 10 | 11 | def __init__(self) -> None: 12 | self.layouts = {} 13 | 14 | @abstractmethod 15 | def get_layout(self): 16 | raise NotImplementedError 17 | -------------------------------------------------------------------------------- /backend/app/utilities/defaults.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | defaults = dict( 4 | SECRET_KEY="6c14d4be3699a9f66c6d3003bd62f8e14c36f08c631237cc7c88083080b9bb78", 5 | ALGORITHM="HS256", 6 | ACCESS_TOKEN_EXPIRE_MINUTES=60, 7 | ) 8 | 9 | 10 | def get_defaults(key): 11 | value = os.environ.get(key, "") 12 | if value == "": 13 | value = defaults[key.upper()] 14 | return value 15 | -------------------------------------------------------------------------------- /backend/app/models/common.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | 3 | from pydantic import BaseModel 4 | 5 | 6 | class Share(BaseModel): 7 | id: Optional[str] 8 | name: str 9 | 10 | 11 | class Schema(BaseModel): 12 | name: str 13 | share: str 14 | 15 | 16 | class QueryModel(BaseModel): 17 | predicateHints: Optional[List[str]] = "" 18 | limitHint: Optional[int] 19 | version: Optional[int] 20 | -------------------------------------------------------------------------------- /backend/app/conf.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | 3 | 4 | class Config: 5 | def __init__(self, path="app/serverconf.yaml") -> None: 6 | self.path = path 7 | self.config = self.load_server_config() 8 | 9 | def load_server_config(self): 10 | with open(self.path, "r") as file: 11 | config = yaml.safe_load(file) 12 | return config 13 | 14 | def get(self, key): 15 | return self.config[key] 16 | -------------------------------------------------------------------------------- /backend/app/core/__init__.py: -------------------------------------------------------------------------------- 1 | from app.core.base import BaseTableFormat 2 | from core.delta.share import DeltaFormat 3 | from core.iceberg.share import IcebergFormat 4 | 5 | 6 | def get_table_format_client(name): 7 | if name.lower() == "iceberg": 8 | client = IcebergFormat 9 | elif name.lower() == "delta": 10 | client = DeltaFormat 11 | else: 12 | print("No Table format available for given name {}".format(name)) 13 | return client 14 | -------------------------------------------------------------------------------- /backend/app/core/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class BaseTableFormat(object): 5 | def __init__(self) -> None: 6 | pass 7 | 8 | def load_table(self, *args, **kwargs): 9 | pass 10 | 11 | @abstractmethod 12 | def table_version(self, *args, **kwargs): 13 | pass 14 | 15 | @abstractmethod 16 | def metadata(self, *args, **kwargs): 17 | pass 18 | 19 | @abstractmethod 20 | def files(self, *args, **kwargs): 21 | pass 22 | -------------------------------------------------------------------------------- /frontend/app/core/base/client.py: -------------------------------------------------------------------------------- 1 | import os 2 | from abc import ABC, abstractmethod 3 | 4 | 5 | class BaseClient(ABC): 6 | def __init__(self, *args, **kwargs) -> None: 7 | super().__init__() 8 | 9 | @abstractmethod 10 | def client(self): 11 | ... 12 | 13 | @abstractmethod 14 | def auth(self, **kwargs): 15 | ... 16 | 17 | @abstractmethod 18 | def get(self, **kwargs): 19 | ... 20 | 21 | @abstractmethod 22 | def post(self, **kwargs): 23 | ... 24 | -------------------------------------------------------------------------------- /backend/app/models/response.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel, Field 2 | 3 | from .common import Schema, Share 4 | 5 | 6 | class GetShareResponse(BaseModel): 7 | share: Share 8 | 9 | 10 | class SchemaResponse(BaseModel): 11 | name: str 12 | share: str 13 | 14 | 15 | class TableResponse(BaseModel): 16 | name: str 17 | schemaName: str = Field(alias="schema") 18 | share: str 19 | shareId: str 20 | id: str 21 | 22 | 23 | class CommonErrorResponse(BaseModel): 24 | errorCode: str 25 | message: str 26 | -------------------------------------------------------------------------------- /.github/workflows/style.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Python style check 3 | on: [push,pull_request] 4 | 5 | # When this workflow is queued, automatically cancel any previous running 6 | # or pending jobs from the same branch 7 | concurrency: 8 | group: style-${{ github.head_ref }} 9 | cancel-in-progress: true 10 | 11 | jobs: 12 | pre-commit: 13 | name: Run pre-commit hooks 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: actions/checkout@v2 17 | - uses: actions/setup-python@v2 18 | - uses: pre-commit/action@v2.0.0 19 | -------------------------------------------------------------------------------- /frontend/app/core/api/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import yaml 4 | from toolz.dicttoolz import get_in 5 | 6 | 7 | class Config: 8 | def __init__(self, path="config.yaml") -> None: 9 | print(os.getcwd()) 10 | self.path = path 11 | self.config = self.load_server_config() 12 | 13 | def load_server_config(self): 14 | with open(self.path, "r") as file: 15 | config = yaml.safe_load(file) 16 | return config 17 | 18 | def get(self, key): 19 | keys = key.split(".") 20 | return get_in(keys, self.config) 21 | -------------------------------------------------------------------------------- /frontend/Dockerfile: -------------------------------------------------------------------------------- 1 | # app/Dockerfile 2 | 3 | FROM python:3.9-slim 4 | 5 | WORKDIR /frontend 6 | 7 | RUN apt-get update && apt-get install -y \ 8 | build-essential \ 9 | curl \ 10 | software-properties-common \ 11 | git \ 12 | && rm -rf /var/lib/apt/lists/* 13 | 14 | RUN pip install --upgrade pip 15 | COPY ./requirements.txt ./ 16 | RUN pip3 install -r requirements.txt 17 | 18 | COPY . . 19 | EXPOSE 8501 20 | 21 | HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health 22 | 23 | ENTRYPOINT ["python","-m","streamlit", "run", "app/main.py", "--server.port=8501", "--server.address=0.0.0.0"] 24 | -------------------------------------------------------------------------------- /backend/app/models/auth.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Union 2 | 3 | from pydantic import BaseModel 4 | 5 | 6 | class Token(BaseModel): 7 | access_token: str 8 | token_type: str 9 | 10 | 11 | class TokenData(BaseModel): 12 | username: Union[str, None] = None 13 | 14 | 15 | class User(BaseModel): 16 | name: str 17 | email: Union[str, None] = None 18 | disabled: Union[bool, None] = None 19 | 20 | 21 | class UserInDB(User): 22 | id: str 23 | encrypted_password: str 24 | 25 | 26 | class NewUser(BaseModel): 27 | name: str 28 | password: str 29 | email: str 30 | namespace: Optional[str] 31 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/psf/black 3 | rev: 22.3.0 4 | hooks: 5 | - id: black 6 | language_version: python3 7 | # - repo: https://gitlab.com/pycqa/flake8 8 | # rev: 3.9.2 9 | # hooks: 10 | # - id: flake8 11 | # language_version: python3 12 | - repo: https://github.com/pycqa/isort 13 | rev: 5.7.0 14 | hooks: 15 | - id: isort 16 | args: 17 | - "--profile" 18 | - "black" 19 | - repo: https://github.com/pre-commit/pre-commit-hooks 20 | rev: v3.2.0 21 | hooks: 22 | - id: trailing-whitespace 23 | - id: end-of-file-fixer 24 | # - id: check-yaml 25 | - id: check-added-large-files 26 | -------------------------------------------------------------------------------- /backend/app/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from app.utilities.exceptions import add_exception_handler 4 | from db.queries import create_db_and_tables 5 | from fastapi import FastAPI 6 | from routers.admin import admin 7 | from routers.auth import auth_router 8 | from routers.share import share_router 9 | 10 | TABLE_FORMAT = os.environ.get("TABLE_FORMAT", "delta") 11 | server = FastAPI(title=f"Lakehouse-sharing - ({TABLE_FORMAT})") 12 | 13 | 14 | @server.on_event("startup") 15 | async def startup_event(): 16 | print("starting db ...") 17 | create_db_and_tables() 18 | 19 | 20 | add_exception_handler(server=server) 21 | server.include_router(auth_router) 22 | server.include_router(admin) 23 | server.include_router(share_router) 24 | -------------------------------------------------------------------------------- /backend/app/utilities/exceptions.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI, Request 2 | from fastapi.responses import JSONResponse 3 | 4 | 5 | class LakehouseShareException(Exception): 6 | def __init__(self, status_code: int, message: str): 7 | self.status_code = status_code 8 | self.message = message 9 | 10 | 11 | def add_exception_handler(server): 12 | print("Registering the exception handler") 13 | 14 | @server.exception_handler(LakehouseShareException) 15 | async def lakehouse_share_exception_handler( 16 | request: Request, exc: LakehouseShareException 17 | ): 18 | return JSONResponse( 19 | status_code=exc.status_code, 20 | content={"errorCode": exc.status_code, "message": exc.message}, 21 | ) 22 | 23 | return server 24 | -------------------------------------------------------------------------------- /backend/app/models/admin.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Union 2 | 3 | from pydantic import BaseModel 4 | 5 | 6 | class ShareModel(BaseModel): 7 | name: str 8 | 9 | 10 | class SchemaNameModel(BaseModel): 11 | name: str 12 | 13 | 14 | class TableModel(BaseModel): 15 | table_name: str 16 | table_location: str 17 | 18 | 19 | class SchemaModel(BaseModel): 20 | name: str 21 | table_id: str 22 | share_id: str 23 | 24 | 25 | class AllDetails(BaseModel): 26 | share: ShareModel 27 | schema_: SchemaNameModel 28 | table: TableModel 29 | 30 | 31 | class PermissionModel(BaseModel): 32 | user_id: str 33 | share_id: str 34 | schema_name: str 35 | table_id: str 36 | 37 | 38 | class TokenLifetime(BaseModel): 39 | username: str 40 | expiry: int 41 | -------------------------------------------------------------------------------- /backend/app/serverconf.yaml: -------------------------------------------------------------------------------- 1 | # Set the host name that the server will use 2 | host: "localhost" 3 | # Set the port that the server will listen on. Note: using ports below 1024 4 | # may require a privileged user in some operating systems. 5 | port: 8081 6 | # Set the url prefix for the REST APIs 7 | endpoint: "/delta-sharing" 8 | 9 | db: 10 | POSTGRES_USER: ${POSTGRES_USERNAME} 11 | POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:password} 12 | POSTGRES_HOST: ${POSTGRES_HOST:localhost} 13 | POSTGRES_PORT: ${POSTGRES_PORT:5432} 14 | POSTGRES_DB: ${POSTGRES_DB:postgres} 15 | # db_url: "postgresql+psycopg2://${POSTGRES_USER:root}:${POSTGRES_PASSWORD:password}@${POSTGRES_HOST:localhost}:${POSTGRES_PORT:5432}/postgres" 16 | 17 | # db_url: "sqlite:////Users/cb-it-01-1834/chargebee/research/opensource/lakehouse-sharing/database.sqlite" 18 | -------------------------------------------------------------------------------- /backend/app/core/cloud/__init__.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlparse 2 | 3 | from .aws import AWSCloudSigner 4 | from .azure import AzureCloudSigner 5 | from .gcs import GCSCloudSigner 6 | 7 | 8 | def get_presigned_url(s3_path, expiration=3600): 9 | path = urlparse(s3_path) 10 | bucket = path.netloc 11 | objpath = path.path.lstrip("/") 12 | if path.scheme == "s3" or path.scheme == "s3a": 13 | s3 = AWSCloudSigner(bucket=bucket, path=objpath, expiration=expiration) 14 | signed_url = s3.sign() 15 | elif path.scheme == "gs": 16 | gcs = GCSCloudSigner(bucket=bucket, path=objpath, expiration=expiration) 17 | signed_url = gcs.sign() 18 | elif path.scheme == "adfs": 19 | adfs = AzureCloudSigner(bucket=bucket, path=path, expiration=expiration) 20 | signed_url = adfs.sign() 21 | return signed_url 22 | -------------------------------------------------------------------------------- /frontend/app/core/share.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from app.core.api.rest import RestClient 3 | 4 | client = RestClient(token=st.session_state["token"]) 5 | 6 | 7 | def create_share_in_db(shareDetails): 8 | client.set_token(st.session_state["token"]) 9 | response = client.post("/admin/share", data=None, json=shareDetails) 10 | print(response.content) 11 | if response.status_code == 200: 12 | st.markdown(f"## Share {shareDetails['name']} created in the lakehouse") 13 | st.balloons() 14 | 15 | 16 | def create_share_form_layout(): 17 | create_share_form = st.form("create_share") 18 | sharename = create_share_form.text_input("sharename") 19 | submit = create_share_form.form_submit_button("create") 20 | shareDetails = {} 21 | shareDetails["name"] = sharename 22 | if submit: 23 | create_share_in_db(shareDetails) 24 | return create_share_form 25 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | - env=${env:-PROD} 2 | # set table format : delta(default) or iceberg 3 | - TABLE_FORMAT=${TABLE_FORMAT:-delta} 4 | # iceberg specific setting ensure glue related permissions 5 | - PYICEBERG_CATALOG__DEFAULT__TYPE=glue 6 | - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} 7 | - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} 8 | - AWS_DEFAULT_REGION=${AWS_DEFAULT_REGION} 9 | # need session token if you are using MFA 10 | - AWS_SESSION_TOKEN=${AWS_SESSION_TOKEN} 11 | 12 | # configure postgres related information - here 13 | POSTGRES_USERNAME=root 14 | POSTGRES_PASSWORD=password 15 | POSTGRES_PORT=5432 16 | POSTGRES_HOST=db 17 | TABLE_FORMAT=iceberg 18 | AWS_REGION=us-east-1 19 | AWS_DEFAULT_REGION=us-east-1 20 | 21 | # optional , consider this if you are productionizing this app 22 | # generate random secret key used by jwt auth 23 | SECRET_KEY=`openssl rand -hex 32` 24 | ALGORITHM=HS256 25 | ACCESS_TOKEN_EXPIRE_MINUTES=60 26 | -------------------------------------------------------------------------------- /backend/Dockerfile: -------------------------------------------------------------------------------- 1 | # Pull official latest Python Docker image (Pulished with version 3.11.0) 2 | FROM --platform=linux/amd64 python:latest 3 | 4 | # Set the working directory 5 | WORKDIR /usr/app 6 | 7 | # Set up Python behaviour 8 | ENV PYTHONDONTWRITEBYTECODE 1 9 | ENV PYTHONUNBUFFERED 1 10 | ENV VIRTUAL_ENV=/opt/venv 11 | 12 | # Switch on virtual environment 13 | RUN python3 -m venv $VIRTUAL_ENV 14 | ENV PATH="$VIRTUAL_ENV/bin:$PATH" 15 | 16 | # Set the server port 17 | EXPOSE 8000 18 | 19 | # Install system dependencies 20 | RUN apt-get update \ 21 | && apt-get -y install netcat gcc postgresql \ 22 | && apt-get clean 23 | 24 | # Install Python dependencies 25 | RUN pip install --upgrade pip 26 | COPY ./requirements.txt ./ 27 | RUN pip3 install -r requirements.txt 28 | RUN pip3 install deltalake pyiceberg 29 | # Copy all files 30 | ARG CACHEBUST=1 31 | 32 | COPY . . 33 | 34 | # Start up the backend server 35 | CMD python -m uvicorn main:server --app-dir app/ --reload --workers 4 --host 0.0.0.0 --port 8000 36 | -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: Lakehouse-Sharing Testing 2 | 3 | on: 4 | push: 5 | 6 | jobs: 7 | Unit-and-Integration-testing: 8 | runs-on: ubuntu-latest 9 | strategy: 10 | matrix: 11 | python-version: ["3.9"] 12 | 13 | steps: 14 | - uses: actions/checkout@v3 15 | - name: Set up Python ${{ matrix.python-version }} 16 | uses: actions/setup-python@v4 17 | with: 18 | python-version: ${{ matrix.python-version }} 19 | - name: Install dependencies 20 | run: | 21 | python --version 22 | cd backend/ 23 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 24 | if [ -f dev_requirements.txt ]; then pip install -r dev_requirements.txt; fi 25 | python -m pip install --upgrade pip 26 | pip install flake8 pytest pytest-cov deltalake pyiceberg 27 | - name: Test Backend APIs 28 | run: | 29 | cd backend/ 30 | python -m pytest --cov-report=term --cov=app tests -v 31 | -------------------------------------------------------------------------------- /frontend/app/core/table.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from app.core.api.rest import RestClient 3 | 4 | client = RestClient(token=st.session_state["token"]) 5 | 6 | 7 | def create_table_in_db(tableDetails): 8 | client.set_token(st.session_state["token"]) 9 | response = client.post("/admin/table", data=None, json=tableDetails) 10 | print(response.content) 11 | if response.status_code == 200: 12 | st.markdown(f"## Table {tableDetails['table_name']} created in the lakehouse") 13 | st.balloons() 14 | 15 | 16 | def create_table_form_layout(): 17 | create_table_form = st.form("create_table") 18 | tablename = create_table_form.text_input("tablename") 19 | tablelocation = create_table_form.text_input("table_location") 20 | submit = create_table_form.form_submit_button("create") 21 | tableDetails = {} 22 | tableDetails["table_name"] = tablename 23 | tableDetails["table_location"] = tablelocation 24 | if submit: 25 | create_table_in_db(tableDetails) 26 | return create_table_form 27 | -------------------------------------------------------------------------------- /backend/app/README.md: -------------------------------------------------------------------------------- 1 | ## Backend Details 2 | 3 | 4 | ### The backend app consists of the following modules: 5 | 6 | **core**: 7 | 8 | This module is the core part of the backend app containing cloud, delta, and iceberg modules. Cloud modules contain functions to sign the cloud storage files (GCFs, S3,adls) with the specified expiration. Delta and Iceberg modules contain functions and classes to get the metadata and data files from lakehouse table formats and sign the data files using the cloud module and return the data in a protocol-compliant way. 9 | 10 | **DB**: 11 | 12 | This module contains tables, and queries needed for storing & querying the Framework’s metadata-RDS of the lakehouse-sharing server. 13 | 14 | **routers**: 15 | 16 | This module contains FastAPI routers for /admin, /auth,/delta-sharing. 17 | 18 | **Securities**: 19 | 20 | This module implements JWT token generations. 21 | 22 | **Utilities**: 23 | 24 | This module contains validators, exceptions, and pagination utility 25 | 26 | **models**: 27 | 28 | Contains request and response Pydantic models which fast API loves and integrates well. 29 | -------------------------------------------------------------------------------- /frontend/app/core/user.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from app.core.api.rest import RestClient 3 | 4 | client = RestClient(token=st.session_state["token"]) 5 | 6 | 7 | def create_user_in_db(userDetails): 8 | client.set_token(st.session_state["token"]) 9 | response = client.post("/auth/add_user", data=None, json=userDetails) 10 | if response.status_code == 200: 11 | st.markdown(f"## User {userDetails['name']} added to the lakehouse") 12 | st.balloons() 13 | 14 | 15 | def create_user_form_layout(): 16 | create_user_form = st.form("create_user") 17 | username = create_user_form.text_input("username") 18 | password = create_user_form.text_input("password", type="password") 19 | email = create_user_form.text_input("email") 20 | team = create_user_form.text_input("team") 21 | submit = create_user_form.form_submit_button("create") 22 | userDetails = {} 23 | userDetails["name"] = username 24 | userDetails["password"] = password 25 | userDetails["email"] = email 26 | userDetails["namespace"] = team 27 | if submit: 28 | create_user_in_db(userDetails) 29 | 30 | return create_user_form 31 | -------------------------------------------------------------------------------- /backend/app/securities/user_auth.py: -------------------------------------------------------------------------------- 1 | from app.utilities.exceptions import LakehouseShareException 2 | from fastapi import Depends, HTTPException, status 3 | from fastapi.security import OAuth2PasswordBearer 4 | from securities.jwt_utils import * 5 | 6 | auth = UserCatalog() 7 | oauth2_scheme = OAuth2PasswordBearer(tokenUrl="token") 8 | 9 | 10 | async def get_current_user(token: str = Depends(oauth2_scheme)): 11 | credentials_exception = LakehouseShareException( 12 | status_code=status.HTTP_401_UNAUTHORIZED, 13 | message="Unauthorized User", 14 | ) 15 | try: 16 | payload = jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM]) 17 | username: str = payload.get("sub") 18 | if username is None: 19 | raise credentials_exception 20 | token_data = TokenData(username=username) 21 | except JWTError: 22 | raise credentials_exception 23 | user = auth.get_user(username=token_data.username) 24 | if user is None: 25 | raise credentials_exception 26 | return user 27 | 28 | 29 | async def get_current_active_user(current_user: User = Depends(get_current_user)): 30 | if current_user.disabled: 31 | raise LakehouseShareException(status_code=403, message="Inactive user") 32 | return current_user 33 | -------------------------------------------------------------------------------- /backend/app/utilities/responses.py: -------------------------------------------------------------------------------- 1 | from app.models.response import CommonErrorResponse, Share 2 | from fastapi import status 3 | 4 | 5 | def get_response_dict(model, message): 6 | response = { 7 | "model": model, # custom pydantic model for 200 response 8 | "description": message, 9 | } 10 | return response 11 | 12 | 13 | def get_200_ok(model): 14 | success_response = get_response_dict(model=model, message="Success message") 15 | return success_response 16 | 17 | 18 | common_responses = { 19 | status.HTTP_200_OK: get_200_ok(Share), 20 | status.HTTP_400_BAD_REQUEST: get_response_dict( 21 | CommonErrorResponse, "The request is malformed." 22 | ), 23 | status.HTTP_401_UNAUTHORIZED: get_response_dict( 24 | CommonErrorResponse, 25 | "The request is unauthenticated. The bearer token is missing or incorrect.", 26 | ), 27 | status.HTTP_403_FORBIDDEN: get_response_dict( 28 | CommonErrorResponse, "The request is forbidden from being fulfilled." 29 | ), 30 | status.HTTP_404_NOT_FOUND: get_response_dict( 31 | CommonErrorResponse, "The requested resource does not exist." 32 | ), 33 | status.HTTP_500_INTERNAL_SERVER_ERROR: get_response_dict( 34 | CommonErrorResponse, 35 | "The request is not handled correctly due to a server error.", 36 | ), 37 | } 38 | -------------------------------------------------------------------------------- /frontend/README.md: -------------------------------------------------------------------------------- 1 | ## Frontend for Lakehouse Sharing 2 | 3 | ### Frontend setup and walkthrough 4 | 5 | - To show the value of the backend RestAPI built a quick and simple Streamlit APP. which is just a client for this Lakehouse-sharing, offloading all the authentication, admin related activities to the backend server. 6 | 7 | #### As discussed above, the setup should straight forward. 8 | 9 | - Step 1: if you are using docker, docker-compose up (recommended approach for quick setup). otherwise, if you are adventurous set up the local server based on the installation and readme guide. 10 | 11 | - Step 2: once the server and DB instance are ready, Run the python script to create and populate the database tables. Sample entry population script was present inside the sqls folder of the root directory 12 | 13 | - Step 3: Start the backend server after installing the required dependencies (Skip if you are using docker-compose which will take care of this step already) 14 | 15 | - Step 4: Start the Frontend APP. using the appropriate makefile command. (Skip if you are using docker-compose which will take care of this step already) 16 | 17 | - Step 5: Log in to the Frontend APP using superuser credentials: (username: admin, password: admin@123) 18 | 19 | - Step 6: Once Superuser was logged in, he/she can create n number of users and create a logical grouping under one namespace and share that with other teams or organizations. 20 | -------------------------------------------------------------------------------- /backend/tests/mock_results.py: -------------------------------------------------------------------------------- 1 | ListShareResult = [ 2 | {"id": "01460171-de53-4eb6-8d99-a8ed491bac1d", "name": "iceberg_share"}, 3 | {"id": "2b3b8cc2-cc46-4af5-9f59-92e7176e855a", "name": "delta_share1"}, 4 | {"id": "78a3a3d6-afb0-49c9-a45d-c89777f3b885", "name": "delta_share2"}, 5 | {"id": "97c9eeb6-2c18-481c-8690-40f0c819a2f2", "name": "delta_share3"}, 6 | ] 7 | 8 | ShareResult = {"id": "01460171-de53-4eb6-8d99-a8ed491bac1d", "name": "iceberg_share"} 9 | 10 | ListSchemaResult = [ 11 | {"name": "delta_schema", "share": "delta_share1"}, 12 | {"name": "delta_schema1", "share": "delta_share1"}, 13 | ] 14 | 15 | ListTableResult = [ 16 | { 17 | "name": "iceberg_benchmark_nyc_taxi_trips_v2", 18 | "schema": "tripsdb", 19 | "share": "iceberg_share", 20 | "shareId": "01460171-de53-4eb6-8d99-a8ed491bac1d", 21 | "id": "b22e0a03-7236-4482-9c6c-aed926073384", 22 | } 23 | ] 24 | 25 | ListAllTableResult = [ 26 | { 27 | "name": "test_hm", 28 | "schema": "schema2", 29 | "share": "delta_share2", 30 | "shareId": "78a3a3d6-afb0-49c9-a45d-c89777f3b885", 31 | "id": "c098e038-d032-4da5-b5f8-e7401073b04c", 32 | }, 33 | { 34 | "name": "test_hm", 35 | "schema": "schema2", 36 | "share": "delta_share2", 37 | "shareId": "78a3a3d6-afb0-49c9-a45d-c89777f3b885", 38 | "id": "c098e038-d032-4da5-b5f8-e7401073b04c", 39 | }, 40 | ] 41 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | #!/usr/bin/make 2 | 3 | .DEFAULT_GOAL := help 4 | # COLORS 5 | 6 | BOLD := $(shell tput bold) 7 | GREEN := $(shell tput -Txterm setaf 2) 8 | YELLOW := $(shell tput -Txterm setaf 3) 9 | WHITE := $(shell tput -Txterm setaf 7) 10 | RESET := $(shell tput -Txterm sgr0) 11 | TARGET_MAX_CHAR_NUM=30 12 | 13 | .PHONY: venv 14 | ## create a virtual environment for development 15 | venv: 16 | virtualenv -p python3 env 17 | source env/bin/activate && pip install pip --upgrade && pip install -r requirements.txt 18 | 19 | 20 | 21 | .PHONY: start_backend_server 22 | ## starts prefect server 23 | start_backend_server: 24 | ## make sure you have docker installed and docker-deamon was up and running 25 | cd backend/ && python -m uvicorn main:server --reload --app-dir app/ 26 | 27 | 28 | .PHONY: start_frontend_server 29 | ## starts prefect agent 30 | start_frontend_server: 31 | cd frontend/ && python -m streamlit run app/main.py 32 | 33 | 34 | 35 | .PHONY: help 36 | ## Show help 37 | help: 38 | @echo '' 39 | @echo 'Usage:' 40 | @echo ' ${YELLOW}make${RESET} ${GREEN}${RESET}' 41 | @echo '' 42 | @echo 'Targets:' 43 | @grep "^# help\:" Makefile | grep -v grep | sed 's/\# help\: //' | sed 's/\# help\://' 44 | 45 | @awk '/^[a-zA-Z\-\_0-9]+:/ { \ 46 | helpMessage = match(lastLine, /^## (.*)/); \ 47 | if (helpMessage) { \ 48 | helpCommand = substr($$1, 0, index($$1, ":")-1); \ 49 | helpMessage = substr(lastLine, RSTART + 3, RLENGTH); \ 50 | printf " ${YELLOW}%-$(TARGET_MAX_CHAR_NUM)s${RESET} ${GREEN}%s${RESET}\n", helpCommand, helpMessage; \ 51 | } \ 52 | } \ 53 | { lastLine = $$0 }' $(MAKEFILE_LIST) 54 | -------------------------------------------------------------------------------- /backend/app/core/cloud/aws.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from .base import BaseCloudSigner 4 | 5 | try: 6 | import boto3 7 | from botocore.client import Config 8 | from botocore.exceptions import ClientError 9 | except ImportError as ie: 10 | print("Please install AWS related libraries to get presigned url for S3 bucket", ie) 11 | 12 | 13 | def create_presigned_s3_url(bucket_name, object_name, expiration=3600): 14 | """Generate a presigned URL to share an S3 object 15 | 16 | :param bucket_name: string 17 | :param object_name: string 18 | :param expiration: Time in seconds for the presigned URL to remain valid 19 | :return: Presigned URL as string. If error, returns None. 20 | """ 21 | 22 | # Generate a presigned URL for the S3 object 23 | s3_client = boto3.client("s3", config=Config(signature_version="s3v4")) 24 | try: 25 | response = s3_client.generate_presigned_url( 26 | "get_object", 27 | Params={"Bucket": bucket_name, "Key": object_name}, 28 | HttpMethod="GET", 29 | ExpiresIn=expiration, 30 | ) 31 | except ClientError as e: 32 | logging.error(e) 33 | return None 34 | 35 | # The response contains the presigned URL 36 | return response 37 | 38 | 39 | class AWSCloudSigner(BaseCloudSigner): 40 | def __init__(self, bucket: str, path: str, expiration: int = 3600) -> None: 41 | super().__init__(bucket, path, expiration) 42 | 43 | def sign(self): 44 | print(self.bucket, self.path) 45 | signed_url = create_presigned_s3_url(self.bucket, self.path, self.expiration) 46 | return signed_url 47 | -------------------------------------------------------------------------------- /frontend/app/core/api/jwt_auth.py: -------------------------------------------------------------------------------- 1 | import os 2 | from abc import ABC, abstractmethod 3 | from typing import Dict 4 | from urllib.parse import urljoin 5 | 6 | from app.core.api.config import Config 7 | from core.base.auth import BaseAuth 8 | from requests import Request 9 | 10 | URL = os.environ.get("delta-sharing-url", "http://localhost:8000") 11 | 12 | 13 | class JWTAuth(BaseAuth): 14 | def __init__( 15 | self, client: Request, url, config: Config, prefix="auth", user_details=None 16 | ) -> None: 17 | self.client = client 18 | self.url = url 19 | self.prefix = prefix 20 | self.config = config 21 | if user_details is None: 22 | self.load_user_details() 23 | else: 24 | self.user_details = user_details 25 | super().__init__(url=url) 26 | 27 | def load_user_details(self): 28 | self.user_details = { 29 | "username": self.config.get("lakehouse-sharing.username"), 30 | "password": self.config.get("lakehouse-sharing.password"), 31 | } 32 | 33 | def path_join(self, *args): 34 | url = "{base}{}".format("/".join(args), base=self.url) 35 | return url 36 | 37 | def get_token(self, path="token"): 38 | url = self.path_join(self.prefix, path) 39 | print("get_token", url, self.user_details) 40 | response = self.client.post(url, headers={}, data=self.user_details) 41 | print(response.content) 42 | if response.status_code != 200: 43 | raise Exception(response.content) 44 | token = response.json()["access_token"] 45 | return token 46 | 47 | 48 | if __name__ == "__main__": 49 | jauth = JWTAuth() 50 | jauth.get_headers({"username": "admin", "password": "admin@123"}) 51 | -------------------------------------------------------------------------------- /backend/app/db/tables.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from sqlmodel import Field, SQLModel 4 | 5 | 6 | class Share(SQLModel, table=True): 7 | # __table_args__ = {"extend_existing": True} 8 | 9 | id: str = Field(primary_key=True) 10 | name: str = Field(unique=True) 11 | created_by: str = Field(default=None, nullable=False, foreign_key="user.id") 12 | 13 | 14 | class Table(SQLModel, table=True): 15 | # __table_args__ = {"extend_existing": True} 16 | id: str = Field(primary_key=True) 17 | table_name: str = Field(unique=True) 18 | table_location: str 19 | created_by: str = Field(default=None, nullable=False, foreign_key="user.id") 20 | 21 | 22 | class Schema(SQLModel, table=True): 23 | # __table_args__ = {"extend_existing": True} 24 | 25 | id: str = Field(primary_key=True) 26 | name: str = Field(unique=True) 27 | table_id: str = Field(default=None, foreign_key="table.id") 28 | share_id: str = Field(default=None, foreign_key="share.id") 29 | created_by: str = Field(default=None, nullable=False, foreign_key="user.id") 30 | 31 | 32 | class User(SQLModel, table=True): 33 | # __table_args__ = {"extend_existing": True} 34 | 35 | id: str = Field(primary_key=True) 36 | name: str = Field(unique=True) 37 | email: str 38 | encrypted_password: str 39 | namespace: str 40 | 41 | 42 | class TokenLifetime(SQLModel, table=True): 43 | # __table_args__ = {"extend_existing": True} 44 | 45 | id: str = Field(primary_key=True) 46 | user_id: str = Field(default=None, unique=True, foreign_key="user.id") 47 | expiry: int 48 | 49 | 50 | class Permission(SQLModel, table=True): 51 | # __table_args__ = {"extend_existing": True} 52 | id: str = Field(primary_key=True) 53 | user_id: str = Field(default=None, nullable=False, foreign_key="user.id") 54 | share_id: str = Field(default=None, foreign_key="share.id") 55 | schema_id: str = Field(default=None, foreign_key="schema.id") 56 | table_id: str = Field(default=None, foreign_key="table.id") 57 | -------------------------------------------------------------------------------- /backend/app/core/cloud/azure.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from .base import BaseCloudSigner 4 | 5 | try: 6 | from azure.storage.blob import BlobSasPermissions, generate_blob_sas 7 | except ImportError as ie: 8 | print( 9 | "Please install Azure related libraries to get presigned url for Azure container ", 10 | ie, 11 | ) 12 | 13 | 14 | def generate_download_signed_url( 15 | azure_account_name, azure_container, azure_blob, expiration, azure_primary_key 16 | ): 17 | sas_blob = generate_blob_sas( 18 | account_name=azure_account_name, 19 | container_name=azure_container, 20 | blob_name=azure_blob, 21 | account_key=azure_primary_key, 22 | # For writing back to the Azure Blob set write and create to True 23 | permission=BlobSasPermissions(read=True, write=False, create=False), 24 | # This URL will be valid for 1 hour 25 | expiry=datetime.utcnow() + timedelta(hours=1), 26 | ) 27 | url = ( 28 | "https://" 29 | + azure_account_name 30 | + ".blob.core.windows.net/" 31 | + azure_container 32 | + "/" 33 | + azure_blob 34 | + "?" 35 | + sas_blob 36 | ) 37 | 38 | print("Generated GET signed URL:") 39 | print(url) 40 | print("You can use this URL with any user agent, for example:") 41 | print("curl '{}'".format(url)) 42 | return url 43 | 44 | 45 | class AzureCloudSigner(BaseCloudSigner): 46 | def __init__(self, bucket: str, path: str, expiration: int = 3600) -> None: 47 | self.azure_account_name = "" 48 | self.azure_primary_key = "" 49 | super().__init__(bucket, path, expiration) 50 | 51 | def sign(self): 52 | signed_url = generate_download_signed_url( 53 | azure_account_name=self.azure_account_name, 54 | azure_container=self.bucket, 55 | azure_blob=self.path, 56 | expiration=self.expiration, 57 | azure_primary_key=self.azure_primary_key, 58 | ) 59 | return signed_url 60 | -------------------------------------------------------------------------------- /backend/app/routers/auth.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | from typing import Union 3 | 4 | from app.db.auth_queries import AuthQueries 5 | from app.models.auth import * 6 | from app.securities.user_auth import * 7 | from app.utilities.exceptions import LakehouseShareException 8 | from fastapi import Depends, FastAPI, status 9 | from fastapi.routing import APIRouter 10 | from fastapi.security import OAuth2PasswordBearer, OAuth2PasswordRequestForm 11 | from jose import JWTError, jwt 12 | from passlib.context import CryptContext 13 | 14 | auth_router = APIRouter(prefix="/auth", tags=["auth"]) 15 | 16 | 17 | @auth_router.post("/token", response_model=Token) 18 | async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends()): 19 | user = auth.authenticate_user(form_data.username, form_data.password) 20 | if not user: 21 | raise LakehouseShareException( 22 | status_code=status.HTTP_401_UNAUTHORIZED, 23 | message="Unauthorized User", 24 | ) 25 | # token_lifetime = auth.get_token_lifetime(user) 26 | token_expiry = ACCESS_TOKEN_EXPIRE_MINUTES 27 | access_token_expires = timedelta(minutes=token_expiry) 28 | access_token = auth.create_access_token( 29 | data={"sub": user.name}, expires_delta=access_token_expires 30 | ) 31 | return {"access_token": access_token, "token_type": "bearer"} 32 | 33 | 34 | @auth_router.post("/add_user") 35 | async def create_user( 36 | user: NewUser, current_user: User = Depends(get_current_active_user) 37 | ): 38 | added = auth.create_user(user) 39 | if added: 40 | return "user added" 41 | else: 42 | raise LakehouseShareException( 43 | status_code=409, 44 | message="Conflict: User already exists please add unique user", 45 | ) 46 | 47 | 48 | @auth_router.get("/users/me/", response_model=User) 49 | async def read_users_me(current_user: User = Depends(get_current_active_user)): 50 | return current_user 51 | 52 | 53 | @auth_router.get("/users") 54 | async def list_users(current_user: User = Depends(get_current_active_user)): 55 | userslist = auth.list_users() 56 | return userslist 57 | -------------------------------------------------------------------------------- /backend/tests/test_share_apis.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from fastapi.testclient import TestClient 5 | 6 | os.environ["db_url"] = "sqlite:///tests/test_database.sqlite" 7 | sys.path.append("app/") 8 | print(sys.path) 9 | from unittest import mock 10 | 11 | import app.db.queries 12 | import pytest 13 | from app.models.auth import UserInDB 14 | from app.routers.share import Query, get_current_active_user 15 | from sqlmodel import Session, SQLModel, create_engine 16 | 17 | from .mock_results import * 18 | 19 | 20 | def skip_auth(): 21 | user = UserInDB( 22 | name="test_users", 23 | id="a1ca3334052e4a079145df1e13c4cd41", 24 | email="abc@oss.com", 25 | encrypted_password="encrypted###***", 26 | ) 27 | return user 28 | 29 | 30 | PREFIX = "/delta-sharing" 31 | 32 | 33 | @pytest.fixture() 34 | def client(): 35 | from app.main import server 36 | 37 | server.dependency_overrides[get_current_active_user] = skip_auth 38 | with TestClient(server) as test_client: 39 | yield test_client 40 | 41 | 42 | def test_list_share(client): 43 | response = client.get(f"{PREFIX}/shares") 44 | print(response.text) 45 | assert response.status_code == 200 46 | assert response.json()["items"] == ListShareResult 47 | 48 | 49 | def test_get_share(client): 50 | response = client.get(f"{PREFIX}/shares/delta_share1") 51 | print(response.text) 52 | assert response.status_code == 200 53 | assert response.json()["share"] == ShareResult 54 | 55 | 56 | def test_list_schema(client): 57 | response = client.get(f"{PREFIX}/shares/delta_share1/schemas") 58 | print(response.text) 59 | assert response.status_code == 200 60 | assert response.json()["items"] == ListSchemaResult 61 | 62 | 63 | def test_list_tables(client): 64 | response = client.get(f"{PREFIX}/shares/iceberg_share/schemas/tripsdb/tables") 65 | print(response.text) 66 | assert response.status_code == 200 67 | assert response.json()["items"] == ListTableResult 68 | 69 | 70 | def test_list_all_tables(client): 71 | response = client.get(f"{PREFIX}/shares/delta_share2/schemas/all-tables") 72 | print(response.text) 73 | assert response.status_code == 200 74 | assert response.json()["items"] == ListAllTableResult 75 | -------------------------------------------------------------------------------- /frontend/app/core/login.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from app.core.api.rest import RestClient 3 | 4 | 5 | class Authenticator(object): 6 | def __init__(self) -> None: 7 | if "logged_in" not in st.session_state: 8 | st.session_state["logged_in"] = None 9 | if "username" not in st.session_state: 10 | st.session_state["username"] = None 11 | if "logout" not in st.session_state: 12 | st.session_state["logout"] = None 13 | if "token" not in st.session_state: 14 | st.session_state["token"] = None 15 | 16 | def verify_user(self, username, password): 17 | user_details = {"username": username, "password": password} 18 | client = RestClient(user_details=user_details) 19 | token = client.jauth.get_token() 20 | return token 21 | 22 | def login_screen(self): 23 | print("logged_in", st.session_state["logged_in"]) 24 | if not st.session_state["logged_in"]: 25 | st.header("Lakehouse-Sharing") 26 | st.text("( A table format agnostic sharing app )") 27 | login_form = st.form("Lakehouse-Sharing") 28 | username = login_form.text_input("username") 29 | password = login_form.text_input("password", type="password") 30 | login_form.form_submit_button("login") 31 | try: 32 | token = self.verify_user(username=username, password=password) 33 | st.session_state["username"] = username 34 | st.session_state["password"] = password 35 | st.session_state["token"] = token 36 | st.session_state["logged_in"] = True 37 | st.experimental_rerun() 38 | return True 39 | except Exception as e: 40 | st.session_state["logged_in"] = False 41 | st.warning("Logging failed") 42 | st.error(e) 43 | 44 | def logout(self): 45 | logout = st.sidebar.button("logout") 46 | if logout: 47 | st.session_state["username"] = None 48 | st.session_state["password"] = None 49 | st.session_state["token"] = None 50 | st.session_state["logged_in"] = False 51 | st.experimental_rerun() 52 | -------------------------------------------------------------------------------- /backend/app/db/auth_queries.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | from app.db.tables import TokenLifetime, User 4 | from app.utilities import get_random_uuid_hex 5 | from db.queries import create_db_connection 6 | from sqlmodel import Session, select 7 | 8 | 9 | class AuthQueries: 10 | def __init__(self) -> None: 11 | self.engine = create_db_connection() 12 | 13 | def execute_sql(self, stmt): 14 | with Session(self.engine) as session: 15 | results = session.exec(statement=stmt).all() 16 | print(results) 17 | return results 18 | 19 | def check_user_exist(self, name): 20 | stmt = select(User).where(User.name == name) 21 | print(stmt) 22 | rows = self.execute_sql(stmt) 23 | print(rows) 24 | if len(rows) > 0: 25 | res = { 26 | "name": rows[0].name, 27 | "id": rows[0].id, 28 | "email": rows[0].email, 29 | "encrypted_password": rows[0].encrypted_password, 30 | } 31 | else: 32 | res = None 33 | return res 34 | 35 | def create_user(self, user_details: Dict): 36 | user = User( 37 | id=get_random_uuid_hex(), 38 | name=user_details["name"], 39 | email=user_details["email"], 40 | encrypted_password=user_details["encrypted_password"], 41 | namespace=user_details.get("namespace", "EDP"), 42 | ) 43 | session = Session(self.engine) 44 | session.add(user) 45 | session.commit() 46 | return True 47 | 48 | def get_token_lifetime(self, user): 49 | stmt = ( 50 | select(User, TokenLifetime) 51 | .where(User.id == TokenLifetime.user_id) 52 | .where(User.id == user) 53 | ) 54 | rows = self.execute_sql(stmt) 55 | if rows: 56 | user, token = rows[0] 57 | return int(token.expiry) 58 | 59 | def list_users(self): 60 | stmt = select(User.name, User.id) 61 | rows = self.execute_sql(stmt) 62 | return rows 63 | 64 | def get_username_by_id(self, user_id): 65 | stmt = select(User.name).where(User.id == user_id) 66 | rows = self.execute_sql(stmt) 67 | if rows: 68 | return rows[0] 69 | -------------------------------------------------------------------------------- /frontend/app/main.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from core.login import Authenticator 3 | 4 | app_auth = Authenticator() 5 | from core.link import create_link_form_layout 6 | from core.schema import create_complete_share_layout 7 | from core.share import create_share_form_layout 8 | from core.table import create_table_form_layout 9 | from core.table_format import table_format_exploration 10 | from core.user import create_user_form_layout 11 | 12 | st.set_page_config( 13 | page_title="LakeHouse Sharing", 14 | layout="wide", 15 | page_icon="https://uxwing.com/wp-content/themes/uxwing/download/web-app-development/data-lake-icon.png", 16 | initial_sidebar_state="collapsed", 17 | menu_items={ 18 | "About": "# LakeHouse Sharing", 19 | "Report a bug": "https://github.com/rajagurunath/Lakehouse-Sharing", 20 | }, 21 | ) 22 | 23 | 24 | def create_user_layout(): 25 | st.header("Create a New User") 26 | create_user_form_layout() 27 | 28 | 29 | def create_components_layout(): 30 | """ 31 | share form 32 | schema form 33 | table form 34 | """ 35 | st.title("Lakehouse Share Components") 36 | # st.header("create a share") 37 | # # create_share_form_layout() 38 | # st.header("create a table") 39 | # # create_table_form_layout() 40 | # st.header("create a schema") 41 | create_complete_share_layout() 42 | 43 | 44 | def user_link_layout(): 45 | """ 46 | user-> share - > schema -> table 47 | """ 48 | # download creds 49 | create_link_form_layout() 50 | 51 | 52 | def main_layout(): 53 | st.sidebar.title(f"Hi {st.session_state['username']}") 54 | tab = st.sidebar.radio( 55 | "Pages", 56 | ["Add User", "Create a Share", "Define Permissions", "Explore Table format"], 57 | ) 58 | if tab == "Add User": 59 | create_user_layout() 60 | elif tab == "Create a Share": 61 | create_components_layout() 62 | elif tab == "Define Permissions": 63 | user_link_layout() 64 | elif tab == "Explore Table format": 65 | table_format_exploration() 66 | 67 | 68 | if __name__ == "__main__": 69 | app_auth.login_screen() 70 | if st.session_state["logged_in"]: 71 | 72 | main_layout() 73 | app_auth.logout() 74 | else: 75 | st.write("Please Enter correct username and Password") 76 | -------------------------------------------------------------------------------- /backend/app/utilities/validators.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | 3 | from app.db.queries import Query 4 | from app.models.auth import UserInDB 5 | from app.securities.user_auth import get_current_active_user 6 | from app.utilities.exceptions import LakehouseShareException 7 | from fastapi import Depends, status 8 | 9 | query = Query() 10 | 11 | 12 | def validate_share(share, current_user: UserInDB = Depends(get_current_active_user)): 13 | # share = kwargs.get("share",None) 14 | # schema = kwargs.get("schema",None) 15 | # table = kwargs.get("table",None) 16 | exist = query.check_schema_and_table_existance(share) 17 | if exist: 18 | authorized = query.check_user_permission(current_user.id, share=share) 19 | if authorized: 20 | return current_user, share 21 | else: 22 | raise LakehouseShareException( 23 | status_code=status.HTTP_403_FORBIDDEN, 24 | message=f"User {current_user.name} does not have permission to access {share}", 25 | ) 26 | else: 27 | raise LakehouseShareException( 28 | status_code=status.HTTP_404_NOT_FOUND, 29 | message=f"Required share {share} was not available in backend", 30 | ) 31 | 32 | 33 | def validate_share_and_schema( 34 | share, schema, current_user: UserInDB = Depends(get_current_active_user) 35 | ): 36 | # share = kwargs.get("share",None) 37 | # schema = kwargs.get("schema",None) 38 | # table = kwargs.get("table",None) 39 | exist = query.check_schema_and_table_existance(share, schema) 40 | if exist: 41 | return current_user, share, schema 42 | else: 43 | raise LakehouseShareException( 44 | status_code=status.HTTP_404_NOT_FOUND, 45 | message=f"Required share {share} and {schema} was not available in backend", 46 | ) 47 | 48 | 49 | def validate_share_and_schema_and_table( 50 | share, schema, table, current_user: UserInDB = Depends(get_current_active_user) 51 | ): 52 | # share = kwargs.get("share",None) 53 | # schema = kwargs.get("schema",None) 54 | # table = kwargs.get("table",None) 55 | exist = query.check_schema_and_table_existance(share, schema, table) 56 | if exist: 57 | return current_user, share, schema, table 58 | else: 59 | raise LakehouseShareException( 60 | status_code=status.HTTP_404_NOT_FOUND, 61 | message=f"Required share {share} and {schema} was not available in backend", 62 | ) 63 | -------------------------------------------------------------------------------- /frontend/app/core/schema.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from app.core.api.rest import RestClient 3 | 4 | client = RestClient(token=st.session_state["token"]) 5 | 6 | 7 | def create_schema_in_db(completeDetails): 8 | client.set_token(st.session_state["token"]) 9 | response = client.post("/admin/complete", data=None, json=completeDetails) 10 | print(response.content) 11 | if response.status_code == 200: 12 | st.markdown( 13 | f"## Share {completeDetails['share']['name']} created in the lakehouse" 14 | ) 15 | st.balloons() 16 | 17 | 18 | def list_shares(): 19 | client.set_token(st.session_state["token"]) 20 | response = client.get("/delta-sharing/shares") 21 | if response.status_code == 200: 22 | print(response.content) 23 | items = response.json()["items"] 24 | _list_shares = [] 25 | for r in items: 26 | _list_shares.append(f"{r['name']} ({r['id']})") 27 | print(_list_shares) 28 | return _list_shares 29 | 30 | 31 | def list_tables(share): 32 | client.set_token(st.session_state["token"]) 33 | st.write(share) 34 | sharename, id = share.split(" ") 35 | response = client.get(f"/delta-sharing/shares/{sharename}/schemas/all-tables") 36 | if response.status_code == 200: 37 | print(response.content) 38 | items = response.json()["items"] 39 | _list_tables = [] 40 | for r in items: 41 | _list_tables.append(f"{r['name']} ({r['id']})") 42 | else: 43 | raise Exception(response.content) 44 | return _list_tables 45 | 46 | 47 | def create_complete_share_layout(): 48 | create_complete_share = st.container() 49 | create_complete_share.header("Create a Share") 50 | sharename = create_complete_share.text_input("sharename") 51 | create_complete_share.header("Create a Table") 52 | tablename = create_complete_share.text_input("tablename") 53 | table_location = create_complete_share.text_input("tablelocation") 54 | create_complete_share.header("Create a Schema") 55 | schemaname = create_complete_share.text_input("schemaname") 56 | submit = create_complete_share.button("create") 57 | completeDetails = {} 58 | completeDetails["share"] = {"name": sharename} 59 | completeDetails["schema_"] = {"name": schemaname} 60 | completeDetails["table"] = { 61 | "table_name": tablename, 62 | "table_location": table_location, 63 | } 64 | if submit: 65 | create_schema_in_db(completeDetails) 66 | return create_complete_share 67 | -------------------------------------------------------------------------------- /backend/requirements.txt: -------------------------------------------------------------------------------- 1 | aiobotocore==2.4.2 2 | aiohttp==3.8.3 3 | aioitertools==0.11.0 4 | aiosignal==1.3.1 5 | aiosqlite==0.17.0 6 | altair==4.2.0 7 | anyio==3.6.2 8 | asgiref==3.6.0 9 | async-timeout==4.0.2 10 | asyncpg==0.27.0 11 | attrs==22.2.0 12 | bcrypt==4.0.1 13 | beanie==1.16.4 14 | black==22.12.0 15 | blinker==1.5 16 | boto3==1.24.59 17 | botocore==1.27.59 18 | cachetools==5.2.0 19 | certifi==2022.12.7 20 | charset-normalizer==2.1.1 21 | click==8.1.3 22 | colorama==0.4.6 23 | commonmark==0.9.1 24 | databases==0.6.1 25 | decorator==5.1.1 26 | Django==4.1.4 27 | dnspython==2.2.1 28 | docstring-parser==0.12 29 | ecdsa==0.18.0 30 | email-validator==1.3.0 31 | entrypoints==0.4 32 | fastapi==0.88.0 33 | fastapi-pagination==0.11.1 34 | frozenlist==1.3.3 35 | fsspec==2022.10.0 36 | geomet==0.2.1.post1 37 | gitdb==4.0.10 38 | GitPython==3.1.30 39 | greenlet==2.0.1 40 | h11==0.14.0 41 | httpcore==0.16.3 42 | httptools==0.5.0 43 | httpx==0.23.1 44 | idna==3.4 45 | importlib-metadata==6.0.0 46 | inflection==0.5.1 47 | iso8601==1.1.0 48 | itsdangerous==2.1.2 49 | Jinja2==3.1.2 50 | jmespath==1.0.1 51 | jsonschema==4.17.3 52 | lazy-model==0.0.5 53 | MarkupSafe==2.1.1 54 | mmhash3==3.0.1 55 | mongoengine==0.24.2 56 | motor==3.1.1 57 | multidict==6.0.4 58 | mypy-extensions==0.4.3 59 | numpy==1.24.1 60 | orjson==3.8.3 61 | orm==0.3.1 62 | ormar==0.12.0 63 | packaging==22.0 64 | pandas==1.5.2 65 | passlib==1.7.4 66 | pathspec==0.10.3 67 | piccolo==0.101.0 68 | Pillow==9.4.0 69 | platformdirs==2.6.0 70 | pony==0.7.16 71 | protobuf==3.20.3 72 | pyarrow==10.0.1 73 | pyasn1==0.4.8 74 | pydantic==1.10.2 75 | pydeck==0.8.0 76 | Pygments==2.14.0 77 | pymongo==4.3.3 78 | Pympler==1.0.1 79 | pyparsing==3.0.9 80 | pypika-tortoise==0.1.6 81 | pyrsistent==0.19.3 82 | python-dateutil==2.8.2 83 | python-dotenv==0.21.0 84 | python-jose==3.3.0 85 | python-multipart==0.0.5 86 | pytz==2022.7 87 | pytz-deprecation-shim==0.1.0.post0 88 | PyYAML==6.0 89 | requests==2.28.1 90 | rfc3986==1.5.0 91 | rich==12.6.0 92 | rsa==4.9 93 | s3fs==2022.10.0 94 | s3transfer==0.6.0 95 | scylla-driver==3.25.10 96 | semver==2.13.0 97 | six==1.16.0 98 | smmap==5.0.0 99 | sniffio==1.3.0 100 | sqlakeyset==1.0.1659142803 101 | SQLAlchemy==1.4.41 102 | sqlalchemy2-stubs==0.0.2a31 103 | sqlmodel==0.0.8 104 | sqlparse==0.4.3 105 | starlette==0.22.0 106 | streamlit==1.16.0 107 | targ==0.3.7 108 | toml==0.10.2 109 | tomli==2.0.1 110 | toolz==0.12.0 111 | tornado==6.2 112 | tortoise-orm==0.19.2 113 | typesystem==0.3.1 114 | typing_extensions==4.4.0 115 | tzdata==2022.7 116 | tzlocal==4.2 117 | ujson==5.6.0 118 | urllib3==1.26.13 119 | uvicorn==0.20.0 120 | uvloop==0.17.0 121 | validators==0.20.0 122 | watchfiles==0.18.1 123 | websockets==10.4 124 | wrapt==1.14.1 125 | yarl==1.8.2 126 | zipp==3.11.0 127 | zstandard==0.19.0 128 | psycopg2-binary 129 | -------------------------------------------------------------------------------- /backend/app/core/cloud/gcs.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | from typing import Optional 3 | 4 | from .base import BaseCloudSigner 5 | 6 | try: 7 | from google import auth 8 | from google.auth.transport import requests 9 | from google.cloud.storage import Client 10 | except ImportError as ie: 11 | print( 12 | "Please install GCS related libraries to get presigned url for gcs bucket", ie 13 | ) 14 | 15 | 16 | def get_presigned_gcs_url( 17 | bucket: str, 18 | blob: str, 19 | *, 20 | exp: Optional[timedelta] = None, 21 | content_type="application/octet-stream", 22 | min_size=1, 23 | max_size=int(1e6), 24 | ): 25 | """ 26 | Compute a GCS signed upload URL without needing a private key file. 27 | Can only be called when a service account is used as the application 28 | default credentials, and when that service account has the proper IAM 29 | roles, like `roles/storage.objectCreator` for the bucket, and 30 | `roles/iam.serviceAccountTokenCreator`. 31 | Source: https://stackoverflow.com/a/64245028 32 | Parameters 33 | ---------- 34 | bucket : str 35 | Name of the GCS bucket the signed URL will reference. 36 | blob : str 37 | Name of the GCS blob (in `bucket`) the signed URL will reference. 38 | exp : timedelta, optional 39 | Time from now when the signed url will expire. 40 | content_type : str, optional 41 | The required mime type of the data that is uploaded to the generated 42 | signed url. 43 | min_size : int, optional 44 | The minimum size the uploaded file can be, in bytes (inclusive). 45 | If the file is smaller than this, GCS will return a 400 code on upload. 46 | max_size : int, optional 47 | The maximum size the uploaded file can be, in bytes (inclusive). 48 | If the file is larger than this, GCS will return a 400 code on upload. 49 | """ 50 | if exp is None: 51 | exp = timedelta(hours=1) 52 | credentials, project_id = auth.default() 53 | if credentials.token is None: 54 | # Perform a refresh request to populate the access token of the 55 | # current credentials. 56 | credentials.refresh(requests.Request()) 57 | client = Client() 58 | bucket = client.get_bucket(bucket) 59 | blob = bucket.blob(blob) 60 | return blob.generate_signed_url( 61 | version="v4", 62 | expiration=exp, 63 | service_account_email=credentials.service_account_email, 64 | access_token=credentials.token, 65 | method="PUT", 66 | content_type=content_type, 67 | headers={"X-Goog-Content-Length-Range": f"{min_size},{max_size}"}, 68 | ) 69 | 70 | 71 | class GCSCloudSigner(BaseCloudSigner): 72 | def __init__(self, bucket: str, path: str, expiration: int = 3600) -> None: 73 | super().__init__(bucket, path, expiration) 74 | 75 | # TODO: pass timedelta 76 | def sign(self): 77 | signed_url = get_presigned_gcs_url( 78 | self.bucket, self.path, exp=timedelta(seconds=self.expiration) 79 | ) 80 | return signed_url 81 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3.9' 2 | 3 | services: 4 | db: 5 | image: postgres:latest 6 | container_name: db 7 | restart: always 8 | environment: 9 | - POSTGRES_USER=${POSTGRES_USERNAME} 10 | - POSTGRES_PASSWORD=${POSTGRES_PASSWORD} 11 | - POSTGRES_DB=${POSTGRES_DB} 12 | - PGDATA=/var/lib/postgresql/data/ 13 | volumes: 14 | - postgresql_db_data:/var/lib/postgresql/data/ 15 | expose: 16 | - 5432 17 | ports: 18 | - 5433:5432 19 | 20 | db_editor: 21 | image: adminer 22 | container_name: db_editor 23 | restart: always 24 | environment: 25 | - POSTGRES_USER=${POSTGRES_USERNAME} 26 | - POSTGRES_PASSWORD=${POSTGRES_PASSWORD} 27 | - POSTGRES_HOST=${POSTGRES_HOST} 28 | - POSTGRES_PORT=${POSTGRES_PORT} 29 | - POSTGRES_DB=${POSTGRES_DB} 30 | expose: 31 | - 8080 32 | ports: 33 | - 8081:8080 34 | depends_on: 35 | - db 36 | 37 | lakehouse_share_backend_app: 38 | container_name: lakehouse_share_backend 39 | restart: always 40 | build: 41 | dockerfile: Dockerfile 42 | context: ./backend/ 43 | # command: > 44 | # sh -c "python manage.py wait_for_db && 45 | # python manage.py migrate && 46 | # python manage.py runserver 0.0.0.0:8000" 47 | environment: 48 | - env=${env:-PROD} 49 | - TABLE_FORMAT=${TABLE_FORMAT:-delta} 50 | - PYICEBERG_CATALOG__DEFAULT__TYPE=glue 51 | - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} 52 | - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} 53 | - AWS_DEFAULT_REGION=${AWS_DEFAULT_REGION} 54 | - AWS_SESSION_TOKEN=${AWS_SESSION_TOKEN} 55 | - POSTGRES_USER=${POSTGRES_USERNAME} 56 | - POSTGRES_PASSWORD=${POSTGRES_PASSWORD} 57 | - POSTGRES_HOST=${POSTGRES_HOST} 58 | - POSTGRES_PORT=${POSTGRES_PORT} 59 | - POSTGRES_DB=${POSTGRES_DB} 60 | - SECRET_KEY=${SECRET_KEY} 61 | - ALGORITHM=${ALGORITHM} 62 | - ACCESS_TOKEN_EXPIRE_MINUTES=${ACCESS_TOKEN_EXPIRE_MINUTES} 63 | volumes: 64 | - ./backend/:/usr/backend/ 65 | expose: 66 | - 8000 67 | ports: 68 | - 8001:8000 69 | depends_on: 70 | - db 71 | lakehouse_share_frontend_app: 72 | container_name: lakehouse_share_frontend 73 | restart: always 74 | build: 75 | dockerfile: Dockerfile 76 | context: ./frontend/ 77 | # command: > 78 | # sh -c "python manage.py wait_for_db && 79 | # python manage.py migrate && 80 | # python manage.py runserver 0.0.0.0:8000" 81 | environment: 82 | - env=${env:-PROD} 83 | - BACKEND_HOST=lakehouse_share_backend_app 84 | - BACKEND_PORT=8000 85 | - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} 86 | - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} 87 | volumes: 88 | - ./frontend/:/usr/frontend/ 89 | expose: 90 | - 8501 91 | ports: 92 | - 8501:8501 93 | depends_on: 94 | - db 95 | - lakehouse_share_backend_app 96 | 97 | volumes: 98 | postgresql_db_data: 99 | -------------------------------------------------------------------------------- /backend/app/securities/jwt_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime, timedelta 3 | 4 | from app.db.auth_queries import AuthQueries 5 | from app.models.auth import * 6 | from app.utilities.defaults import get_defaults 7 | from jose import JWTError, jwt 8 | from passlib.context import CryptContext 9 | 10 | SECRET_KEY = get_defaults("SECRET_KEY") 11 | ALGORITHM = get_defaults("ALGORITHM") 12 | print("ACCESS_TOKEN_EXPIRE_MINUTES", os.environ.get("ACCESS_TOKEN_EXPIRE_MINUTES")) 13 | ACCESS_TOKEN_EXPIRE_MINUTES = int(get_defaults("ACCESS_TOKEN_EXPIRE_MINUTES")) 14 | 15 | 16 | class UserCatalog(object): 17 | def __init__(self) -> None: 18 | self.secret = self.load_secret() 19 | self.pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto") 20 | self.auth_db = AuthQueries() 21 | 22 | def load_secret(self): 23 | return SECRET_KEY 24 | 25 | def get_password_hash(self, password: str): 26 | return self.pwd_context.hash(password) 27 | 28 | def verify_password(self, plain_password, hashed_password): 29 | return self.pwd_context.verify(plain_password, hashed_password) 30 | 31 | def create_user(self, user: NewUser): 32 | userDetails = self.auth_db.check_user_exist(user.name) 33 | if userDetails is None: 34 | encrypted_password = self.get_password_hash(user.password) 35 | user_details = {} 36 | user_details["name"] = user.name 37 | user_details["encrypted_password"] = encrypted_password 38 | user_details["email"] = user.email 39 | user_details["namespace"] = user.namespace 40 | self.auth_db.create_user(user_details) 41 | return True 42 | else: 43 | return False 44 | 45 | def get_user(self, username: str): 46 | userDetails = self.auth_db.check_user_exist(username) 47 | if userDetails is not None: 48 | return UserInDB(**userDetails) 49 | else: 50 | return None 51 | 52 | def get_token_lifetime(self, user): 53 | token = self.auth_db.get_token_lifetime(user) 54 | print(token) 55 | return token 56 | 57 | def get_username_by_id(self, user_id): 58 | username = self.auth_db.get_username_by_id(user_id) 59 | return username 60 | 61 | def authenticate_user(self, username: str, password: str): 62 | print("authenticate_user", username) 63 | user = self.get_user(username) 64 | if not user: 65 | return False 66 | if not self.verify_password(password, user.encrypted_password): 67 | return False 68 | return user 69 | 70 | def create_access_token( 71 | self, data: dict, expires_delta: Union[timedelta, None] = None 72 | ): 73 | to_encode = data.copy() 74 | if expires_delta: 75 | expire = datetime.utcnow() + expires_delta 76 | else: 77 | expire = datetime.utcnow() + timedelta(minutes=15) 78 | to_encode.update({"exp": expire}) 79 | encoded_jwt = jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM) 80 | return encoded_jwt 81 | 82 | def list_users(self): 83 | users_list = self.auth_db.list_users() 84 | return users_list 85 | -------------------------------------------------------------------------------- /backend/app/core/iceberg/share.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from core.base import BaseTableFormat 4 | from core.iceberg.models import SharingFile, SharingFileStats, SharingMetaData 5 | from pyiceberg.catalog import Catalog, load_catalog 6 | from pyiceberg.expressions import parser 7 | from pyiceberg.manifest import DataFile 8 | from pyiceberg.table import FileScanTask, Table 9 | 10 | 11 | class IcebergFormat(BaseTableFormat): 12 | def __init__(self) -> None: 13 | self.catalog = self.load_catalog() 14 | self.meta = SharingMetaData() 15 | self.file = SharingFile() 16 | self.table = None 17 | super().__init__() 18 | 19 | def _get_required_args(self, **kwargs): 20 | schema = kwargs.get("schema") 21 | table_name = kwargs.get("table_name") 22 | return schema, table_name 23 | 24 | def load_catalog(self) -> Catalog: 25 | return load_catalog("default", properties={"type": "glue"}) 26 | 27 | def load_table(self, share, schema, table_name) -> Table: 28 | self.table = self.catalog.load_table("{}.{}".format(schema, table_name)) 29 | 30 | def table_version(self, share, schema, table_name): 31 | self.load_table(share, schema=schema, table_name=table_name) 32 | # return table.metadata.table_uuid 33 | return str(self.table.current_snapshot().snapshot_id) 34 | 35 | def get_protocol(self): 36 | """ 37 | dummy as of now for iceberg 38 | """ 39 | protocol_json = {"protocol": {"minReaderVersion": 1}} 40 | return protocol_json 41 | 42 | def _metadata(self, share, schema, table_name): 43 | if self.table is None: 44 | self.load_table(share, schema=schema, table_name=table_name) 45 | self.meta.setTable(self.table) 46 | metadata = self.meta.get_metadata() 47 | return metadata 48 | 49 | def table_metadata(self, share, schema, table_name): 50 | yield json.dumps(self.get_protocol()) 51 | yield "\n" 52 | yield json.dumps(self._metadata(share, schema=schema, table_name=table_name)) 53 | 54 | def file_details( 55 | self, 56 | share, 57 | schema, 58 | table_name, 59 | predicateHints=None, 60 | limitHint=None, 61 | version=None, 62 | file_expiry=None, 63 | ): 64 | if predicateHints == "": 65 | parsed_filters = [] 66 | else: 67 | parsed_filters = [] 68 | for hints in predicateHints: 69 | parsed_filters.append(parser.parse(hints)) 70 | 71 | yield json.dumps(self.get_protocol()) 72 | yield "\n" 73 | yield json.dumps(self._metadata(share, schema=schema, table_name=table_name)) 74 | yield "\n" 75 | 76 | tableScan = self.table.scan(row_filter=parsed_filters) 77 | sf = SharingFile() 78 | 79 | total_records = 0 80 | print(sf.stats) 81 | for f in tableScan.plan_files(): 82 | sf.setFile(file=f) 83 | sf.prepare_file_details(file_expiry) 84 | print("total_records", total_records) 85 | yield json.dumps(sf.get_file_details(file_expiry)) 86 | yield "\n" 87 | if limitHint is not None: 88 | total_records += sf.stats.numRecords 89 | if total_records >= limitHint: 90 | break 91 | -------------------------------------------------------------------------------- /backend/app/routers/admin.py: -------------------------------------------------------------------------------- 1 | from app.conf import Config 2 | from app.db.queries import AdminQuery 3 | from app.models.admin import ( 4 | AllDetails, 5 | PermissionModel, 6 | SchemaModel, 7 | ShareModel, 8 | TableModel, 9 | TokenLifetime, 10 | ) 11 | from app.securities.user_auth import * 12 | from app.utilities import get_random_uuid 13 | from fastapi import APIRouter, Depends 14 | 15 | admin = APIRouter(prefix="/admin", tags=["admin"]) 16 | 17 | query = AdminQuery() 18 | 19 | 20 | @admin.post("/share", deprecated=True) 21 | def create_share(share: ShareModel, current_user=Depends(get_current_user)): 22 | """ 23 | This endpoint was deprecated, please use /complete api endpoint to create a share 24 | """ 25 | # shares = config.get("shares") 26 | query.create_share(share, user_id=current_user.id) 27 | return f"Share {share.name} was created successfully" 28 | 29 | 30 | @admin.post("/schema", deprecated=True) 31 | def create_schema(schema: SchemaModel, current_user=Depends(get_current_user)): 32 | """ 33 | This endpoint was deprecated, please use /complete api endpoint to create a Schema 34 | """ 35 | # shares = config.get("shares") 36 | query.create_schema(schema, user_id=current_user.id) 37 | return f"Schema {schema.name} was created successfully" 38 | 39 | 40 | @admin.post("/table", deprecated=True) 41 | def create_table(table: TableModel, current_user=Depends(get_current_user)): 42 | """ 43 | This endpoint was deprecated, please use /complete api endpoint to create a Table 44 | """ 45 | # shares = config.get("shares") 46 | query.create_table(table, user_id=current_user.id) 47 | return f"Table {table.table_name} was created successfully" 48 | 49 | 50 | @admin.post("/complete") 51 | def create_complete_share( 52 | all_details: AllDetails, current_user=Depends(get_current_user) 53 | ): 54 | """ 55 | Creates share, Table and Schema (combined version all three apis) 56 | """ 57 | query.create_complete_share(all_details=all_details, user_id=current_user.id) 58 | return f"share:{all_details.share.name}, table: {all_details.table.table_name},schema:{all_details.schema_.name} was created successfully" 59 | 60 | 61 | @admin.post("/link") 62 | def link_resources(resources: PermissionModel): 63 | query.link_resources(resources) 64 | return f"Resources linked successfully" 65 | 66 | 67 | @admin.post("/lifetime") 68 | def update_token_lifetime(lifetime: TokenLifetime): 69 | user_id = query.get_id_by_user(lifetime.username) 70 | print("user_id", user_id) 71 | if user_id: 72 | query.add_lifetime(user_id=user_id, expiry=lifetime.expiry) 73 | return "expiry updated successfully" 74 | else: 75 | raise LakehouseShareException(status_code=402, message="Failed") 76 | 77 | 78 | @admin.get("/token/{user_id}", response_model=Token) 79 | async def sharing_token(user_id: str, current_user=Depends(get_current_user)): 80 | token_lifetime = auth.get_token_lifetime(user_id) 81 | 82 | username = auth.get_username_by_id(user_id=user_id) 83 | if username is None: 84 | raise LakehouseShareException( 85 | status_code=404, message=f"User ID {user_id} not found" 86 | ) 87 | print("token_lifetime", token_lifetime) 88 | token_expiry = token_lifetime if token_lifetime else ACCESS_TOKEN_EXPIRE_MINUTES 89 | access_token_expires = timedelta(seconds=token_expiry) 90 | access_token = auth.create_access_token( 91 | data={"sub": username}, expires_delta=access_token_expires 92 | ) 93 | return {"access_token": access_token, "token_type": "bearer"} 94 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | *.sqlite 152 | 153 | # Cython debug symbols 154 | cython_debug/ 155 | 156 | # PyCharm 157 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 158 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 159 | # and can be added to the global gitignore or merged into this file. For a more nuclear 160 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 161 | #.idea/ 162 | -------------------------------------------------------------------------------- /backend/app/core/delta/models.py: -------------------------------------------------------------------------------- 1 | import json 2 | from dataclasses import dataclass, field 3 | from typing import Any, Dict, List 4 | 5 | from app.core.cloud import get_presigned_url 6 | from deltalake import DeltaTable 7 | 8 | 9 | @dataclass 10 | class SharingMetaData(object): 11 | id: str = "" 12 | format: Dict[str, str] = field(default_factory=dict) 13 | schemaString: str = "" 14 | partitionColumns: List[str] = field(default_factory=list) 15 | table: DeltaTable = None 16 | 17 | def setTable(self, table: DeltaTable): 18 | print("Table was set {}".format(table)) 19 | self.table = table 20 | 21 | def prepare_metadata(self): 22 | schemaDict = self.table.schema().to_json() 23 | self.id = str(self.table.metadata().id) 24 | self.format = {"provider": "parquet"} 25 | self.schemaString = schemaDict 26 | self.partitionColumns = self.table.metadata().partition_columns 27 | 28 | def get_metadata(self): 29 | self.prepare_metadata() 30 | return { 31 | "metaData": { 32 | "id": self.id, 33 | "format": self.format, 34 | "schemaString": self.schemaString, 35 | "partitionColumns": self.partitionColumns, 36 | } 37 | } 38 | 39 | def get_version(self): 40 | self.prepare_metadata() 41 | return self.table.version() 42 | 43 | def __repr__(self): 44 | return json.dumps(self.get_metadata()) 45 | 46 | 47 | @dataclass 48 | class SharingFileStats: 49 | numRecords: int = 0 50 | minValues: Dict[str, str] = field(default_factory=dict) 51 | maxValues: Dict[str, str] = field(default_factory=dict) 52 | nullCount: Dict[str, str] = field(default_factory=dict) 53 | file: Dict[str, Any] = None 54 | 55 | def setDataFile(self, file: Dict[str, Any]): 56 | self.file = file 57 | 58 | def prepare_fie_stats(self): 59 | self.numRecords = self.file["numRecords"] 60 | self.minValues = self.file["minValues"] 61 | self.maxValues = self.file["maxValues"] 62 | self.nullCount = self.file["nullCount"] 63 | 64 | def get_stats(self): 65 | self.prepare_fie_stats() 66 | stats = { 67 | "numRecords": self.numRecords, 68 | "minValues": self.minValues, 69 | "maxValues": self.maxValues, 70 | "nullCount": self.nullCount, 71 | } 72 | return stats 73 | 74 | def __repr__(self): 75 | return json.dumps(self.get_stats()) 76 | 77 | 78 | @dataclass 79 | class SharingFile: 80 | url: str = "" 81 | id: str = "" 82 | partitionValues: Dict[str, str] = field(default_factory=dict) 83 | size: float = 0 84 | file: Dict[str, Any] = None 85 | stats: SharingFileStats = None 86 | 87 | def setFile(self, file: Dict[str, Any]): 88 | self.file = file 89 | 90 | def prepare_file_details(self, file_expiry: int): 91 | stats = SharingFileStats() 92 | # key will be `add`` or `remove` 93 | key = list(self.file.keys())[0] 94 | stats.setDataFile(json.loads(self.file[key]["stats"])) 95 | stats.prepare_fie_stats() 96 | self.url = get_presigned_url(self.file[key]["path"], expiration=file_expiry) 97 | self.partitionValues = self.file[key]["partitionValues"] 98 | self.size = self.file[key]["size"] 99 | self.stats = stats 100 | 101 | def get_file_details(self, file_expiry): 102 | self.prepare_file_details(file_expiry) 103 | file_details = { 104 | "file": { 105 | "url": self.url, 106 | "id": "123", 107 | "partitionValues": self.partitionValues, 108 | "size": self.size, 109 | "stats": self.stats.get_stats(), 110 | } 111 | } 112 | return file_details 113 | -------------------------------------------------------------------------------- /frontend/app/core/api/rest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from abc import ABC, abstractmethod 4 | 5 | import requests 6 | 7 | print(sys.path) 8 | print(os.path.abspath(".")) 9 | sys.path.insert(0, os.path.abspath(".")) 10 | from typing import Dict 11 | from urllib.parse import urljoin 12 | 13 | from core.api.config import Config 14 | from core.api.jwt_auth import JWTAuth 15 | from core.base.client import BaseClient 16 | 17 | 18 | class RestClient(BaseClient): 19 | def __init__( 20 | self, 21 | baseurl: str = None, 22 | prefix: str = None, 23 | user_details: Dict[str, str] = None, 24 | token=None, 25 | ) -> None: 26 | self.config = Config() 27 | print(self.config.config) 28 | self.baseurl = baseurl if baseurl else self._frame_base_url() 29 | print(self.baseurl) 30 | self.prefix = prefix if prefix else self.config.get("lakehouse-sharing.prefix") 31 | self.token = token 32 | self.jauth = JWTAuth( 33 | self.client(), 34 | url=self.baseurl, 35 | config=self.config, 36 | prefix="auth", 37 | user_details=user_details, 38 | ) 39 | super().__init__() 40 | 41 | def _frame_base_url(self): 42 | if os.environ.get("env", "local") == "local": 43 | host = self.config.get("lakehouse-sharing.host") 44 | port = self.config.get("lakehouse-sharing.port") 45 | else: 46 | host = os.environ.get("BACKEND_HOST", "0.0.0.0") 47 | port = os.environ.get("BACKEND_PORT", 8000) 48 | baseurl = f"http://{host}:{port}/" 49 | return baseurl 50 | 51 | def client(self): 52 | return requests 53 | 54 | def get_headers(self): 55 | if self.token is None: 56 | token = self.jauth.get_token() 57 | else: 58 | print("Using existing user's token") 59 | token = self.token 60 | headers = {"Authorization": f"Bearer {token}"} 61 | return headers 62 | 63 | def set_token(self, token): 64 | self.token = token 65 | 66 | def auth(self, **kwargs) -> JWTAuth: 67 | return self.jauth 68 | 69 | def form_path(self, path): 70 | print("form_path") 71 | print(self.baseurl) 72 | print(urljoin(self.baseurl, path)) 73 | print("===========") 74 | return urljoin(self.baseurl, path) 75 | 76 | def post(self, path, data, **kwargs): 77 | url = self.form_path(path) 78 | print(url) 79 | headers = self.get_headers() 80 | print("headers", headers) 81 | res = self.client().post(url, data=data, headers=headers, **kwargs) 82 | return res 83 | 84 | def get(self, path, **kwargs): 85 | url = self.form_path(path) 86 | print(url) 87 | headers = self.get_headers() 88 | print("headers", headers) 89 | res = self.client().get(url, headers=headers, **kwargs) 90 | return res 91 | 92 | 93 | if __name__ == "__main__": 94 | 95 | user_details = {"username": "admin", "password": "admin@123"} 96 | c = RestClient("http://localhost:8000", user_details=user_details) 97 | headers = c.get_headers() 98 | print(headers) 99 | 100 | list_shares = c.get("delta-sharing/shares", params={"maxResults": 2}) 101 | print(list_shares.json()) 102 | 103 | list_schemas = c.get( 104 | "delta-sharing/shares/share1/schemas", params={"maxResults": 2} 105 | ) 106 | print(list_schemas.json()) 107 | 108 | list_tables = c.get( 109 | "delta-sharing/shares/share1/schemas/iceberg_benchmark_db/tables", 110 | params={"maxResults": 2}, 111 | ) 112 | print(list_tables.json()) 113 | 114 | create_share = c.post("admin/share", data={"name": "share-test##"}) 115 | print(create_share.content) 116 | 117 | create_schema = c.post( 118 | "admin/schema", 119 | data={"name": "schema-test##", "table_id": 123, "share_id": "123##"}, 120 | ) 121 | print(create_schema.content) 122 | 123 | create_table = c.post( 124 | "admin/table", 125 | data={"table_name": "table-test529", "table_location": "s3://bucket/object1/"}, 126 | ) 127 | print(create_table.content) 128 | -------------------------------------------------------------------------------- /backend/app/core/delta/share.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from app.db.queries import Query 4 | from core.base import BaseTableFormat 5 | from core.delta.models import SharingFile, SharingFileStats, SharingMetaData 6 | from core.delta.utils import CustomDeltaMetaReader 7 | from deltalake.data_catalog import DataCatalog 8 | from deltalake.table import DeltaTable 9 | 10 | 11 | class DeltaFormat(BaseTableFormat): 12 | def __init__(self) -> None: 13 | self.catalog = self.load_catalog() 14 | self.meta = SharingMetaData() 15 | self.file = SharingFile() 16 | self.table: DeltaTable = None 17 | self.meta_db = Query() 18 | self.path = None 19 | self.__post_init__() 20 | super().__init__() 21 | 22 | def __post_init__(self): 23 | """To pass creds to deltalake underlying rust lib""" 24 | import os 25 | 26 | from boto3.session import Session 27 | 28 | session = Session() 29 | credentials = session.get_credentials() 30 | if credentials is not None: 31 | current_credentials = credentials.get_frozen_credentials() 32 | os.environ["AWS_ACCESS_KEY_ID"] = current_credentials.access_key 33 | os.environ["AWS_SECRET_ACCESS_KEY"] = current_credentials.secret_key 34 | 35 | def load_catalog(self) -> DataCatalog: 36 | return DataCatalog.AWS 37 | 38 | def get_path(self, share: str, schema: str, table_name: str): 39 | self.path = self.meta_db.get_path(share=share, schema=schema, table=table_name) 40 | if self.path is None: 41 | raise ValueError(f"Table {table_name} doesn't exist") 42 | 43 | def load_table(self, share: str, schema: str, table_name: str): 44 | self.get_path(share=share, schema=schema, table_name=table_name) 45 | self.table = DeltaTable(self.path) 46 | 47 | def table_version(self, share: str, schema: str, table_name: str): 48 | self.get_path(share=share, schema=schema, table_name=table_name) 49 | self.load_table(share=share, schema=schema, table_name=table_name) 50 | # return table.metadata.table_uuid 51 | return str(self.table.version()) 52 | 53 | def get_protocol(self): 54 | """ 55 | dummy as of now for iceberg 56 | """ 57 | protocol_json = { 58 | "protocol": { 59 | "minReaderVersion": self.table.protocol().min_reader_version, 60 | "minWriterVersion": self.table.protocol().min_writer_version, 61 | } 62 | } 63 | return protocol_json 64 | 65 | def _metadata(self, share: str, schema: str, table_name: str): 66 | self.get_path(share=share, schema=schema, table_name=table_name) 67 | if self.table is None: 68 | self.load_table(self.path) 69 | self.meta.setTable(self.table) 70 | metadata = self.meta.get_metadata() 71 | return metadata 72 | 73 | def table_metadata(self, share: str, schema: str, table_name: str): 74 | yield json.dumps(self.get_protocol()) 75 | yield "\n" 76 | yield json.dumps( 77 | self._metadata(share=share, schema=schema, table_name=table_name) 78 | ) 79 | 80 | def file_details( 81 | self, 82 | share: str, 83 | schema: str, 84 | table_name: str, 85 | predicateHints=None, 86 | limitHint=None, 87 | version=None, 88 | file_expiry=3600, 89 | ): 90 | if predicateHints == "": 91 | predicateHints = [] 92 | self.get_path(share=share, schema=schema, table_name=table_name) 93 | yield json.dumps(self.get_protocol()) 94 | yield "\n" 95 | yield json.dumps( 96 | self._metadata(share=share, schema=schema, table_name=table_name) 97 | ) 98 | yield "\n" 99 | cdr = CustomDeltaMetaReader(path=self.path, version=version) 100 | files = cdr.get_metafiles() 101 | sf = SharingFile() 102 | 103 | total_records = 0 104 | print(sf.stats) 105 | for f in files: 106 | sf.setFile(file=f) 107 | sf.prepare_file_details(file_expiry) 108 | print("total_records", total_records) 109 | yield json.dumps(sf.get_file_details(file_expiry)) 110 | yield "\n" 111 | if limitHint is not None: 112 | total_records += sf.stats.numRecords 113 | if total_records >= limitHint: 114 | break 115 | -------------------------------------------------------------------------------- /backend/app/utilities/pagination.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import math 3 | from typing import Any, Callable, Generic, Optional, Sequence, TypeVar 4 | 5 | from fastapi import Query 6 | from fastapi.exceptions import HTTPException 7 | from fastapi_pagination import Params 8 | from fastapi_pagination.api import _ctx_var_with_reset, _items_val 9 | from fastapi_pagination.bases import AbstractPage, AbstractParams, RawParams 10 | from fastapi_pagination.types import AdditionalData 11 | from fastapi_pagination.utils import verify_params 12 | from pydantic import BaseModel, conint 13 | from starlette.requests import Request 14 | 15 | T = TypeVar("T") 16 | 17 | 18 | def encode_token(offsetParams, otherparams): 19 | next_token = "{},{}".format(offsetParams.offset + 1, ",".join(otherparams)) 20 | print("encode_token", next_token) 21 | next_token = base64.urlsafe_b64encode(next_token.encode()).decode() 22 | return next_token 23 | 24 | 25 | def decode_token(token_string): 26 | page_id = base64.urlsafe_b64decode(token_string).decode().split(",")[0] 27 | if page_id: 28 | page_id = int(page_id) 29 | else: 30 | page_id = 0 31 | return page_id 32 | 33 | 34 | def validate_params(params, other_params: str): 35 | params_from_token = base64.urlsafe_b64decode(params).decode().split(",")[1:] 36 | if len(params_from_token) == len(other_params): 37 | for l, r in zip(params_from_token, other_params): 38 | if l == r: 39 | continue 40 | else: 41 | raise HTTPException( 42 | status_code=400, 43 | detail="Token Params and Params \ 44 | from APIs was not equal {} ! = {}".format( 45 | l, r 46 | ), 47 | ) 48 | else: 49 | raise HTTPException( 50 | status_code=400, 51 | detail="Number of decoded token params and \ 52 | given params length was not equal {} ! = {}".format( 53 | len(params_from_token), len(other_params) 54 | ), 55 | ) 56 | 57 | 58 | class SingleTokenParams(BaseModel, AbstractParams): 59 | maxResults: int = Query(50, ge=1, le=100, description="maxResults") 60 | next_token: str = Query("", description="Next Token", include_in_schema=False) 61 | 62 | def to_raw_params(self) -> RawParams: 63 | page_id = decode_token(self.next_token) 64 | return RawParams(limit=self.maxResults, offset=page_id) 65 | 66 | 67 | class SingleTokenPagination(AbstractPage[T], Generic[T]): 68 | items: Sequence[T] 69 | next_token: Optional[str] = None 70 | __params_type__ = SingleTokenParams 71 | 72 | @classmethod 73 | def create( 74 | cls, 75 | items: Sequence[T], 76 | total: int, 77 | params: AbstractParams, 78 | # request: Request 79 | **kwargs: Any, 80 | ): 81 | if not isinstance(params, SingleTokenParams): 82 | raise ValueError("Page should be used with Params") 83 | 84 | other_params = kwargs.get("other_params", []) 85 | if params.next_token: 86 | validate_params(params.next_token, other_params) 87 | offsetParams = params.to_raw_params() 88 | if offsetParams.offset + offsetParams.limit < total: 89 | next_token = encode_token(offsetParams, other_params) 90 | else: 91 | next_token = None 92 | 93 | return cls( 94 | items=items, 95 | total=total, 96 | params=params, 97 | next_token=next_token, 98 | **kwargs, 99 | ) 100 | 101 | 102 | def create_page( 103 | items: Sequence[T], 104 | total: Optional[int] = None, 105 | params: Optional[AbstractParams] = None, 106 | **kwargs: Any, 107 | ) -> AbstractPage[T]: 108 | kwargs["params"] = params 109 | 110 | if total is not None: # temporary to support old signature 111 | kwargs["total"] = total 112 | 113 | with _ctx_var_with_reset(_items_val, items): 114 | return SingleTokenPagination.create(items=items, **kwargs) 115 | 116 | 117 | def paginate( 118 | sequence: Sequence[T], 119 | params: Optional[AbstractParams] = None, 120 | length_function: Callable[[Sequence[T]], int] = len, 121 | *, 122 | additional_data: AdditionalData = {}, 123 | ) -> AbstractPage[T]: 124 | params, raw_params = verify_params(params, "limit-offset") 125 | return create_page( 126 | sequence[raw_params.offset : raw_params.offset + raw_params.limit], 127 | length_function(sequence), 128 | params, 129 | **(additional_data or {}), 130 | ) 131 | -------------------------------------------------------------------------------- /frontend/app/core/table_format.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import pandas as pd 4 | import streamlit as st 5 | from app.core.api.rest import RestClient 6 | 7 | client = RestClient(token=st.session_state["token"]) 8 | 9 | days_to_seconds = lambda days: days * 24 * 60 * 60 10 | 11 | 12 | def list_shares(): 13 | client.set_token(st.session_state["token"]) 14 | response = client.get("/delta-sharing/shares") 15 | if response.status_code == 200: 16 | print(response.content) 17 | items = response.json()["items"] 18 | _list_shares = [] 19 | for r in items: 20 | _list_shares.append(f"{r['name']} ({r['id']})") 21 | print(_list_shares) 22 | return _list_shares 23 | 24 | 25 | def list_schema(share): 26 | sharename, id = share.split(" ") 27 | response = client.get(f"/delta-sharing/shares/{sharename}/schemas") 28 | if response.status_code == 200: 29 | print(response.content) 30 | items = response.json()["items"] 31 | _list_tables = [] 32 | for r in items: 33 | _list_tables.append(f"{r['name']}") 34 | else: 35 | raise Exception(response.content) 36 | return _list_tables 37 | 38 | 39 | def list_tables(share, schema): 40 | sharename, id = share.split(" ") 41 | response = client.get(f"/delta-sharing/shares/{sharename}/schemas/{schema}/tables") 42 | if response.status_code == 200: 43 | print(response.content) 44 | items = response.json()["items"] 45 | _list_tables = [] 46 | for r in items: 47 | _list_tables.append(f"{r['name']} ({r['id']})") 48 | else: 49 | raise Exception(response.content) 50 | return _list_tables 51 | 52 | 53 | def get_metadata(share, schema, table): 54 | response = client.get( 55 | f"/delta-sharing/shares/{share}/schemas/{schema}/tables/{table}/metadata" 56 | ) 57 | if response.status_code == 200: 58 | lines = response.iter_lines() 59 | header = json.loads(next(lines)) 60 | metadata = json.loads(next(lines)) 61 | else: 62 | raise Exception(response.content) 63 | return header, metadata 64 | 65 | 66 | def get_table_data(share, schema, table, version, limitHint): 67 | response = client.post( 68 | f"/delta-sharing/shares/{share}/schemas/{schema}/tables/{table}/query", 69 | data=None, 70 | json={"predicateHints": [], "limitHint": limitHint, "version": version}, 71 | ) 72 | if response.status_code == 200: 73 | lines = response.iter_lines() 74 | header = json.loads(next(lines)) 75 | metadata = json.loads(next(lines)) 76 | else: 77 | raise Exception(response.content) 78 | return header, metadata, lines 79 | 80 | 81 | def table_format_exploration(): 82 | st.header("Explore Table formats") 83 | create_link_form = st.container() 84 | col1, col2, col3 = create_link_form.columns(3) 85 | share = col1.selectbox("shares", list_shares()) 86 | print(share) 87 | schema = col2.selectbox("schema", list_schema(share)) 88 | table = col3.selectbox("table", list_tables(share, schema=schema)) 89 | col1, col2 = create_link_form.columns(2) 90 | # metadata_bt = col1.button("get Metadata") 91 | # table_bt = col2.button("get Table Files") 92 | with st.expander("Metadata"): 93 | get_meta = st.button("Get Metadata") 94 | if get_meta: 95 | st.header("Metadata") 96 | share = share.split(" ")[0].split("(")[-1].replace(")", "") 97 | table = table.split(" ")[0].split("(")[-1].replace(")", "") 98 | header, metadata = get_metadata(share, schema, table) 99 | st.markdown("### Header") 100 | st.write(header) 101 | st.markdown("### Metadata") 102 | st.write(metadata) 103 | 104 | with st.expander("Table Details"): 105 | get_table_meta = st.button("Get Table Details") 106 | version = st.text_input("version") 107 | limitHint = st.text_input("limit hint") 108 | if get_table_meta: 109 | st.header("Table Details") 110 | share = share.split(" ")[0].split("(")[-1].replace(")", "") 111 | table = table.split(" ")[0].split("(")[-1].replace(")", "") 112 | header, metadata, table_iter = get_table_data( 113 | share, schema, table, version, limitHint 114 | ) 115 | st.markdown("### Header") 116 | st.write(header) 117 | st.markdown("### Metadata") 118 | st.write(metadata) 119 | st.markdown("### Table") 120 | file_details = next(table_iter) 121 | df = pd.DataFrame(json.loads(file_details)["file"]) 122 | st.write(df) 123 | -------------------------------------------------------------------------------- /backend/app/core/iceberg/models.py: -------------------------------------------------------------------------------- 1 | import json 2 | from dataclasses import dataclass, field 3 | from typing import Dict, List 4 | 5 | from app.core.cloud import get_presigned_url 6 | from pyiceberg.manifest import DataFile 7 | from pyiceberg.table import FileScanTask, Table 8 | 9 | 10 | @dataclass 11 | class SharingMetaData(object): 12 | id: str = "" 13 | format: Dict[str, str] = field(default_factory=dict) 14 | schemaString: str = "" 15 | partitionColumns: List[str] = field(default_factory=list) 16 | table: Table = None 17 | field_map = { 18 | "timestamptz": "timestamp", 19 | "int": "integer", 20 | "decimal(19, 9)": "double", 21 | } 22 | 23 | def _map_field(self, field): 24 | return self.field_map.get(field, field) 25 | 26 | def format_schema_dict(self, iceberg_dict): 27 | all_fields = [] 28 | for f in iceberg_dict["fields"]: 29 | n = {} 30 | n["name"] = f.get("name") 31 | n["type"] = self._map_field(f.get("field_type")) 32 | n["nullable"] = f.get("required") 33 | all_fields.append(n) 34 | schema = { 35 | "type": iceberg_dict["type"], 36 | "fields": all_fields, 37 | "metadata": { 38 | "schema_id": iceberg_dict["schema_id"], 39 | "identifier_field_ids": iceberg_dict["identifier_field_ids"], 40 | }, 41 | } 42 | return schema 43 | 44 | def setTable(self, table: Table): 45 | print("Table was set {}".format(table)) 46 | self.table = table 47 | 48 | def prepare_metadata(self): 49 | schemaDict = self.table.metadata.schemas[0].dict() 50 | self.id = str(self.table.metadata.table_uuid) 51 | self.format = {"provider": self.table.io.properties["format"]} 52 | self.schemaString = json.dumps(self.format_schema_dict(schemaDict)) 53 | self.partitionColumns = [ 54 | field.name 55 | for key in self.table.specs() 56 | for field in self.table.specs()[key].fields 57 | ] 58 | 59 | def get_metadata(self): 60 | self.prepare_metadata() 61 | return { 62 | "metaData": { 63 | "id": self.id, 64 | "format": self.format, 65 | "schemaString": self.schemaString, 66 | "partitionColumns": self.partitionColumns, 67 | } 68 | } 69 | 70 | def get_version(self): 71 | self.prepare_metadata() 72 | return self.table.current_snapshot() 73 | 74 | def __repr__(self): 75 | return json.dumps(self.get_metadata()) 76 | 77 | 78 | @dataclass 79 | class SharingFileStats: 80 | numRecords: int = 0 81 | minValues: Dict[str, str] = field(default_factory=dict) 82 | maxValues: Dict[str, str] = field(default_factory=dict) 83 | nullCount: Dict[str, str] = field(default_factory=dict) 84 | value_counts: Dict[str, str] = field(default_factory=dict) 85 | file: DataFile = None 86 | 87 | def setDataFile(self, file: DataFile): 88 | self.file = file 89 | 90 | def prepare_fie_stats(self): 91 | self.numRecords = self.file.dict()["record_count"] 92 | self.minValues = { 93 | k: v.decode("ISO-8859-1") 94 | for k, v in self.file.dict()["lower_bounds"].items() 95 | } 96 | self.maxValues = { 97 | k: v.decode("ISO-8859-1") 98 | for k, v in self.file.dict()["upper_bounds"].items() 99 | } 100 | self.nullCount = self.file.dict()["null_value_counts"] 101 | self.value_counts = self.file.dict()["value_counts"] 102 | 103 | def get_stats(self): 104 | self.prepare_fie_stats() 105 | stats = { 106 | "numRecords": self.numRecords, 107 | "minValues": self.minValues, 108 | "maxValues": self.maxValues, 109 | "nullCount": self.nullCount, 110 | "value_counts": self.value_counts, 111 | } 112 | return stats 113 | 114 | def __repr__(self): 115 | return json.dumps(self.get_stats()) 116 | 117 | 118 | @dataclass 119 | class SharingFile: 120 | url: str = "" 121 | id: str = "" 122 | partitionValues: Dict[str, str] = field(default_factory=dict) 123 | size: float = 0 124 | file: FileScanTask = None 125 | stats: SharingFileStats = None 126 | 127 | def setFile(self, file: FileScanTask): 128 | self.file = file 129 | 130 | def prepare_file_details(self, file_expiry): 131 | stats = SharingFileStats() 132 | stats.setDataFile(self.file.file) 133 | stats.prepare_fie_stats() 134 | self.url = get_presigned_url(self.file.file.file_path, expiration=file_expiry) 135 | self.partitionValues = self.file.file.partition 136 | self.size = self.file.length 137 | self.stats = stats 138 | 139 | def get_file_details(self, file_expiry): 140 | self.prepare_file_details(file_expiry) 141 | file_details = { 142 | "file": { 143 | "url": self.url, 144 | "id": "123", 145 | "partitionValues": self.partitionValues, 146 | "size": self.size, 147 | "stats": self.stats.get_stats(), 148 | } 149 | } 150 | return file_details 151 | -------------------------------------------------------------------------------- /backend/app/core/delta/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | from typing import Any, Dict 4 | 5 | from fsspec.core import get_fs_token_paths 6 | 7 | PYARROW_CHECKPOINT_SCHEMA = [ 8 | "txn", 9 | "add", 10 | "remove", 11 | "metaData", 12 | "protocol", 13 | "commitInfo", 14 | ] 15 | 16 | 17 | class CustomDeltaMetaReader: 18 | def __init__( 19 | self, 20 | path: str, 21 | version: int = 0, 22 | checkpoint=None, 23 | storage_options=None, 24 | ): 25 | self.path = str(path).rstrip("/") 26 | self.version = version 27 | self.pq_files = set() 28 | self.delta_log_path = f"{self.path}/_delta_log" 29 | self.fs, self.fs_token, _ = get_fs_token_paths( 30 | path, storage_options=storage_options 31 | ) 32 | self.checkpoint = ( 33 | checkpoint if checkpoint is not None else self.get_checkpoint_id() 34 | ) 35 | self.storage_options = storage_options 36 | self.schema = None 37 | 38 | def get_checkpoint_id(self): 39 | """ 40 | if _last_checkpoint file exists, returns checkpoint_id else zero 41 | """ 42 | try: 43 | last_checkpoint_version = json.loads( 44 | self.fs.cat(f"{self.delta_log_path}/_last_checkpoint") 45 | )["version"] 46 | except FileNotFoundError: 47 | last_checkpoint_version = 0 48 | return last_checkpoint_version 49 | 50 | def get_pq_files_from_checkpoint_parquet(self): 51 | """ 52 | use checkpoint_id to get logs from parquet files 53 | """ 54 | if self.checkpoint == 0: 55 | return 56 | checkpoint_path = ( 57 | f"{self.delta_log_path}/{self.checkpoint:020}.checkpoint.parquet" 58 | ) 59 | if not self.fs.exists(checkpoint_path): 60 | raise ValueError( 61 | f"Parquet file with the given checkpoint {self.checkpoint} does not exists: " 62 | f"File {checkpoint_path} not found" 63 | ) 64 | parquet_checkpoint = self.engine.read_partition( 65 | fs=self.fs, 66 | pieces=[(checkpoint_path, None, None)], 67 | columns=PYARROW_CHECKPOINT_SCHEMA, 68 | index=None, 69 | ) 70 | mm = [] 71 | for i, row in parquet_checkpoint.iterrows(): 72 | if row["add"] is not None or row["remove"] is not None: 73 | self.pq_files.add(f"{self.path}/{row['add']['path']}") 74 | mm.append(row) 75 | return mm 76 | 77 | def get_pq_files_from_delta_json_logs(self): 78 | """ 79 | start from checkpoint id, collect logs from every json file until the 80 | given version 81 | example: 82 | checkpoint 10, version 16 83 | 1. read the logs from 10th checkpoint parquet ( using above func) 84 | 2. read logs from json files until version 16 85 | log Collection: 86 | for reading the particular version of delta table, We are concerned 87 | about `add` and `remove` Operation (transaction) only.(which involves 88 | adding and removing respective parquet file transaction) 89 | """ 90 | log_files = self.fs.glob( 91 | f"{self.delta_log_path}/{self.checkpoint // 10:019}*.json" 92 | ) 93 | if len(log_files) == 0: 94 | raise RuntimeError( 95 | f"No Json files found at _delta_log_path:- {self.delta_log_path}" 96 | ) 97 | log_files = sorted(log_files) 98 | log_versions = [ 99 | int(re.findall(r"(\d{20})", log_file_name)[0]) 100 | for log_file_name in log_files 101 | ] 102 | if (self.version is not None) and (self.version not in log_versions): 103 | raise ValueError( 104 | f"Cannot time travel Delta table to version {self.version}, Available versions for given " 105 | f"checkpoint {self.checkpoint} are {log_versions}" 106 | ) 107 | mm = [] 108 | for log_file_name, log_version in zip(log_files, log_versions): 109 | print(log_file_name) 110 | log = self.fs.cat(log_file_name).decode().split("\n") 111 | for line in log: 112 | if line: # for last empty line 113 | meta_data = json.loads(line) 114 | 115 | if "add" in meta_data.keys(): 116 | file = f"{self.path}/{meta_data['add']['path']}" 117 | meta_data["add"]["path"] = file 118 | mm.append(meta_data) 119 | elif "remove" in meta_data.keys(): 120 | remove_file = f"{self.path}/{meta_data['remove']['path']}" 121 | meta_data["remove"]["path"] = remove_file 122 | if self.version == int(log_version): 123 | break 124 | return mm 125 | 126 | def get_metafiles(self) -> Dict[str, Any]: 127 | files1 = self.get_pq_files_from_checkpoint_parquet() 128 | files2 = self.get_pq_files_from_delta_json_logs() 129 | files = [] 130 | if files1 is None: 131 | files = files2 132 | else: 133 | files += files1 134 | files += files2 135 | return files 136 | -------------------------------------------------------------------------------- /frontend/app/core/link.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datetime import datetime 3 | 4 | import jwt 5 | import streamlit as st 6 | from app.core.api.rest import RestClient 7 | 8 | client = RestClient(token=st.session_state["token"]) 9 | 10 | days_to_seconds = lambda days: days * 24 * 60 * 60 11 | 12 | 13 | def create_link_in_db(linkDetails): 14 | client.set_token(st.session_state["token"]) 15 | response = client.post("/admin/link", data=None, json=linkDetails) 16 | print(response.content) 17 | if response.status_code == 200: 18 | st.markdown(response.text) 19 | st.balloons() 20 | else: 21 | st.error(response.text) 22 | 23 | 24 | def list_shares(): 25 | client.set_token(st.session_state["token"]) 26 | response = client.get("/delta-sharing/shares") 27 | if response.status_code == 200: 28 | print(response.content) 29 | items = response.json()["items"] 30 | _list_shares = [] 31 | for r in items: 32 | _list_shares.append(f"{r['name']} ({r['id']})") 33 | print(_list_shares) 34 | return _list_shares 35 | 36 | 37 | def list_schema(share): 38 | client.set_token(st.session_state["token"]) 39 | sharename, id = share.split(" ") 40 | response = client.get(f"/delta-sharing/shares/{sharename}/schemas") 41 | if response.status_code == 200: 42 | print(response.content) 43 | items = response.json()["items"] 44 | _list_tables = [] 45 | for r in items: 46 | _list_tables.append(f"{r['name']}") 47 | else: 48 | raise Exception(response.content) 49 | return _list_tables 50 | 51 | 52 | def list_tables(share, schema): 53 | client.set_token(st.session_state["token"]) 54 | sharename, id = share.split(" ") 55 | response = client.get(f"/delta-sharing/shares/{sharename}/schemas/{schema}/tables") 56 | if response.status_code == 200: 57 | print(response.content) 58 | items = response.json()["items"] 59 | _list_tables = [] 60 | for r in items: 61 | _list_tables.append(f"{r['name']} ({r['id']})") 62 | else: 63 | raise Exception(response.content) 64 | return _list_tables 65 | 66 | 67 | def get_token(user_id): 68 | response = client.get(f"/admin/token/{user_id}") 69 | if response.status_code == 200: 70 | print(response.content) 71 | return response.json()["access_token"] 72 | 73 | 74 | def get_expiry_from_token(token): 75 | expiry_timestamp = jwt.decode( 76 | token, algorithms="HS256", options={"verify_signature": False} 77 | )["exp"] 78 | return datetime.fromtimestamp(expiry_timestamp).strftime("%Y-%m-%dT%H:%M:%SZ") 79 | 80 | 81 | def download_creds_layout(user_id, container): 82 | button = container.button("Download Credentials") 83 | if button: 84 | token = get_token(user_id=user_id) 85 | sharing_profile = { 86 | "shareCredentialsVersion": 1, 87 | "endpoint": f"{client.baseurl}{client.prefix}", 88 | "bearerToken": token, 89 | "expirationTime": get_expiry_from_token(token), 90 | } 91 | container.write(sharing_profile) 92 | download_button = container.download_button( 93 | "Download Credentials", 94 | data=json.dumps(sharing_profile), 95 | file_name="profile.json", 96 | ) 97 | if download_button: 98 | st.write(user_id) 99 | 100 | else: 101 | st.error("Error generating token") 102 | 103 | 104 | def list_users(): 105 | client.set_token(st.session_state["token"]) 106 | response = client.get("/auth/users") 107 | if response.status_code == 200: 108 | print(response.content) 109 | items = response.json() 110 | _list_users = [] 111 | for r in items: 112 | _list_users.append(f"{r['name']} ({r['id']})") 113 | print(_list_users) 114 | else: 115 | _list_users = [] 116 | return _list_users 117 | 118 | 119 | def update_token_lifetime(username, seconds): 120 | data = {"username": username, "expiry": seconds} 121 | response = client.post("/admin/lifetime", data=None, json=data) 122 | if response.status_code == 200: 123 | st.success(f"User {username} Token lifetime updated") 124 | else: 125 | st.write(response.text) 126 | st.error(f"Error updating lifetime of the token for User {username}") 127 | 128 | 129 | def create_link_form_layout(): 130 | st.header("Give Required Permissions to the User") 131 | create_link_form = st.container() 132 | user_id = create_link_form.selectbox("users", list_users()) 133 | col1, col2, col3 = create_link_form.columns(3) 134 | share = col1.selectbox("shares", list_shares()) 135 | print(share) 136 | schema = col2.selectbox("schema", list_schema(share)) 137 | table = col3.selectbox("table", list_tables(share, schema=schema)) 138 | 139 | submit = create_link_form.button("Give permission") 140 | linkDetails = {} 141 | linkDetails["user_id"] = user_id.split(" ")[1].split("(")[-1].replace(")", "") 142 | linkDetails["share_id"] = share.split(" ")[1].split("(")[-1].replace(")", "") 143 | linkDetails["schema_name"] = schema 144 | linkDetails["table_id"] = table.split(" ")[1].split("(")[-1].replace(")", "") 145 | if submit: 146 | create_link_in_db(linkDetails) 147 | create_link_form.header("Define Token/Credential Lifetime for the User Token") 148 | col1, col3, _ = create_link_form.columns(3, gap="medium") 149 | expiry = col1.slider("Expiry in days", min_value=1, max_value=7) 150 | update_lifetime = col3.button("Update Token Lifetime") 151 | if update_lifetime: 152 | seconds = days_to_seconds(expiry) 153 | create_link_form.write(seconds) 154 | update_token_lifetime(username=user_id.split(" ")[0], seconds=seconds) 155 | create_link_form.header("Download Sharing Profile") 156 | download_creds_layout(linkDetails["user_id"], create_link_form) 157 | return create_link_form 158 | -------------------------------------------------------------------------------- /backend/app/routers/share.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from app.conf import Config 4 | from app.core import get_table_format_client 5 | from app.core.base import BaseTableFormat 6 | from app.db.queries import Query 7 | from app.models.auth import User 8 | from app.models.common import QueryModel, Share 9 | from app.models.response import GetShareResponse, SchemaResponse, TableResponse 10 | from app.securities.user_auth import * 11 | from app.utilities.exceptions import LakehouseShareException 12 | from app.utilities.pagination import ( 13 | SingleTokenPagination, 14 | SingleTokenParams, 15 | encode_token, 16 | ) 17 | from app.utilities.pagination import paginate as custom_paginate 18 | from app.utilities.responses import common_responses 19 | from app.utilities.validators import ( 20 | validate_share, 21 | validate_share_and_schema, 22 | validate_share_and_schema_and_table, 23 | ) 24 | from fastapi import APIRouter, Depends 25 | from fastapi.exceptions import HTTPException 26 | from fastapi_pagination import Page, Params, add_pagination, paginate 27 | from starlette.responses import JSONResponse, StreamingResponse 28 | 29 | config = Config() 30 | PREFIX = config.get("endpoint") 31 | share_router = APIRouter(prefix=PREFIX, responses=common_responses, tags=["sharing"]) 32 | 33 | query = Query() 34 | LakeHouse: BaseTableFormat = get_table_format_client( 35 | os.environ.get("TABLE_FORMAT", "delta") 36 | )() 37 | 38 | 39 | @share_router.get("/shares", response_model=SingleTokenPagination[Share], responses={}) 40 | def list_shares( 41 | params: SingleTokenParams = Depends(), 42 | current_user: User = Depends(get_current_active_user), 43 | ): 44 | # shares = config.get("shares") 45 | # print(shares) 46 | shares = query.list_shares(current_user.id) 47 | print(shares) 48 | return custom_paginate(shares, params) 49 | 50 | 51 | @share_router.get("/shares/{share}", response_model=GetShareResponse) 52 | def get_share(user_share=Depends(validate_share)): 53 | # shares = config.get("shares") 54 | user, share = user_share 55 | shares = query.get_share(share=share, user_id=user.id) 56 | return {"share": shares} 57 | 58 | 59 | @share_router.get( 60 | "/shares/{share}/schemas", response_model=SingleTokenPagination[SchemaResponse] 61 | ) 62 | def list_schema( 63 | user_share=Depends(validate_share), params: SingleTokenParams = Depends() 64 | ): 65 | user, share = user_share 66 | schemas = query.list_schemas(share, user_id=user.id) 67 | return custom_paginate( 68 | schemas, params=params, additional_data={"other_params": [share]} 69 | ) 70 | 71 | 72 | # list(filter(lambda s: s['name']==share,shares))[0] 73 | 74 | 75 | @share_router.get( 76 | "/shares/{share}/schemas/{schema}/tables", 77 | response_model=SingleTokenPagination[TableResponse], 78 | ) 79 | def list_tables( 80 | share_and_schema=Depends(validate_share_and_schema), 81 | params: SingleTokenParams = Depends(), 82 | ): 83 | user, share, schema = share_and_schema 84 | schemas = query.list_tables(share, schema=schema, user_id=user.id) 85 | return custom_paginate( 86 | schemas, params=params, additional_data={"other_params": [share, schema]} 87 | ) 88 | 89 | 90 | @share_router.get( 91 | "/shares/{share}/schemas/all-tables", 92 | response_model=SingleTokenPagination[TableResponse], 93 | ) 94 | def list_all_tables( 95 | user_share=Depends(validate_share), 96 | params: SingleTokenParams = Depends(), 97 | ): 98 | user, share = user_share 99 | schemas = query.list_all_tables(share, user.id) 100 | return custom_paginate( 101 | schemas, params=params, additional_data={"other_params": [share]} 102 | ) 103 | 104 | 105 | @share_router.head("/shares/{share}/schemas/{schema}/tables/{table}") 106 | def get_table_version(share_schema_table=Depends(validate_share_and_schema_and_table)): 107 | try: 108 | user, share, schema, table = share_schema_table 109 | return JSONResponse( 110 | {}, 111 | headers={ 112 | "delta-table-version": LakeHouse.table_version( 113 | share, schema=schema, table_name=table 114 | ) 115 | }, 116 | ) 117 | except Exception as e: 118 | raise LakehouseShareException( 119 | status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, message=str(e) 120 | ) 121 | 122 | 123 | @share_router.get("/shares/{share}/schemas/{schema}/tables/{table}/metadata") 124 | async def get_table_metadata( 125 | share_schema_table_user=Depends(validate_share_and_schema_and_table), 126 | ): 127 | 128 | try: 129 | user, share, schema, table = share_schema_table_user 130 | 131 | response = StreamingResponse( 132 | LakeHouse.table_metadata(share=share, schema=schema, table_name=table), 133 | headers={ 134 | "delta-table-version": LakeHouse.table_version( 135 | share, schema=schema, table_name=table 136 | ) 137 | }, 138 | media_type="application/json", 139 | ) 140 | return response 141 | except Exception as e: 142 | raise LakehouseShareException( 143 | status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, message=str(e) 144 | ) 145 | 146 | 147 | @share_router.post("/shares/{share}/schemas/{schema}/tables/{table}/query") 148 | async def query_table_files( 149 | queryM: QueryModel, 150 | share_schema_table_user=Depends(validate_share_and_schema_and_table), 151 | ): 152 | try: 153 | user, share, schema, table = share_schema_table_user 154 | token_lifetime = auth.get_token_lifetime(user.id) 155 | response = StreamingResponse( 156 | LakeHouse.file_details( 157 | share, 158 | schema=schema, 159 | table_name=table, 160 | predicateHints=queryM.predicateHints, 161 | limitHint=queryM.limitHint, 162 | version=queryM.version, 163 | file_expiry=token_lifetime, 164 | ), 165 | headers={ 166 | "delta-table-version": LakeHouse.table_version( 167 | share, schema=schema, table_name=table 168 | ) 169 | }, 170 | media_type="application/json", 171 | ) 172 | return response 173 | except Exception as e: 174 | raise LakehouseShareException( 175 | status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, message=str(e) 176 | ) 177 | -------------------------------------------------------------------------------- /sqls/prepopulate_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import uuid 3 | from typing import Optional 4 | 5 | from passlib.context import CryptContext 6 | from sqlmodel import Field, Session, SQLModel, create_engine, select 7 | 8 | 9 | class Share(SQLModel, table=True): 10 | # __table_args__ = {"extend_existing": True} 11 | 12 | id: str = Field(primary_key=True) 13 | name: str = Field(unique=True) 14 | created_by: str = Field(default=None, nullable=False, foreign_key="user.id") 15 | 16 | 17 | class Table(SQLModel, table=True): 18 | # __table_args__ = {"extend_existing": True} 19 | id: str = Field(primary_key=True) 20 | table_name: str = Field(unique=True) 21 | table_location: str 22 | created_by: str = Field(default=None, nullable=False, foreign_key="user.id") 23 | 24 | 25 | class Schema(SQLModel, table=True): 26 | # __table_args__ = {"extend_existing": True} 27 | 28 | id: str = Field(primary_key=True) 29 | name: str = Field(unique=True) 30 | table_id: str = Field(default=None, foreign_key="table.id") 31 | share_id: str = Field(default=None, foreign_key="share.id") 32 | created_by: str = Field(default=None, nullable=False, foreign_key="user.id") 33 | 34 | 35 | class User(SQLModel, table=True): 36 | # __table_args__ = {"extend_existing": True} 37 | 38 | id: str = Field(primary_key=True) 39 | name: str = Field(unique=True) 40 | email: str 41 | encrypted_password: str 42 | namespace: str 43 | 44 | 45 | class TokenLifetime(SQLModel, table=True): 46 | # __table_args__ = {"extend_existing": True} 47 | 48 | id: str = Field(primary_key=True) 49 | user_id: str = Field(default=None, unique=True, foreign_key="user.id") 50 | expiry: int 51 | 52 | 53 | class Permission(SQLModel, table=True): 54 | # __table_args__ = {"extend_existing": True} 55 | id: str = Field(primary_key=True) 56 | user_id: str = Field(default=None, nullable=False, foreign_key="user.id") 57 | share_id: str = Field(default=None, foreign_key="share.id") 58 | schema_id: str = Field(default=None, foreign_key="schema.id") 59 | table_id: str = Field(default=None, foreign_key="table.id") 60 | 61 | 62 | def create_db_connection(): 63 | if os.environ.get("env", "local") == "local": 64 | # for manual setup 65 | sqlite_url = os.environ.get("db_url") 66 | print("local", sqlite_url) 67 | engine = create_engine( 68 | sqlite_url, 69 | echo=True, 70 | ) 71 | # SQLModel.metadata.create_all(engine, checkfirst=True) 72 | return engine 73 | else: 74 | # for docker setup 75 | pg_url = "postgresql+psycopg2://root:password@localhost:5433/postgres" 76 | engine = create_engine(pg_url, echo=True) 77 | SQLModel.metadata.create_all(engine) 78 | return engine 79 | 80 | 81 | pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto") 82 | get_hashed_passw = lambda password: pwd_context.hash(password) 83 | get_user_id = lambda: uuid.uuid4().hex 84 | password = get_hashed_passw("admin@123") 85 | 86 | engine = create_db_connection() 87 | session = Session(engine) 88 | SQLModel.metadata.create_all(engine) 89 | session.commit() 90 | 91 | user5 = User( 92 | id=get_user_id(), 93 | name="admin", 94 | email="admin@lakehouse.com", 95 | team="open-source", 96 | encrypted_password=password, 97 | namespace="data-team", 98 | ) 99 | 100 | session.add(user5) 101 | session.commit() 102 | 103 | delta_share1 = Share(name="delta_share1", id=str(uuid.uuid4()), created_by=user5.id) 104 | iceberg_share = Share(name="iceberg_share", id=str(uuid.uuid4()), created_by=user5.id) 105 | delta_share2 = Share(name="delta_share2", id=str(uuid.uuid4()), created_by=user5.id) 106 | delta_share3 = Share(name="delta_share3", id=str(uuid.uuid4()), created_by=user5.id) 107 | 108 | 109 | delta_table1 = Table( 110 | table_name="test_hm", 111 | id=str(uuid.uuid4()), 112 | table_location="s3://tf-benchmarking/delta_2/dwh/test_hm", 113 | created_by=user5.id, 114 | ) 115 | delta_table2 = Table( 116 | table_name="test_student", 117 | id=str(uuid.uuid4()), 118 | table_location="s3://tf-benchmarking/delta_2/dwh/test_student", 119 | created_by=user5.id, 120 | ) 121 | 122 | delta_table3 = Table( 123 | table_name="test_teacher", 124 | id=str(uuid.uuid4()), 125 | table_location="s3://tf-benchmarking/delta_2/dwh/test_teacher", 126 | created_by=user5.id, 127 | ) 128 | 129 | iceberg_table4 = Table( 130 | table_name="iceberg_benchmark_nyc_taxi_trips_v2", 131 | id=str(uuid.uuid4()), 132 | table_location="s3://dummy-bucket/iceberg_benchmark_nyc_taxi_trips_v2", 133 | created_by=user5.id, 134 | ) 135 | 136 | schema1tb1 = Schema( 137 | name="delta_schema", 138 | share_id=delta_share1.id, 139 | id=str(uuid.uuid4()), 140 | table_id=delta_table1.id, 141 | created_by=user5.id, 142 | ) 143 | 144 | schema1tb2 = Schema( 145 | name="delta_schema1", 146 | share_id=delta_share1.id, 147 | id=str(uuid.uuid4()), 148 | table_id=delta_table2.id, 149 | created_by=user5.id, 150 | ) 151 | 152 | 153 | schema2tb1 = Schema( 154 | name="schema2", 155 | share_id=delta_share2.id, 156 | id=str(uuid.uuid4()), 157 | table_id=delta_table1.id, 158 | created_by=user5.id, 159 | ) 160 | 161 | schema2tb2 = Schema( 162 | name="schema2", 163 | share_id=delta_share2.id, 164 | id=str(uuid.uuid4()), 165 | table_id=delta_table2.id, 166 | created_by=user5.id, 167 | ) 168 | 169 | schema3tb3 = Schema( 170 | name="delta_schema2", 171 | share_id=delta_share3.id, 172 | id=str(uuid.uuid4()), 173 | table_id=delta_table3.id, 174 | created_by=user5.id, 175 | ) 176 | 177 | schema4tb3 = Schema( 178 | name="tripsdb", 179 | share_id=iceberg_share.id, 180 | id=str(uuid.uuid4()), 181 | table_id=iceberg_table4.id, 182 | created_by=user5.id, 183 | ) 184 | 185 | 186 | permission1 = Permission( 187 | id=str(uuid.uuid4()), 188 | user_id=user5.id, 189 | share_id=schema1tb1.share_id, 190 | schema_id=schema1tb1.id, 191 | table_id=schema1tb1.table_id, 192 | ) 193 | permission2 = Permission( 194 | id=str(uuid.uuid4()), 195 | user_id=user5.id, 196 | schema_id=schema2tb1.id, 197 | share_id=schema2tb1.share_id, 198 | table_id=schema2tb1.table_id, 199 | ) 200 | permission3 = Permission( 201 | id=str(uuid.uuid4()), 202 | user_id=user5.id, 203 | schema_id=schema2tb1.id, 204 | share_id=schema2tb1.share_id, 205 | table_id=schema2tb1.table_id, 206 | ) 207 | permission4 = Permission( 208 | id=str(uuid.uuid4()), 209 | user_id=user5.id, 210 | schema_id=schema3tb3.id, 211 | share_id=schema3tb3.share_id, 212 | table_id=schema3tb3.table_id, 213 | ) 214 | permission5 = Permission( 215 | id=str(uuid.uuid4()), 216 | user_id=user5.id, 217 | schema_id=schema4tb3.id, 218 | share_id=schema4tb3.share_id, 219 | table_id=schema4tb3.table_id, 220 | ) 221 | 222 | tlf = TokenLifetime(id=str(uuid.uuid4()), user_id=user5.id, expiry="604800") 223 | 224 | session.add(delta_share1) 225 | session.add(delta_share2) 226 | session.add(delta_share3) 227 | session.add(iceberg_share) 228 | session.commit() 229 | 230 | session.add(delta_table1) 231 | session.add(delta_table2) 232 | session.add(delta_table3) 233 | session.add(iceberg_table4) 234 | session.commit() 235 | 236 | session.add(schema1tb1) 237 | session.add(schema1tb2) 238 | 239 | session.add(schema2tb1) 240 | # session.add(schema2tb2) 241 | 242 | session.add(schema3tb3) 243 | session.add(schema4tb3) 244 | session.commit() 245 | 246 | 247 | session.add(tlf) 248 | 249 | session.add(permission1) 250 | session.add(permission2) 251 | session.add(permission3) 252 | session.add(permission4) 253 | session.add(permission5) 254 | session.commit() 255 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LakeHouse Sharing 2 | 3 | Lakehouse is getting more and more popular nowadays, and many Oragnization seeing massive value in building and maintaining the lakehouse instead of maintaining the Warehouse technologies for various reasons. There are lot of great articles covering this topic in the internet or ask ChatGPT to know more about this area. 4 | 5 | This repo in particular belongs to one such part of lakehouse ecosystem, where the concern is how to share data securely, within the org and outside of the organization. Heavily inspired, infact the complete idea is from Delta-sharing protocol. delta-sharing solves the data-sharing problem for the people using Delta Table format and Databricks (company behind Delta-lake) provides excellent self-service tools on top of open source delta-sharig 6 | 7 | 8 | ### Motivation of this Repo: 9 | - Delta-sharing Framework is a protocol which can be adopted to other Table formats (Hudi and iceberg) apart from Delta 10 | - A Quick POC (Super alpha stage) to prove that protocol can be implemented easily and adopted to other table formats as well 11 | - Original Delta-sharing server was written in Scala, This repo inspired that server implementation and written in Python language which can be good starting point for python developers, and can be adopted by the organizations where python deployment stack (infra stack) is already available. 12 | - This Repo provides Alpha implementation of Sharing protocol for Iceberg table format using Pyiceberg. This can be enhanced/improved based on the interest. 13 | - This Repo rewrites the sharing protocol for Delta table format as well using delta-rs (actually this is super to implement this protocol on top of this python package) 14 | 15 | ### Difference between normal querying and querying via delta-sharing 16 | - Lot of Engines like Spark,Trino,Presto,Dremio queries Table format efficiently why we need delta-sharing. 17 | - In Delta sharing, you can logically group required few tables and send them as shares to the other teams or other org. 18 | - you need to share the token to the user, they can use that token to authenticate with delta-sharing server, once authenticated 19 | they can read the data from lakehouse without worrying about s3 authentication IAM cross account configurations etc.. 20 | 21 | ### Installation 22 | 23 | Run this commands in the root folder of this project 24 | 25 | #### help: 26 | 27 | ```sh 28 | 29 | Usage: 30 | make 31 | 32 | Targets: 33 | venv create a virtual environment for development 34 | start_backend_server starts prefect server 35 | start_frontend_server starts prefect agent 36 | help Show help 37 | ``` 38 | #### install requirements 39 | 40 | ```sh 41 | make venv 42 | ``` 43 | To share iceberg table format install following extra package 44 | and setup catalog like AWS Glue or Hive, refer [PyIceberg documentation](https://py.iceberg.apache.org/configuration/) 45 | 46 | #### for iceberg 47 | 48 | ```sh 49 | # install iceberg 50 | pip install pyiceberg 51 | ``` 52 | 53 | To share delta-lake table format, install delta-lake package and delta-lake doesn't need any catalog it will directly fetch the metadata from table formats metadata in cloud storage files. 54 | 55 | #### for delta-lake 56 | 57 | ```sh 58 | pip install deltalake 59 | 60 | ``` 61 | 62 | ### start backend server 63 | 64 | ```sh 65 | make start_backend_server 66 | ``` 67 | 68 | ### start Frontend server 69 | In another termianl start frontend streamlit APP. 70 | 71 | ```sh 72 | make start_frontend_server 73 | ``` 74 | 75 | ### Use docker setup 76 | 77 | use docker setup to quickly setup the app 78 | 79 | ```sh 80 | docker-compose up 81 | ``` 82 | 83 | Set few of the Environment variables before starting the docker-compose up 84 | refer `.env.example` file for setting the variables 85 | 86 | ### APP urls: 87 | 88 | Once docker-compose was up and running successfully, we can expect following urls 89 | 90 | - RDS Admin : http://localhost:8081/ 91 | - Postgres : host: localhost, port:5433 92 | - Backend (FastAPI): http://localhost:8001/docs 93 | - Frontend (streamlit): http://localhost:8501 94 | 95 | - Set the following env variables accordingly and Run `sqls/prepopulate_data.py` for creating superuser and few test tables 96 | 97 | - for local setup set `export env=local` 98 | - for docker setup set `export env=docker` 99 | 100 | #### Frontend: 101 | - Login Username : `admin` 102 | - Login password : `admin@123` 103 | 104 | ### Lakehouse- sharing Architecture 105 | ![architecture](images/lakehouse-sharing-arch.png) 106 | 107 | ### Blog post 108 | Refer the accompanied blog post for more details : 109 | https://guruengineering.substack.com/p/lakehouse-sharing 110 | 111 | 112 | ### Video setup instructions 113 | https://youtu.be/6H0qv-thogY 114 | 115 | ### Code structure 116 | 117 | - For more details about the backend and frondend refer respective directories 118 | 119 | ``` 120 | . 121 | ├── Makefile 122 | ├── README.md 123 | ├── backend 124 | │   ├── Dockerfile 125 | │   ├── app 126 | │   │   ├── README.md 127 | │   │   ├── __init__.py 128 | │   │   ├── __pycache__ 129 | │   │   ├── conf.py 130 | │   │   ├── core 131 | │   │   │   ├── __init__.py 132 | │   │   │   ├── __pycache__ 133 | │   │   │   ├── base.py 134 | │   │   │   ├── cloud 135 | │   │   │   │   ├── __init__.py 136 | │   │   │   │   ├── __pycache__ 137 | │   │   │   │   ├── aws.py 138 | │   │   │   │   ├── azure.py 139 | │   │   │   │   ├── base.py 140 | │   │   │   │   └── gcs.py 141 | │   │   │   ├── delta 142 | │   │   │   │   ├── __init__.py 143 | │   │   │   │   ├── __pycache__ 144 | │   │   │   │   ├── models.py 145 | │   │   │   │   ├── share.py 146 | │   │   │   │   └── utils.py 147 | │   │   │   └── iceberg 148 | │   │   │   ├── __init__.py 149 | │   │   │   ├── __pycache__ 150 | │   │   │   ├── models.py 151 | │   │   │   └── share.py 152 | │   │   ├── db 153 | │   │   │   ├── __init__.py 154 | │   │   │   ├── __pycache__ 155 | │   │   │   ├── auth_queries.py 156 | │   │   │   ├── queries.py 157 | │   │   │   └── tables.py 158 | │   │   ├── main.py 159 | │   │   ├── models 160 | │   │   │   ├── __init__.py 161 | │   │   │   ├── __pycache__ 162 | │   │   │   ├── admin.py 163 | │   │   │   ├── auth.py 164 | │   │   │   ├── common.py 165 | │   │   │   └── response.py 166 | │   │   ├── routers 167 | │   │   │   ├── __init__.py 168 | │   │   │   ├── __pycache__ 169 | │   │   │   ├── admin.py 170 | │   │   │   ├── auth.py 171 | │   │   │   └── share.py 172 | │   │   ├── securities 173 | │   │   │   ├── __init__.py 174 | │   │   │   ├── __pycache__ 175 | │   │   │   ├── jwt_utils.py 176 | │   │   │   └── user_auth.py 177 | │   │   ├── serverconf.yaml 178 | │   │   └── utilities 179 | │   │   ├── __init__.py 180 | │   │   ├── __pycache__ 181 | │   │   ├── defaults.py 182 | │   │   ├── exceptions.py 183 | │   │   ├── pagination.py 184 | │   │   ├── responses.py 185 | │   │   └── validators.py 186 | │   ├── requirements.txt 187 | │   └── tests 188 | │   ├── __init__.py 189 | │   ├── __pycache__ 190 | │   ├── mock_results.py 191 | │   └── test_share_apis.py 192 | ├── docker-compose.yaml 193 | ├── frontend 194 | │   ├── Dockerfile 195 | │   ├── README.md 196 | │   ├── app 197 | │   │   ├── __init__.py 198 | │   │   ├── __pycache__ 199 | │   │   ├── core 200 | │   │   │   ├── __init__.py 201 | │   │   │   ├── __pycache__ 202 | │   │   │   ├── api 203 | │   │   │   │   ├── __init__.py 204 | │   │   │   │   ├── __pycache__ 205 | │   │   │   │   ├── config.py 206 | │   │   │   │   ├── jwt_auth.py 207 | │   │   │   │   └── rest.py 208 | │   │   │   ├── base 209 | │   │   │   │   ├── __init__.py 210 | │   │   │   │   ├── __pycache__ 211 | │   │   │   │   ├── auth.py 212 | │   │   │   │   ├── client.py 213 | │   │   │   │   └── layout.py 214 | │   │   │   ├── link.py 215 | │   │   │   ├── login.py 216 | │   │   │   ├── schema.py 217 | │   │   │   ├── share.py 218 | │   │   │   ├── table.py 219 | │   │   │   ├── table_format.py 220 | │   │   │   └── user.py 221 | │   │   └── main.py 222 | │   ├── config.yaml 223 | │   └── requirements.txt 224 | ├── images 225 | │   └── lakehouse-sharing-arch.png 226 | ├── notebooks 227 | │   ├── client-example.ipynb 228 | │   └── profile.json 229 | └── sqls 230 | └── prepopulate_data.py 231 | ``` 232 | 233 | ### Roadmap: 234 | - Improve Backend database Modeling 235 | - improve test cases and performance 236 | - Try adopting this protocol for Hudi table format 237 | - Try to capture change data feed (CDF) from delta and iceberg 238 | - PAckage this code in docker and wrap it up in Helm chart 239 | - For iceberg currently we are using glue catalog, 240 | load metadata directly from cloud storage without catalog 241 | 242 | 243 | ### Reference: 244 | - [delta-sharing-protocol-specification](https://github.com/delta-io/delta-sharing/blob/main/PROTOCOL.md) 245 | -------------------------------------------------------------------------------- /notebooks/client-example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "2532defd", 6 | "metadata": {}, 7 | "source": [ 8 | "## Lakehouse Sharing Example" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "3f90b473", 14 | "metadata": {}, 15 | "source": [ 16 | "### Notebook setup \n", 17 | "\n", 18 | "Since we already have virtualenvs setup , to use the delta-sharing client for this lakehouse-sharing server, follow below steps\n", 19 | "- install ipykernel\n", 20 | "\n", 21 | " ``` \n", 22 | " pip install ipykernel\n", 23 | " ```\n", 24 | "- register virtualenv in jupyter notebook\n", 25 | " ``` \n", 26 | " source venv/bin/activate\n", 27 | " python -m ipykernel install --user --name=lakehouse-sharing\n", 28 | " ```\n", 29 | "- Install deltasharing python client\n", 30 | "\n", 31 | " ```\n", 32 | " pip install delta-sharing\n", 33 | " ```" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 13, 39 | "id": "ff71954b", 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "import delta_sharing\n", 44 | "from delta_sharing import Share,Schema,Table\n", 45 | "# Point to the profile file. It can be a file on the local file system or a file on a remote storage.\n", 46 | "profile_file = \"profile.json\"\n", 47 | "\n", 48 | "# Create a SharingClient.\n", 49 | "client = delta_sharing.SharingClient(profile_file)\n", 50 | "\n", 51 | "\n" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "id": "071ffd93", 57 | "metadata": {}, 58 | "source": [ 59 | "# List all shares\n" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 14, 65 | "id": "0416e4ab", 66 | "metadata": {}, 67 | "outputs": [ 68 | { 69 | "data": { 70 | "text/plain": [ 71 | "[Share(name='delta_share1')]" 72 | ] 73 | }, 74 | "execution_count": 14, 75 | "metadata": {}, 76 | "output_type": "execute_result" 77 | } 78 | ], 79 | "source": [ 80 | "client.list_shares()" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "id": "1e6c2a49", 86 | "metadata": {}, 87 | "source": [ 88 | "## List Schemas" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 18, 94 | "id": "29a721b5", 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "data": { 99 | "text/plain": [ 100 | "[Schema(name='delta_schema', share='delta_share1'),\n", 101 | " Schema(name='delta_schema1', share='delta_share1')]" 102 | ] 103 | }, 104 | "execution_count": 18, 105 | "metadata": {}, 106 | "output_type": "execute_result" 107 | } 108 | ], 109 | "source": [ 110 | "schemas = client.list_schemas(share=Share(\"delta_share1\"))\n", 111 | "schemas" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "id": "f6006581", 117 | "metadata": {}, 118 | "source": [ 119 | "## List Tables" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 19, 125 | "id": "aaf18ea4", 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "data": { 130 | "text/plain": [ 131 | "[Table(name='test_hm', share='delta_share1', schema='delta_schema')]" 132 | ] 133 | }, 134 | "execution_count": 19, 135 | "metadata": {}, 136 | "output_type": "execute_result" 137 | } 138 | ], 139 | "source": [ 140 | "client.list_tables(schemas[0])" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "id": "559d2d8b", 146 | "metadata": {}, 147 | "source": [ 148 | "## List all tables" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 20, 154 | "id": "4690a640", 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "data": { 159 | "text/plain": [ 160 | "[Table(name='test_hm', share='delta_share1', schema='delta_schema'),\n", 161 | " Table(name='test_student', share='delta_share1', schema='delta_schema1')]" 162 | ] 163 | }, 164 | "execution_count": 20, 165 | "metadata": {}, 166 | "output_type": "execute_result" 167 | } 168 | ], 169 | "source": [ 170 | "client.list_all_tables()" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "id": "18a01017", 176 | "metadata": {}, 177 | "source": [ 178 | "## Iceberg Table" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 21, 184 | "id": "a951bf59", 185 | "metadata": { 186 | "scrolled": false 187 | }, 188 | "outputs": [], 189 | "source": [ 190 | "# table_url = profile_file + \"#iceberg_share.tripsdb.iceberg_benchmark_nyc_taxi_trips_v2\"\n", 191 | "\n", 192 | "# Fetch 10 rows from a table and convert it to a Pandas DataFrame. This can be used to read sample data \n", 193 | "# from a table that cannot fit in the memory.\n", 194 | "# delta_sharing.load_as_pandas(table_url, limit=100,version=0)" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "id": "59eb4ed2", 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "id": "84167764", 208 | "metadata": {}, 209 | "source": [ 210 | "## Delta Table" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 22, 216 | "id": "7e17a180", 217 | "metadata": {}, 218 | "outputs": [ 219 | { 220 | "data": { 221 | "text/html": [ 222 | "
\n", 223 | "\n", 236 | "\n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | "
idnamedepartment
01hm1maths
12hm2maths
23hm3maths
34hm4maths1
45hm5maths1
56hm6maths6
67hm7maths7
\n", 290 | "
" 291 | ], 292 | "text/plain": [ 293 | " id name department\n", 294 | "0 1 hm1 maths\n", 295 | "1 2 hm2 maths\n", 296 | "2 3 hm3 maths\n", 297 | "3 4 hm4 maths1\n", 298 | "4 5 hm5 maths1\n", 299 | "5 6 hm6 maths6\n", 300 | "6 7 hm7 maths7" 301 | ] 302 | }, 303 | "execution_count": 22, 304 | "metadata": {}, 305 | "output_type": "execute_result" 306 | } 307 | ], 308 | "source": [ 309 | "table_url = profile_file + \"#delta_share1.delta_schema.test_hm\"\n", 310 | "\n", 311 | "# Fetch 10 rows from a table and convert it to a Pandas DataFrame. This can be used to read sample data \n", 312 | "# from a table that cannot fit in the memory.\n", 313 | "delta_sharing.load_as_pandas(table_url, limit=10)" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": null, 319 | "id": "9b8a9e54", 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [] 323 | } 324 | ], 325 | "metadata": { 326 | "kernelspec": { 327 | "display_name": "lakehouse-sharing", 328 | "language": "python", 329 | "name": "lakehouse-sharing" 330 | }, 331 | "language_info": { 332 | "codemirror_mode": { 333 | "name": "ipython", 334 | "version": 3 335 | }, 336 | "file_extension": ".py", 337 | "mimetype": "text/x-python", 338 | "name": "python", 339 | "nbconvert_exporter": "python", 340 | "pygments_lexer": "ipython3", 341 | "version": "3.11.1" 342 | } 343 | }, 344 | "nbformat": 4, 345 | "nbformat_minor": 5 346 | } 347 | -------------------------------------------------------------------------------- /backend/app/db/queries.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from app.conf import Config 4 | from app.db.tables import Permission, Schema, Share, Table, TokenLifetime, User 5 | from app.utilities import get_random_uuid 6 | 7 | # from .utils import create_db_connection 8 | from sqlmodel import Session, SQLModel, create_engine, select 9 | 10 | conf = Config() 11 | 12 | 13 | def create_db_connection(): 14 | if os.environ.get("env", "local") == "local": 15 | 16 | sqlite_url = os.environ.get("db_url") 17 | print("local", sqlite_url) 18 | engine = create_engine( 19 | sqlite_url, 20 | echo=True, 21 | ) 22 | # SQLModel.metadata.create_all(engine, checkfirst=True) 23 | return engine 24 | else: 25 | db_conf = conf.get("db") 26 | POSTGRES_USER = os.environ.get("POSTGRES_USER") 27 | POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD") 28 | POSTGRES_HOST = os.environ.get("POSTGRES_HOST") 29 | POSTGRES_PORT = os.environ.get("POSTGRES_PORT") 30 | POSTGRES_DB = os.environ.get("POSTGRES_DB") 31 | postgres_url = f"postgresql+psycopg2://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DB}" 32 | engine = create_engine( 33 | postgres_url, 34 | echo=True, 35 | ) 36 | return engine 37 | 38 | 39 | def create_db_and_tables(): 40 | engine = create_db_connection() 41 | session = Session(engine) 42 | SQLModel.metadata.create_all(engine) 43 | session.commit() 44 | 45 | 46 | class Query: 47 | def __init__(self, engine=None) -> None: 48 | if engine is None: 49 | self.engine = create_db_connection() 50 | else: 51 | self.engine = engine 52 | 53 | def execute_sql(self, stmt): 54 | with Session(self.engine) as session: 55 | results = session.exec(statement=stmt).all() 56 | print(results) 57 | return results 58 | 59 | def list_shares(self, user_id): 60 | stmt = ( 61 | select(Share) 62 | .select_from(Permission) 63 | .where(Share.id == Permission.share_id) 64 | .where(Permission.user_id == user_id) 65 | .distinct(Share.name) 66 | ).union(select(Share).where(Share.created_by == user_id)) 67 | print("list_shares sql", stmt.compile()) 68 | results = self.execute_sql(stmt) 69 | return results 70 | 71 | def get_share(self, share, user_id): 72 | stmt = ( 73 | select(Share) 74 | .select_from(Permission) 75 | .where(Share.id == Permission.share_id) 76 | .where(Permission.user_id == user_id) 77 | .where(Share.name == share) 78 | ).union(select(Share).where(Share.created_by == user_id)) 79 | rows = self.execute_sql(stmt) 80 | if len(rows) > 0: 81 | res = {"id": rows[0].id, "name": rows[0].name} 82 | else: 83 | res = None 84 | return res 85 | 86 | def list_schemas(self, share, user_id): 87 | stmt = select(Share, Schema).join(Schema).where(Share.name == share) 88 | rows = self.execute_sql(stmt) 89 | res = [{"name": schema.name, "share": share.name} for (share, schema) in rows] 90 | return res 91 | 92 | def list_tables(self, share, schema, user_id): 93 | stmt = ( 94 | select(Share, Schema, Table) 95 | .where(Share.id == Schema.share_id, Schema.table_id == Table.id) 96 | .where(Share.name == share, Schema.name == schema) 97 | # .union(select(Table).where(Table.created_by==user_id)) 98 | ) 99 | rows = self.execute_sql(stmt) 100 | res = [ 101 | { 102 | "name": table.table_name, 103 | "schema": schema.name, 104 | "share": share.name, 105 | "shareId": share.id, 106 | "id": table.id, 107 | } 108 | for (share, schema, table) in rows 109 | ] 110 | return res 111 | 112 | def check_user_permission(self, user_id, share, schema=None, table=None): 113 | with Session(self.engine) as session: 114 | # statement = ( 115 | # session.query(Permission) 116 | # .select_from(Permission) 117 | # .join(Share) 118 | # .join(Schema) 119 | # .join(Table) 120 | # .where(Permission.user_id == user_id) 121 | # ) 122 | if share: 123 | statement = ( 124 | select(Share) 125 | .join(Permission) 126 | .where(Permission.user_id == user_id) 127 | .union(select(Share).where(Share.created_by == user_id)) 128 | ) 129 | # statement = statement.where(Share.name == share) 130 | if schema: 131 | statement = ( 132 | select(Schema) 133 | .join(Permission) 134 | .join(Share) 135 | .where(Permission.user_id == user_id) 136 | .union(select(Schema).where(Schema.created_by == user_id)) 137 | ) 138 | # statement = statement.where(Schema.name == schema) 139 | if table: 140 | statement = ( 141 | select(Table) 142 | .join(Permission) 143 | .join(Share) 144 | .join(Schema) 145 | .where(Permission.user_id == user_id) 146 | .union(select(Table).where(Table.created_by == user_id)) 147 | ) 148 | # statement = statement.where(Table.table_name == table) 149 | # statement = statement.with_entities( 150 | # Share.name, Table.table_name, Schema.name 151 | # ) 152 | # statement = select(Permission, Share.name,).join(Share).join(Schema).join(Table) 153 | # stmt = perm.query.join(Share) 154 | results = session.exec(statement).all() 155 | return True if results else False 156 | 157 | def list_all_tables(self, share, user_id): 158 | stmt = ( 159 | select(Share, Schema, Table) 160 | .select_from(Permission) 161 | .where(Share.id == Permission.share_id) 162 | .where(Permission.user_id == user_id) 163 | .where(Share.id == Schema.share_id, Schema.table_id == Table.id) 164 | .where(Share.name == share) 165 | ) 166 | rows = self.execute_sql(stmt) 167 | res = [ 168 | { 169 | "name": table.table_name, 170 | "schema": schema.name, 171 | "share": share.name, 172 | "shareId": share.id, 173 | "id": table.id, 174 | } 175 | for (share, schema, table) in rows 176 | ] 177 | return res 178 | 179 | def check_schema_and_table_existance(self, share, schema=None, table=None): 180 | 181 | if (share is not None) and (schema is None) and (table is None): 182 | stmt = select(Share).where(Share.name == share) 183 | if (share is not None) and (schema is not None) and (table is None): 184 | stmt = ( 185 | select(Share, Schema) 186 | .where(Share.id == Schema.share_id, Schema.table_id == Table.id) 187 | .where(Schema.name == schema) 188 | ) 189 | if (share is not None) and (schema is not None) and (table is not None): 190 | stmt = ( 191 | select(Share, Schema, Table) 192 | .where(Share.id == Schema.share_id, Schema.table_id == Table.id) 193 | .where(Table.table_name == table) 194 | ) 195 | sql = stmt.compile() 196 | print("compiled sql", sql) 197 | rows = self.execute_sql(stmt) 198 | return True if len(rows) != 0 else False 199 | 200 | def get_path(self, share, schema, table): 201 | stmt = ( 202 | select(Share, Schema, Table) 203 | .where(Share.id == Schema.share_id, Schema.table_id == Table.id) 204 | .where(Share.name == share) 205 | .where(Schema.name == schema) 206 | .where(Table.table_name == table) 207 | ) 208 | rows = self.execute_sql(stmt) 209 | if len(rows) > 0: 210 | _, _, table = rows[0] 211 | return table.table_location 212 | 213 | 214 | class AdminQuery: 215 | def __init__(self) -> None: 216 | self.engine = create_db_connection() 217 | 218 | def get_session(self): 219 | session = Session(self.engine) 220 | return session 221 | 222 | # def add(self,model:SQLModel): 223 | # with Session(self.engine) as session: 224 | # session.add(model) 225 | def add(self, model: SQLModel): 226 | session = Session(self.engine) 227 | session.add(model) 228 | session.commit() 229 | session.close() 230 | 231 | def execute_sql(self, stmt): 232 | with Session(self.engine) as session: 233 | results = session.exec(statement=stmt).all() 234 | print(results) 235 | return results 236 | 237 | def create_share(self, share: Share, user_id: str): 238 | shareTable = Share(id=get_random_uuid(), name=share.name, created_by=user_id) 239 | self.add(shareTable) 240 | 241 | def create_schema(self, schema: Schema, user_id: str): 242 | schemaTable = Schema( 243 | id=get_random_uuid(), 244 | name=schema.name, 245 | table_id=schema.table_id, 246 | share_id=schema.share_id, 247 | created_by=user_id, 248 | ) 249 | self.add(schemaTable) 250 | 251 | def create_table(self, table: Table, user_id: str): 252 | tableTable = Table( 253 | id=get_random_uuid(), 254 | table_name=table.table_name, 255 | table_location=table.table_location, 256 | created_by=user_id, 257 | ) 258 | self.add(tableTable) 259 | 260 | def create_complete_share(self, all_details, user_id: str): 261 | shareTable = Share( 262 | id=get_random_uuid(), name=all_details.share.name, created_by=user_id 263 | ) 264 | tableTable = Table( 265 | id=get_random_uuid(), 266 | table_name=all_details.table.table_name, 267 | table_location=all_details.table.table_location, 268 | created_by=user_id, 269 | ) 270 | schemaTable = Schema( 271 | id=get_random_uuid(), 272 | name=all_details.schema_.name, 273 | table_id=tableTable.id, 274 | share_id=shareTable.id, 275 | created_by=user_id, 276 | ) 277 | 278 | self.add(shareTable) 279 | self.add(tableTable) 280 | self.add(schemaTable) 281 | 282 | def get_schema_id_by_name(self, share_id, schemaname): 283 | stmt = select(Schema.id).where( 284 | Schema.share_id == share_id, Schema.name == schemaname 285 | ) 286 | rows = self.execute_sql(stmt=stmt) 287 | return rows 288 | 289 | def link_resources(self, resources: Permission): 290 | schema_id = self.get_schema_id_by_name( 291 | resources.share_id, resources.schema_name 292 | )[0] 293 | perm = Permission( 294 | id=get_random_uuid(), 295 | user_id=resources.user_id, 296 | share_id=resources.share_id, 297 | schema_id=schema_id, 298 | table_id=resources.table_id, 299 | ) 300 | self.add(perm) 301 | 302 | def get_id_by_user(self, user_name): 303 | stmt = select(User.id).where(User.name == user_name) 304 | rows = self.execute_sql(stmt) 305 | return rows[0] 306 | 307 | def add_lifetime(self, user_id, expiry): 308 | lifetime = TokenLifetime(id=get_random_uuid(), user_id=user_id, expiry=expiry) 309 | self.add(lifetime) 310 | --------------------------------------------------------------------------------