├── .env ├── test ├── config │ └── test_project │ │ ├── databricks │ │ └── Workflows │ │ │ └── workflow_template.yaml │ │ ├── pipelines │ │ ├── tables.xlsx │ │ ├── tables_invalid.yaml │ │ ├── test_tables.yaml │ │ ├── tables.yaml │ │ ├── json_schema │ │ │ ├── sibytes_yetl_project_schema.json │ │ │ ├── sibytes_yetl_pipeline_schema.json │ │ │ └── sibytes_yetl_tables_schema.json │ │ └── autoloader.yaml │ │ ├── dataNone │ │ └── _delta_log │ │ │ ├── .00000000000000000000.json.crc │ │ │ └── 00000000000000000000.json │ │ ├── logging.yaml │ │ ├── test_project.yaml │ │ ├── sql │ │ └── raw_dbx_patterns_control │ │ │ ├── header_footer.sql │ │ │ └── raw_audit.sql │ │ └── schema │ │ ├── customer_details_1.yaml │ │ └── customer_details_2.yaml ├── unit │ ├── test_validation.py │ ├── test_timeslice.py │ └── test_utils.py └── integration │ └── test_configuration_load.py ├── yetl ├── resource │ ├── tables.xlsx │ ├── logging.yaml │ ├── project.yaml │ ├── __init__.py │ ├── sibytes_yetl_project_schema.json │ ├── sibytes_yetl_pipeline_schema.json │ └── sibytes_yetl_tables_schema.json ├── cli │ ├── metadata_provider │ │ └── __init__.py │ └── _init.py ├── config │ ├── table │ │ ├── _table_type.py │ │ ├── _write.py │ │ ├── __init__.py │ │ ├── _factory.py │ │ ├── _table.py │ │ ├── _deltalake.py │ │ └── _read.py │ ├── _stage_type.py │ ├── _table_mapping.py │ ├── __init__.py │ ├── _spark_context.py │ ├── _project.py │ ├── _logging_config.py │ ├── _decorators.py │ ├── _config.py │ ├── _utils.py │ ├── _timeslice.py │ └── _tables.py ├── workflow │ ├── __init__.py │ ├── _notebook.py │ ├── _dlt.py │ └── _multi_threaded.py ├── validation │ ├── __init__.py │ └── _validate.py ├── __init__.py └── __main__.py ├── .flake8 ├── pytest.ini ├── typings └── __builtins__.pyi ├── local_cleanup.sh ├── .vscode ├── settings.json └── launch.json ├── README.md ├── requirements38.txt ├── requirements310.txt ├── main.py ├── setup.py ├── ci.yaml ├── schema_testing ├── tables.yaml ├── autoloader.yaml ├── sibytes_yetl_pipeline_schema.json └── sibytes_yetl_tables_schema.json └── .gitignore /.env: -------------------------------------------------------------------------------- 1 | YETL_CONFIG=./test/config 2 | YETL_ENVIRONMENT=local -------------------------------------------------------------------------------- /test/config/test_project/databricks/Workflows/workflow_template.yaml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /yetl/resource/tables.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sibytes/yetl/HEAD/yetl/resource/tables.xlsx -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ; extend-ignore = D 3 | per-file-ignores = 4 | # line too long 5 | yetl/*.py: E501 6 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | # pytest.ini 2 | [pytest] 3 | env = 4 | YETL_CONFIG=./test/config 5 | YETL_ENVIRONMENT=local 6 | -------------------------------------------------------------------------------- /typings/__builtins__.pyi: -------------------------------------------------------------------------------- 1 | 2 | try: 3 | from databricks.sdk.runtime import * 4 | except ModuleNotFoundError: 5 | pass 6 | -------------------------------------------------------------------------------- /yetl/cli/metadata_provider/__init__.py: -------------------------------------------------------------------------------- 1 | from ._xlsx import XlsMetadata, ImportFormat 2 | 3 | __all__ = ["XlsMetadata", "ImportFormat"] 4 | -------------------------------------------------------------------------------- /local_cleanup.sh: -------------------------------------------------------------------------------- 1 | rm -r -f ./metastore_db 2 | rm -r -f ./spark-warehouse 3 | rm -f derby.log 4 | rm -r -f ./test/config/test_project/data 5 | -------------------------------------------------------------------------------- /test/config/test_project/pipelines/tables.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sibytes/yetl/HEAD/test/config/test_project/pipelines/tables.xlsx -------------------------------------------------------------------------------- /yetl/config/table/_table_type.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class TableType(str, Enum): 5 | read = "read" 6 | write = "write" 7 | delta_lake = "delta_lake" 8 | -------------------------------------------------------------------------------- /test/config/test_project/dataNone/_delta_log/.00000000000000000000.json.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sibytes/yetl/HEAD/test/config/test_project/dataNone/_delta_log/.00000000000000000000.json.crc -------------------------------------------------------------------------------- /yetl/workflow/__init__.py: -------------------------------------------------------------------------------- 1 | from ._notebook import Notebook 2 | from ._multi_threaded import execute_notebooks 3 | from ._dlt import create_dlt 4 | 5 | __all__ = ["Notebook", "execute_notebooks", "create_dlt"] 6 | -------------------------------------------------------------------------------- /yetl/config/_stage_type.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class StageType(str, Enum): 5 | audit_control = "audit_control" 6 | source = "source" 7 | landing = "landing" 8 | raw = "raw" 9 | base = "base" 10 | curated = "curated" 11 | extract = "extract" 12 | -------------------------------------------------------------------------------- /yetl/config/table/_write.py: -------------------------------------------------------------------------------- 1 | from ._table import Table 2 | import logging 3 | from typing import Any 4 | 5 | 6 | class Write(Table): 7 | def __init__(self, **data: Any) -> None: 8 | super().__init__(**data) 9 | self._logger = logging.getLogger(self.__class__.__name__) 10 | self._render() 11 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.formatting.provider": "black", 3 | "python.testing.pytestArgs": [ 4 | "test" 5 | ], 6 | "python.testing.unittestEnabled": false, 7 | "python.testing.pytestEnabled": true, 8 | "python.envFile": "${workspaceFolder}/.databricks/.databricks.env", 9 | "databricks.python.envFile": "${workspaceFolder}/.env" 10 | } -------------------------------------------------------------------------------- /yetl/config/_table_mapping.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel, Field 2 | from typing import Union, Any, Dict 3 | from .table import Table 4 | 5 | 6 | class TableMapping(BaseModel): 7 | def __init__(self, **data: Any) -> None: 8 | super().__init__(**data) 9 | 10 | destination: Table = Field(...) 11 | source: Union[Dict[str, Table], Table] = Field(...) 12 | -------------------------------------------------------------------------------- /yetl/resource/logging.yaml: -------------------------------------------------------------------------------- 1 | version: 1 2 | formatters: 3 | default: 4 | format: "%(levelname)s : %(asctime)s : %(name)s : %(filename)s.%(funcName)s: line(%(lineno)s) : %(message)s" 5 | handlers: 6 | console: 7 | class: logging.StreamHandler 8 | formatter: default 9 | stream: ext://sys.stdout 10 | root: 11 | level: INFO 12 | handlers: [console] 13 | -------------------------------------------------------------------------------- /test/config/test_project/logging.yaml: -------------------------------------------------------------------------------- 1 | version: 1 2 | formatters: 3 | default: 4 | format: "%(levelname)s : %(asctime)s : %(name)s : %(filename)s.%(funcName)s: line(%(lineno)s) : %(message)s" 5 | handlers: 6 | console: 7 | class: logging.StreamHandler 8 | formatter: default 9 | stream: ext://sys.stdout 10 | root: 11 | level: DEBUG 12 | handlers: [console] 13 | -------------------------------------------------------------------------------- /yetl/validation/__init__.py: -------------------------------------------------------------------------------- 1 | from ._validate import ( 2 | get_table_schema, 3 | get_pipeline_schema, 4 | get_project_schema, 5 | validate_tables, 6 | validate_pipeline, 7 | SchemaFiles, 8 | get_schema, 9 | ) 10 | 11 | 12 | __all__ = [ 13 | "get_table_schema", 14 | "get_pipeline_schema", 15 | "get_project_schema", 16 | "validate_tables", 17 | "validate_pipeline", 18 | "SchemaFiles", 19 | "get_schema", 20 | ] 21 | -------------------------------------------------------------------------------- /yetl/config/table/__init__.py: -------------------------------------------------------------------------------- 1 | from ._deltalake import DeltaLake 2 | from ._read import Read 3 | from ._table import Table, ValidationThreshold, ValidationThresholdType 4 | from ._factory import factory as table_factory 5 | from ._table_type import TableType 6 | 7 | 8 | __all__ = [ 9 | "DeltaLake", 10 | "Read", 11 | "table_factory", 12 | "DataSet", 13 | "Table", 14 | "ValidationThreshold", 15 | "TableType", 16 | "ValidationThresholdType", 17 | ] 18 | -------------------------------------------------------------------------------- /yetl/workflow/_notebook.py: -------------------------------------------------------------------------------- 1 | # used to carry notebook data 2 | from pydantic import BaseModel, Field 3 | from typing import Any 4 | 5 | 6 | class Notebook(BaseModel): 7 | def __init__(self, **data: Any) -> None: 8 | super().__init__(**data) 9 | # add the notebook path to parameters for error reporting. 10 | self.parameters["notebook"] = self.path 11 | 12 | path: str = Field(...) 13 | timeout: int = Field(default=3600) 14 | parameters: dict = Field(default={}) 15 | retry: int = Field(default=0) 16 | enabled: bool = Field(default=True) 17 | -------------------------------------------------------------------------------- /yetl/resource/project.yaml: -------------------------------------------------------------------------------- 1 | version: 0.0.0 2 | 3 | name: default 4 | sql: ./sql 5 | spark_schema: ./schema 6 | pipeline: ./pipelines 7 | databricks_notebooks: ./databricks/notebooks 8 | databricks_workflows: ./databricks/workflows 9 | databricks_queries: ./databricks/queries 10 | 11 | 12 | spark: 13 | logging_level: ERROR 14 | config: 15 | spark.master: local 16 | spark.databricks.delta.allowArbitraryProperties.enabled: true 17 | spark.sql.catalog.spark_catalog: org.apache.spark.sql.delta.catalog.DeltaCatalog 18 | spark.sql.extensions: io.delta.sql.DeltaSparkSessionExtension 19 | -------------------------------------------------------------------------------- /test/config/test_project/test_project.yaml: -------------------------------------------------------------------------------- 1 | version: 3.0.0 2 | 3 | name: test_project 4 | sql: ./sql 5 | spark_schema: ./schema 6 | pipeline: ./pipelines 7 | databricks_notebooks: ./databricks/notebooks 8 | databricks_workflows: ./databricks/workflows 9 | databricks_queries: ./databricks/queries 10 | 11 | 12 | spark: 13 | logging_level: ERROR 14 | config: 15 | spark.master: local 16 | spark.databricks.delta.allowArbitraryProperties.enabled: true 17 | spark.sql.catalog.spark_catalog: org.apache.spark.sql.delta.catalog.DeltaCatalog 18 | spark.sql.extensions: io.delta.sql.DeltaSparkSessionExtension -------------------------------------------------------------------------------- /test/config/test_project/sql/raw_dbx_patterns_control/header_footer.sql: -------------------------------------------------------------------------------- 1 | 2 | CREATE TABLE IF NOT EXISTS `raw_dbx_patterns_control`.`header_footer` 3 | ( 4 | header struct, 5 | raw_header string, 6 | footer struct, 7 | raw_footer string, 8 | _process_id bigint, 9 | _load_date timestamp, 10 | _metadata struct 11 | ) 12 | USING DELTA 13 | LOCATION '{{location}}' 14 | TBLPROPERTIES ( 15 | {{delta_properties}} 16 | ); -------------------------------------------------------------------------------- /yetl/__init__.py: -------------------------------------------------------------------------------- 1 | from .config import ( 2 | Config, 3 | Timeslice, 4 | TimesliceNow, 5 | TimesliceUtcNow, 6 | Read, 7 | DeltaLake, 8 | TableMapping, 9 | Tables, 10 | StageType, 11 | yetl_flow, 12 | ValidationThreshold, 13 | ValidationThresholdType, 14 | ) 15 | 16 | __all__ = [ 17 | "Config", 18 | "Timeslice", 19 | "TimesliceNow", 20 | "TimesliceUtcNow", 21 | "Read", 22 | "DeltaLake", 23 | "TableMapping", 24 | "Tables", 25 | "StageType", 26 | "yetl_flow", 27 | "ValidationThreshold", 28 | "ValidationThresholdType", 29 | ] 30 | -------------------------------------------------------------------------------- /yetl/resource/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | from importlib.resources import files as resources 3 | except Exception: 4 | from importlib import resources 5 | 6 | _PACKAGE = "yetl.resource" 7 | 8 | 9 | def get_resource_text(resource: str): 10 | try: 11 | data = resources(_PACKAGE).joinpath(resource).read_text() 12 | except Exception: 13 | data = resources.read_text(_PACKAGE, resource) 14 | return data 15 | 16 | 17 | def get_resource_binary(resource: str): 18 | try: 19 | schema = resources(_PACKAGE).joinpath(resource).read_bytes() 20 | except Exception: 21 | schema = resources.read_binary(_PACKAGE, resource) 22 | return schema 23 | -------------------------------------------------------------------------------- /yetl/config/__init__.py: -------------------------------------------------------------------------------- 1 | from ._config import Config 2 | from ._timeslice import Timeslice, TimesliceNow, TimesliceUtcNow 3 | from .table import ( 4 | DeltaLake, 5 | Read, 6 | ValidationThreshold, 7 | ValidationThresholdType, 8 | TableType, 9 | ) 10 | from ._tables import Tables 11 | from ._table_mapping import TableMapping 12 | from ._stage_type import StageType 13 | from ._decorators import yetl_flow 14 | 15 | 16 | __all__ = [ 17 | "Config", 18 | "Timeslice", 19 | "TimesliceNow", 20 | "TimesliceUtcNow", 21 | "Read", 22 | "DeltaLake", 23 | "TableMapping", 24 | "Tables", 25 | "StageType", 26 | "yetl_flow", 27 | "ValidationThreshold", 28 | "ValidationThresholdType", 29 | "TableType", 30 | ] 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # What Is Yetl 2 | 3 | Website: https://www.yetl.io/ 4 | 5 | 6 | ## Development Setup 7 | 8 | ``` 9 | pip install -r requirements.txt 10 | ``` 11 | 12 | ## Unit Tests 13 | 14 | To run the unit tests with a coverage report. 15 | 16 | ``` 17 | pip install -e . 18 | pytest test/unit --junitxml=junit/test-results.xml --cov=yetl --cov-report=xml --cov-report=html 19 | ``` 20 | 21 | ## Integration Tests 22 | 23 | To run the integration tests with a coverage report. 24 | 25 | ``` 26 | pip install -e . 27 | pytest test/integration --junitxml=junit/test-results.xml --cov=yetl --cov-report=xml --cov-report=html 28 | ``` 29 | 30 | ## Build 31 | 32 | ``` 33 | python setup.py sdist bdist_wheel 34 | ``` 35 | 36 | ## Publish 37 | 38 | 39 | ``` 40 | twine upload dist/* 41 | ``` 42 | -------------------------------------------------------------------------------- /test/config/test_project/schema/customer_details_1.yaml: -------------------------------------------------------------------------------- 1 | fields: 2 | - metadata: {} 3 | name: flag 4 | nullable: true 5 | type: string 6 | - metadata: {} 7 | name: period 8 | nullable: true 9 | type: timestamp 10 | - metadata: {} 11 | name: id 12 | nullable: true 13 | type: integer 14 | - metadata: {} 15 | name: first_name 16 | nullable: true 17 | type: string 18 | - metadata: {} 19 | name: last_name 20 | nullable: true 21 | type: string 22 | - metadata: {} 23 | name: email 24 | nullable: true 25 | type: string 26 | - metadata: {} 27 | name: gender 28 | nullable: true 29 | type: string 30 | - metadata: {} 31 | name: job_title 32 | nullable: true 33 | type: string 34 | - metadata: {} 35 | name: amount 36 | nullable: true 37 | type: double 38 | type: struct 39 | -------------------------------------------------------------------------------- /test/config/test_project/schema/customer_details_2.yaml: -------------------------------------------------------------------------------- 1 | fields: 2 | - metadata: {} 3 | name: flag 4 | nullable: true 5 | type: string 6 | - metadata: {} 7 | name: period 8 | nullable: true 9 | type: timestamp 10 | - metadata: {} 11 | name: id 12 | nullable: true 13 | type: integer 14 | - metadata: {} 15 | name: first_name 16 | nullable: true 17 | type: string 18 | - metadata: {} 19 | name: last_name 20 | nullable: true 21 | type: string 22 | - metadata: {} 23 | name: email 24 | nullable: true 25 | type: string 26 | - metadata: {} 27 | name: gender 28 | nullable: true 29 | type: string 30 | - metadata: {} 31 | name: job_title 32 | nullable: true 33 | type: string 34 | - metadata: {} 35 | name: amount 36 | nullable: true 37 | type: double 38 | type: struct 39 | -------------------------------------------------------------------------------- /requirements38.txt: -------------------------------------------------------------------------------- 1 | attrs==23.1.0 2 | black==23.3.0 3 | click==8.1.4 4 | coverage==7.2.7 5 | delta-spark==2.4.0 6 | et-xmlfile==1.1.0 7 | exceptiongroup==1.1.2 8 | flake8==6.0.0 9 | importlib-metadata==6.8.0 10 | iniconfig==2.0.0 11 | Jinja2==3.1.2 12 | jsonschema==4.16.0 13 | jsonschema-specifications==2023.6.1 14 | MarkupSafe==2.1.3 15 | mccabe==0.7.0 16 | mypy-extensions==1.0.0 17 | numpy==1.25.0 18 | openpyxl==3.1.2 19 | packaging==23.1 20 | pandas==2.0.3 21 | pathspec==0.11.1 22 | platformdirs==3.8.1 23 | pluggy==1.2.0 24 | py4j==0.10.9.7 25 | pyaml==23.7.0 26 | pycodestyle==2.10.0 27 | pydantic==1.10.6 28 | pyflakes==3.0.1 29 | pyrsistent==0.19.3 30 | pyspark==3.4.1 31 | pytest==7.4.0 32 | pytest-cov==4.1.0 33 | pytest-env==0.8.2 34 | python-dateutil==2.8.2 35 | pytz==2023.3 36 | PyYAML==6.0 37 | referencing==0.29.1 38 | rpds-py==0.8.10 39 | six==1.16.0 40 | tomli==2.0.1 41 | typer==0.9.0 42 | typing_extensions==4.7.1 43 | tzdata==2023.3 44 | zipp==3.15.0 45 | -------------------------------------------------------------------------------- /test/config/test_project/sql/raw_dbx_patterns_control/raw_audit.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS `raw_dbx_patterns_control`.`raw_audit` 2 | ( 3 | `file_name` string, 4 | source_database string, 5 | source_table string, 6 | `database` string, 7 | `table` string, 8 | 9 | total_count bigint, 10 | valid_count bigint, 11 | invalid_count bigint, 12 | invalid_ratio double, 13 | expected_row_count bigint, 14 | warning_thresholds struct< 15 | invalid_ratio:double, 16 | invalid_rows:bigint, 17 | max_rows:bigint, 18 | min_rows:bigint 19 | >, 20 | exception_thresholds struct< 21 | invalid_ratio:double, 22 | invalid_rows:bigint, 23 | max_rows:bigint, 24 | min_rows:bigint 25 | >, 26 | file_path string, 27 | file_size bigint, 28 | file_modification_time timestamp, 29 | _process_id bigint, 30 | _load_date timestamp 31 | ) 32 | USING DELTA 33 | LOCATION '{{location}}' 34 | TBLPROPERTIES ( 35 | {{delta_properties}} 36 | ) 37 | 38 | -------------------------------------------------------------------------------- /test/unit/test_validation.py: -------------------------------------------------------------------------------- 1 | from yetl.validation import _validate as v 2 | import yaml 3 | from jsonschema import SchemaError, ValidationError 4 | 5 | 6 | def test_get_table_schema(): 7 | 8 | schema = v.get_table_schema() 9 | 10 | assert isinstance(schema, dict) 11 | assert schema is not None 12 | 13 | def test_validate_valid_tables(): 14 | 15 | with open("./test/config/test_project/pipelines/tables.yaml", "r", encoding="utf-8") as f: 16 | tables = yaml.safe_load(f) 17 | 18 | try: 19 | v.validate_tables(tables) 20 | assert True 21 | except ValidationError: 22 | assert False 23 | 24 | def test_validate_invalid_tables(): 25 | 26 | with open("./test/config/test_project/pipelines/tables_invalid.yaml", "r", encoding="utf-8") as f: 27 | tables = yaml.safe_load(f) 28 | 29 | try: 30 | v.validate_tables(tables) 31 | assert False 32 | except Exception as e: 33 | assert isinstance(e, ValidationError) 34 | assert e.message == "'version' is a required property" 35 | 36 | -------------------------------------------------------------------------------- /requirements310.txt: -------------------------------------------------------------------------------- 1 | annotated-types==0.5.0 2 | attrs==23.1.0 3 | black==23.7.0 4 | click==8.1.5 5 | coverage==7.2.7 6 | delta-spark==2.4.0 7 | et-xmlfile==1.1.0 8 | exceptiongroup==1.1.2 9 | flake8==6.0.0 10 | importlib-metadata==6.8.0 11 | iniconfig==2.0.0 12 | Jinja2==3.1.2 13 | jsonschema==4.18.3 14 | jsonschema-specifications==2023.6.1 15 | MarkupSafe==2.1.3 16 | mccabe==0.7.0 17 | mypy-extensions==1.0.0 18 | numpy==1.25.1 19 | openpyxl==3.1.2 20 | packaging==23.1 21 | pandas==2.0.3 22 | pathspec==0.11.1 23 | platformdirs==3.9.0 24 | pluggy==1.2.0 25 | py4j==0.10.9.7 26 | pycodestyle==2.10.0 27 | pydantic==2.0.3 28 | pydantic_core==2.3.0 29 | pyflakes==3.0.1 30 | pyspark==3.4.1 31 | pytest==7.4.0 32 | pytest-cov==4.1.0 33 | pytest-env==0.8.2 34 | python-dateutil==2.8.2 35 | pytz==2023.3 36 | PyYAML==6.0 37 | referencing==0.29.1 38 | rpds-py==0.8.10 39 | six==1.16.0 40 | tomli==2.0.1 41 | typer==0.9.0 42 | typing_extensions==4.7.1 43 | tzdata==2023.3 44 | -e git+https://github.com/sibytes/yetl.git@30cce5c49e9b4673e4e398265ee5aca39c34b14d#egg=yetl_framework 45 | zipp==3.16.2 46 | -------------------------------------------------------------------------------- /yetl/config/table/_factory.py: -------------------------------------------------------------------------------- 1 | from ._deltalake import DeltaLake 2 | from ._read import Read 3 | from ._write import Write 4 | from ._table import Table 5 | import logging 6 | from ._table_type import TableType 7 | 8 | 9 | class TableFactory: 10 | def __init__(self) -> None: 11 | self._logger = logging.getLogger(self.__class__.__name__) 12 | self._dataset = {} 13 | self._table = {} 14 | 15 | def register_table_type(self, io_type: TableType, table_type: type): 16 | self._logger.debug(f"Register table type {table_type} as {type}") 17 | self._table[io_type] = table_type 18 | 19 | def make(self, table_type: TableType, config: dict) -> Table: 20 | self._logger.debug(f"Get {table_type.name} from factory dataset") 21 | table_class = self._table.get(table_type) 22 | 23 | if not table_class: 24 | self._logger.debug( 25 | f"TableType {table_type.name} not registered in the table factory" 26 | ) 27 | raise ValueError(table_type) 28 | 29 | return table_class( 30 | **config, 31 | ) 32 | 33 | 34 | factory = TableFactory() 35 | factory.register_table_type(TableType.read, Read) 36 | factory.register_table_type(TableType.delta_lake, DeltaLake) 37 | factory.register_table_type(TableType.write, Write) 38 | -------------------------------------------------------------------------------- /yetl/config/_spark_context.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from pyspark.sql import SparkSession 4 | from delta import configure_spark_with_delta_pip 5 | from ._utils import is_databricks 6 | 7 | _logger = logging.getLogger(__name__) 8 | 9 | 10 | def get_spark_context(project: str, config: dict = None): 11 | if is_databricks(): 12 | _logger.debug("Getting databricks spark context") 13 | try: 14 | from databricks.sdk.runtime import spark 15 | 16 | return spark 17 | except Exception: 18 | _logger.info("cannot create spark context, spark not found.") 19 | return None 20 | 21 | else: 22 | _logger.debug("Getting local spark context") 23 | 24 | if config is None: 25 | config = { 26 | "spark.master": "local", 27 | "spark.databricks.delta.allowArbitraryProperties.enabled": True, 28 | "spark.sql.catalog.spark_catalog": "org.apache.spark.sql.delta.catalog.DeltaCatalog", 29 | "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension", 30 | } 31 | 32 | msg = json.dumps(config, indent=4, default=str) 33 | _logger.debug(msg) 34 | 35 | builder = SparkSession.builder 36 | 37 | for k, v in config.items(): 38 | builder = builder.config(k, v) 39 | 40 | builder.appName(project) 41 | spark = configure_spark_with_delta_pip(builder).getOrCreate() 42 | return spark 43 | -------------------------------------------------------------------------------- /yetl/workflow/_dlt.py: -------------------------------------------------------------------------------- 1 | from ..config import StageType, Config 2 | from ..config.table import Table 3 | from typing import Callable 4 | import logging 5 | 6 | _logger = logging.getLogger(__name__) 7 | 8 | 9 | def create_dlt( 10 | config: Config, 11 | stage: StageType, 12 | dlt_funct: Callable[[Table, Table], None], 13 | debug: bool = False, 14 | **kwargs, 15 | ): 16 | tables = config.tables.lookup_table( 17 | stage=stage, 18 | first_match=False, 19 | # this will filter the tables on a custom property 20 | # in the tables parameter you can add whatever custom properties you want 21 | # either for filtering or to use in pipelines 22 | **kwargs, 23 | ) 24 | 25 | for t in tables: 26 | table_mapping = config.get_table_mapping( 27 | stage=stage, 28 | table=t.table, 29 | # dlt does this so yetl doesn't need to 30 | create_database=False, 31 | create_table=False, 32 | ) 33 | # TODO: not sure if we need checkpoints in DLT 34 | # config.set_checkpoint( 35 | # table_mapping.source, table_mapping.destination 36 | # ) 37 | src = table_mapping.source 38 | dst = table_mapping.destination 39 | if debug: 40 | msg = f"{src.database}.{src.table} => {dst.database}.{dst.table}" 41 | _logger.info(msg) 42 | else: 43 | dlt_funct(table_mapping.source, table_mapping.destination) 44 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | 8 | { 9 | "name": "Python: Main", 10 | "type": "python", 11 | "request": "launch", 12 | "program": "main.py", 13 | "console": "integratedTerminal", 14 | "envFile": "${workspaceFolder}/.env", 15 | "args": [ 16 | // "validate", 17 | // "test_project", 18 | // "autoloader", 19 | // "./test/config" 20 | 21 | // "init", "test_yetl" 22 | 23 | // "import-tables", 24 | // "./test/config/test_project/pipelines/tables.xlsx", 25 | // "./test/config/test_project/pipelines/test_tables.yaml" 26 | 27 | "import-tables", 28 | "./test/config/fnz_pb/pipelines/tables.xlsx", 29 | "./test/config/fnz_pb/pipelines/tables.yaml" 30 | ], 31 | "env": { } 32 | }, 33 | { 34 | "name": "Test", 35 | "type": "python", 36 | "request": "launch", 37 | "console": "internalConsole", 38 | "envFile": "${workspaceFolder}/.env" 39 | } 40 | ] 41 | } 42 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from yetl import * 2 | import os 3 | import shutil 4 | 5 | 6 | # from yetl import __main__ 7 | 8 | def tear_down(): 9 | shutil.rmtree("./test/config/test_project/data", ignore_errors=True) 10 | shutil.rmtree("./metastore_db", ignore_errors=True) 11 | shutil.rmtree("./spark-warehouse", ignore_errors=True) 12 | try: 13 | os.remove("./derby.log") 14 | except Exception: 15 | pass 16 | 17 | 18 | tear_down() 19 | pipeline = "autoloader" 20 | config_path = "./test/config" 21 | project = "test_project" 22 | timeslice = Timeslice(day="*", month="*", year="*") 23 | config = Config( 24 | project=project, 25 | pipeline=pipeline, 26 | config_path=config_path, 27 | timeslice=timeslice, 28 | ) 29 | 30 | # tables = config.tables.create_table( 31 | # stage=StageType.audit_control, 32 | # first_match=False, 33 | # catalog="development" 34 | # ) 35 | 36 | table_mapping = config.get_table_mapping( 37 | stage=StageType.raw, 38 | table="header_footer", 39 | catalog=None, 40 | create_table=False 41 | ) 42 | 43 | 44 | # source: Read = table_mapping.source["customer_details_1"] 45 | # destination: DeltaLake = table_mapping.destination 46 | # config.set_checkpoint(source=source, destination=destination) 47 | 48 | 49 | # t:Timeslice = Timeslice.parse_iso_date("*-*-") 50 | # print(t.strftime("%Y%m%d")) 51 | 52 | 53 | 54 | # @yetl_flow( 55 | # project="test_project", 56 | # stage=StageType.audit_control, 57 | # config_path="./test/config", 58 | # catalog=None 59 | # ) 60 | # def autoloader(table_mapping:TableMapping): 61 | # return table_mapping 62 | 63 | 64 | # result = autoloader(table="header_footer") 65 | # tear_down() 66 | 67 | -------------------------------------------------------------------------------- /yetl/validation/_validate.py: -------------------------------------------------------------------------------- 1 | import json 2 | from ..resource import get_resource_text 3 | import jsonschema as js 4 | from enum import Enum 5 | 6 | 7 | class SchemaFiles(Enum): 8 | tables_schema = "sibytes_yetl_tables_schema.json" 9 | pipeline_schema = "sibytes_yetl_pipeline_schema.json" 10 | project_schema = "sibytes_yetl_project_schema.json" 11 | 12 | 13 | def get_table_schema(): 14 | """Get the tables json schema from the package resource""" 15 | schema = get_resource_text(SchemaFiles.tables_schema.value) 16 | json_schema = json.loads(schema) 17 | 18 | return json_schema 19 | 20 | 21 | def get_pipeline_schema(): 22 | """Get the pipeline json schema from the package resource""" 23 | schema = get_resource_text(SchemaFiles.pipeline_schema.value) 24 | json_schema = json.loads(schema) 25 | 26 | return json_schema 27 | 28 | 29 | def get_project_schema(): 30 | """Get the project json schema from the package resource""" 31 | schema = get_resource_text(SchemaFiles.project_schema.value) 32 | json_schema = json.loads(schema) 33 | 34 | return json_schema 35 | 36 | 37 | def get_schema(schema_file: SchemaFiles): 38 | if schema_file == SchemaFiles.pipeline_schema: 39 | return get_pipeline_schema() 40 | if schema_file == SchemaFiles.project_schema: 41 | return get_project_schema() 42 | if schema_file == SchemaFiles.tables_schema: 43 | return get_table_schema() 44 | 45 | 46 | def validate_tables(data: dict) -> bool: 47 | schema = get_table_schema() 48 | js.validate(instance=data, schema=schema) 49 | 50 | 51 | def validate_pipeline(data: dict) -> bool: 52 | schema = get_pipeline_schema() 53 | js.validate(instance=data, schema=schema) 54 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | from setuptools import setup 3 | 4 | # The directory containing this file 5 | HERE = pathlib.Path(__file__).parent 6 | 7 | # The text of the README file 8 | README = (HERE / "README.md").read_text() 9 | 10 | # This call to setup() does all the work 11 | setup( 12 | name="yetl-framework", 13 | version="3.0.0", 14 | description="yet (another spark) etl framework", 15 | long_description=README, 16 | long_description_content_type="text/markdown", 17 | url="https://www.yetl.io/", 18 | project_urls={ 19 | "GitHub": "https://github.com/sibytes/yetl", 20 | "Documentation": "https://www.yetl.io/", 21 | }, 22 | author="Shaun Ryan", 23 | author_email="shaun_chiburi@hotmail.com", 24 | license="MIT", 25 | classifiers=[ 26 | "License :: OSI Approved :: MIT License", 27 | "Programming Language :: Python :: 3", 28 | "Programming Language :: Python :: 3.8", 29 | "Programming Language :: Python :: 3.9", 30 | "Programming Language :: Python :: 3.10", 31 | ], 32 | include_package_data=True, 33 | package_dir={"": "."}, 34 | package_data={"yetl.resource": ["*.json", "*.yaml", "*.xlsx"]}, 35 | packages=[ 36 | "yetl", 37 | "yetl.resource", 38 | "yetl.validation", 39 | "yetl.cli", 40 | "yetl.cli.metadata_provider", 41 | "yetl.config", 42 | "yetl.config.table", 43 | "yetl.workflow", 44 | 45 | ], 46 | install_requires=[ 47 | 'PyYAML', 48 | 'jinja2', 49 | 'pydantic', 50 | 'jsonschema', 51 | 'typer', 52 | 'pandas', 53 | 'openpyxl', 54 | 'delta-spark', 55 | 'pyspark' 56 | ], 57 | zip_safe=False 58 | ) 59 | -------------------------------------------------------------------------------- /yetl/__main__.py: -------------------------------------------------------------------------------- 1 | import typer 2 | from .cli import _init 3 | from typing_extensions import Annotated 4 | from .cli.metadata_provider import XlsMetadata, ImportFormat 5 | from .config import Config 6 | from typing import Optional 7 | import logging 8 | 9 | app = typer.Typer() 10 | 11 | 12 | @app.command() 13 | def init(project: str, directory: str = "."): 14 | """Initialise the project directory with the suggested structure and start config files 15 | 16 | --directory:str - Where you want the project to be initialised 17 | """ 18 | _init.init(project, directory) 19 | 20 | 21 | @app.command() 22 | def import_tables( 23 | source: str, 24 | destination: str, 25 | format: Annotated[ 26 | ImportFormat, typer.Option(case_sensitive=False) 27 | ] = ImportFormat.excel, 28 | ): 29 | """Import tables configuration from an external source such as a Excel. 30 | 31 | source:str - The uri indicator of the table metadata e.g. the file path if importing a csv \n 32 | format:ImportFormat - The format of the table metadata to import e.g. excel 33 | """ 34 | metadata = XlsMetadata(source=source) 35 | metadata.write(path=destination) 36 | 37 | 38 | @app.command() 39 | def validate( 40 | project: Annotated[str, typer.Argument()], 41 | pipeline: Annotated[str, typer.Argument()], 42 | config_path: Annotated[Optional[str], typer.Argument()] = None, 43 | ): 44 | """Validate that configuration meets the schema and deserialises. 45 | 46 | projec:str - Name of the project to validate \n 47 | pipeline:str - Name of the pipeline config to validate \n 48 | config_path:str - Path to the project configuration root \n 49 | 50 | """ 51 | _logger = logging.getLogger(__name__) 52 | 53 | _logger.info(f"validating project {project} {pipeline}") 54 | Config(project=project, pipeline=pipeline, config_path=config_path) 55 | _logger.info(f"{project} {pipeline} is Valid!") 56 | 57 | 58 | if __name__ in ["yetl.__main__", "__main__"]: 59 | app() 60 | -------------------------------------------------------------------------------- /yetl/config/_project.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel, Field 2 | from enum import Enum 3 | from typing import Union, Dict, Any 4 | import os 5 | 6 | 7 | class SparkLoggingLevel(Enum): 8 | INFO = "INFO" 9 | DEBUG = "DEBUG" 10 | WARNING = "WARNING" 11 | ERROR = "ERROR" 12 | 13 | 14 | class SparkConfig(BaseModel): 15 | logging_level: SparkLoggingLevel = Field(default=SparkLoggingLevel.INFO) 16 | config: Dict[str, Union[str, bool]] = Field( 17 | default={ 18 | "spark.master": "local", 19 | "spark.sql.catalog.spark_catalog": "org.apache.spark.sql.delta.catalog.DeltaCatalog", 20 | "spark.databricks.delta.merge.repartitionBeforeWrite.enabled": True, 21 | "spark.sql.catalog.spark_catalog": "org.apache.spark.sql.delta.catalog.DeltaCatalog", 22 | "spark.sql.catalogImplementation": "hive", 23 | } 24 | ) 25 | 26 | 27 | class Project(BaseModel): 28 | def __init__(self, **data: Any) -> None: 29 | super().__init__(**data) 30 | self.sql = os.path.join(self.config_path, self.sql.replace("./", "")) 31 | self.pipelines = os.path.join( 32 | self.config_path, self.pipelines.replace("./", "") 33 | ) 34 | self.databricks_notebooks = os.path.join( 35 | self.config_path, self.databricks_notebooks.replace("./", "") 36 | ) 37 | self.databricks_workflows = os.path.join( 38 | self.config_path, self.databricks_workflows.replace("./", "") 39 | ) 40 | self.databricks_queries = os.path.join( 41 | self.config_path, self.databricks_queries.replace("./", "") 42 | ) 43 | 44 | config_path: str = Field(...) 45 | name: str = Field(...) 46 | sql: str = Field(default="./sql") 47 | pipelines: str = Field(default="./pipelines") 48 | databricks_notebooks: str = Field(default="./databricks/notebooks") 49 | databricks_workflows: str = Field(default="./databricks/workflows") 50 | databricks_queries: str = Field(default="./databricks/queries") 51 | 52 | spark: SparkConfig = Field(default=SparkConfig()) 53 | -------------------------------------------------------------------------------- /ci.yaml: -------------------------------------------------------------------------------- 1 | trigger: 2 | branches: 3 | include: 4 | - main 5 | 6 | # variables: 7 | # - group: data-platform-kv 8 | # - group: databricks 9 | 10 | pool: 11 | vmImage: 'ubuntu-latest' 12 | strategy: 13 | matrix: 14 | Python310: 15 | python.version: '3.10' 16 | 17 | steps: 18 | - task: UsePythonVersion@0 19 | inputs: 20 | versionSpec: '$(python.version)' 21 | displayName: 'Use Python $(python.version)' 22 | 23 | - script: | 24 | python -m pip install --upgrade pip setuptools wheel twine 25 | pip install -r requirements310.txt 26 | displayName: 'Install dependencies' 27 | 28 | - script: | 29 | python -m flake8 ./yetl 30 | displayName: 'Run lint tests' 31 | 32 | - script: | 33 | python setup.py sdist bdist_wheel 34 | ls dist/ 35 | displayName: 'Artifact creation' 36 | 37 | - script: | 38 | pip install . 39 | pytest test/ --junitxml=junit/test-results.xml --cov=yetl --cov-report=xml 40 | displayName: 'Unit & Integration Tests' 41 | 42 | 43 | - task: PublishTestResults@2 44 | condition: succeededOrFailed() 45 | inputs: 46 | testResultsFiles: '**/test-*.xml' 47 | testRunTitle: 'Publish test results for Python $(python.version)' 48 | 49 | - task: PublishCodeCoverageResults@1 50 | inputs: 51 | codeCoverageTool: Cobertura 52 | summaryFileLocation: '$(System.DefaultWorkingDirectory)/**/coverage.xml' 53 | reportDirectory: '$(System.DefaultWorkingDirectory)/**/htmlcov' 54 | 55 | - task: CopyFiles@2 56 | inputs: 57 | SourceFolder: '$(Build.SourcesDirectory)' 58 | Contents: | 59 | dist/** 60 | deployment/** 61 | TargetFolder: '$(Build.ArtifactStagingDirectory)' 62 | 63 | - task: PublishBuildArtifacts@1 64 | inputs: 65 | PathtoPublish: '$(Build.ArtifactStagingDirectory)' 66 | ArtifactName: 'drop' 67 | publishLocation: 'Container' 68 | displayName: 'Publish Build Artefacts' 69 | 70 | - task: TwineAuthenticate@0 71 | inputs: 72 | # artifactFeeds: 'sibytes' 73 | externalFeeds: 'pypi' 74 | displayName: 'Authenticate Twine' 75 | 76 | - script: | 77 | twine upload -r pypi --config-file $(PYPIRC_PATH) $(Build.SourcesDirectory)/dist/* 78 | continueOnError: true 79 | displayName: 'Publish to Artefact Store' 80 | -------------------------------------------------------------------------------- /yetl/config/_logging_config.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import logging.config 3 | import yaml 4 | import os 5 | from yaml import YAMLError 6 | from ._utils import YETL_CONFIG 7 | 8 | 9 | def configure_logging(project: str = None, config_path=None): 10 | """Returns as of a named logger based on a yaml logging configuration file 11 | 12 | The configuration file is called logging.yaml. It's a directory 13 | location is taken from an envrionment variables called DATAPREPHOME. 14 | If this does not exist then it is defaulted to ./config/ 15 | """ 16 | log_config_file = config_path 17 | if not log_config_file: 18 | log_config_file = os.getenv(YETL_CONFIG, "./config") 19 | 20 | if project: 21 | project_log_config_file = os.path.join(log_config_file, project) 22 | 23 | project_log_config_file = f"{project_log_config_file}/logging.yaml" 24 | project_log_config_file = os.path.abspath(project_log_config_file) 25 | 26 | log_config_file = f"{log_config_file}/logging.yaml" 27 | log_config_file = os.path.abspath(log_config_file) 28 | 29 | if os.path.exists(project_log_config_file): 30 | log_config_file = project_log_config_file 31 | 32 | # check that it exists 33 | if not os.path.exists(log_config_file): 34 | msg = f"Config logging file path does not exist {log_config_file}" 35 | raise Exception(msg) 36 | 37 | # load the logging configuration into the logger 38 | with open(log_config_file, "r") as f: 39 | try: 40 | config = yaml.safe_load(f.read()) 41 | logging.config.dictConfig(config) 42 | 43 | # if it errors because of invalid yaml format then 44 | # provide details so the users can easily find and correct 45 | # if it's a different exception just let it raise 46 | except YAMLError as e: 47 | location = "" 48 | if hasattr(e, "problem_mark"): 49 | mark = e.problem_mark 50 | location = f"Error position ({mark.line}, {mark.column})" 51 | 52 | if hasattr(e, "problem"): 53 | problem = f"{e.problem}." 54 | 55 | raise Exception( 56 | f"Invalid yaml format in {log_config_file}. {problem} {location}" 57 | ) 58 | -------------------------------------------------------------------------------- /test/config/test_project/dataNone/_delta_log/00000000000000000000.json: -------------------------------------------------------------------------------- 1 | {"commitInfo":{"timestamp":1697012148607,"operation":"CREATE TABLE","operationParameters":{"isManaged":"false","description":null,"partitionBy":"[]","properties":"{\"delta.autooptimize.autocompact\":\"true\",\"delta.autooptimize.optimizewrite\":\"true\",\"delta.appendOnly\":\"true\"}"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{},"engineInfo":"Apache-Spark/3.4.1 Delta-Lake/2.4.0","txnId":"ecefe756-4bcb-42bc-85dd-3d148e23d8f2"}} 2 | {"protocol":{"minReaderVersion":1,"minWriterVersion":2}} 3 | {"metaData":{"id":"7ddc211e-d6d1-4928-849d-c513682fb508","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"header\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"flag\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"row_count\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"period\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"batch\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"raw_header\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"footer\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"flag\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"period\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"raw_footer\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"_process_id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"_load_date\",\"type\":\"timestamp\",\"nullable\":true,\"metadata\":{}},{\"name\":\"_metadata\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"file_path\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"file_name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"file_size\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"file_modification_time\",\"type\":\"timestamp\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.autooptimize.autocompact":"true","delta.autooptimize.optimizewrite":"true","delta.appendOnly":"true"},"createdTime":1697012148597}} 4 | -------------------------------------------------------------------------------- /test/config/test_project/pipelines/tables_invalid.yaml: -------------------------------------------------------------------------------- 1 | # version: 3.0.0 2 | 3 | audit_control: 4 | delta_lake: 5 | raw_dbx_patterns_control: 6 | header_footer: 7 | depends_on: 8 | - raw.raw_dbx_patterns.* 9 | partition_by: none 10 | sql: ../sql/{{database}}/{{table}}.sql 11 | vacuum: 30 12 | catalog: hive_metastore 13 | raw_audit: 14 | depends_on: 15 | - raw.raw_dbx_patterns.* 16 | - audit_control.raw_dbx_patterns_control.header_footer 17 | sql: ../sql/{{database}}/{{table}}.sql 18 | vacuum: 30 19 | 20 | landing: 21 | read: 22 | landing_dbx_patterns: 23 | catalog: hive_metastore 24 | customer_details_1: null 25 | customer_details_2: null 26 | 27 | raw: 28 | delta_lake: 29 | raw_dbx_patterns: 30 | catalog: hive_metastore 31 | customers: 32 | custom_properties: 33 | process_group: 1 34 | rentention_days: 365 35 | depends_on: 36 | - landing.landing_dbx_patterns.customer_details_1 37 | - landing.landing_dbx_patterns.customer_details_2 38 | exception_thresholds: 39 | invalid_ratio: 0.2 40 | invalid_rows: 2 41 | max_rows: 1000 42 | min_rows: 0 43 | id: id 44 | vacuum: 30 45 | warning_thresholds: 46 | invalid_ratio: 0.1 47 | invalid_rows: 0 48 | max_rows: 100 49 | min_rows: 5 50 | z_order_by: 51 | - _load_date_1 52 | - _load_date_2 53 | 54 | base: 55 | delta_lake: 56 | base_dbx_patterns: 57 | catalog: hive_metastore 58 | customer_details_1: 59 | delta_properties: 60 | delta.appendOnly: true 61 | delta.autoOptimize.autoCompact: true 62 | delta.autoOptimize.optimizeWrite: true 63 | delta.enableChangeDataFeed: false 64 | depends_on: 65 | - raw.raw_dbx_patterns.customers 66 | id: id 67 | vacuum: 30 68 | customer_details_2: 69 | delta_properties: 70 | delta.appendOnly: true 71 | delta.autoOptimize.autoCompact: true 72 | delta.autoOptimize.optimizeWrite: true 73 | delta.enableChangeDataFeed: false 74 | depends_on: 75 | - raw.raw_dbx_patterns.customers 76 | id: id 77 | vacuum: 30 78 | 79 | -------------------------------------------------------------------------------- /yetl/config/_decorators.py: -------------------------------------------------------------------------------- 1 | # implicit, not referenced - must be the 1st import 2 | from ._logging_config import configure_logging 3 | import logging 4 | from ._config import Config 5 | from ._timeslice import Timeslice 6 | from ._stage_type import StageType 7 | from .table import Table 8 | 9 | 10 | def yetl_flow( 11 | stage: StageType, 12 | project: str, 13 | pipeline: str = None, 14 | config_path: str = None, 15 | catalog: str = None, 16 | ): 17 | def decorate(function): 18 | def wrap_function(*args, **kwargs): 19 | configure_logging(project) 20 | _logger = logging.getLogger(__name__) 21 | 22 | _pipeline = pipeline 23 | if not _pipeline: 24 | _pipeline = function.__name__ 25 | 26 | _logger.info(f"Loading pipeline configuration {_pipeline}") 27 | 28 | timeslice = kwargs.get("timeslice", Timeslice(day="*", month="*", year="*")) 29 | if "timeslice" in kwargs.keys(): 30 | del kwargs["timeslice"] 31 | 32 | try: 33 | table = kwargs["table"] 34 | del kwargs["table"] 35 | except KeyError as e: 36 | raise Exception(f"{e} is a required argument for a yetl flow function") 37 | 38 | config = Config( 39 | project=project, 40 | pipeline=_pipeline, 41 | config_path=config_path, 42 | timeslice=timeslice, 43 | ) 44 | table_mapping = config.get_table_mapping( 45 | stage=stage, 46 | table=table, 47 | catalog=catalog, 48 | ) 49 | 50 | destination: Table = table_mapping.destination 51 | sources = table_mapping.source 52 | if isinstance(sources, dict): 53 | for _, source in sources.items(): 54 | config.set_checkpoint(source=source, destination=destination) 55 | else: 56 | config.set_checkpoint(source=sources, destination=destination) 57 | 58 | _logger.info(f"Calling function {function.__name__}") 59 | ret = function( 60 | *args, 61 | table_mapping=table_mapping, 62 | **kwargs, 63 | ) 64 | return ret 65 | 66 | return wrap_function 67 | 68 | return decorate 69 | -------------------------------------------------------------------------------- /test/config/test_project/pipelines/test_tables.yaml: -------------------------------------------------------------------------------- 1 | # yaml-language-server: $schema=./json_schema/sibytes_yetl_tables_schema.json 2 | 3 | version: 3.0.0 4 | 5 | audit_control: 6 | delta_lake: 7 | raw_dbx_patterns_control: 8 | catalog: hive_metastore 9 | header_footer: 10 | cluster_by: none 11 | depends_on: 12 | - raw.raw_dbx_patterns.* 13 | partition_by: none 14 | sql: ../sql/{{database}}/{{table}}.sql 15 | vacuum: 30 16 | raw_audit: 17 | depends_on: 18 | - raw.raw_dbx_patterns.* 19 | - audit_control.raw_dbx_patterns_control.header_footer 20 | sql: ../sql/{{database}}/{{table}}.sql 21 | vacuum: 30 22 | 23 | landing: 24 | read: 25 | landing_dbx_patterns: 26 | catalog: hive_metastore 27 | customer_details_1: null 28 | customer_details_2: null 29 | 30 | raw: 31 | delta_lake: 32 | raw_dbx_patterns: 33 | catalog: hive_metastore 34 | customers: 35 | custom_properties: 36 | process_group: 1 37 | rentention_days: 365 38 | depends_on: 39 | - landing.landing_dbx_patterns.customer_details_1 40 | - landing.landing_dbx_patterns.customer_details_2 41 | exception_thresholds: 42 | invalid_ratio: 0.2 43 | invalid_rows: 2 44 | max_rows: 1000 45 | min_rows: 0 46 | id: id 47 | vacuum: 30 48 | warning_thresholds: 49 | invalid_ratio: 0.1 50 | invalid_rows: 0 51 | max_rows: 100 52 | min_rows: 5 53 | z_order_by: 54 | - _load_date_1 55 | - _load_date_2 56 | 57 | base: 58 | delta_lake: 59 | base_dbx_patterns: 60 | catalog: hive_metastore 61 | customer_details_1: 62 | delta_properties: 63 | delta.appendOnly: true 64 | delta.autoOptimize.autoCompact: true 65 | delta.autoOptimize.optimizeWrite: true 66 | delta.enableChangeDataFeed: false 67 | depends_on: 68 | - raw.raw_dbx_patterns.customers 69 | id: id 70 | vacuum: 30 71 | customer_details_2: 72 | delta_properties: 73 | delta.appendOnly: true 74 | delta.autoOptimize.autoCompact: true 75 | delta.autoOptimize.optimizeWrite: true 76 | delta.enableChangeDataFeed: false 77 | depends_on: 78 | - raw.raw_dbx_patterns.customers 79 | id: id 80 | vacuum: 30 81 | 82 | -------------------------------------------------------------------------------- /test/config/test_project/pipelines/tables.yaml: -------------------------------------------------------------------------------- 1 | version: 3.0.0 2 | 3 | audit_control: 4 | delta_lake: 5 | raw_dbx_patterns_control: 6 | header_footer: 7 | depends_on: 8 | - raw.raw_dbx_patterns.* 9 | partition_by: none 10 | sql: ../sql/{{database}}/{{table}}.sql 11 | vacuum: 30 12 | catalog: null 13 | raw_audit: 14 | depends_on: 15 | - raw.raw_dbx_patterns.* 16 | - audit_control.raw_dbx_patterns_control.header_footer 17 | sql: ../sql/{{database}}/{{table}}.sql 18 | vacuum: 30 19 | 20 | source: 21 | delta_lake: 22 | source_dbx_patterns: 23 | catalog: hive_metastore 24 | customer_details_1: null 25 | customer_details_2: null 26 | 27 | landing: 28 | read: 29 | landing_dbx_patterns: 30 | catalog: hive_metastore 31 | customer_details_1: null 32 | customer_details_2: null 33 | 34 | raw: 35 | delta_lake: 36 | raw_dbx_patterns: 37 | catalog: null 38 | customers: 39 | custom_properties: 40 | process_group: 1 41 | rentention_days: 365 42 | depends_on: 43 | - landing.landing_dbx_patterns.customer_details_1 44 | - landing.landing_dbx_patterns.customer_details_2 45 | exception_thresholds: 46 | invalid_ratio: 0.2 47 | invalid_rows: 2 48 | max_rows: 1000 49 | min_rows: 0 50 | id: id 51 | vacuum: 30 52 | warning_thresholds: 53 | invalid_ratio: 0.1 54 | invalid_rows: 0 55 | max_rows: 100 56 | min_rows: 5 57 | z_order_by: 58 | - _load_date_1 59 | - _load_date_2 60 | 61 | base: 62 | delta_lake: 63 | base_dbx_patterns: 64 | catalog: null 65 | customer_details_1: 66 | delta_properties: 67 | delta.appendOnly: true 68 | delta.autoOptimize.autoCompact: true 69 | delta.autoOptimize.optimizeWrite: true 70 | delta.enableChangeDataFeed: false 71 | depends_on: 72 | - raw.raw_dbx_patterns.customers 73 | id: id 74 | vacuum: 30 75 | customer_details_2: 76 | delta_properties: 77 | delta.appendOnly: true 78 | delta.autoOptimize.autoCompact: true 79 | delta.autoOptimize.optimizeWrite: true 80 | delta.enableChangeDataFeed: false 81 | depends_on: 82 | - raw.raw_dbx_patterns.customers 83 | id: id 84 | vacuum: 30 85 | 86 | -------------------------------------------------------------------------------- /schema_testing/tables.yaml: -------------------------------------------------------------------------------- 1 | # yaml-language-server: $schema=sibytes_yetl_tables_schema.json 2 | 3 | version: 3.0.0 4 | 5 | audit_control: 6 | delta_lake: 7 | raw_dbx_patterns_control: 8 | header_footer: 9 | depends_on: 10 | - raw.raw_dbx_patterns.* 11 | partition_by: none 12 | sql: ../sql/{{database}}/{{table}}.sql 13 | vacuum: 30 14 | catalog: hive_metastore 15 | raw_audit: 16 | depends_on: 17 | - raw.raw_dbx_patterns.* 18 | - audit_control.raw_dbx_patterns_control.header_footer 19 | sql: ../sql/{{database}}/{{table}}.sql 20 | vacuum: 30 21 | 22 | source: 23 | delta_lake: 24 | source_dbx_patterns: 25 | catalog: hive_metastore 26 | customer_details_1: 27 | custom_properties: 28 | process_group: 1 29 | rentention_days: 365 30 | id: id 31 | customer_details_2: 32 | custom_properties: 33 | process_group: 1 34 | rentention_days: 365 35 | id: id 36 | 37 | 38 | landing: 39 | read: 40 | landing_dbx_patterns: 41 | catalog: hive_metastore 42 | customer_details_1: null 43 | customer_details_2: null 44 | 45 | 46 | raw: 47 | delta_lake: 48 | raw_dbx_patterns: 49 | catalog: hive_metastore 50 | customers: 51 | custom_properties: 52 | process_group: 1 53 | rentention_days: 365 54 | depends_on: 55 | - landing.landing_dbx_patterns.customer_details_1 56 | - landing.landing_dbx_patterns.customer_details_2 57 | exception_thresholds: 58 | invalid_ratio: 0.2 59 | invalid_rows: 2 60 | max_rows: 1000 61 | min_rows: 0 62 | id: id 63 | vacuum: 30 64 | warning_thresholds: 65 | invalid_ratio: 0.1 66 | invalid_rows: 0 67 | max_rows: 100 68 | min_rows: 5 69 | z_order_by: 70 | - _load_date_1 71 | - _load_date_2 72 | 73 | base: 74 | delta_lake: 75 | base_dbx_patterns: 76 | catalog: hive_metastore 77 | customer_details_1: 78 | delta_properties: 79 | delta.appendOnly: true 80 | delta.autoOptimize.autoCompact: true 81 | delta.autoOptimize.optimizeWrite: true 82 | delta.enableChangeDataFeed: false 83 | depends_on: 84 | - raw.raw_dbx_patterns.customers 85 | id: id 86 | vacuum: 30 87 | customer_details_2: 88 | delta_properties: 89 | delta.appendOnly: true 90 | delta.autoOptimize.autoCompact: true 91 | delta.autoOptimize.optimizeWrite: true 92 | delta.enableChangeDataFeed: false 93 | depends_on: 94 | - raw.raw_dbx_patterns.customers 95 | id: id 96 | vacuum: 30 97 | 98 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | # .env 106 | .venv 107 | env/ 108 | venv/ 109 | venv*/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | .dmypy.json 127 | dmypy.json 128 | 129 | # Pyre type checker 130 | .pyre/ 131 | 132 | 133 | # .vscode 134 | .DS_Store 135 | 136 | 137 | # ignore database engines 138 | /spark-standalone/delta_lake/* 139 | /spark-standalone/apps/* 140 | /spark-warehouse/* 141 | /test/config/test_project/data/* 142 | # internal spark metastore 143 | /metastore_db/* 144 | derby.log 145 | 146 | 147 | /config/runs/* 148 | /scratch 149 | 150 | .databricks 151 | junit -------------------------------------------------------------------------------- /schema_testing/autoloader.yaml: -------------------------------------------------------------------------------- 1 | # yaml-language-server: $schema=sibytes_yetl_pipeline_schema.json 2 | 3 | version: 3.0.0 4 | tables: ./tables.yaml 5 | 6 | audit_control: 7 | delta_lake: 8 | # delta table properties can be set at stage level or table level 9 | delta_properties: 10 | delta.appendOnly: true 11 | delta.autoOptimize.autoCompact: true 12 | delta.autoOptimize.optimizeWrite: true 13 | managed: false 14 | container: datalake 15 | # location: /mnt/{{container}}/data/raw 16 | # path: "{{database}}/{{table}}" 17 | options: 18 | checkpointLocation: "/mnt/{{container}}/checkpoint/{{project}}/{{checkpoint}}" 19 | 20 | source: 21 | delta_lake: 22 | managed: false 23 | container: datalake 24 | location: /mnt/{{container}}/data/source 25 | path: "{{database}}/{{table}}" 26 | options: null 27 | 28 | landing: 29 | read: 30 | trigger: customerdetailscomplete-{{filename_date_format}}*.flg 31 | trigger_type: file 32 | container: datalake 33 | location: "/mnt/{{container}}/data/landing/dbx_patterns/{{table}}/{{path_date_format}}" 34 | filename: "{{table}}-{{filename_date_format}}*.csv" 35 | filename_date_format: "%Y%m%d" 36 | path_date_format: "%Y%m%d" 37 | # injects the time period column into the dataset 38 | # using either the path_date_format or the filename_date_format 39 | # as you specify 40 | slice_date: filename_date_format 41 | slice_date_column_name: _slice_date 42 | format: cloudFiles 43 | spark_schema: ../schema/{{table.lower()}}.yaml 44 | options: 45 | # autoloader 46 | cloudFiles.format: csv 47 | cloudFiles.schemaLocation: /mnt/{{container}}/checkpoint/{{project}}/{{checkpoint}} 48 | cloudFiles.useIncrementalListing: auto 49 | # schema 50 | inferSchema: false 51 | enforceSchema: true 52 | columnNameOfCorruptRecord: _corrupt_record 53 | # csv 54 | header: false 55 | mode: PERMISSIVE 56 | encoding: windows-1252 57 | delimiter: "," 58 | escape: '"' 59 | nullValue: "" 60 | quote: '"' 61 | emptyValue: "" 62 | 63 | raw: 64 | delta_lake: 65 | # delta table properties can be set at stage level or table level 66 | delta_properties: 67 | delta.appendOnly: true 68 | delta.autoOptimize.autoCompact: true 69 | delta.autoOptimize.optimizeWrite: true 70 | delta.enableChangeDataFeed: false 71 | managed: false 72 | container: datalake 73 | location: /mnt/{{container}}/data/raw 74 | path: "{{database}}/{{table}}" 75 | options: 76 | mergeSchema: true 77 | checkpointLocation: "/mnt/{{container}}/checkpoint/{{project}}/{{checkpoint}}" 78 | 79 | base: 80 | delta_lake: 81 | container: datalake 82 | location: /mnt/{{container}}/data/base 83 | path: "{{database}}/{{table}}" 84 | options: null 85 | -------------------------------------------------------------------------------- /yetl/workflow/_multi_threaded.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ThreadPoolExecutor, Future, as_completed 2 | from ._notebook import Notebook 3 | from typing import List 4 | 5 | 6 | # execute a notebook using databricks workflows 7 | def _execute_notebook(notebook: Notebook, dbutils): 8 | """Execute a notebookd using databricks workflows""" 9 | msg = { 10 | "_message": f"Executing notebook {notebook.path} parameters {notebook.parameters}", 11 | "status": "executing", 12 | "notebook": notebook.path, 13 | } 14 | print(msg["_message"], flush=True) 15 | 16 | try: 17 | result = dbutils.notebook.run( 18 | notebook.path, notebook.timeout, notebook.parameters 19 | ) 20 | msg = { 21 | "_message": f"Succeeded notebook {notebook.path}", 22 | "status": "succeeded", 23 | "notebook": notebook.path, 24 | } 25 | print(msg["_message"], flush=True) 26 | return result 27 | 28 | except Exception as e: 29 | if notebook.retry < 1: 30 | msg = { 31 | "_message": f"notebook {notebook.path} failed.", 32 | "status": "failed", 33 | "error": str(e), 34 | "notebook": notebook.path, 35 | } 36 | print(msg["_message"], flush=True) 37 | raise Exception(msg["_message"]) 38 | 39 | msg = { 40 | "_message": f"Retrying notebook {notebook.path}", 41 | "status": "executing", 42 | "notebook": notebook.path, 43 | } 44 | print(msg["_message"], flush=True) 45 | notebook.retry -= 1 46 | 47 | 48 | def _try_future(future: Future, catch=False): 49 | return future.result() 50 | 51 | 52 | def execute_notebooks(notebooks: List[Notebook], maxParallel: int, dbutils): 53 | msg = { 54 | "_message": f"Executing {len(notebooks)} with maxParallel of {maxParallel}", 55 | "notebooks": len(notebooks), 56 | "maxParallel": maxParallel, 57 | } 58 | print(msg["_message"], flush=True) 59 | 60 | with ThreadPoolExecutor(max_workers=maxParallel) as executor: 61 | results = [ 62 | executor.submit(_execute_notebook, notebook, dbutils) 63 | for notebook in notebooks 64 | if notebook.enabled 65 | ] 66 | 67 | # the individual notebooks handle their errors and pass back a packaged result 68 | # we will still need to handle the fact that the notebook execution call may fail 69 | # or a programmer missed the handling of an error in the notebook task 70 | # that's what tryFuture(future:Future) does 71 | results_list = [_try_future(r) for r in as_completed(results)] 72 | 73 | print( 74 | f"Finished executing {len(notebooks)} with maxParallel of {maxParallel}", 75 | flush=True, 76 | ) 77 | return results_list 78 | -------------------------------------------------------------------------------- /yetl/resource/sibytes_yetl_project_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$id": "https://yetl.io/schemas/project", 3 | 4 | "type":"object", 5 | "description": "Root of the yetl project config", 6 | "properties": { 7 | "version": { 8 | "type": "string", 9 | "description": "version of yetl that the configuration is compatible with", 10 | "pattern": "^(\\d+\\.)?(\\d+\\.)?(\\*|\\d+)$" 11 | }, 12 | "name": { 13 | "type": "string", 14 | "description": "name of the project" 15 | }, 16 | "spark_schema": { 17 | "type": "string", 18 | "description": "relative project path to directory containing spark schema" 19 | }, 20 | "pipeline": { 21 | "type": "string", 22 | "description": "relative project path to directory containing yetl pipeline configuration" 23 | }, 24 | "databricks_notebooks": { 25 | "type": "string", 26 | "description": "relative project path to directory containing databricks notebooks" 27 | }, 28 | "databricks_workflows": { 29 | "type": "string", 30 | "description": "relative project path to directory containing databricks workflows" 31 | }, 32 | "databricks_queries": { 33 | "type": "string", 34 | "description": "relative project path to directory containing databricks queries" 35 | }, 36 | "spark": { 37 | "$ref": "#/$defs/spark" 38 | } 39 | 40 | }, 41 | "required": [ 42 | "version", 43 | "name", 44 | "spark_schema", 45 | "pipeline", 46 | "databricks_notebooks", 47 | "databricks_workflows", 48 | "databricks_queries" 49 | ], 50 | 51 | "$defs": { 52 | "spark": { 53 | "type": "object", 54 | "description": "defines spark logging and configuration for local execution", 55 | "properties": { 56 | "logging_level": { 57 | "type" : "string", 58 | "enum": [ 59 | "OFF", 60 | "FATAL", 61 | "ERROR", 62 | "WARN", 63 | "INFO", 64 | "DEBUG", 65 | "TRACE", 66 | "ALL" 67 | ] 68 | }, 69 | "config": { 70 | "type": "object", 71 | "description": "spark configuration key value pairs", 72 | "minProperties": 1, 73 | "patternProperties":{ 74 | "^\\S+$": { 75 | "type": ["string","number","boolean"], 76 | "description": "spark configuration properties" 77 | } 78 | } 79 | } 80 | 81 | } 82 | } 83 | } 84 | } 85 | 86 | 87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /test/config/test_project/pipelines/json_schema/sibytes_yetl_project_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$id": "https://yetl.io/schemas/project", 3 | 4 | "type":"object", 5 | "description": "Root of the yetl project config", 6 | "properties": { 7 | "version": { 8 | "type": "string", 9 | "description": "version of yetl that the configuration is compatible with", 10 | "pattern": "^(\\d+\\.)?(\\d+\\.)?(\\*|\\d+)$" 11 | }, 12 | "name": { 13 | "type": "string", 14 | "description": "name of the project" 15 | }, 16 | "spark_schema": { 17 | "type": "string", 18 | "description": "relative project path to directory containing spark schema" 19 | }, 20 | "pipeline": { 21 | "type": "string", 22 | "description": "relative project path to directory containing yetl pipeline configuration" 23 | }, 24 | "databricks_notebooks": { 25 | "type": "string", 26 | "description": "relative project path to directory containing databricks notebooks" 27 | }, 28 | "databricks_workflows": { 29 | "type": "string", 30 | "description": "relative project path to directory containing databricks workflows" 31 | }, 32 | "databricks_queries": { 33 | "type": "string", 34 | "description": "relative project path to directory containing databricks queries" 35 | }, 36 | "spark": { 37 | "$ref": "#/$defs/spark" 38 | } 39 | 40 | }, 41 | "required": [ 42 | "version", 43 | "name", 44 | "spark_schema", 45 | "pipeline", 46 | "databricks_notebooks", 47 | "databricks_workflows", 48 | "databricks_queries" 49 | ], 50 | 51 | "$defs": { 52 | "spark": { 53 | "type": "object", 54 | "description": "defines spark logging and configuration for local execution", 55 | "properties": { 56 | "logging_level": { 57 | "type" : "string", 58 | "enum": [ 59 | "OFF", 60 | "FATAL", 61 | "ERROR", 62 | "WARN", 63 | "INFO", 64 | "DEBUG", 65 | "TRACE", 66 | "ALL" 67 | ] 68 | }, 69 | "config": { 70 | "type": "object", 71 | "description": "spark configuration key value pairs", 72 | "minProperties": 1, 73 | "patternProperties":{ 74 | "^\\S+$": { 75 | "type": ["string","number","boolean"], 76 | "description": "spark configuration properties" 77 | } 78 | } 79 | } 80 | 81 | } 82 | } 83 | } 84 | } 85 | 86 | 87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /test/unit/test_timeslice.py: -------------------------------------------------------------------------------- 1 | from pydantic import ValidationError 2 | from yetl import Timeslice, TimesliceNow, TimesliceUtcNow 3 | from datetime import datetime 4 | 5 | 6 | def test_timeslice_all(): 7 | format = "%Y%m%d" 8 | timeslice = Timeslice(day="*", month="*", year="*") 9 | actual = timeslice.strftime(format) 10 | expected = "***" 11 | assert actual == expected 12 | 13 | 14 | def test_timeslice_year(): 15 | format = "%Y/%m/%d" 16 | timeslice = Timeslice(day="*", month="*", year=2023) 17 | actual = timeslice.strftime(format) 18 | expected = "2023/*/*" 19 | assert actual == expected 20 | 21 | 22 | def test_timeslice_month(): 23 | format = "%Y-%m-%d" 24 | timeslice = Timeslice(day="*", month=1, year=2023) 25 | actual = timeslice.strftime(format) 26 | expected = "2023-01-*" 27 | assert actual == expected 28 | 29 | 30 | def test_timeslice_day(): 31 | format = "%Y\\%m\\%d" 32 | timeslice = Timeslice(day=1, month=1, year=2023) 33 | actual = timeslice.strftime(format) 34 | expected = "2023\\01\\01" 35 | assert actual == expected 36 | 37 | 38 | def test_timeslice_invalid(): 39 | actual = None 40 | try: 41 | timeslice = Timeslice(day="s", month=1, year=2023) # noqa F841 42 | except ValidationError as e: 43 | actual = e 44 | 45 | assert isinstance(actual, ValidationError) 46 | 47 | 48 | def test_timeslice_invalid_date(): 49 | actual = None 50 | format = "%Y/%m/%d" 51 | try: 52 | timeslice = Timeslice(day=500, month=1, year=2023) 53 | actual = timeslice.strftime(format) 54 | except ValueError as e: 55 | actual = e 56 | 57 | assert isinstance(actual, ValueError) 58 | 59 | 60 | def test_timeslice_now(): 61 | now = datetime.now() 62 | timeslice = TimesliceNow() 63 | 64 | assert ( 65 | timeslice.day == now.day 66 | and timeslice.month == now.month 67 | and timeslice.year == now.year 68 | and timeslice.hour == now.hour 69 | and timeslice.minute == now.minute 70 | ) 71 | 72 | 73 | def test_timeslice_utcnow(): 74 | now = datetime.utcnow() 75 | timeslice = TimesliceUtcNow() 76 | 77 | assert ( 78 | timeslice.day == now.day 79 | and timeslice.month == now.month 80 | and timeslice.year == now.year 81 | and timeslice.hour == now.hour 82 | and timeslice.minute == now.minute 83 | ) 84 | 85 | 86 | def test_timeslice_invalid_format_code(): 87 | format = "%Y-%m-%d-%c" 88 | actual = "" 89 | try: 90 | timeslice = Timeslice(day="*", month=1, year=2023) 91 | actual = timeslice.strftime(format) 92 | except Exception as e: 93 | actual = str(e) 94 | 95 | expected = "The format contains the following unsupported format codes: %c" 96 | 97 | assert actual == expected 98 | 99 | 100 | def test_timeslice_str(): 101 | timeslice = Timeslice(day=1, month=1, year=2023) 102 | actual = str(timeslice) 103 | expected = "2023-01-01 00:00:00.000000" 104 | assert actual == expected 105 | -------------------------------------------------------------------------------- /test/config/test_project/pipelines/autoloader.yaml: -------------------------------------------------------------------------------- 1 | # yaml-language-server: $schema=./json_schema/sibytes_yetl_pipeline_schema.json 2 | 3 | version: 3.0.0 4 | tables: ./tables.yaml 5 | 6 | audit_control: 7 | delta_lake: 8 | # delta table properties can be set at stage level or table level 9 | delta_properties: 10 | delta.appendOnly: true 11 | delta.autoOptimize.autoCompact: true 12 | delta.autoOptimize.optimizeWrite: true 13 | managed: false 14 | container: datalake 15 | # location: /mnt/{{container}}/data/raw 16 | # path: "{{database}}/{{table}}" 17 | options: 18 | checkpointLocation: "/mnt/{{container}}/checkpoint/{{project}}/{{checkpoint}}" 19 | 20 | source: 21 | delta_lake: 22 | # delta table properties can be set at stage level or table level 23 | delta_properties: 24 | delta.appendOnly: true 25 | delta.autoOptimize.autoCompact: true 26 | delta.autoOptimize.optimizeWrite: true 27 | delta.enableChangeDataFeed: false 28 | managed: false 29 | container: datalake 30 | location: /mnt/{{container}}/data/source 31 | path: "{{database}}/{{table}}" 32 | options: null 33 | 34 | landing: 35 | read: 36 | trigger: customerdetailscomplete-{{filename_date_format}}*.flg 37 | trigger_type: file 38 | container: datalake 39 | location: "/mnt/{{container}}/data/landing/dbx_patterns/{{table}}/{{path_date_format}}" 40 | filename: "{{table}}-{{filename_date_format}}*.csv" 41 | filename_date_format: "%Y%m%d" 42 | path_date_format: "%Y%m%d" 43 | # injects the time period column into the dataset 44 | # using either the path_date_format or the filename_date_format 45 | # as you specify 46 | slice_date: filename_date_format 47 | slice_date_column_name: _slice_date 48 | format: cloudFiles 49 | spark_schema: ../schema/{{table.lower()}}.yaml 50 | options: 51 | # autoloader 52 | cloudFiles.format: csv 53 | cloudFiles.schemaLocation: /mnt/{{container}}/checkpoint/{{project}}/{{checkpoint}} 54 | cloudFiles.useIncrementalListing: auto 55 | # schema 56 | inferSchema: false 57 | enforceSchema: true 58 | columnNameOfCorruptRecord: _corrupt_record 59 | # csv 60 | header: false 61 | mode: PERMISSIVE 62 | encoding: windows-1252 63 | delimiter: "," 64 | escape: '"' 65 | nullValue: "" 66 | quote: '"' 67 | emptyValue: "" 68 | 69 | raw: 70 | delta_lake: 71 | # delta table properties can be set at stage level or table level 72 | delta_properties: 73 | delta.appendOnly: true 74 | delta.autoOptimize.autoCompact: true 75 | delta.autoOptimize.optimizeWrite: true 76 | delta.enableChangeDataFeed: false 77 | managed: false 78 | container: datalake 79 | location: /mnt/{{container}}/data/raw 80 | path: "{{database}}/{{table}}" 81 | options: 82 | mergeSchema: true 83 | checkpointLocation: "/mnt/{{container}}/checkpoint/{{project}}/{{checkpoint}}" 84 | 85 | base: 86 | delta_lake: 87 | container: datalake 88 | location: /mnt/{{container}}/data/base 89 | path: "{{database}}/{{table}}" 90 | options: null 91 | -------------------------------------------------------------------------------- /test/unit/test_utils.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pyspark.sql.types import ( 3 | StructType, 4 | StructField, 5 | StringType, 6 | IntegerType, 7 | DecimalType, 8 | LongType, 9 | ) 10 | from yetl.config import _utils as utils 11 | import re 12 | 13 | 14 | def remove_white_space(string: str): 15 | pattern = re.compile(r"\s+") 16 | string = re.sub(pattern, "", string) 17 | return string 18 | 19 | 20 | @pytest.fixture 21 | def spark_schema(): 22 | spark_schema = StructType( 23 | [ 24 | StructField("firstname", StringType(), True), 25 | StructField("middlename", StringType(), True), 26 | StructField("lastname", StringType(), True), 27 | StructField("id", LongType(), True), 28 | StructField("gender", StringType(), True), 29 | StructField("salary", DecimalType(4, 2), True), 30 | StructField("age", IntegerType(), True), 31 | ] 32 | ) 33 | 34 | return spark_schema 35 | 36 | 37 | @pytest.fixture 38 | def replacements(): 39 | replacements = { 40 | utils.JinjaVariables.DATABASE: "test_database", 41 | utils.JinjaVariables.TABLE: "test_table", 42 | utils.JinjaVariables.CHECKPOINT: "test_checkpoint", 43 | utils.JinjaVariables.FILENAME_DATE_FORMAT: "test_filename_date_format", 44 | utils.JinjaVariables.PATH_DATE_FORMAT: "test_path_date_format", 45 | utils.JinjaVariables.CONTAINER: "test_container", 46 | } 47 | return replacements 48 | 49 | 50 | def test_utils_get_dll_header(spark_schema): 51 | actual = utils.get_ddl(spark_schema=spark_schema, header=True) 52 | expected = [ 53 | "firstname string", 54 | "middlename string", 55 | "lastname string", 56 | "id bigint", 57 | "gender string", 58 | "salary decimal(4,2)", 59 | "age int", 60 | ] 61 | 62 | assert actual == expected 63 | 64 | 65 | def test_utils_get_dll_noheader(spark_schema): 66 | actual = utils.get_ddl(spark_schema=spark_schema, header=False) 67 | expected = [ 68 | "_c0 string", 69 | "_c1 string", 70 | "_c2 string", 71 | "_c3 bigint", 72 | "_c4 string", 73 | "_c5 decimal(4,2)", 74 | "_c6 int", 75 | ] 76 | 77 | assert actual == expected 78 | 79 | 80 | def test_render_jinja(replacements): 81 | data = """ 82 | {{database}} 83 | {{table}} 84 | {{checkpoint}} 85 | {{filename_date_format}} 86 | {{path_date_format}} 87 | {{container}} 88 | """ 89 | 90 | actual = utils.render_jinja(data, replacements) 91 | expected = """ 92 | test_database 93 | test_table 94 | test_checkpoint 95 | test_filename_date_format 96 | test_path_date_format 97 | test_container 98 | """ 99 | 100 | assert actual == expected 101 | 102 | 103 | def test_render_jinja_skip(): 104 | data = """ 105 | {{database}} 106 | """ 107 | 108 | replacements = {utils.JinjaVariables.DATABASE: None} 109 | 110 | actual = utils.render_jinja(data, replacements) 111 | expected = """ 112 | {{database}} 113 | """ 114 | 115 | assert actual == expected 116 | 117 | 118 | def test_get_html_table(): 119 | test_kv = {"test": "succeeded", "test1": {"test1_1": "also fine"}} 120 | actual = remove_white_space(utils.get_html_table(test_kv)) 121 | expected = """ 122 | 123 | 124 | 125 | 126 |
NameSourceDestination
testsucceeded
test1.test1_1also fine
127 | """ 128 | expected = remove_white_space(expected) 129 | 130 | assert actual == expected 131 | -------------------------------------------------------------------------------- /yetl/cli/_init.py: -------------------------------------------------------------------------------- 1 | import os 2 | import yaml 3 | import pkg_resources 4 | from ..validation import ( 5 | SchemaFiles, 6 | get_schema, 7 | ) 8 | import json 9 | from ..resource import get_resource_binary, get_resource_text 10 | 11 | 12 | def init(project: str, directory: str = "."): 13 | project = project.lower() 14 | project_path = os.path.abspath(directory) 15 | project_path = f"{project_path}/{project}" 16 | paths: dict = _make_project_dir(project_path, project) 17 | _create_log_file(project_path) 18 | _create_json_schema(project_path, paths["pipeline"]) 19 | _create_tables_excel(project_path, paths["pipeline"]) 20 | 21 | for _, p in paths.items(): 22 | _make_dirs(project_path, p) 23 | 24 | 25 | def _make_dirs(project_path: str, relative_path: str): 26 | relative_path.replace("./", "") 27 | path = f"{project_path}/{relative_path}" 28 | os.makedirs(path, exist_ok=True) 29 | 30 | 31 | def _create_json_schema(project_path: str, pipeline_dir: str): 32 | """Create json schema files to assist with vscode editing and validation""" 33 | 34 | json_schema_path = os.path.abspath(project_path) 35 | json_schema_path = os.path.join(json_schema_path, pipeline_dir, "json_schema") 36 | os.makedirs(json_schema_path, exist_ok=True) 37 | 38 | for f in SchemaFiles: 39 | schema = get_schema(f) 40 | schema_path = os.path.join(json_schema_path, f.value) 41 | with open(schema_path, "w", encoding="utf-8") as f: 42 | f.write(json.dumps(schema, indent=4)) 43 | 44 | 45 | def _get_default_config(name: str): 46 | """Get the default configuration""" 47 | config = get_resource_text(name) 48 | 49 | return config 50 | 51 | 52 | def _get_binary_template(name: str): 53 | """Get the binary template object""" 54 | data = get_resource_binary(name) 55 | 56 | return data 57 | 58 | 59 | def _create_log_file(project_path: str): 60 | config: dict = yaml.safe_load(_get_default_config("logging.yaml")) 61 | file_path = os.path.join(project_path, "logging.yaml") 62 | with open(file_path, "w", encoding="utf-8") as f: 63 | f.write(yaml.safe_dump(config, indent=4)) 64 | 65 | 66 | def _create_tables_excel(project_path: str, pipeline_dir: str): 67 | data: bytes = _get_binary_template("tables.xlsx") 68 | 69 | pipeline_path = os.path.abspath(project_path) 70 | pipeline_path = os.path.join(pipeline_path, pipeline_dir) 71 | file_path = os.path.join(pipeline_path, "tables.xlsx") 72 | with open(file_path, "wb") as f: 73 | f.write(data) 74 | 75 | 76 | def _make_project_dir(project_path: str, project: str): 77 | config: dict = yaml.safe_load(_get_default_config("project.yaml")) 78 | config["name"] = project 79 | config["version"] = pkg_resources.get_distribution("yetl-framework").version 80 | 81 | pipeline_path = config["pipeline"] 82 | paths = { 83 | "sql": config["sql"], 84 | "spark_schema": config["spark_schema"], 85 | "pipeline": pipeline_path, 86 | "databricks_notebooks": config["databricks_notebooks"], 87 | "databricks_workflows": config["databricks_workflows"], 88 | "databricks_queries": config["databricks_queries"], 89 | } 90 | 91 | try: 92 | os.makedirs(project_path, exist_ok=False) 93 | except Exception as e: 94 | raise Exception(f"project {project} already exists at this path") from e 95 | 96 | project_file_path = os.path.join(project_path, f"{project}.yaml") 97 | with open(project_file_path, "w", encoding="utf-8") as f: 98 | f.write( 99 | f"# yaml-language-server: $schema={pipeline_path}/json_schema/sibytes_yetl_project_schema.json\n\n" 100 | ) 101 | f.write(yaml.safe_dump(config, indent=4)) 102 | 103 | return paths 104 | -------------------------------------------------------------------------------- /yetl/config/table/_table.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pydantic import BaseModel, Field, PrivateAttr 3 | from .._utils import JinjaVariables 4 | from typing import Any, Dict, Union, List, Optional 5 | from .._timeslice import Timeslice 6 | from .._stage_type import StageType 7 | from ._table_type import TableType 8 | from .._project import Project 9 | from enum import Enum 10 | from .._utils import render_jinja 11 | 12 | 13 | class ValidationThresholdType(Enum): 14 | exception = ("exception",) 15 | warning = "warning" 16 | 17 | 18 | class ValidationThreshold(BaseModel): 19 | def __init__(self, **data: Any) -> None: 20 | super().__init__(**data) 21 | 22 | invalid_ratio: Optional[float] = Field(default=None) 23 | invalid_rows: Optional[int] = Field(default=None) 24 | max_rows: Optional[int] = Field(default=None) 25 | min_rows: Optional[int] = Field(default=None) 26 | 27 | @classmethod 28 | def default_select_sql(cls): 29 | sql = "null" 30 | return sql 31 | 32 | def select_sql(self): 33 | thresholds_sql = [] 34 | if self.invalid_ratio is not None: 35 | thresholds_sql.append( 36 | f"cast({self.invalid_ratio} as double) as invalid_ratio" 37 | ) 38 | else: 39 | thresholds_sql.append("null as invalid_ratio") 40 | 41 | if self.invalid_rows is not None: 42 | thresholds_sql.append(f"cast({self.invalid_rows} as long) as invalid_rows") 43 | else: 44 | thresholds_sql.append("null as invalid_rows") 45 | 46 | if self.max_rows is not None: 47 | thresholds_sql.append(f"cast({self.max_rows} as long) as max_rows") 48 | else: 49 | thresholds_sql.append("null as max_rows") 50 | 51 | if self.min_rows is not None: 52 | thresholds_sql.append(f"cast({self.min_rows} as long) as min_rows") 53 | else: 54 | thresholds_sql.append("null as min_rows") 55 | 56 | sql = f""" 57 | struct( 58 | {",".join(thresholds_sql)} 59 | ) 60 | """ 61 | 62 | return sql 63 | 64 | 65 | class Table(BaseModel): 66 | def __init__(self, **data: Any) -> None: 67 | super().__init__(**data) 68 | self._logger = logging.getLogger(self.__class__.__name__) 69 | 70 | _logger: Any = PrivateAttr(default=None) 71 | _rendered: bool = PrivateAttr(default=False) 72 | _replacements: Dict[JinjaVariables, str] = PrivateAttr(default=None) 73 | stage: StageType = Field(...) 74 | database: str = Field(...) 75 | table: str = Field(...) 76 | id: Union[str, List[str]] = Field(default=[]) 77 | custom_properties: Optional[Dict[str, Any]] = Field(default=None) 78 | table_type: TableType = Field(...) 79 | warning_thresholds: Optional[ValidationThreshold] = Field(default=None) 80 | exception_thresholds: Optional[ValidationThreshold] = Field(default=None) 81 | project: Project = Field(...) 82 | container: str = Field(...) 83 | location: str = Field(...) 84 | path: Optional[str] = Field(default=None) 85 | options: dict = Field(...) 86 | timeslice: Timeslice = Field(...) 87 | checkpoint: Optional[str] = Field(default=None) 88 | config_path: str = Field(...) 89 | catalog: Optional[str] = Field(default=None) 90 | 91 | def _render(self): 92 | self._replacements = { 93 | JinjaVariables.TABLE: self.table, 94 | JinjaVariables.DATABASE: self.database, 95 | JinjaVariables.CONTAINER: self.container, 96 | JinjaVariables.CHECKPOINT: self.checkpoint, 97 | JinjaVariables.PROJECT: self.project.name, 98 | JinjaVariables.CATALOG: self.catalog, 99 | } 100 | 101 | def render(self): 102 | self._replacements[JinjaVariables.CHECKPOINT] = self.checkpoint 103 | 104 | if self.options: 105 | for option, value in self.options.items(): 106 | if isinstance(value, str): 107 | self.options[option] = render_jinja(value, self._replacements) 108 | 109 | def thresholds_select_sql(self, threshold_type: ValidationThresholdType): 110 | if threshold_type == ValidationThresholdType.exception: 111 | if self.exception_thresholds: 112 | return self.exception_thresholds.select_sql() 113 | else: 114 | return ValidationThreshold.default_select_sql() 115 | 116 | if threshold_type == ValidationThresholdType.warning: 117 | if self.warning_thresholds: 118 | return self.warning_thresholds.select_sql() 119 | else: 120 | return ValidationThreshold.default_select_sql() 121 | 122 | def _set_catalog(self, catalog: str = None): 123 | if catalog: 124 | self.catalog = catalog 125 | 126 | def create_table(self, catalog: str = None): 127 | self._set_catalog(catalog) 128 | 129 | def create_database(self, catalog: str = None): 130 | self._set_catalog(catalog) 131 | 132 | def qualified_table_name(self): 133 | pass 134 | -------------------------------------------------------------------------------- /yetl/config/_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .table import Table 3 | from ._timeslice import Timeslice 4 | from ._tables import Tables, _INDEX_WILDCARD, KeyContants 5 | from ._stage_type import StageType 6 | from ._utils import abs_config_path, load_yaml, get_config_path, check_version 7 | from ._logging_config import configure_logging 8 | import logging 9 | from ._project import Project 10 | from ..validation import validate_tables, validate_pipeline 11 | from typing import Union 12 | 13 | 14 | class Config: 15 | def __init__( 16 | self, 17 | project: str, 18 | pipeline: str, 19 | timeslice: Timeslice = None, 20 | config_path: str = None, 21 | ): 22 | self.config_path = get_config_path(project, config_path) 23 | self._logger = logging.getLogger(self.__class__.__name__) 24 | configure_logging(project, self.config_path) 25 | if not timeslice: 26 | timeslice = Timeslice( 27 | year=_INDEX_WILDCARD, month=_INDEX_WILDCARD, day=_INDEX_WILDCARD 28 | ) 29 | self.project = self._load_project(project) 30 | self.pipeline = pipeline 31 | self.tables = self._load_tables(timeslice) 32 | 33 | def _load_project(self, project: str): 34 | project_file_path = os.path.join(self.config_path, f"{project}.yaml") 35 | project_config = load_yaml(project_file_path) 36 | check_version(project_config) 37 | project_config["config_path"] = self.config_path 38 | project = Project(**project_config) 39 | return project 40 | 41 | def _load_pipeline(self, pipeline: str): 42 | pipeline_file = f"{pipeline}.yaml" 43 | config_file_path = os.path.join(self.project.pipelines, pipeline_file) 44 | pipeline = load_yaml(config_file_path) 45 | validate_pipeline(pipeline) 46 | check_version(pipeline) 47 | return pipeline 48 | 49 | def _load_tables(self, timeslice: Timeslice): 50 | tables_config = self._load_pipeline(self.pipeline) 51 | tables_path = tables_config[KeyContants.TABLES.value] 52 | tables_path = abs_config_path(self.project.pipelines, tables_path) 53 | 54 | data: dict = load_yaml(tables_path) 55 | validate_tables(data) 56 | check_version(data) 57 | 58 | tables_config[KeyContants.TABLES.value] = data 59 | tables_config[KeyContants.TIMESLICE.value] = timeslice 60 | tables_config[KeyContants.CONFIG_PATH.value] = self.project.pipelines 61 | tables_config[KeyContants.PROJECT.value] = self.project 62 | 63 | tables = Tables(table_data=tables_config) 64 | return tables 65 | 66 | def create_tables( 67 | self, 68 | stage: Union[StageType, str] = _INDEX_WILDCARD, 69 | database=_INDEX_WILDCARD, 70 | catalog: str = None, 71 | **kwargs, 72 | ): 73 | return self.tables.create_table( 74 | stage=stage, 75 | database=database, 76 | first_match=False, 77 | catalog=catalog, 78 | **kwargs, 79 | ) 80 | 81 | def create_table( 82 | self, 83 | stage: Union[StageType, str] = _INDEX_WILDCARD, 84 | database=_INDEX_WILDCARD, 85 | table=_INDEX_WILDCARD, 86 | catalog: str = None, 87 | **kwargs, 88 | ): 89 | return self.tables.create_table( 90 | stage=stage, 91 | database=database, 92 | table=table, 93 | first_match=True, 94 | catalog=catalog, 95 | **kwargs, 96 | ) 97 | 98 | def get_table_mapping( 99 | self, 100 | stage: StageType, 101 | database: str = _INDEX_WILDCARD, 102 | table: str = _INDEX_WILDCARD, 103 | create_database: bool = False, 104 | create_table: bool = False, 105 | catalog: str = None, 106 | ): 107 | table_mapping = self.tables.get_table_mapping( 108 | stage=stage, 109 | table=table, 110 | database=database, 111 | create_database=create_database, 112 | create_table=create_table, 113 | catalog=catalog, 114 | ) 115 | 116 | return table_mapping 117 | 118 | def set_checkpoint( 119 | self, 120 | source: Table, 121 | destination: Table, 122 | checkpoint_name: str = None, 123 | ): 124 | if not checkpoint_name: 125 | checkpoint_name = f"{source.database}.{source.table}-{destination.database}.{destination.table}" 126 | 127 | source.checkpoint = checkpoint_name 128 | source.render() 129 | destination.checkpoint = checkpoint_name 130 | destination.render() 131 | 132 | def lookup_table( 133 | self, 134 | stage: Union[StageType, str] = _INDEX_WILDCARD, 135 | database=_INDEX_WILDCARD, 136 | table=_INDEX_WILDCARD, 137 | first_match: bool = True, 138 | create_database: bool = False, 139 | create_table: bool = False, 140 | catalog: str = None, 141 | **kwargs, 142 | ): 143 | return self.tables.lookup_table( 144 | stage=stage, 145 | database=database, 146 | table=table, 147 | first_match=first_match, 148 | create_database=create_database, 149 | create_table=create_table, 150 | catalog=catalog, 151 | **kwargs, 152 | ) 153 | -------------------------------------------------------------------------------- /test/integration/test_configuration_load.py: -------------------------------------------------------------------------------- 1 | from yetl import Config, Timeslice, StageType, Read, DeltaLake, yetl_flow, TableMapping, ValidationThreshold 2 | from yetl.config._project import SparkLoggingLevel 3 | from yetl.config.table import TableType 4 | from yetl.config.table._read import SliceDateFormat 5 | import pytest 6 | import os 7 | import shutil 8 | 9 | 10 | 11 | @pytest.fixture() 12 | def tear_down(): 13 | def tear_down_fn(): 14 | shutil.rmtree("./test/config/test_project/data", ignore_errors=True) 15 | shutil.rmtree("./metastore_db", ignore_errors=True) 16 | shutil.rmtree("./spark-warehouse", ignore_errors=True) 17 | try: 18 | os.remove("./derby.log") 19 | except Exception: 20 | pass 21 | return tear_down_fn 22 | 23 | @pytest.fixture() 24 | def root_path(): 25 | 26 | root = os.path.abspath(os.getcwd()) 27 | return root 28 | 29 | 30 | def test_configuration_load(tear_down, root_path): 31 | tear_down() 32 | pipeline = "autoloader" 33 | config_path = "./test/config" 34 | project = "test_project" 35 | timeslice = Timeslice(day="*", month="*", year="*") 36 | config = Config( 37 | project=project, 38 | pipeline=pipeline, 39 | config_path=config_path, 40 | timeslice=timeslice 41 | ) 42 | table_mapping = config.get_table_mapping( 43 | stage=StageType.raw, table="customers" 44 | ) 45 | 46 | source: Read = table_mapping.source["customer_details_1"] 47 | destination: DeltaLake = table_mapping.destination 48 | config.set_checkpoint(source=source, destination=destination) 49 | 50 | assert source.table == "customer_details_1" 51 | assert source.slice_date == SliceDateFormat.FILENAME_DATE_FORMAT 52 | assert source.slice_date_column_name == "_slice_date" 53 | 54 | assert destination.table == "customers" 55 | assert destination.stage == StageType.raw 56 | assert destination.database =='raw_dbx_patterns' 57 | assert destination.table=='customers' 58 | assert destination.id=='id' 59 | 60 | assert destination.custom_properties == {'process_group': 1,'rentention_days': 365} 61 | assert destination.table_type == TableType.delta_lake 62 | assert destination.warning_thresholds == ValidationThreshold(invalid_ratio=0.1, invalid_rows=0, max_rows=100, min_rows=5) 63 | assert destination.exception_thresholds == ValidationThreshold(invalid_ratio=0.2, invalid_rows=2, max_rows=1000, min_rows=0) 64 | assert destination.project.config_path == f'{root_path}/test/config/test_project' 65 | assert destination.project.name == 'test_project' 66 | assert destination.project.sql == f'{root_path}/test/config/test_project/sql' 67 | assert destination.project.pipelines == f'{root_path}/test/config/test_project/pipelines' 68 | assert destination.project.databricks_notebooks == f'{root_path}/test/config/test_project/databricks/notebooks' 69 | assert destination.project.databricks_workflows == f'{root_path}/test/config/test_project/databricks/workflows' 70 | assert destination.project.databricks_queries == f'{root_path}/test/config/test_project/databricks/queries' 71 | # assert destination.project.spark.config == { 72 | # 'spark.master': 'local', 'spark.databricks.delta.allowArbitraryProperties.enabled': True, 73 | # 'spark.sql.catalog.spark_catalog': 'org.apache.spark.sql.delta.catalog.DeltaCatalog', 74 | # 'spark.sql.extensions': 'io.delta.sql.DeltaSparkSessionExtension' 75 | # } 76 | assert destination.project.spark.logging_level == SparkLoggingLevel.ERROR 77 | assert destination.container == 'datalake' 78 | assert destination.location == f'{root_path}/test/config/test_project/data/mnt/datalake/data/raw/raw_dbx_patterns/customers' 79 | assert destination.path == 'raw_dbx_patterns/customers' 80 | assert destination.options == {'mergeSchema': True, 'checkpointLocation': '/mnt/datalake/checkpoint/test_project/landing_dbx_patterns.customer_details_1-raw_dbx_patterns.customers'} 81 | assert destination.timeslice == Timeslice(year='*', month='*', day='*', hour=0, minute=0, second=0, microsecond=0) 82 | assert destination.checkpoint == 'landing_dbx_patterns.customer_details_1-raw_dbx_patterns.customers' 83 | assert destination.delta_constraints == None 84 | assert destination.partition_by == None 85 | assert destination.z_order_by == ["_load_date_1", "_load_date_2"] 86 | assert destination.managed == False 87 | assert destination.sql == None 88 | assert destination.vacuum == 30 89 | assert destination.catalog == None 90 | 91 | 92 | 93 | def test_decorator_configuration_load(tear_down): 94 | @yetl_flow( 95 | project="test_project", 96 | stage=StageType.raw, 97 | config_path="./test/config", 98 | catalog=None 99 | ) 100 | def autoloader(table_mapping:TableMapping): 101 | return table_mapping 102 | 103 | 104 | result = autoloader(table="customers") 105 | tear_down() 106 | assert result.source["customer_details_1"].table == "customer_details_1" 107 | assert result.destination.table == "customers" 108 | 109 | 110 | def test_decorator_configuration_audit_load(tear_down): 111 | @yetl_flow( 112 | project="test_project", 113 | stage=StageType.audit_control, 114 | config_path="./test/config", 115 | catalog=None 116 | ) 117 | def autoloader(table_mapping:TableMapping): 118 | return table_mapping 119 | 120 | 121 | result = autoloader(table="header_footer") 122 | tear_down() 123 | assert result.source.table == "customers" 124 | assert result.destination.table == "header_footer" 125 | 126 | 127 | -------------------------------------------------------------------------------- /yetl/config/_utils.py: -------------------------------------------------------------------------------- 1 | import jinja2 2 | from enum import Enum 3 | import yaml 4 | from pyspark.sql.types import StructType 5 | from typing import Dict 6 | import os 7 | import pkg_resources 8 | import logging 9 | 10 | 11 | YETL_CONFIG = "YETL_CONFIG" 12 | _ENCODING = "utf-8" 13 | _DBX_WORKSPACE_PATH = "/Workspace" 14 | _DBX_REPO_PATH = "/Workspace/Repos" 15 | 16 | 17 | class VersionNotFoundException(Exception): 18 | def __init__(self, *args: object) -> None: 19 | super().__init__(*args) 20 | 21 | 22 | class JinjaVariables(Enum): 23 | DATABASE = "database" 24 | TABLE = "table" 25 | CHECKPOINT = "checkpoint" 26 | FILENAME_DATE_FORMAT = "filename_date_format" 27 | PATH_DATE_FORMAT = "path_date_format" 28 | CONTAINER = "container" 29 | DELTA_PROPERTIES = "delta_properties" 30 | LOCATION = "location" 31 | PROJECT = "project" 32 | CATALOG = "catalog" 33 | 34 | 35 | def is_databricks(): 36 | return "DATABRICKS_RUNTIME_VERSION" in os.environ 37 | 38 | 39 | def check_version(data: dict): 40 | _logger = logging.getLogger(__name__) 41 | version = data.get("version") 42 | 43 | if version is None: 44 | raise VersionNotFoundException() 45 | 46 | del data["version"] 47 | 48 | pkg_version = pkg_resources.get_distribution("yetl-framework").version 49 | pkg_version = pkg_version.split(".") 50 | 51 | try: 52 | version = version.split(".") 53 | if pkg_version[0] != version[0] or pkg_version[1] != version[1]: 54 | _logger.warning( 55 | f"Configuration and library shows that it's incompatible config version config_version={version} package_version={pkg_version}" 56 | ) 57 | 58 | except Exception as e: 59 | raise VersionNotFoundException from e 60 | 61 | 62 | def render_jinja(data: str, replacements: Dict[JinjaVariables, str]): 63 | _logger = logging.getLogger(__name__) 64 | _logger.debug(f"Rendering Jinja string {data}") 65 | if data and isinstance(data, str): 66 | replace = {k.value: v for (k, v) in replacements.items()} 67 | skip = False 68 | for k, v in replace.items(): 69 | if v is None and "{{" + k + "}}" in data.replace(" ", ""): 70 | skip = True 71 | break 72 | 73 | if not skip: 74 | template: jinja2.Template = jinja2.Template(data) 75 | data = template.render(replace) 76 | _logger.debug(f"Rendered Jinja string {data}") 77 | 78 | return data 79 | 80 | 81 | def abs_config_path(root: str, path: str): 82 | _logger = logging.getLogger(__name__) 83 | if not os.path.isabs(path): 84 | path = os.path.join(root, path) 85 | _logger.debug(f"Absolute config path {path}") 86 | return path 87 | 88 | 89 | def get_config_path(project: str, path: str): 90 | _logger = logging.getLogger(__name__) 91 | 92 | default_path = "." 93 | if is_databricks(): 94 | if os.path.exists(_DBX_WORKSPACE_PATH): 95 | default_path = f"{_DBX_WORKSPACE_PATH}" 96 | 97 | if os.getcwd().startswith(_DBX_REPO_PATH): 98 | repo_path = "/".join(os.getcwd().split("/")[0:5]) 99 | default_path = repo_path 100 | 101 | if not path: 102 | path = os.getenv(YETL_CONFIG, default_path) 103 | path = os.path.abspath(path) 104 | path = os.path.join(path, project) 105 | _logger.info(f"Absolute root config path {path}") 106 | return path 107 | 108 | 109 | def load_schema(path: str): 110 | _logger = logging.getLogger(__name__) 111 | schema = load_yaml(path) 112 | _logger.info(f"Loading schema {path}") 113 | schema = StructType.fromJson(schema) 114 | 115 | return schema 116 | 117 | 118 | def load_yaml(path: str): 119 | _logger = logging.getLogger(__name__) 120 | _logger.info(f"Loading yaml file {path}") 121 | with open(path, "r", encoding=_ENCODING) as f: 122 | try: 123 | data = yaml.safe_load(f) 124 | except yaml.YAMLError as e: 125 | location = "" 126 | if hasattr(e, "problem_mark"): 127 | mark = e.problem_mark 128 | location = f"Error position ({mark.line}, {mark.column})" 129 | 130 | if hasattr(e, "problem"): 131 | problem = f"{e.problem}." 132 | 133 | raise Exception(f"Invalid yaml format in {path}. {problem} {location}") 134 | _logger.debug(data) 135 | return data 136 | 137 | 138 | def load_text(path: str): 139 | _logger = logging.getLogger(__name__) 140 | _logger.info(f"Loading text file {path}") 141 | with open(path, "r", encoding=_ENCODING) as f: 142 | data = f.read() 143 | _logger.debug(data) 144 | return data 145 | 146 | 147 | def get_ddl(spark_schema: StructType, header: bool = True): 148 | _logger = logging.getLogger(__name__) 149 | _logger.debug(f"Converting spark schema to ddl with header={str(header)}") 150 | if header: 151 | ddl = [f"{f.name} {f.dataType.simpleString()}" for f in spark_schema.fields] 152 | _logger.debug(ddl) 153 | else: 154 | ddl = [ 155 | f"_c{i} {f.dataType.simpleString()}" 156 | for i, f in enumerate(spark_schema.fields) 157 | ] 158 | _logger.debug(ddl) 159 | 160 | return ddl 161 | 162 | 163 | def get_html_table(data: dict): 164 | rows = [] 165 | for k, v in data.items(): 166 | if isinstance(v, dict): 167 | for ki, vi in v.items(): 168 | row = f"{k}.{ki}{vi}" 169 | rows.append(row) 170 | else: 171 | row = f"{k}{v}" 172 | rows.append(row) 173 | 174 | html = "".join(rows) 175 | 176 | html = f""" 177 | 178 | 179 | {html} 180 |
NameSourceDestination
181 | """ 182 | 183 | return html 184 | -------------------------------------------------------------------------------- /yetl/config/table/_deltalake.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pydantic import Field, PrivateAttr 3 | from .._utils import ( 4 | JinjaVariables, 5 | render_jinja, 6 | is_databricks, 7 | abs_config_path, 8 | load_text, 9 | ) 10 | from typing import Any, Dict, Union, List, Optional 11 | from .._timeslice import Timeslice 12 | import os 13 | from .._stage_type import StageType 14 | from ._table import Table 15 | from ..deltalake import DeltaLakeFn 16 | from pyspark.sql.types import StructType 17 | 18 | 19 | class DeltaLake(Table): 20 | def __init__(self, **data: Any) -> None: 21 | super().__init__(**data) 22 | self._logger = logging.getLogger(self.__class__.__name__) 23 | self._spark = DeltaLakeFn(project=self.project) 24 | self._render() 25 | 26 | @classmethod 27 | def in_allowed_stages(cls, stage: StageType): 28 | return stage in (stage.raw, stage.base, stage.curated) 29 | 30 | _logger: Any = PrivateAttr(default=None) 31 | _replacements: Dict[JinjaVariables, str] = PrivateAttr(default=None) 32 | _spark: DeltaLakeFn = PrivateAttr(default=None) 33 | depends_on: Optional[List[str]] = Field(default=[]) 34 | delta_properties: Optional[Dict[str, Union[str, bool, int, float]]] = Field( 35 | default=None 36 | ) 37 | delta_constraints: Optional[Dict[str, str]] = Field(default=None) 38 | partition_by: Optional[Union[List[str], str]] = Field(default=None) 39 | cluster_by: Optional[Union[List[str], str]] = Field(default=None) 40 | z_order_by: Optional[Union[List[str], str]] = Field(default=None) 41 | vacuum: Optional[int] = Field(default=31) 42 | options: Optional[Union[dict, None]] = Field(default=None) 43 | timeslice: Timeslice = Field(...) 44 | location: Optional[str] = Field(default=None) 45 | stage: StageType = Field(...) 46 | managed: Optional[bool] = Field(default=False) 47 | 48 | sql: Optional[str] = Field(default=None) 49 | 50 | def _load_sql(self, path: str): 51 | path = abs_config_path(self.config_path, path) 52 | sql = load_text(path) 53 | return sql 54 | 55 | def _render(self): 56 | super()._render() 57 | if not self._rendered: 58 | if self.delta_properties: 59 | delta_properties_sql = self._spark.get_delta_properties_sql( 60 | self.delta_properties 61 | ) 62 | self._replacements[ 63 | JinjaVariables.DELTA_PROPERTIES 64 | ] = delta_properties_sql 65 | self.database = render_jinja(self.database, self._replacements) 66 | self.table = render_jinja(self.table, self._replacements) 67 | self.location = render_jinja(self.location, self._replacements) 68 | self.path = render_jinja(self.path, self._replacements) 69 | if self.location and self.path: 70 | self.location = os.path.join(self.location, self.path) 71 | if not is_databricks(): 72 | self.location = f"{self.config_path}/../data{self.location}" 73 | self.location = os.path.abspath(self.location) 74 | self._replacements[JinjaVariables.LOCATION] = self.location 75 | 76 | if self.sql: 77 | # render the path 78 | self.sql = render_jinja(self.sql, self._replacements) 79 | # load the file 80 | self.sql = self._load_sql(self.sql) 81 | # render the SQL 82 | self.sql = render_jinja(self.sql, self._replacements) 83 | 84 | if self.options: 85 | for option, value in self.options.items(): 86 | if isinstance(value, str): 87 | self.options[option] = render_jinja(value, self._replacements) 88 | 89 | self._rendered = True 90 | 91 | def create_database(self, catalog: str = None): 92 | super().create_database(catalog=catalog) 93 | self._spark.create_database(self.database, catalog=self.catalog) 94 | 95 | # TODO: alter table 96 | def create_table( 97 | self, catalog: str = None, schema: StructType = None, create_database=True 98 | ): 99 | if create_database: 100 | self.create_database(catalog=catalog) 101 | super().create_table(catalog=catalog) 102 | if self._spark.table_exists( 103 | database=self.database, table=self.table, catalog=self.catalog 104 | ): 105 | pass 106 | # TODO: alter table 107 | if self.catalog: 108 | msg = f"table `{self.catalog}`.{self.database}`.`{self.table}` already exists." 109 | else: 110 | msg = f"table {self.database}`.`{self.table}` already exists." 111 | self._logger.info(msg) 112 | else: 113 | if self.managed: 114 | self._spark.create_table( 115 | database=self.database, 116 | table=self.table, 117 | delta_properties=self.delta_properties, 118 | sql=self.sql, 119 | catalog=self.catalog, 120 | schema=schema, 121 | cluster_by=self.cluster_by, 122 | partition_by=self.partition_by, 123 | ) 124 | else: 125 | self._spark.create_table( 126 | database=self.database, 127 | table=self.table, 128 | delta_properties=self.delta_properties, 129 | path=self.location, 130 | sql=self.sql, 131 | catalog=self.catalog, 132 | schema=schema, 133 | cluster_by=self.cluster_by, 134 | partition_by=self.partition_by, 135 | ) 136 | 137 | def qualified_table_name(self): 138 | name = f"`{self.database}`.`{self.table}`" 139 | if self.catalog: 140 | name = f"`{self.catalog}`.{name}" 141 | return name 142 | -------------------------------------------------------------------------------- /yetl/config/_timeslice.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import Literal, Union 3 | from pydantic import BaseModel, Field 4 | from typing import Any 5 | import re 6 | 7 | _WILDCARD = "*" 8 | Wildcard = Literal["*"] 9 | 10 | _UNSUPPORTED_FORMAT_CODES = [ 11 | "%c", 12 | "%x", 13 | "%X", 14 | "%G", 15 | "%u", 16 | "%V", 17 | "%z", 18 | "%Z", 19 | "%I", 20 | "%p", 21 | "%b", 22 | "%B", 23 | "%a", 24 | "%A", 25 | "%w", 26 | ] 27 | 28 | 29 | class Timeslice(BaseModel): 30 | def __init__(__pydantic_self__, **data: Any) -> None: 31 | super().__init__(**data) 32 | 33 | year: Union[int, Wildcard] = Field(...) 34 | month: Union[int, Wildcard] = Field(default=_WILDCARD) 35 | day: Union[int, Wildcard] = Field(default=_WILDCARD) 36 | hour: Union[int, Wildcard] = Field(default=0) 37 | minute: Union[int, Wildcard] = Field(default=0) 38 | second: Union[int, Wildcard] = Field(default=0) 39 | microsecond: Union[int, Wildcard] = Field(default=0) 40 | 41 | @classmethod 42 | def parse_iso_date(cls, iso_date: str): 43 | if iso_date == "*": 44 | iso_date = "*-*-*" 45 | pattern = "^(([12]\d{3}|[*])-(0[1-9]|1[0-2]|[*])-(0[1-9]|[12]\d|3[01]|[*]))$" # noqa W605 46 | result = re.match(pattern, iso_date) 47 | 48 | if result: 49 | parts = iso_date.split("-") 50 | args = {"year": parts[0], "month": parts[1], "day": parts[2]} 51 | return cls(**args) 52 | else: 53 | raise Exception( 54 | f"{iso_date} is an invalid iso date string. Must be the format YYYY-mm-dd" 55 | ) 56 | 57 | def strftime(self, format: str): 58 | """This will format and return the timeslice using python format codes. Only a subset of format codes are suppoered by design 59 | %d - Day of the month as a zero-padded decimal number. 60 | %m - Month as a zero-padded decimal number. 61 | %y - Year without century as a zero-padded decimal number. 62 | %Y - Year with century as a decimal number. 63 | %H - Hour (24-hour clock) as a zero-padded decimal number. 64 | %M - Minute as a zero-padded decimal number. 65 | %S - Second as a zero-padded decimal number. 66 | %f - Microsecond as a decimal number, zero-padded to 6 digits. 67 | %% - A literal '%' character. 68 | %j - Day of the year as a zero-padded decimal number. 69 | %U - Week number of the year (Sunday as the first day of the week) as a zero-padded decimal number. All days in a new year preceding the first Sunday are considered to be in week 0. 70 | %W - Week number of the year (Monday as the first day of the week) as a zero-padded decimal number. All days in a new year preceding the first Monday are considered to be in week 0. 71 | NOT SUPPORTED - %c - Locale’s appropriate date and time representation. 72 | NOT SUPPORTED - %x - Locale’s appropriate date representation. 73 | NOT SUPPORTED - %X - Locale’s appropriate time representation. 74 | NOT SUPPORTED - %G - ISO 8601 year with century representing the year that contains the greater part of the ISO week (%V). 75 | NOT SUPPORTED - %u - ISO 8601 weekday as a decimal number where 1 is Monday. 76 | NOT SUPPORTED - %V - ISO 8601 week as a decimal number with Monday as the first day of the week. Week 01 is the week containing Jan 4. 77 | NOT SUPPORTED - %z - UTC offset in the form ±HHMM[SS[.ffffff]] (empty string if the object is naive). 78 | NOT SUPPORTED - %Z - Time zone name (empty string if the object is naive). 79 | NOT SUPPORTED - %I - Hour (12-hour clock) as a zero-padded decimal number. 80 | NOT SUPPORTED - %p - Locale’s equivalent of either AM or PM. 81 | NOT SUPPORTED - %b - Month as locale’s abbreviated name. 82 | NOT SUPPORTED - %B - Month as locale’s full name. 83 | NOT SUPPORTED - %a - Weekday as locale’s abbreviated name. 84 | NOT SUPPORTED - %A - Weekday as locale’s full name. 85 | NOT SUPPORTED - %w - Weekday as a decimal number, where 0 is Sunday and 6 is Saturday. 86 | """ 87 | 88 | unsupported_codes = [c for c in _UNSUPPORTED_FORMAT_CODES if c in format] 89 | 90 | if unsupported_codes: 91 | unsupported_codes = ",".join(unsupported_codes) 92 | raise Exception( 93 | f"The format contains the following unsupported format codes: {unsupported_codes}" 94 | ) 95 | 96 | format, _year = self._format_wildcard(format, self.year, ["%y", "%Y"], 1900) 97 | format, _month = self._format_wildcard(format, self.month, "%m", 1) 98 | format, _day = self._format_wildcard(format, self.day, "%d", 1) 99 | 100 | format, _hour = self._format_wildcard(format, self.hour, "%H") 101 | format, _minutue = self._format_wildcard(format, self.minute, "%M") 102 | format, _second = self._format_wildcard(format, self.second, "%S") 103 | format, _microsecond = self._format_wildcard(format, self.microsecond, "%f") 104 | 105 | timeslice = datetime( 106 | _year, _month, _day, _hour, _minutue, _second, _microsecond 107 | ) 108 | 109 | formatted = timeslice.strftime(format) 110 | return formatted 111 | 112 | def _format_wildcard( 113 | self, 114 | format: str, 115 | datepart: Union[int, Wildcard], 116 | format_code: Union[list, str], 117 | default=0, 118 | ): 119 | if datepart == _WILDCARD: 120 | if isinstance(format_code, str): 121 | format = format.replace(format_code, f"{_WILDCARD}") 122 | elif isinstance(format_code, list): 123 | for f in format_code: 124 | format = format.replace(f, f"{_WILDCARD}") 125 | datepart = default 126 | 127 | return format, datepart 128 | 129 | def __str__(self) -> str: 130 | return self.strftime("%Y-%m-%d %H:%M:%S.%f") 131 | 132 | 133 | class TimesliceNow(Timeslice): 134 | def __init__(self) -> None: 135 | now = datetime.now() 136 | args = { 137 | "year": now.year, 138 | "month": now.month, 139 | "day": now.day, 140 | "hour": now.hour, 141 | "minute": now.minute, 142 | "second": now.second, 143 | "microsecond": now.microsecond, 144 | } 145 | super().__init__(**args) 146 | 147 | 148 | class TimesliceUtcNow(Timeslice): 149 | def __init__(self) -> None: 150 | now = datetime.utcnow() 151 | args = { 152 | "year": now.year, 153 | "month": now.month, 154 | "day": now.day, 155 | "hour": now.hour, 156 | "minute": now.minute, 157 | "second": now.second, 158 | "microsecond": now.microsecond, 159 | } 160 | super().__init__(**args) 161 | -------------------------------------------------------------------------------- /yetl/config/table/_read.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pydantic import Field, PrivateAttr 3 | from .._utils import JinjaVariables, render_jinja, get_ddl, load_schema, abs_config_path 4 | from typing import Any, Dict, List, Union, Optional 5 | from enum import Enum 6 | import os 7 | from pyspark.sql.types import StructType 8 | from pyspark.sql.streaming import StreamingQuery 9 | from pyspark.sql import DataFrame 10 | from pyspark.sql import functions as fn 11 | from .._stage_type import StageType 12 | from ._table import Table 13 | from ..deltalake import DeltaLakeFn 14 | 15 | 16 | class TriggerType(Enum): 17 | File = "file" 18 | 19 | 20 | class SliceDateFormat(Enum): 21 | FILENAME_DATE_FORMAT = JinjaVariables.FILENAME_DATE_FORMAT.value 22 | PATH_DATE_FORMAT = JinjaVariables.PATH_DATE_FORMAT.value 23 | 24 | 25 | class Read(Table): 26 | _OPTION_CF_SCHEMA_HINTS = "cloudFiles.schemaHints" 27 | _OPTION_CORRUPT_RECORD_NAME = "columnNameOfCorruptRecord" 28 | 29 | def add_timeslice( 30 | self, df: DataFrame, filepath_column_name: str = "_metadata.file_path" 31 | ): 32 | if self.slice_date == SliceDateFormat.FILENAME_DATE_FORMAT: 33 | date_format = self.path_date_format 34 | 35 | if self.slice_date == SliceDateFormat.PATH_DATE_FORMAT: 36 | date_format = self.filename_date_format 37 | 38 | pattern = DeltaLakeFn.to_regex_search_pattern(date_format) 39 | spark_format_string = DeltaLakeFn.to_spark_format_code(date_format) 40 | 41 | df = ( 42 | df.withColumn(self.slice_date_column_name, fn.col(filepath_column_name)) 43 | .withColumn( 44 | self.slice_date_column_name, 45 | fn.regexp_extract(fn.col(self.slice_date_column_name), pattern, 0), 46 | ) 47 | .withColumn( 48 | self.slice_date_column_name, 49 | fn.to_timestamp(self.slice_date_column_name, spark_format_string), 50 | ) 51 | ) 52 | 53 | return df 54 | 55 | def __init__(self, **data: Any) -> None: 56 | super().__init__(**data) 57 | self._logger = logging.getLogger(self.__class__.__name__) 58 | self._render() 59 | self.path = os.path.join(self.location, self.filename) 60 | 61 | _logger: Any = PrivateAttr(default=None) 62 | _replacements: Dict[JinjaVariables, str] = PrivateAttr(default=None) 63 | managed: Optional[bool] = Field(default=False) 64 | trigger: Optional[str] = Field(default=None) 65 | trigger_type: Optional[TriggerType] = Field(default=None) 66 | filename: str = Field(...) 67 | filename_date_format: str = Field(...) 68 | path_date_format: str = Field(...) 69 | format: str = Field(...) 70 | spark_schema: Optional[Union[StructType, str]] = Field(default=None) 71 | ddl: Optional[List[str]] = Field(default=None) 72 | headerless_ddl: Optional[List[str]] = Field(default=None) 73 | stage: StageType = Field(...) 74 | slice_date: Optional[SliceDateFormat] = Field( 75 | default=SliceDateFormat.FILENAME_DATE_FORMAT 76 | ) 77 | slice_date_column_name: Optional[str] = Field(default="_slice_date") 78 | 79 | def _render(self): 80 | super()._render() 81 | self._replacements[ 82 | JinjaVariables.FILENAME_DATE_FORMAT 83 | ] = self.timeslice.strftime(self.filename_date_format) 84 | self._replacements[JinjaVariables.PATH_DATE_FORMAT] = self.timeslice.strftime( 85 | self.path_date_format 86 | ) 87 | if not self._rendered: 88 | self.location = render_jinja(self.location, self._replacements) 89 | self.filename = render_jinja(self.filename, self._replacements) 90 | self.database = render_jinja(self.database, self._replacements) 91 | self.table = render_jinja(self.table, self._replacements) 92 | self.trigger = render_jinja(self.trigger, self._replacements) 93 | 94 | if self.options: 95 | for option, value in self.options.items(): 96 | self.options[option] = render_jinja(value, self._replacements) 97 | 98 | self._config_schema_hints() 99 | 100 | if isinstance(self.spark_schema, str): 101 | path = self.spark_schema 102 | path = render_jinja(self.spark_schema, self._replacements) 103 | path = abs_config_path(self.config_path, path) 104 | if os.path.exists(path): 105 | self._load_schema(path) 106 | else: 107 | self.spark_schema = path 108 | self._logger.warning( 109 | f"Schema path doesn't exist, schema has not been loaded and remains to be path {path}." 110 | ) 111 | 112 | corrupt_record_name = self.options.get( 113 | self._OPTION_CORRUPT_RECORD_NAME, None 114 | ) 115 | if isinstance(self.spark_schema, StructType) and corrupt_record_name: 116 | if corrupt_record_name not in self.spark_schema.names: 117 | self.spark_schema.add(field=corrupt_record_name, data_type="string") 118 | 119 | if self.options: 120 | for option, value in self.options.items(): 121 | if isinstance(value, str): 122 | self.options[option] = render_jinja(value, self._replacements) 123 | 124 | self._rendered = True 125 | 126 | def _config_schema_hints(self): 127 | path = self.options.get(self._OPTION_CF_SCHEMA_HINTS, None) 128 | if path and "/" in path: 129 | self._load_schema(path) 130 | 131 | if self.options.get("header"): 132 | self.options[self._OPTION_CF_SCHEMA_HINTS] = ", ".join(self.ddl) 133 | else: 134 | self.options[self._OPTION_CF_SCHEMA_HINTS] = ", ".join( 135 | self.headerless_ddl 136 | ) 137 | 138 | def _load_schema(self, path: str): 139 | path = abs_config_path(self.config_path, path) 140 | if not self.spark_schema or isinstance(self.spark_schema, str): 141 | self.spark_schema = load_schema(path) 142 | if not self.ddl: 143 | self.ddl = get_ddl(self.spark_schema, header=True) 144 | if not self.headerless_ddl: 145 | self.headerless_ddl = get_ddl(self.spark_schema, header=False) 146 | 147 | def rename_headerless(self, df: Union[StreamingQuery, DataFrame]): 148 | columns = [c for c in df.columns if c not in ["_rescued_data"]] 149 | columns_cnt = len(columns) 150 | ddls = len(self.ddl) 151 | if columns_cnt != ddls: 152 | raise Exception( 153 | f"Headless files with schema hints must have a fully hinted schema since it must work positionally. Datasets!=dll({columns_cnt}!={ddls}" 154 | ) 155 | 156 | for i, _ in enumerate(columns): 157 | from_name = f"_c{i}" 158 | to_name = self.ddl[i].split(" ")[0].strip() 159 | logging.info(f"rename {from_name} to {to_name}") 160 | df: Union[StreamingQuery, DataFrame] = df.withColumnRenamed( 161 | from_name, to_name 162 | ) 163 | 164 | return df 165 | 166 | def qualified_table_name(self): 167 | return self.path 168 | 169 | class Config: 170 | arbitrary_types_allowed = True 171 | -------------------------------------------------------------------------------- /yetl/config/_tables.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel, Field, PrivateAttr 2 | from typing import Union, Any, Dict, List, Optional 3 | from ._stage_type import StageType 4 | import fnmatch 5 | from ._table_mapping import TableMapping 6 | from .table import TableType 7 | from .table import table_factory 8 | from .table import Table 9 | from enum import Enum 10 | import logging 11 | 12 | _INDEX_WILDCARD = "*" 13 | 14 | 15 | class KeyContants(Enum): 16 | DATABASE = "database" 17 | TABLE = "table" 18 | TABLES = "tables" 19 | STAGE = "stage" 20 | TABLE_TYPE = "table_type" 21 | PROJECT = "project" 22 | TIMESLICE = "timeslice" 23 | CONFIG_PATH = "config_path" 24 | 25 | 26 | class PushDownProperties(Enum): 27 | DELTA_PROPETIES = "delta_properties" 28 | CATALOG = "catalog" 29 | 30 | @classmethod 31 | def has_value(cls, value): 32 | return value in cls._value2member_map_ 33 | 34 | @classmethod 35 | def has_not_value(cls, value): 36 | return value not in cls._value2member_map_ 37 | 38 | 39 | class Tables(BaseModel): 40 | def __init__(self, **data: Any) -> None: 41 | super().__init__(**data) 42 | self._logger = logging.getLogger(self.__class__.__name__) 43 | self._parse_configuration() 44 | self._build_tables() 45 | 46 | def _parse_configuration(self): 47 | push_down_properties = {} 48 | for stage_name, table_type in self.table_data["tables"].items(): 49 | stage_type = StageType(stage_name) 50 | for table_type_name, database in table_type.items(): 51 | table_type = TableType(table_type_name) 52 | push_down_properties = {} 53 | for database_name, table in database.items(): 54 | if PushDownProperties.has_not_value(database_name): 55 | catalog = table.get(PushDownProperties.CATALOG.value) 56 | if PushDownProperties.CATALOG.value in table: 57 | del table[PushDownProperties.CATALOG.value] 58 | for table_name, table_properties in table.items(): 59 | table_config = { 60 | KeyContants.DATABASE.value: database_name, 61 | KeyContants.TABLE.value: table_name, 62 | KeyContants.STAGE.value: stage_type, 63 | KeyContants.TABLE_TYPE.value: table_type, 64 | KeyContants.PROJECT.value: self.table_data.get( 65 | KeyContants.PROJECT.value 66 | ), 67 | KeyContants.TIMESLICE.value: self.table_data.get( 68 | KeyContants.TIMESLICE.value 69 | ), 70 | KeyContants.CONFIG_PATH.value: self.table_data.get( 71 | KeyContants.CONFIG_PATH.value 72 | ), 73 | } 74 | if table_properties: 75 | table_config = {**table_config, **table_properties} 76 | table_config = {**push_down_properties, **table_config} 77 | table_config[PushDownProperties.CATALOG.value] = catalog 78 | for p, v in push_down_properties.items(): 79 | if isinstance(v, dict) and table_config.get(p): 80 | table_config[p] = {**v, **table_config[p]} 81 | else: 82 | table_config[p] = v 83 | stage_config = self.table_data.get(stage_type.value, {}) 84 | stage_config = stage_config.get(table_type.value, {}) 85 | table_config = {**stage_config, **table_config} 86 | index = f"{stage_name}.{database_name}.{table_name}" 87 | self.tables_index[index] = table_config 88 | else: 89 | push_down_properties[database_name] = table 90 | 91 | table_data: dict = Field(...) 92 | tables_index: Dict[str, Table] = Field(default={}) 93 | delta_properties: Optional[Dict[str, str]] = Field(default=None) 94 | _logger: Any = PrivateAttr(default=None) 95 | 96 | @classmethod 97 | def get_index( 98 | cls, 99 | stage: Union[StageType, str] = _INDEX_WILDCARD, 100 | database=_INDEX_WILDCARD, 101 | table=_INDEX_WILDCARD, 102 | ): 103 | if isinstance(stage, StageType): 104 | return f"{stage.name}.{database}.{table}" 105 | else: 106 | return f"{stage}.{database}.{table}" 107 | 108 | @classmethod 109 | def parse_index( 110 | cls, 111 | index: str, 112 | ): 113 | try: 114 | parts = index.split(".") 115 | stage = StageType[parts[0]] 116 | database = parts[1] 117 | table = parts[2] 118 | except Exception as e: 119 | raise Exception( 120 | f"attempted to parse an invalid index {index}. It must be of the form 'stage.database.table'" 121 | ) from e 122 | 123 | return stage, database, table 124 | 125 | def _build_tables(self): 126 | """ 127 | Parse through the table definitions dictionary and deserialize it 128 | into Table objects. The table object are then place in a dictionary for easy 129 | lookup with a key = stage.database.table and the value being the table 130 | object it self. This dictionary index is held on self.tables_index 131 | """ 132 | for index, table_config in self.tables_index.items(): 133 | self.tables_index[index] = table_factory.make( 134 | table_config["table_type"], table_config 135 | ) 136 | 137 | def create_table( 138 | self, 139 | stage: Union[StageType, str] = _INDEX_WILDCARD, 140 | database=_INDEX_WILDCARD, 141 | table=_INDEX_WILDCARD, 142 | first_match: bool = True, 143 | catalog: str = None, 144 | **kwargs, 145 | ): 146 | return self.lookup_table( 147 | stage=stage, 148 | database=database, 149 | table=table, 150 | first_match=first_match, 151 | create_database=True, 152 | create_table=True, 153 | catalog=catalog, 154 | **kwargs, 155 | ) 156 | 157 | def lookup_table( 158 | self, 159 | stage: Union[StageType, str] = _INDEX_WILDCARD, 160 | database=_INDEX_WILDCARD, 161 | table=_INDEX_WILDCARD, 162 | first_match: bool = True, 163 | create_database: bool = False, 164 | create_table: bool = False, 165 | catalog: str = None, 166 | **kwargs, 167 | ): 168 | index = Tables.get_index(stage, database, table) 169 | matches = fnmatch.filter(list(self.tables_index.keys()), index) 170 | 171 | if not matches: 172 | raise Exception(f"index {index} not found in tables_index") 173 | 174 | def match_property( 175 | table: Table, properties: Dict[str, Any], matches: List[str] 176 | ): 177 | for p, v in properties.items(): 178 | if ( 179 | isinstance(table.custom_properties, dict) 180 | and table.custom_properties.get(p) == v 181 | ): 182 | return True 183 | else: 184 | index = Tables.get_index(table.stage, table.database, table.table) 185 | if index in matches: 186 | matches.remove( 187 | Tables.get_index(table.stage, table.database, table.table) 188 | ) 189 | return False 190 | 191 | tables_index = dict(self.tables_index) 192 | if kwargs: 193 | tables_index = { 194 | k: v 195 | for k, v in self.tables_index.items() 196 | if match_property(v, kwargs, matches) 197 | } 198 | 199 | if first_match: 200 | matches = matches[0] 201 | table = tables_index[matches] 202 | msg_tables = f"{table.database}.{table.table}" 203 | self._logger.info(f"Matched tables: {msg_tables}") 204 | if create_database: 205 | table.create_database(catalog=catalog) 206 | if create_table: 207 | table.create_table(catalog=catalog) 208 | return table 209 | else: 210 | tables = [tables_index[i] for i in matches] 211 | msg_tables = "\n".join([f"{t.database}.{t.table}" for t in tables]) 212 | self._logger.info(f"Matched tables: {msg_tables}") 213 | db = "" 214 | if create_table or create_database: 215 | for t in tables: 216 | if create_database and db != t.database: 217 | db = t.database 218 | t.create_database(catalog=catalog) 219 | if create_table: 220 | t.create_table(catalog=catalog) 221 | return tables 222 | 223 | def get_table_mapping( 224 | self, 225 | stage: StageType, 226 | table=_INDEX_WILDCARD, 227 | database=_INDEX_WILDCARD, 228 | create_database: bool = False, 229 | create_table: bool = False, 230 | catalog: str = None, 231 | ): 232 | destination = self.lookup_table( 233 | stage=stage, 234 | database=database, 235 | table=table, 236 | first_match=True, 237 | create_database=create_database, 238 | create_table=create_table, 239 | catalog=catalog, 240 | ) 241 | source = {} 242 | 243 | tables = [] 244 | try: 245 | for index in destination.depends_on: 246 | do_stage, do_database, do_table = Tables.parse_index(index) 247 | tables = tables + self.lookup_table( 248 | stage=do_stage, 249 | table=do_table, 250 | database=do_database, 251 | first_match=False, 252 | create_database=create_database, 253 | create_table=create_table, 254 | catalog=catalog, 255 | ) 256 | except Exception as e: 257 | raise Exception(f"Error looking up dependencies for table {table}") from e 258 | 259 | for tbl in tables: 260 | source[tbl.table] = tbl 261 | 262 | if len(list(source.values())) == 1: 263 | source = list(source.values())[0] 264 | 265 | return TableMapping(source=source, destination=destination) 266 | -------------------------------------------------------------------------------- /test/config/test_project/pipelines/json_schema/sibytes_yetl_pipeline_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$id": "https://yetl.io/schemas/pipeline", 3 | 4 | "type":"object", 5 | "description": "Root of the yetl tables config", 6 | "properties": { 7 | "version": { 8 | "type": "string", 9 | "description": "version of yetl that the configuration is compatible with", 10 | "pattern": "^(\\d+\\.)?(\\d+\\.)?(\\*|\\d+)$" 11 | }, 12 | "audit_control": { 13 | "type": "object", 14 | "description": "definition of the audit_control database and tables", 15 | "properties": { 16 | "delta_lake": { "$ref": "#/$defs/delta_lake" } 17 | } 18 | }, 19 | "landing": { 20 | "type": "object", 21 | "description": "definition of the landing stage and files", 22 | "properties": { 23 | "read": { 24 | "type": "object", 25 | "description": "read table type is used for spark read table properties, typically used for reading files in object storage", 26 | "properties": { 27 | "trigger": { 28 | "type": ["string", "null"], 29 | "description": "filemask patter to use as a trgger" 30 | }, 31 | "trigger_type": { 32 | "type": ["string", "null"], 33 | "description": "type of trgger" 34 | }, 35 | "container": { 36 | "type": ["string", "null"], 37 | "description": "type of trgger" 38 | }, 39 | "location": { 40 | "type": ["string", "null"], 41 | "description": "file directory location" 42 | }, 43 | "filename": { 44 | "type": ["string", "null"], 45 | "description": "filename mask" 46 | }, 47 | "filename_date_format": { 48 | "type": ["string", "null"], 49 | "description": "define a date format jinja variable for filename dates" 50 | }, 51 | "path_date_format": { 52 | "type": ["string", "null"], 53 | "description": "define a date format jinja variable for file paths" 54 | }, 55 | "slice_date": { 56 | "type": "string", 57 | "enum": ["filename_date_format", "path_date_format"], 58 | "description": "either the filename_date_format or the path_date_format used to shred the time period from the filename or path respectively" 59 | }, 60 | "format": { 61 | "type": "string", 62 | "enum": ["cloudFiles", "csv", "json", "parquet"], 63 | "description": "format of the landing file" 64 | }, 65 | "spark_schema": { 66 | "type": "string", 67 | "description": "relative path to where the spark definition is held" 68 | }, 69 | "options": { "$ref": "#/$defs/options" } 70 | } 71 | } 72 | }, 73 | "required":[ 74 | "read" 75 | ] 76 | }, 77 | 78 | "raw": { 79 | "type": "object", 80 | "description": "definition of the raw database and tables", 81 | "properties": { 82 | "delta_lake": { "$ref": "#/$defs/delta_lake" } 83 | } 84 | }, 85 | "base": { 86 | "type": "object", 87 | "description": "definition of the base database and tables", 88 | "properties": { 89 | "delta_lake": { "$ref": "#/$defs/delta_lake" } 90 | } 91 | } 92 | }, 93 | "required": [ 94 | "version", 95 | "landing" 96 | ], 97 | 98 | "$defs": { 99 | "delta_lake": { 100 | "type": "object", 101 | "description": "defines a stage as a delta lake table stage", 102 | "properties": { 103 | "managed": { 104 | "type": "boolean", 105 | "description": "whether it's a managed table or unmanaged that requires a location" 106 | }, 107 | "delta_properties": { "$ref": "#/$defs/delta_properties" }, 108 | "exception_thresholds": { "$ref": "#/$defs/thresholds" }, 109 | "warning_thresholds": { "$ref": "#/$defs/thresholds" }, 110 | "container": { 111 | "type": ["string", "null"], 112 | "description": "type of trgger" 113 | }, 114 | "location": { 115 | "type": ["string", "null"], 116 | "description": "file location of managed tables for the stage" 117 | }, 118 | "path": { 119 | "type": ["string", "null"], 120 | "description": "path of the table appended to the location of the stage" 121 | }, 122 | "options": { "$ref": "#/$defs/options" }, 123 | "database": { 124 | "type": "string", 125 | "description": "name of the database, {{ database }} variable will inherit the database name from table configuration" 126 | }, 127 | "table": { 128 | "type": "string", 129 | "description": "name of the database table, {{ table }} variable will inherit the database name from table configuration" 130 | } 131 | } 132 | }, 133 | "options": { 134 | "type": ["object","null"], 135 | "description": "holds key value pairs of custom properties", 136 | "minProperties": 1, 137 | "patternProperties":{ 138 | "^\\S+$": { 139 | "type": ["string","number","boolean"], 140 | "description": "value kay pairs of the spark DSL read options" 141 | } 142 | } 143 | }, 144 | "thresholds": { 145 | "type": "object", 146 | "description": "table etl thresholds", 147 | "properties": { 148 | "invalid_ratio": { 149 | "type": "number", 150 | "description": "decimal between 0 and 1 specifying the ratio of invalid rows to valid rows threshold", 151 | "exclusiveMinimum": 0, 152 | "maximum": 1 153 | }, 154 | "invalid_rows": {"type": "integer", "description": "integer specifying invalid rows threshold"}, 155 | "max_rows": {"type": "integer", "description": "integer specifying max rows threshold"}, 156 | "min_rows": {"type": "integer", "description": "integer specifying min rows threshold"} 157 | } 158 | }, 159 | "delta_properties": { 160 | "type": "object", 161 | "description": "holds key value pairs of delta properties", 162 | "minProperties": 1, 163 | "properties":{ 164 | "delta.appendOnly": { 165 | "type": "boolean", 166 | "description": "true for this Delta table to be append-only. If append-only, existing records cannot be deleted, and existing values cannot be updated." 167 | }, 168 | "delta.autoOptimize.autoCompact": { 169 | "type": ["string","boolean"], 170 | "description": "auto for Delta Lake to automatically optimize the layout of the files for this Delta table." 171 | }, 172 | "delta.autoOptimize.optimizeWrite": { 173 | "type": ["string","boolean"], 174 | "description": "true for Delta Lake to automatically optimize the layout of the files for this Delta table during writes." 175 | }, 176 | "delta.checkpoint.writeStatsAsJson": { 177 | "type": ["string","boolean"], 178 | "description": "true for Delta Lake to write file statistics in checkpoints in JSON format for the stats column." 179 | }, 180 | "delta.checkpoint.writeStatsAsStruct": { 181 | "type": ["string","boolean"], 182 | "description": "true for Delta Lake to write file statistics to checkpoints in struct format for the stats_parsed column and to write partition values as a struct for partitionValues_parsed." 183 | }, 184 | "delta.columnMapping.mode": { 185 | "type": "string", 186 | "description": "Whether column mapping is enabled for Delta table columns and the corresponding Parquet columns that use different names." 187 | }, 188 | "delta.compatibility.symlinkFormatManifest.enabled": { 189 | "type": ["string","boolean"], 190 | "description": "true for Delta Lake to configure the Delta table so that all write operations on the table automatically update the manifests." 191 | }, 192 | "delta.dataSkippingNumIndexedCols": { 193 | "type": "integer", 194 | "description": "The number of columns for Delta Lake to collect statistics about for data skipping. A value of -1 means to collect statistics for all columns. Updating this property does not automatically collect statistics again; instead, it redefines the statistics schema of the Delta table. Specifically, it changes the behavior of future statistics collection (such as during appends and optimizations) as well as data skipping (such as ignoring column statistics beyond this number, even when such statistics exist)." 195 | }, 196 | "delta.deletedFileRetentionDuration": { 197 | "type": "integer", 198 | "description": "The shortest duration for Delta Lake to keep logically deleted data files before deleting them physically. This is to prevent failures in stale readers after compactions or partition overwrites." 199 | }, 200 | "delta.enableChangeDataFeed": { 201 | "type": ["string","boolean"], 202 | "description": "true to enable change data feed." 203 | }, 204 | "delta.isolationLevel": { 205 | "type": "string", 206 | "description": "The degree to which a transaction must be isolated from modifications made by concurrent transactions." 207 | }, 208 | "delta.logRetentionDuration": { 209 | "type": "string", 210 | "description": "How long the history for a Delta table is kept. VACUUM operations override this retention threshold." 211 | }, 212 | "delta.minReaderVersion": { 213 | "type": "integer", 214 | "description": "The minimum required protocol reader version for a reader that allows to read from this Delta table." 215 | }, 216 | "delta.minWriterVersion": { 217 | "type": "integer", 218 | "description": "The minimum required protocol writer version for a writer that allows to write to this Delta table." 219 | }, 220 | "delta.randomizeFilePrefixes": { 221 | "type": ["string","boolean"], 222 | "description": "true for Delta Lake to generate a random prefix for a file path instead of partition information." 223 | }, 224 | "delta.randomPrefixLength": { 225 | "type": "integer", 226 | "description": "When delta.randomizeFilePrefixes is set to true, the number of characters that Delta Lake generates for random prefixes." 227 | }, 228 | "delta.setTransactionRetentionDuration": { 229 | "type": "string", 230 | "description": "The shortest duration within which new snapshots will retain transaction identifiers (for example, SetTransactions). When a new snapshot sees a transaction identifier older than or equal to the duration specified by this property, the snapshot considers it expired and ignores it. The SetTransaction identifier is used when making the writes idempotent. " 231 | }, 232 | "delta.targetFileSize": { 233 | "type": "string", 234 | "description": "The target file size in bytes or higher units for file tuning. For example, 104857600 (bytes) or 100mb." 235 | }, 236 | "delta.tuneFileSizesForRewrites": { 237 | "type": ["string","boolean"], 238 | "description": "true to always use lower file sizes for all data layout optimization operations on the Delta table." 239 | } 240 | } 241 | } 242 | } 243 | } -------------------------------------------------------------------------------- /schema_testing/sibytes_yetl_pipeline_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$id": "https://yetl.io/schemas/pipeline", 3 | 4 | "type":"object", 5 | "description": "Root of the yetl tables config", 6 | "properties": { 7 | "version": { 8 | "type": "string", 9 | "description": "version of yetl that the configuration is compatible with", 10 | "pattern": "^(\\d+\\.)?(\\d+\\.)?(\\*|\\d+)$" 11 | }, 12 | "audit_control": { 13 | "type": "object", 14 | "description": "definition of the audit_control database and tables", 15 | "properties": { 16 | "delta_lake": { "$ref": "#/$defs/delta_lake" } 17 | } 18 | }, 19 | "source": { 20 | "type": "object", 21 | "description": "definition of the landing stage and files", 22 | "properties": { 23 | "read": { "$ref": "#/$defs/read" }, 24 | "delta_lake": { "$ref": "#/$defs/delta_lake" } 25 | }, 26 | "oneOf": [ 27 | { 28 | "required":[ 29 | "read" 30 | ] 31 | }, 32 | { 33 | "required":[ 34 | "delta_lake" 35 | ] 36 | } 37 | ] 38 | }, 39 | "landing": { 40 | "type": "object", 41 | "description": "definition of the landing stage and files", 42 | "properties": { 43 | "read": { "$ref": "#/$defs/read" } 44 | }, 45 | "required":[ 46 | "read" 47 | ] 48 | }, 49 | 50 | "raw": { 51 | "type": "object", 52 | "description": "definition of the raw database and tables", 53 | "properties": { 54 | "delta_lake": { "$ref": "#/$defs/delta_lake" } 55 | } 56 | }, 57 | "base": { 58 | "type": "object", 59 | "description": "definition of the base database and tables", 60 | "properties": { 61 | "delta_lake": { "$ref": "#/$defs/delta_lake" } 62 | } 63 | } 64 | }, 65 | "required": [ 66 | "version", 67 | "landing" 68 | ], 69 | 70 | "$defs": { 71 | "read": { 72 | "type": "object", 73 | "description": "read table type is used for spark read table properties, typically used for reading files in object storage", 74 | "properties": { 75 | "trigger": { 76 | "type": ["string", "null"], 77 | "description": "filemask patter to use as a trgger" 78 | }, 79 | "trigger_type": { 80 | "type": ["string", "null"], 81 | "description": "type of trgger" 82 | }, 83 | "container": { 84 | "type": ["string", "null"], 85 | "description": "type of trgger" 86 | }, 87 | "location": { 88 | "type": ["string", "null"], 89 | "description": "file directory location" 90 | }, 91 | "filename": { 92 | "type": ["string", "null"], 93 | "description": "filename mask" 94 | }, 95 | "filename_date_format": { 96 | "type": ["string", "null"], 97 | "description": "define a date format jinja variable for filename dates" 98 | }, 99 | "path_date_format": { 100 | "type": ["string", "null"], 101 | "description": "define a date format jinja variable for file paths" 102 | }, 103 | "slice_date": { 104 | "type": "string", 105 | "enum": ["filename_date_format", "path_date_format"], 106 | "description": "either the filename_date_format or the path_date_format used to shred the time period from the filename or path respectively" 107 | }, 108 | "format": { 109 | "type": "string", 110 | "enum": ["cloudFiles", "csv", "json", "parquet"], 111 | "description": "format of the landing file" 112 | }, 113 | "spark_schema": { 114 | "type": "string", 115 | "description": "relative path to where the spark definition is held" 116 | }, 117 | "options": { "$ref": "#/$defs/options" } 118 | } 119 | }, 120 | 121 | "delta_lake": { 122 | "type": "object", 123 | "description": "defines a stage as a delta lake table stage", 124 | "properties": { 125 | "managed": { 126 | "type": "boolean", 127 | "description": "whether it's a managed table or unmanaged that requires a location" 128 | }, 129 | "delta_properties": { "$ref": "#/$defs/delta_properties" }, 130 | "exception_thresholds": { "$ref": "#/$defs/thresholds" }, 131 | "warning_thresholds": { "$ref": "#/$defs/thresholds" }, 132 | "container": { 133 | "type": ["string", "null"], 134 | "description": "type of trgger" 135 | }, 136 | "location": { 137 | "type": ["string", "null"], 138 | "description": "file location of managed tables for the stage" 139 | }, 140 | "path": { 141 | "type": ["string", "null"], 142 | "description": "path of the table appended to the location of the stage" 143 | }, 144 | "options": { "$ref": "#/$defs/options" }, 145 | "database": { 146 | "type": "string", 147 | "description": "name of the database, {{ database }} variable will inherit the database name from table configuration" 148 | }, 149 | "table": { 150 | "type": "string", 151 | "description": "name of the database table, {{ table }} variable will inherit the database name from table configuration" 152 | } 153 | } 154 | }, 155 | "options": { 156 | "type": ["object","null"], 157 | "description": "holds key value pairs of custom properties", 158 | "minProperties": 1, 159 | "patternProperties":{ 160 | "^\\S+$": { 161 | "type": ["string","number","boolean"], 162 | "description": "value kay pairs of the spark DSL read options" 163 | } 164 | } 165 | }, 166 | "thresholds": { 167 | "type": "object", 168 | "description": "table etl thresholds", 169 | "properties": { 170 | "invalid_ratio": { 171 | "type": "number", 172 | "description": "decimal between 0 and 1 specifying the ratio of invalid rows to valid rows threshold", 173 | "exclusiveMinimum": 0, 174 | "maximum": 1 175 | }, 176 | "invalid_rows": {"type": "integer", "description": "integer specifying invalid rows threshold"}, 177 | "max_rows": {"type": "integer", "description": "integer specifying max rows threshold"}, 178 | "min_rows": {"type": "integer", "description": "integer specifying min rows threshold"} 179 | } 180 | }, 181 | "delta_properties": { 182 | "type": "object", 183 | "description": "holds key value pairs of delta properties", 184 | "minProperties": 1, 185 | "properties":{ 186 | "delta.appendOnly": { 187 | "type": "boolean", 188 | "description": "true for this Delta table to be append-only. If append-only, existing records cannot be deleted, and existing values cannot be updated." 189 | }, 190 | "delta.autoOptimize.autoCompact": { 191 | "type": ["string","boolean"], 192 | "description": "auto for Delta Lake to automatically optimize the layout of the files for this Delta table." 193 | }, 194 | "delta.autoOptimize.optimizeWrite": { 195 | "type": ["string","boolean"], 196 | "description": "true for Delta Lake to automatically optimize the layout of the files for this Delta table during writes." 197 | }, 198 | "delta.checkpoint.writeStatsAsJson": { 199 | "type": ["string","boolean"], 200 | "description": "true for Delta Lake to write file statistics in checkpoints in JSON format for the stats column." 201 | }, 202 | "delta.checkpoint.writeStatsAsStruct": { 203 | "type": ["string","boolean"], 204 | "description": "true for Delta Lake to write file statistics to checkpoints in struct format for the stats_parsed column and to write partition values as a struct for partitionValues_parsed." 205 | }, 206 | "delta.columnMapping.mode": { 207 | "type": "string", 208 | "description": "Whether column mapping is enabled for Delta table columns and the corresponding Parquet columns that use different names." 209 | }, 210 | "delta.compatibility.symlinkFormatManifest.enabled": { 211 | "type": ["string","boolean"], 212 | "description": "true for Delta Lake to configure the Delta table so that all write operations on the table automatically update the manifests." 213 | }, 214 | "delta.dataSkippingNumIndexedCols": { 215 | "type": "integer", 216 | "description": "The number of columns for Delta Lake to collect statistics about for data skipping. A value of -1 means to collect statistics for all columns. Updating this property does not automatically collect statistics again; instead, it redefines the statistics schema of the Delta table. Specifically, it changes the behavior of future statistics collection (such as during appends and optimizations) as well as data skipping (such as ignoring column statistics beyond this number, even when such statistics exist)." 217 | }, 218 | "delta.deletedFileRetentionDuration": { 219 | "type": "integer", 220 | "description": "The shortest duration for Delta Lake to keep logically deleted data files before deleting them physically. This is to prevent failures in stale readers after compactions or partition overwrites." 221 | }, 222 | "delta.enableChangeDataFeed": { 223 | "type": ["string","boolean"], 224 | "description": "true to enable change data feed." 225 | }, 226 | "delta.isolationLevel": { 227 | "type": "string", 228 | "description": "The degree to which a transaction must be isolated from modifications made by concurrent transactions." 229 | }, 230 | "delta.logRetentionDuration": { 231 | "type": "string", 232 | "description": "How long the history for a Delta table is kept. VACUUM operations override this retention threshold." 233 | }, 234 | "delta.minReaderVersion": { 235 | "type": "integer", 236 | "description": "The minimum required protocol reader version for a reader that allows to read from this Delta table." 237 | }, 238 | "delta.minWriterVersion": { 239 | "type": "integer", 240 | "description": "The minimum required protocol writer version for a writer that allows to write to this Delta table." 241 | }, 242 | "delta.randomizeFilePrefixes": { 243 | "type": ["string","boolean"], 244 | "description": "true for Delta Lake to generate a random prefix for a file path instead of partition information." 245 | }, 246 | "delta.randomPrefixLength": { 247 | "type": "integer", 248 | "description": "When delta.randomizeFilePrefixes is set to true, the number of characters that Delta Lake generates for random prefixes." 249 | }, 250 | "delta.setTransactionRetentionDuration": { 251 | "type": "string", 252 | "description": "The shortest duration within which new snapshots will retain transaction identifiers (for example, SetTransactions). When a new snapshot sees a transaction identifier older than or equal to the duration specified by this property, the snapshot considers it expired and ignores it. The SetTransaction identifier is used when making the writes idempotent. " 253 | }, 254 | "delta.targetFileSize": { 255 | "type": "string", 256 | "description": "The target file size in bytes or higher units for file tuning. For example, 104857600 (bytes) or 100mb." 257 | }, 258 | "delta.tuneFileSizesForRewrites": { 259 | "type": ["string","boolean"], 260 | "description": "true to always use lower file sizes for all data layout optimization operations on the Delta table." 261 | } 262 | } 263 | } 264 | } 265 | } -------------------------------------------------------------------------------- /yetl/resource/sibytes_yetl_pipeline_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$id": "https://yetl.io/schemas/pipeline", 3 | 4 | "type":"object", 5 | "description": "Root of the yetl tables config", 6 | "properties": { 7 | "version": { 8 | "type": "string", 9 | "description": "version of yetl that the configuration is compatible with", 10 | "pattern": "^(\\d+\\.)?(\\d+\\.)?(\\*|\\d+)$" 11 | }, 12 | "audit_control": { 13 | "type": "object", 14 | "description": "definition of the audit_control database and tables", 15 | "properties": { 16 | "delta_lake": { "$ref": "#/$defs/delta_lake" } 17 | } 18 | }, 19 | "source": { 20 | "type": "object", 21 | "description": "definition of the landing stage and files", 22 | "properties": { 23 | "delta_lake": { "$ref": "#/$defs/delta_lake" }, 24 | "read": { "$ref": "#/$defs/read" } 25 | }, 26 | "oneOf": [ 27 | { 28 | "required":[ 29 | "read" 30 | ] 31 | }, 32 | { 33 | "required":[ 34 | "delta_lake" 35 | ] 36 | } 37 | ] 38 | }, 39 | "landing": { 40 | "type": "object", 41 | "description": "definition of the landing stage and files", 42 | "properties": { 43 | "read": { "$ref": "#/$defs/read" } 44 | }, 45 | "required":[ 46 | "read" 47 | ] 48 | }, 49 | 50 | "raw": { 51 | "type": "object", 52 | "description": "definition of the raw database and tables", 53 | "properties": { 54 | "delta_lake": { "$ref": "#/$defs/delta_lake" } 55 | } 56 | }, 57 | "base": { 58 | "type": "object", 59 | "description": "definition of the base database and tables", 60 | "properties": { 61 | "delta_lake": { "$ref": "#/$defs/delta_lake" } 62 | } 63 | } 64 | }, 65 | "required": [ 66 | "version", 67 | "landing" 68 | ], 69 | 70 | "$defs": { 71 | "read": { 72 | "type": "object", 73 | "description": "read table type is used for spark read table properties, typically used for reading files in object storage", 74 | "properties": { 75 | "trigger": { 76 | "type": ["string", "null"], 77 | "description": "filemask patter to use as a trgger" 78 | }, 79 | "trigger_type": { 80 | "type": ["string", "null"], 81 | "description": "type of trgger" 82 | }, 83 | "container": { 84 | "type": ["string", "null"], 85 | "description": "type of trgger" 86 | }, 87 | "location": { 88 | "type": ["string", "null"], 89 | "description": "file directory location" 90 | }, 91 | "filename": { 92 | "type": ["string", "null"], 93 | "description": "filename mask" 94 | }, 95 | "filename_date_format": { 96 | "type": ["string", "null"], 97 | "description": "define a date format jinja variable for filename dates" 98 | }, 99 | "path_date_format": { 100 | "type": ["string", "null"], 101 | "description": "define a date format jinja variable for file paths" 102 | }, 103 | "slice_date": { 104 | "type": "string", 105 | "enum": ["filename_date_format", "path_date_format"], 106 | "description": "either the filename_date_format or the path_date_format used to shred the time period from the filename or path respectively" 107 | }, 108 | "format": { 109 | "type": "string", 110 | "enum": ["cloudFiles", "csv", "json", "parquet"], 111 | "description": "format of the landing file" 112 | }, 113 | "spark_schema": { 114 | "type": "string", 115 | "description": "relative path to where the spark definition is held" 116 | }, 117 | "options": { "$ref": "#/$defs/options" } 118 | } 119 | }, 120 | 121 | "delta_lake": { 122 | "type": "object", 123 | "description": "defines a stage as a delta lake table stage", 124 | "properties": { 125 | "managed": { 126 | "type": "boolean", 127 | "description": "whether it's a managed table or unmanaged that requires a location" 128 | }, 129 | "delta_properties": { "$ref": "#/$defs/delta_properties" }, 130 | "exception_thresholds": { "$ref": "#/$defs/thresholds" }, 131 | "warning_thresholds": { "$ref": "#/$defs/thresholds" }, 132 | "container": { 133 | "type": ["string", "null"], 134 | "description": "type of trgger" 135 | }, 136 | "location": { 137 | "type": ["string", "null"], 138 | "description": "file location of managed tables for the stage" 139 | }, 140 | "path": { 141 | "type": ["string", "null"], 142 | "description": "path of the table appended to the location of the stage" 143 | }, 144 | "options": { "$ref": "#/$defs/options" }, 145 | "database": { 146 | "type": "string", 147 | "description": "name of the database, {{ database }} variable will inherit the database name from table configuration" 148 | }, 149 | "table": { 150 | "type": "string", 151 | "description": "name of the database table, {{ table }} variable will inherit the database name from table configuration" 152 | } 153 | } 154 | }, 155 | "options": { 156 | "type": ["object","null"], 157 | "description": "holds key value pairs of custom properties", 158 | "minProperties": 1, 159 | "patternProperties":{ 160 | "^\\S+$": { 161 | "type": ["string","number","boolean"], 162 | "description": "value kay pairs of the spark DSL read options" 163 | } 164 | } 165 | }, 166 | "thresholds": { 167 | "type": "object", 168 | "description": "table etl thresholds", 169 | "properties": { 170 | "invalid_ratio": { 171 | "type": "number", 172 | "description": "decimal between 0 and 1 specifying the ratio of invalid rows to valid rows threshold", 173 | "exclusiveMinimum": 0, 174 | "maximum": 1 175 | }, 176 | "invalid_rows": {"type": "integer", "description": "integer specifying invalid rows threshold"}, 177 | "max_rows": {"type": "integer", "description": "integer specifying max rows threshold"}, 178 | "min_rows": {"type": "integer", "description": "integer specifying min rows threshold"} 179 | } 180 | }, 181 | "delta_properties": { 182 | "type": "object", 183 | "description": "holds key value pairs of delta properties", 184 | "minProperties": 1, 185 | "properties":{ 186 | "delta.appendOnly": { 187 | "type": "boolean", 188 | "description": "true for this Delta table to be append-only. If append-only, existing records cannot be deleted, and existing values cannot be updated." 189 | }, 190 | "delta.autoOptimize.autoCompact": { 191 | "type": ["string","boolean"], 192 | "description": "auto for Delta Lake to automatically optimize the layout of the files for this Delta table." 193 | }, 194 | "delta.autoOptimize.optimizeWrite": { 195 | "type": ["string","boolean"], 196 | "description": "true for Delta Lake to automatically optimize the layout of the files for this Delta table during writes." 197 | }, 198 | "delta.checkpoint.writeStatsAsJson": { 199 | "type": ["string","boolean"], 200 | "description": "true for Delta Lake to write file statistics in checkpoints in JSON format for the stats column." 201 | }, 202 | "delta.checkpoint.writeStatsAsStruct": { 203 | "type": ["string","boolean"], 204 | "description": "true for Delta Lake to write file statistics to checkpoints in struct format for the stats_parsed column and to write partition values as a struct for partitionValues_parsed." 205 | }, 206 | "delta.columnMapping.mode": { 207 | "type": "string", 208 | "description": "Whether column mapping is enabled for Delta table columns and the corresponding Parquet columns that use different names." 209 | }, 210 | "delta.compatibility.symlinkFormatManifest.enabled": { 211 | "type": ["string","boolean"], 212 | "description": "true for Delta Lake to configure the Delta table so that all write operations on the table automatically update the manifests." 213 | }, 214 | "delta.dataSkippingNumIndexedCols": { 215 | "type": "integer", 216 | "description": "The number of columns for Delta Lake to collect statistics about for data skipping. A value of -1 means to collect statistics for all columns. Updating this property does not automatically collect statistics again; instead, it redefines the statistics schema of the Delta table. Specifically, it changes the behavior of future statistics collection (such as during appends and optimizations) as well as data skipping (such as ignoring column statistics beyond this number, even when such statistics exist)." 217 | }, 218 | "delta.deletedFileRetentionDuration": { 219 | "type": "integer", 220 | "description": "The shortest duration for Delta Lake to keep logically deleted data files before deleting them physically. This is to prevent failures in stale readers after compactions or partition overwrites." 221 | }, 222 | "delta.enableChangeDataFeed": { 223 | "type": ["string","boolean"], 224 | "description": "true to enable change data feed." 225 | }, 226 | "delta.isolationLevel": { 227 | "type": "string", 228 | "description": "The degree to which a transaction must be isolated from modifications made by concurrent transactions." 229 | }, 230 | "delta.logRetentionDuration": { 231 | "type": "string", 232 | "description": "How long the history for a Delta table is kept. VACUUM operations override this retention threshold." 233 | }, 234 | "delta.minReaderVersion": { 235 | "type": "integer", 236 | "description": "The minimum required protocol reader version for a reader that allows to read from this Delta table." 237 | }, 238 | "delta.minWriterVersion": { 239 | "type": "integer", 240 | "description": "The minimum required protocol writer version for a writer that allows to write to this Delta table." 241 | }, 242 | "delta.randomizeFilePrefixes": { 243 | "type": ["string","boolean"], 244 | "description": "true for Delta Lake to generate a random prefix for a file path instead of partition information." 245 | }, 246 | "delta.randomPrefixLength": { 247 | "type": "integer", 248 | "description": "When delta.randomizeFilePrefixes is set to true, the number of characters that Delta Lake generates for random prefixes." 249 | }, 250 | "delta.setTransactionRetentionDuration": { 251 | "type": "string", 252 | "description": "The shortest duration within which new snapshots will retain transaction identifiers (for example, SetTransactions). When a new snapshot sees a transaction identifier older than or equal to the duration specified by this property, the snapshot considers it expired and ignores it. The SetTransaction identifier is used when making the writes idempotent. " 253 | }, 254 | "delta.targetFileSize": { 255 | "type": "string", 256 | "description": "The target file size in bytes or higher units for file tuning. For example, 104857600 (bytes) or 100mb." 257 | }, 258 | "delta.tuneFileSizesForRewrites": { 259 | "type": ["string","boolean"], 260 | "description": "true to always use lower file sizes for all data layout optimization operations on the Delta table." 261 | } 262 | } 263 | } 264 | } 265 | } -------------------------------------------------------------------------------- /test/config/test_project/pipelines/json_schema/sibytes_yetl_tables_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$id": "https://yetl.io/schemas/tables", 3 | 4 | "type":"object", 5 | "description": "Root of the yetl tables config", 6 | "properties": { 7 | "version": { 8 | "type": "string", 9 | "description": "version of yetl that the configuration is compatible with", 10 | "pattern": "^(\\d+\\.)?(\\d+\\.)?(\\*|\\d+)$" 11 | }, 12 | "audit_control": { 13 | "type": "object", 14 | "description": "definition of the audit_control database and tables", 15 | "properties": { 16 | "delta_lake": { "$ref": "#/$defs/delta_lake" } 17 | } 18 | }, 19 | "landing": { 20 | "type": "object", 21 | "description": "definition of the landing stage and files", 22 | "properties": { 23 | "read": { 24 | "type": "object", 25 | "description": "read table type is used for spark read table properties, typically used for reading files in object storage", 26 | "minProperties": 1, 27 | "maxProperties": 1, 28 | "patternProperties": { 29 | "^\\S+$": { 30 | "type": "object", 31 | "description": "name of the volume holding the files", 32 | "minProperties": 1, 33 | "patternProperties": { 34 | "^\\S+$": { 35 | "type": ["string", "null"], 36 | "description": "name or partname of the file that indicates the data table e.g. customers_20201001.csv would be customers" 37 | } 38 | } 39 | } 40 | } 41 | } 42 | }, 43 | "required":[ 44 | "read" 45 | ] 46 | }, 47 | "raw": { 48 | "type": "object", 49 | "description": "definition of the raw database and tables", 50 | "properties": { 51 | "delta_lake": { "$ref": "#/$defs/delta_lake" } 52 | } 53 | }, 54 | "base": { 55 | "type": "object", 56 | "description": "definition of the base database and tables", 57 | "properties": { 58 | "delta_lake": { "$ref": "#/$defs/delta_lake" } 59 | } 60 | } 61 | }, 62 | "required": [ 63 | "version", 64 | "landing", 65 | "raw" 66 | ], 67 | 68 | "$defs": { 69 | "delta_lake": { 70 | "type": "object", 71 | "description": "defines a table object as a delta lake table", 72 | "properties": { 73 | "delta_properties": { "$ref": "#/$defs/delta_properties" } 74 | 75 | }, 76 | "patternProperties": { 77 | "^\\S+$": { "$ref": "#/$defs/delta_lake_database" } 78 | } 79 | 80 | }, 81 | "delta_lake_database": { 82 | "type": "object", 83 | "description": "database containing tables", 84 | "minProperties": 1, 85 | "properties": { 86 | "catalog": { 87 | "type": "string", 88 | "description": "the default catalog name for the database" 89 | } 90 | }, 91 | "patternProperties": { 92 | "^(?!catalog)(\\S+)$": { "$ref": "#/$defs/delta_lake_table" } 93 | } 94 | }, 95 | "delta_lake_table": { 96 | "type": "object", 97 | "description": "defines a deltalake table and it's properties", 98 | "properties":{ 99 | "managed": { 100 | "type": "boolean", 101 | "description": "whether it's a managed table or unmanaged that requires a location" 102 | }, 103 | "delta_properties": { "$ref": "#/$defs/delta_properties" }, 104 | "delta_constraints": { "$ref": "#/$defs/delta_constraints" }, 105 | "custom_properties": { "$ref": "#/$defs/custom_properties" }, 106 | "depends_on": { 107 | "type": "array", 108 | "items": { 109 | "type": "string" 110 | }, 111 | "uniqueItems": true, 112 | "pattern": "^(\\S+\\.)(\\S+\\.)(\\*|\\S+)", 113 | "description": "use to denote dependency on other tables referenced their index this is the stage.database.table. The form stage.database.* can also be used to reference all tables" 114 | }, 115 | "exception_thresholds": { "$ref": "#/$defs/thresholds" }, 116 | "warning_thresholds": { "$ref": "#/$defs/thresholds" }, 117 | "partition_by": { 118 | "oneOf": [ 119 | { 120 | "type": "string", 121 | "description": "columns on which to partition by" 122 | }, 123 | { 124 | "type": "array", 125 | "uniqueItems": true, 126 | "items": { 127 | "type": "string" 128 | }, 129 | "description": "columns on which to partition by" 130 | } 131 | ] 132 | }, 133 | "cluster_by": { 134 | "oneOf": [ 135 | { 136 | "type": "string", 137 | "description": "columns on which to appply liquid clustering" 138 | }, 139 | { 140 | "type": "array", 141 | "uniqueItems": true, 142 | "items": { 143 | "type": "string" 144 | }, 145 | "description": "columns on which to appply liquid clustering" 146 | } 147 | ] 148 | }, 149 | "z_order_by": { 150 | "oneOf": [ 151 | { 152 | "type": "string", 153 | "description": "column on which to appply z-ording" 154 | }, 155 | { 156 | "type": "array", 157 | "uniqueItems": true, 158 | "items": { 159 | "type": "string" 160 | }, 161 | "description": "columns on which to appply z-ording" 162 | } 163 | ] 164 | }, 165 | "id": { 166 | "oneOf": [ 167 | { 168 | "type": "string", 169 | "description": "column(s) that comprise the unique identifier" 170 | }, 171 | { 172 | "type": "array", 173 | "uniqueItems": true, 174 | "items": { 175 | "type": "string" 176 | }, 177 | "description": "column(s) that comprise the unique identifier" 178 | } 179 | ] 180 | }, 181 | "vacuum": { 182 | "type": "integer", 183 | "description": "vacuum retention threshold in the number of hours", 184 | "minimum": 0 185 | } 186 | } 187 | }, 188 | "delta_constraints": { 189 | "type": "object", 190 | "description": "holds key value pairs of delta constraints", 191 | "minProperties": 1, 192 | "patternProperties":{ 193 | "^\\S+$": { 194 | "type": "string", 195 | "description": "check constraint logic" 196 | } 197 | } 198 | }, 199 | "custom_properties": { 200 | "type": "object", 201 | "description": "holds key value pairs of custom properties", 202 | "minProperties": 1, 203 | "patternProperties":{ 204 | "^\\S+$": { 205 | "type": ["string","number","boolean"], 206 | "description": "custom property" 207 | } 208 | } 209 | }, 210 | "thresholds": { 211 | "type": "object", 212 | "description": "table etl thresholds", 213 | "properties": { 214 | "invalid_ratio": { 215 | "type": "number", 216 | "description": "decimal between 0 and 1 specifying the ratio of invalid rows to valid rows threshold", 217 | "exclusiveMinimum": 0, 218 | "maximum": 1 219 | }, 220 | "invalid_rows": {"type": "integer", "description": "integer specifying invalid rows threshold"}, 221 | "max_rows": {"type": "integer", "description": "integer specifying max rows threshold"}, 222 | "min_rows": {"type": "integer", "description": "integer specifying min rows threshold"} 223 | } 224 | }, 225 | "delta_properties": { 226 | "type": "object", 227 | "description": "holds key value pairs of delta properties", 228 | "minProperties": 1, 229 | "properties":{ 230 | "delta.appendOnly": { 231 | "type": "boolean", 232 | "description": "true for this Delta table to be append-only. If append-only, existing records cannot be deleted, and existing values cannot be updated." 233 | }, 234 | "delta.autoOptimize.autoCompact": { 235 | "type": ["string","boolean"], 236 | "description": "auto for Delta Lake to automatically optimize the layout of the files for this Delta table." 237 | }, 238 | "delta.autoOptimize.optimizeWrite": { 239 | "type": ["string","boolean"], 240 | "description": "true for Delta Lake to automatically optimize the layout of the files for this Delta table during writes." 241 | }, 242 | "delta.checkpoint.writeStatsAsJson": { 243 | "type": ["string","boolean"], 244 | "description": "true for Delta Lake to write file statistics in checkpoints in JSON format for the stats column." 245 | }, 246 | "delta.checkpoint.writeStatsAsStruct": { 247 | "type": ["string","boolean"], 248 | "description": "true for Delta Lake to write file statistics to checkpoints in struct format for the stats_parsed column and to write partition values as a struct for partitionValues_parsed." 249 | }, 250 | "delta.columnMapping.mode": { 251 | "type": "string", 252 | "description": "Whether column mapping is enabled for Delta table columns and the corresponding Parquet columns that use different names." 253 | }, 254 | "delta.compatibility.symlinkFormatManifest.enabled": { 255 | "type": ["string","boolean"], 256 | "description": "true for Delta Lake to configure the Delta table so that all write operations on the table automatically update the manifests." 257 | }, 258 | "delta.dataSkippingNumIndexedCols": { 259 | "type": "integer", 260 | "description": "The number of columns for Delta Lake to collect statistics about for data skipping. A value of -1 means to collect statistics for all columns. Updating this property does not automatically collect statistics again; instead, it redefines the statistics schema of the Delta table. Specifically, it changes the behavior of future statistics collection (such as during appends and optimizations) as well as data skipping (such as ignoring column statistics beyond this number, even when such statistics exist)." 261 | }, 262 | "delta.deletedFileRetentionDuration": { 263 | "type": "integer", 264 | "description": "The shortest duration for Delta Lake to keep logically deleted data files before deleting them physically. This is to prevent failures in stale readers after compactions or partition overwrites." 265 | }, 266 | "delta.enableChangeDataFeed": { 267 | "type": ["string","boolean"], 268 | "description": "true to enable change data feed." 269 | }, 270 | "delta.isolationLevel": { 271 | "type": "string", 272 | "description": "The degree to which a transaction must be isolated from modifications made by concurrent transactions." 273 | }, 274 | "delta.logRetentionDuration": { 275 | "type": "string", 276 | "description": "How long the history for a Delta table is kept. VACUUM operations override this retention threshold." 277 | }, 278 | "delta.minReaderVersion": { 279 | "type": "integer", 280 | "description": "The minimum required protocol reader version for a reader that allows to read from this Delta table." 281 | }, 282 | "delta.minWriterVersion": { 283 | "type": "integer", 284 | "description": "The minimum required protocol writer version for a writer that allows to write to this Delta table." 285 | }, 286 | "delta.randomizeFilePrefixes": { 287 | "type": ["string","boolean"], 288 | "description": "true for Delta Lake to generate a random prefix for a file path instead of partition information." 289 | }, 290 | "delta.randomPrefixLength": { 291 | "type": "integer", 292 | "description": "When delta.randomizeFilePrefixes is set to true, the number of characters that Delta Lake generates for random prefixes." 293 | }, 294 | "delta.setTransactionRetentionDuration": { 295 | "type": "string", 296 | "description": "The shortest duration within which new snapshots will retain transaction identifiers (for example, SetTransactions). When a new snapshot sees a transaction identifier older than or equal to the duration specified by this property, the snapshot considers it expired and ignores it. The SetTransaction identifier is used when making the writes idempotent. " 297 | }, 298 | "delta.targetFileSize": { 299 | "type": "string", 300 | "description": "The target file size in bytes or higher units for file tuning. For example, 104857600 (bytes) or 100mb." 301 | }, 302 | "delta.tuneFileSizesForRewrites": { 303 | "type": ["string","boolean"], 304 | "description": "true to always use lower file sizes for all data layout optimization operations on the Delta table." 305 | } 306 | } 307 | } 308 | } 309 | } -------------------------------------------------------------------------------- /schema_testing/sibytes_yetl_tables_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$id": "https://yetl.io/schemas/tables", 3 | 4 | "type":"object", 5 | "description": "Root of the yetl tables config", 6 | "properties": { 7 | "version": { 8 | "type": "string", 9 | "description": "version of yetl that the configuration is compatible with", 10 | "pattern": "^(\\d+\\.)?(\\d+\\.)?(\\*|\\d+)$" 11 | }, 12 | "audit_control": { 13 | "type": "object", 14 | "description": "definition of the audit_control database and tables", 15 | "properties": { 16 | "delta_lake": { "$ref": "#/$defs/delta_lake" } 17 | } 18 | }, 19 | "source": { 20 | "type": "object", 21 | "description": "definition of the source stage", 22 | "properties": { 23 | "read": { "$ref": "#/$defs/read" }, 24 | "delta_lake": { "$ref": "#/$defs/delta_lake" } 25 | }, 26 | "oneOf": [ 27 | { 28 | "required":[ 29 | "read" 30 | ] 31 | }, 32 | { 33 | "required":[ 34 | "delta_lake" 35 | ] 36 | } 37 | ] 38 | }, 39 | "landing": { 40 | "type": "object", 41 | "description": "definition of the landing stage and files", 42 | "properties": { 43 | "read": { "$ref": "#/$defs/read" } 44 | }, 45 | "required": ["read"] 46 | }, 47 | "raw": { 48 | "type": "object", 49 | "description": "definition of the raw database and tables", 50 | "properties": { 51 | "delta_lake": { "$ref": "#/$defs/delta_lake" } 52 | } 53 | }, 54 | "base": { 55 | "type": "object", 56 | "description": "definition of the base database and tables", 57 | "properties": { 58 | "delta_lake": { "$ref": "#/$defs/delta_lake" } 59 | } 60 | } 61 | }, 62 | "required": [ 63 | "version" 64 | ], 65 | 66 | "$defs": { 67 | "read": { 68 | "type": "object", 69 | "description": "read table type is used for spark read table properties, typically used for reading files in object storage", 70 | "minProperties": 1, 71 | "maxProperties": 1, 72 | "patternProperties": { 73 | "^\\S+$": { 74 | "type": "object", 75 | "description": "name of the volume holding the files", 76 | "minProperties": 1, 77 | "patternProperties": { 78 | "^\\S+$": { 79 | "type": ["string", "null"], 80 | "description": "name or partname of the file that indicates the data table e.g. customers_20201001.csv would be customers" 81 | } 82 | } 83 | } 84 | } 85 | }, 86 | "delta_lake": { 87 | "type": "object", 88 | "description": "defines a table object as a delta lake table", 89 | "properties": { 90 | "delta_properties": { "$ref": "#/$defs/delta_properties" } 91 | 92 | }, 93 | "patternProperties": { 94 | "^\\S+$": { "$ref": "#/$defs/delta_lake_database" } 95 | } 96 | 97 | }, 98 | "delta_lake_database": { 99 | "type": "object", 100 | "description": "database containing tables", 101 | "minProperties": 1, 102 | "properties": { 103 | "catalog": { 104 | "type": ["string", "null"], 105 | "description": "the default catalog name for the database" 106 | } 107 | }, 108 | "patternProperties": { 109 | "^(?!catalog)(\\S+)$": { "$ref": "#/$defs/delta_lake_table" } 110 | } 111 | }, 112 | "delta_lake_table": { 113 | "type": "object", 114 | "description": "defines a deltalake table and it's properties", 115 | "properties":{ 116 | "managed": { 117 | "type": "boolean", 118 | "description": "whether it's a managed table or unmanaged that requires a location" 119 | }, 120 | "delta_properties": { "$ref": "#/$defs/delta_properties" }, 121 | "delta_constraints": { "$ref": "#/$defs/delta_constraints" }, 122 | "custom_properties": { "$ref": "#/$defs/custom_properties" }, 123 | "depends_on": { 124 | "type": "array", 125 | "items": { 126 | "type": "string" 127 | }, 128 | "uniqueItems": true, 129 | "pattern": "^(\\S+\\.)(\\S+\\.)(\\*|\\S+)", 130 | "description": "use to denote dependency on other tables referenced their index this is the stage.database.table. The form stage.database.* can also be used to reference all tables" 131 | }, 132 | "exception_thresholds": { "$ref": "#/$defs/thresholds" }, 133 | "warning_thresholds": { "$ref": "#/$defs/thresholds" }, 134 | "partition_by": { 135 | "oneOf": [ 136 | { 137 | "type": "string", 138 | "description": "columns on which to partition by" 139 | }, 140 | { 141 | "type": "array", 142 | "uniqueItems": true, 143 | "items": { 144 | "type": "string" 145 | }, 146 | "description": "columns on which to partition by" 147 | } 148 | ] 149 | }, 150 | "cluster_by": { 151 | "oneOf": [ 152 | { 153 | "type": "string", 154 | "description": "columns on which to appply liquid clustering" 155 | }, 156 | { 157 | "type": "array", 158 | "uniqueItems": true, 159 | "items": { 160 | "type": "string" 161 | }, 162 | "description": "columns on which to appply liquid clustering" 163 | } 164 | ] 165 | }, 166 | "z_order_by": { 167 | "oneOf": [ 168 | { 169 | "type": "string", 170 | "description": "column on which to appply z-ording" 171 | }, 172 | { 173 | "type": "array", 174 | "uniqueItems": true, 175 | "items": { 176 | "type": "string" 177 | }, 178 | "description": "columns on which to appply z-ording" 179 | } 180 | ] 181 | }, 182 | "id": { 183 | "oneOf": [ 184 | { 185 | "type": "string", 186 | "description": "column(s) that comprise the unique identifier" 187 | }, 188 | { 189 | "type": "array", 190 | "uniqueItems": true, 191 | "items": { 192 | "type": "string" 193 | }, 194 | "description": "column(s) that comprise the unique identifier" 195 | } 196 | ] 197 | }, 198 | "vacuum": { 199 | "type": "integer", 200 | "description": "vacuum retention threshold in the number of hours", 201 | "minimum": 0 202 | } 203 | } 204 | }, 205 | "delta_constraints": { 206 | "type": "object", 207 | "description": "holds key value pairs of delta constraints", 208 | "minProperties": 1, 209 | "patternProperties":{ 210 | "^\\S+$": { 211 | "type": "string", 212 | "description": "check constraint logic" 213 | } 214 | } 215 | }, 216 | "custom_properties": { 217 | "type": "object", 218 | "description": "holds key value pairs of custom properties", 219 | "minProperties": 1, 220 | "patternProperties":{ 221 | "^\\S+$": { 222 | "type": ["string","number","boolean"], 223 | "description": "custom property" 224 | } 225 | } 226 | }, 227 | "thresholds": { 228 | "type": "object", 229 | "description": "table etl thresholds", 230 | "properties": { 231 | "invalid_ratio": { 232 | "type": "number", 233 | "description": "decimal between 0 and 1 specifying the ratio of invalid rows to valid rows threshold", 234 | "exclusiveMinimum": 0, 235 | "maximum": 1 236 | }, 237 | "invalid_rows": {"type": "integer", "description": "integer specifying invalid rows threshold"}, 238 | "max_rows": {"type": "integer", "description": "integer specifying max rows threshold"}, 239 | "min_rows": {"type": "integer", "description": "integer specifying min rows threshold"} 240 | } 241 | }, 242 | "delta_properties": { 243 | "type": "object", 244 | "description": "holds key value pairs of delta properties", 245 | "minProperties": 1, 246 | "properties":{ 247 | "delta.appendOnly": { 248 | "type": "boolean", 249 | "description": "true for this Delta table to be append-only. If append-only, existing records cannot be deleted, and existing values cannot be updated." 250 | }, 251 | "delta.autoOptimize.autoCompact": { 252 | "type": ["string","boolean"], 253 | "description": "auto for Delta Lake to automatically optimize the layout of the files for this Delta table." 254 | }, 255 | "delta.autoOptimize.optimizeWrite": { 256 | "type": ["string","boolean"], 257 | "description": "true for Delta Lake to automatically optimize the layout of the files for this Delta table during writes." 258 | }, 259 | "delta.checkpoint.writeStatsAsJson": { 260 | "type": ["string","boolean"], 261 | "description": "true for Delta Lake to write file statistics in checkpoints in JSON format for the stats column." 262 | }, 263 | "delta.checkpoint.writeStatsAsStruct": { 264 | "type": ["string","boolean"], 265 | "description": "true for Delta Lake to write file statistics to checkpoints in struct format for the stats_parsed column and to write partition values as a struct for partitionValues_parsed." 266 | }, 267 | "delta.columnMapping.mode": { 268 | "type": "string", 269 | "description": "Whether column mapping is enabled for Delta table columns and the corresponding Parquet columns that use different names." 270 | }, 271 | "delta.compatibility.symlinkFormatManifest.enabled": { 272 | "type": ["string","boolean"], 273 | "description": "true for Delta Lake to configure the Delta table so that all write operations on the table automatically update the manifests." 274 | }, 275 | "delta.dataSkippingNumIndexedCols": { 276 | "type": "integer", 277 | "description": "The number of columns for Delta Lake to collect statistics about for data skipping. A value of -1 means to collect statistics for all columns. Updating this property does not automatically collect statistics again; instead, it redefines the statistics schema of the Delta table. Specifically, it changes the behavior of future statistics collection (such as during appends and optimizations) as well as data skipping (such as ignoring column statistics beyond this number, even when such statistics exist)." 278 | }, 279 | "delta.deletedFileRetentionDuration": { 280 | "type": "integer", 281 | "description": "The shortest duration for Delta Lake to keep logically deleted data files before deleting them physically. This is to prevent failures in stale readers after compactions or partition overwrites." 282 | }, 283 | "delta.enableChangeDataFeed": { 284 | "type": ["string","boolean"], 285 | "description": "true to enable change data feed." 286 | }, 287 | "delta.isolationLevel": { 288 | "type": "string", 289 | "description": "The degree to which a transaction must be isolated from modifications made by concurrent transactions." 290 | }, 291 | "delta.logRetentionDuration": { 292 | "type": "string", 293 | "description": "How long the history for a Delta table is kept. VACUUM operations override this retention threshold." 294 | }, 295 | "delta.minReaderVersion": { 296 | "type": "integer", 297 | "description": "The minimum required protocol reader version for a reader that allows to read from this Delta table." 298 | }, 299 | "delta.minWriterVersion": { 300 | "type": "integer", 301 | "description": "The minimum required protocol writer version for a writer that allows to write to this Delta table." 302 | }, 303 | "delta.randomizeFilePrefixes": { 304 | "type": ["string","boolean"], 305 | "description": "true for Delta Lake to generate a random prefix for a file path instead of partition information." 306 | }, 307 | "delta.randomPrefixLength": { 308 | "type": "integer", 309 | "description": "When delta.randomizeFilePrefixes is set to true, the number of characters that Delta Lake generates for random prefixes." 310 | }, 311 | "delta.setTransactionRetentionDuration": { 312 | "type": "string", 313 | "description": "The shortest duration within which new snapshots will retain transaction identifiers (for example, SetTransactions). When a new snapshot sees a transaction identifier older than or equal to the duration specified by this property, the snapshot considers it expired and ignores it. The SetTransaction identifier is used when making the writes idempotent. " 314 | }, 315 | "delta.targetFileSize": { 316 | "type": "string", 317 | "description": "The target file size in bytes or higher units for file tuning. For example, 104857600 (bytes) or 100mb." 318 | }, 319 | "delta.tuneFileSizesForRewrites": { 320 | "type": ["string","boolean"], 321 | "description": "true to always use lower file sizes for all data layout optimization operations on the Delta table." 322 | } 323 | } 324 | } 325 | } 326 | } -------------------------------------------------------------------------------- /yetl/resource/sibytes_yetl_tables_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$id": "https://yetl.io/schemas/tables", 3 | 4 | "type":"object", 5 | "description": "Root of the yetl tables config", 6 | "properties": { 7 | "version": { 8 | "type": "string", 9 | "description": "version of yetl that the configuration is compatible with", 10 | "pattern": "^(\\d+\\.)?(\\d+\\.)?(\\*|\\d+)$" 11 | }, 12 | "audit_control": { 13 | "type": "object", 14 | "description": "definition of the audit_control database and tables", 15 | "properties": { 16 | "delta_lake": { "$ref": "#/$defs/delta_lake" } 17 | } 18 | }, 19 | "source": { 20 | "type": "object", 21 | "description": "definition of the source stage", 22 | "properties": { 23 | "read": { "$ref": "#/$defs/read" }, 24 | "delta_lake": { "$ref": "#/$defs/delta_lake" } 25 | }, 26 | "oneOf": [ 27 | { 28 | "required":[ 29 | "read" 30 | ] 31 | }, 32 | { 33 | "required":[ 34 | "delta_lake" 35 | ] 36 | } 37 | ] 38 | }, 39 | "landing": { 40 | "type": "object", 41 | "description": "definition of the landing stage and files", 42 | "properties": { 43 | "read": { "$ref": "#/$defs/read" } 44 | }, 45 | "required": ["read"] 46 | }, 47 | "raw": { 48 | "type": "object", 49 | "description": "definition of the raw database and tables", 50 | "properties": { 51 | "delta_lake": { "$ref": "#/$defs/delta_lake" } 52 | } 53 | }, 54 | "base": { 55 | "type": "object", 56 | "description": "definition of the base database and tables", 57 | "properties": { 58 | "delta_lake": { "$ref": "#/$defs/delta_lake" } 59 | } 60 | } 61 | }, 62 | "required": [ 63 | "version" 64 | ], 65 | 66 | "$defs": { 67 | "read": { 68 | "type": "object", 69 | "description": "read table type is used for spark read table properties, typically used for reading files in object storage", 70 | "minProperties": 1, 71 | "maxProperties": 1, 72 | "patternProperties": { 73 | "^\\S+$": { 74 | "type": "object", 75 | "description": "name of the volume holding the files", 76 | "minProperties": 1, 77 | "patternProperties": { 78 | "^\\S+$": { 79 | "type": ["string", "null"], 80 | "description": "name or partname of the file that indicates the data table e.g. customers_20201001.csv would be customers" 81 | } 82 | } 83 | } 84 | } 85 | }, 86 | "delta_lake": { 87 | "type": "object", 88 | "description": "defines a table object as a delta lake table", 89 | "properties": { 90 | "delta_properties": { "$ref": "#/$defs/delta_properties" } 91 | 92 | }, 93 | "patternProperties": { 94 | "^\\S+$": { "$ref": "#/$defs/delta_lake_database" } 95 | } 96 | 97 | }, 98 | "delta_lake_database": { 99 | "type": "object", 100 | "description": "database containing tables", 101 | "minProperties": 1, 102 | "properties": { 103 | "catalog": { 104 | "type": ["string", "null"], 105 | "description": "the default catalog name for the database" 106 | } 107 | }, 108 | "patternProperties": { 109 | "^(?!catalog)(\\S+)$": { "$ref": "#/$defs/delta_lake_table" } 110 | } 111 | }, 112 | "delta_lake_table": { 113 | "type": ["object", "null"], 114 | "description": "defines a deltalake table and it's properties", 115 | "properties":{ 116 | "managed": { 117 | "type": "boolean", 118 | "description": "whether it's a managed table or unmanaged that requires a location" 119 | }, 120 | "delta_properties": { "$ref": "#/$defs/delta_properties" }, 121 | "delta_constraints": { "$ref": "#/$defs/delta_constraints" }, 122 | "custom_properties": { "$ref": "#/$defs/custom_properties" }, 123 | "depends_on": { 124 | "type": "array", 125 | "items": { 126 | "type": "string" 127 | }, 128 | "uniqueItems": true, 129 | "pattern": "^(\\S+\\.)(\\S+\\.)(\\*|\\S+)", 130 | "description": "use to denote dependency on other tables referenced their index this is the stage.database.table. The form stage.database.* can also be used to reference all tables" 131 | }, 132 | "exception_thresholds": { "$ref": "#/$defs/thresholds" }, 133 | "warning_thresholds": { "$ref": "#/$defs/thresholds" }, 134 | "partition_by": { 135 | "oneOf": [ 136 | { 137 | "type": "string", 138 | "description": "columns on which to partition by" 139 | }, 140 | { 141 | "type": "array", 142 | "uniqueItems": true, 143 | "items": { 144 | "type": "string" 145 | }, 146 | "description": "columns on which to partition by" 147 | } 148 | ] 149 | }, 150 | "cluster_by": { 151 | "oneOf": [ 152 | { 153 | "type": "string", 154 | "description": "columns on which to appply liquid clustering" 155 | }, 156 | { 157 | "type": "array", 158 | "uniqueItems": true, 159 | "items": { 160 | "type": "string" 161 | }, 162 | "description": "columns on which to appply liquid clustering" 163 | } 164 | ] 165 | }, 166 | "z_order_by": { 167 | "oneOf": [ 168 | { 169 | "type": "string", 170 | "description": "column on which to appply z-ording" 171 | }, 172 | { 173 | "type": "array", 174 | "uniqueItems": true, 175 | "items": { 176 | "type": "string" 177 | }, 178 | "description": "columns on which to appply z-ording" 179 | } 180 | ] 181 | }, 182 | "id": { 183 | "oneOf": [ 184 | { 185 | "type": "string", 186 | "description": "column(s) that comprise the unique identifier" 187 | }, 188 | { 189 | "type": "array", 190 | "uniqueItems": true, 191 | "items": { 192 | "type": "string" 193 | }, 194 | "description": "column(s) that comprise the unique identifier" 195 | } 196 | ] 197 | }, 198 | "vacuum": { 199 | "type": "integer", 200 | "description": "vacuum retention threshold in the number of hours", 201 | "minimum": 0 202 | } 203 | } 204 | }, 205 | "delta_constraints": { 206 | "type": "object", 207 | "description": "holds key value pairs of delta constraints", 208 | "minProperties": 1, 209 | "patternProperties":{ 210 | "^\\S+$": { 211 | "type": "string", 212 | "description": "check constraint logic" 213 | } 214 | } 215 | }, 216 | "custom_properties": { 217 | "type": "object", 218 | "description": "holds key value pairs of custom properties", 219 | "minProperties": 1, 220 | "patternProperties":{ 221 | "^\\S+$": { 222 | "type": ["string","number","boolean"], 223 | "description": "custom property" 224 | } 225 | } 226 | }, 227 | "thresholds": { 228 | "type": "object", 229 | "description": "table etl thresholds", 230 | "properties": { 231 | "invalid_ratio": { 232 | "type": "number", 233 | "description": "decimal between 0 and 1 specifying the ratio of invalid rows to valid rows threshold", 234 | "exclusiveMinimum": 0, 235 | "maximum": 1 236 | }, 237 | "invalid_rows": {"type": "integer", "description": "integer specifying invalid rows threshold"}, 238 | "max_rows": {"type": "integer", "description": "integer specifying max rows threshold"}, 239 | "min_rows": {"type": "integer", "description": "integer specifying min rows threshold"} 240 | } 241 | }, 242 | "delta_properties": { 243 | "type": "object", 244 | "description": "holds key value pairs of delta properties", 245 | "minProperties": 1, 246 | "properties":{ 247 | "delta.appendOnly": { 248 | "type": "boolean", 249 | "description": "true for this Delta table to be append-only. If append-only, existing records cannot be deleted, and existing values cannot be updated." 250 | }, 251 | "delta.autoOptimize.autoCompact": { 252 | "type": ["string","boolean"], 253 | "description": "auto for Delta Lake to automatically optimize the layout of the files for this Delta table." 254 | }, 255 | "delta.autoOptimize.optimizeWrite": { 256 | "type": ["string","boolean"], 257 | "description": "true for Delta Lake to automatically optimize the layout of the files for this Delta table during writes." 258 | }, 259 | "delta.checkpoint.writeStatsAsJson": { 260 | "type": ["string","boolean"], 261 | "description": "true for Delta Lake to write file statistics in checkpoints in JSON format for the stats column." 262 | }, 263 | "delta.checkpoint.writeStatsAsStruct": { 264 | "type": ["string","boolean"], 265 | "description": "true for Delta Lake to write file statistics to checkpoints in struct format for the stats_parsed column and to write partition values as a struct for partitionValues_parsed." 266 | }, 267 | "delta.columnMapping.mode": { 268 | "type": "string", 269 | "description": "Whether column mapping is enabled for Delta table columns and the corresponding Parquet columns that use different names." 270 | }, 271 | "delta.compatibility.symlinkFormatManifest.enabled": { 272 | "type": ["string","boolean"], 273 | "description": "true for Delta Lake to configure the Delta table so that all write operations on the table automatically update the manifests." 274 | }, 275 | "delta.dataSkippingNumIndexedCols": { 276 | "type": "integer", 277 | "description": "The number of columns for Delta Lake to collect statistics about for data skipping. A value of -1 means to collect statistics for all columns. Updating this property does not automatically collect statistics again; instead, it redefines the statistics schema of the Delta table. Specifically, it changes the behavior of future statistics collection (such as during appends and optimizations) as well as data skipping (such as ignoring column statistics beyond this number, even when such statistics exist)." 278 | }, 279 | "delta.deletedFileRetentionDuration": { 280 | "type": "integer", 281 | "description": "The shortest duration for Delta Lake to keep logically deleted data files before deleting them physically. This is to prevent failures in stale readers after compactions or partition overwrites." 282 | }, 283 | "delta.enableChangeDataFeed": { 284 | "type": ["string","boolean"], 285 | "description": "true to enable change data feed." 286 | }, 287 | "delta.isolationLevel": { 288 | "type": "string", 289 | "description": "The degree to which a transaction must be isolated from modifications made by concurrent transactions." 290 | }, 291 | "delta.logRetentionDuration": { 292 | "type": "string", 293 | "description": "How long the history for a Delta table is kept. VACUUM operations override this retention threshold." 294 | }, 295 | "delta.minReaderVersion": { 296 | "type": "integer", 297 | "description": "The minimum required protocol reader version for a reader that allows to read from this Delta table." 298 | }, 299 | "delta.minWriterVersion": { 300 | "type": "integer", 301 | "description": "The minimum required protocol writer version for a writer that allows to write to this Delta table." 302 | }, 303 | "delta.randomizeFilePrefixes": { 304 | "type": ["string","boolean"], 305 | "description": "true for Delta Lake to generate a random prefix for a file path instead of partition information." 306 | }, 307 | "delta.randomPrefixLength": { 308 | "type": "integer", 309 | "description": "When delta.randomizeFilePrefixes is set to true, the number of characters that Delta Lake generates for random prefixes." 310 | }, 311 | "delta.setTransactionRetentionDuration": { 312 | "type": "string", 313 | "description": "The shortest duration within which new snapshots will retain transaction identifiers (for example, SetTransactions). When a new snapshot sees a transaction identifier older than or equal to the duration specified by this property, the snapshot considers it expired and ignores it. The SetTransaction identifier is used when making the writes idempotent. " 314 | }, 315 | "delta.targetFileSize": { 316 | "type": "string", 317 | "description": "The target file size in bytes or higher units for file tuning. For example, 104857600 (bytes) or 100mb." 318 | }, 319 | "delta.tuneFileSizesForRewrites": { 320 | "type": ["string","boolean"], 321 | "description": "true to always use lower file sizes for all data layout optimization operations on the Delta table." 322 | } 323 | } 324 | } 325 | } 326 | } --------------------------------------------------------------------------------