├── .env
├── test
    ├── config
    │   └── test_project
    │   │   ├── databricks
    │   │       └── Workflows
    │   │       │   └── workflow_template.yaml
    │   │   ├── pipelines
    │   │       ├── tables.xlsx
    │   │       ├── tables_invalid.yaml
    │   │       ├── test_tables.yaml
    │   │       ├── tables.yaml
    │   │       ├── json_schema
    │   │       │   ├── sibytes_yetl_project_schema.json
    │   │       │   ├── sibytes_yetl_pipeline_schema.json
    │   │       │   └── sibytes_yetl_tables_schema.json
    │   │       └── autoloader.yaml
    │   │   ├── dataNone
    │   │       └── _delta_log
    │   │       │   ├── .00000000000000000000.json.crc
    │   │       │   └── 00000000000000000000.json
    │   │   ├── logging.yaml
    │   │   ├── test_project.yaml
    │   │   ├── sql
    │   │       └── raw_dbx_patterns_control
    │   │       │   ├── header_footer.sql
    │   │       │   └── raw_audit.sql
    │   │   └── schema
    │   │       ├── customer_details_1.yaml
    │   │       └── customer_details_2.yaml
    ├── unit
    │   ├── test_validation.py
    │   ├── test_timeslice.py
    │   └── test_utils.py
    └── integration
    │   └── test_configuration_load.py
├── yetl
    ├── resource
    │   ├── tables.xlsx
    │   ├── logging.yaml
    │   ├── project.yaml
    │   ├── __init__.py
    │   ├── sibytes_yetl_project_schema.json
    │   ├── sibytes_yetl_pipeline_schema.json
    │   └── sibytes_yetl_tables_schema.json
    ├── cli
    │   ├── metadata_provider
    │   │   └── __init__.py
    │   └── _init.py
    ├── config
    │   ├── table
    │   │   ├── _table_type.py
    │   │   ├── _write.py
    │   │   ├── __init__.py
    │   │   ├── _factory.py
    │   │   ├── _table.py
    │   │   ├── _deltalake.py
    │   │   └── _read.py
    │   ├── _stage_type.py
    │   ├── _table_mapping.py
    │   ├── __init__.py
    │   ├── _spark_context.py
    │   ├── _project.py
    │   ├── _logging_config.py
    │   ├── _decorators.py
    │   ├── _config.py
    │   ├── _utils.py
    │   ├── _timeslice.py
    │   └── _tables.py
    ├── workflow
    │   ├── __init__.py
    │   ├── _notebook.py
    │   ├── _dlt.py
    │   └── _multi_threaded.py
    ├── validation
    │   ├── __init__.py
    │   └── _validate.py
    ├── __init__.py
    └── __main__.py
├── .flake8
├── pytest.ini
├── typings
    └── __builtins__.pyi
├── local_cleanup.sh
├── .vscode
    ├── settings.json
    └── launch.json
├── README.md
├── requirements38.txt
├── requirements310.txt
├── main.py
├── setup.py
├── ci.yaml
├── schema_testing
    ├── tables.yaml
    ├── autoloader.yaml
    ├── sibytes_yetl_pipeline_schema.json
    └── sibytes_yetl_tables_schema.json
└── .gitignore


/.env:
--------------------------------------------------------------------------------
1 | YETL_CONFIG=./test/config
2 | YETL_ENVIRONMENT=local


--------------------------------------------------------------------------------
/test/config/test_project/databricks/Workflows/workflow_template.yaml:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/yetl/resource/tables.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sibytes/yetl/HEAD/yetl/resource/tables.xlsx


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ; extend-ignore = D
3 | per-file-ignores =
4 |     # line too long
5 |     yetl/*.py: E501
6 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | # pytest.ini
2 | [pytest]
3 | env =
4 |     YETL_CONFIG=./test/config
5 |     YETL_ENVIRONMENT=local
6 | 


--------------------------------------------------------------------------------
/typings/__builtins__.pyi:
--------------------------------------------------------------------------------
1 | 
2 | try:
3 |     from databricks.sdk.runtime import *
4 | except ModuleNotFoundError:
5 |     pass
6 | 


--------------------------------------------------------------------------------
/yetl/cli/metadata_provider/__init__.py:
--------------------------------------------------------------------------------
1 | from ._xlsx import XlsMetadata, ImportFormat
2 | 
3 | __all__ = ["XlsMetadata", "ImportFormat"]
4 | 


--------------------------------------------------------------------------------
/local_cleanup.sh:
--------------------------------------------------------------------------------
1 | rm -r -f ./metastore_db
2 | rm -r -f ./spark-warehouse
3 | rm -f derby.log
4 | rm -r -f ./test/config/test_project/data
5 | 


--------------------------------------------------------------------------------
/test/config/test_project/pipelines/tables.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sibytes/yetl/HEAD/test/config/test_project/pipelines/tables.xlsx


--------------------------------------------------------------------------------
/yetl/config/table/_table_type.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 | 
3 | 
4 | class TableType(str, Enum):
5 |     read = "read"
6 |     write = "write"
7 |     delta_lake = "delta_lake"
8 | 


--------------------------------------------------------------------------------
/test/config/test_project/dataNone/_delta_log/.00000000000000000000.json.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sibytes/yetl/HEAD/test/config/test_project/dataNone/_delta_log/.00000000000000000000.json.crc


--------------------------------------------------------------------------------
/yetl/workflow/__init__.py:
--------------------------------------------------------------------------------
1 | from ._notebook import Notebook
2 | from ._multi_threaded import execute_notebooks
3 | from ._dlt import create_dlt
4 | 
5 | __all__ = ["Notebook", "execute_notebooks", "create_dlt"]
6 | 


--------------------------------------------------------------------------------
/yetl/config/_stage_type.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class StageType(str, Enum):
 5 |     audit_control = "audit_control"
 6 |     source = "source"
 7 |     landing = "landing"
 8 |     raw = "raw"
 9 |     base = "base"
10 |     curated = "curated"
11 |     extract = "extract"
12 | 


--------------------------------------------------------------------------------
/yetl/config/table/_write.py:
--------------------------------------------------------------------------------
 1 | from ._table import Table
 2 | import logging
 3 | from typing import Any
 4 | 
 5 | 
 6 | class Write(Table):
 7 |     def __init__(self, **data: Any) -> None:
 8 |         super().__init__(**data)
 9 |         self._logger = logging.getLogger(self.__class__.__name__)
10 |         self._render()
11 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "python.formatting.provider": "black",
 3 |     "python.testing.pytestArgs": [
 4 |         "test"
 5 |     ],
 6 |     "python.testing.unittestEnabled": false,
 7 |     "python.testing.pytestEnabled": true,
 8 |     "python.envFile": "${workspaceFolder}/.databricks/.databricks.env",
 9 |     "databricks.python.envFile": "${workspaceFolder}/.env"
10 | }


--------------------------------------------------------------------------------
/yetl/config/_table_mapping.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel, Field
 2 | from typing import Union, Any, Dict
 3 | from .table import Table
 4 | 
 5 | 
 6 | class TableMapping(BaseModel):
 7 |     def __init__(self, **data: Any) -> None:
 8 |         super().__init__(**data)
 9 | 
10 |     destination: Table = Field(...)
11 |     source: Union[Dict[str, Table], Table] = Field(...)
12 | 


--------------------------------------------------------------------------------
/yetl/resource/logging.yaml:
--------------------------------------------------------------------------------
 1 | version: 1
 2 | formatters:
 3 |   default:
 4 |     format: "%(levelname)s : %(asctime)s : %(name)s : %(filename)s.%(funcName)s: line(%(lineno)s) : %(message)s"
 5 | handlers:
 6 |   console:
 7 |     class: logging.StreamHandler
 8 |     formatter: default
 9 |     stream: ext://sys.stdout
10 | root:
11 |   level: INFO
12 |   handlers: [console]
13 |     


--------------------------------------------------------------------------------
/test/config/test_project/logging.yaml:
--------------------------------------------------------------------------------
 1 | version: 1
 2 | formatters:
 3 |   default:
 4 |     format: "%(levelname)s : %(asctime)s : %(name)s : %(filename)s.%(funcName)s: line(%(lineno)s) : %(message)s"
 5 | handlers:
 6 |   console:
 7 |     class: logging.StreamHandler
 8 |     formatter: default
 9 |     stream: ext://sys.stdout
10 | root:
11 |   level: DEBUG
12 |   handlers: [console]
13 |     


--------------------------------------------------------------------------------
/yetl/validation/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._validate import (
 2 |     get_table_schema,
 3 |     get_pipeline_schema,
 4 |     get_project_schema,
 5 |     validate_tables,
 6 |     validate_pipeline,
 7 |     SchemaFiles,
 8 |     get_schema,
 9 | )
10 | 
11 | 
12 | __all__ = [
13 |     "get_table_schema",
14 |     "get_pipeline_schema",
15 |     "get_project_schema",
16 |     "validate_tables",
17 |     "validate_pipeline",
18 |     "SchemaFiles",
19 |     "get_schema",
20 | ]
21 | 


--------------------------------------------------------------------------------
/yetl/config/table/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._deltalake import DeltaLake
 2 | from ._read import Read
 3 | from ._table import Table, ValidationThreshold, ValidationThresholdType
 4 | from ._factory import factory as table_factory
 5 | from ._table_type import TableType
 6 | 
 7 | 
 8 | __all__ = [
 9 |     "DeltaLake",
10 |     "Read",
11 |     "table_factory",
12 |     "DataSet",
13 |     "Table",
14 |     "ValidationThreshold",
15 |     "TableType",
16 |     "ValidationThresholdType",
17 | ]
18 | 


--------------------------------------------------------------------------------
/yetl/workflow/_notebook.py:
--------------------------------------------------------------------------------
 1 | # used to carry notebook data
 2 | from pydantic import BaseModel, Field
 3 | from typing import Any
 4 | 
 5 | 
 6 | class Notebook(BaseModel):
 7 |     def __init__(self, **data: Any) -> None:
 8 |         super().__init__(**data)
 9 |         # add the notebook path to parameters for error reporting.
10 |         self.parameters["notebook"] = self.path
11 | 
12 |     path: str = Field(...)
13 |     timeout: int = Field(default=3600)
14 |     parameters: dict = Field(default={})
15 |     retry: int = Field(default=0)
16 |     enabled: bool = Field(default=True)
17 | 


--------------------------------------------------------------------------------
/yetl/resource/project.yaml:
--------------------------------------------------------------------------------
 1 | version: 0.0.0
 2 | 
 3 | name: default
 4 | sql: ./sql
 5 | spark_schema: ./schema
 6 | pipeline: ./pipelines
 7 | databricks_notebooks: ./databricks/notebooks
 8 | databricks_workflows: ./databricks/workflows
 9 | databricks_queries: ./databricks/queries
10 | 
11 | 
12 | spark:
13 |   logging_level: ERROR
14 |   config:
15 |     spark.master: local
16 |     spark.databricks.delta.allowArbitraryProperties.enabled: true
17 |     spark.sql.catalog.spark_catalog: org.apache.spark.sql.delta.catalog.DeltaCatalog
18 |     spark.sql.extensions: io.delta.sql.DeltaSparkSessionExtension
19 | 


--------------------------------------------------------------------------------
/test/config/test_project/test_project.yaml:
--------------------------------------------------------------------------------
 1 | version: 3.0.0
 2 | 
 3 | name: test_project
 4 | sql: ./sql
 5 | spark_schema: ./schema
 6 | pipeline: ./pipelines
 7 | databricks_notebooks: ./databricks/notebooks
 8 | databricks_workflows: ./databricks/workflows
 9 | databricks_queries: ./databricks/queries
10 | 
11 | 
12 | spark:
13 |   logging_level: ERROR
14 |   config:
15 |     spark.master: local
16 |     spark.databricks.delta.allowArbitraryProperties.enabled: true
17 |     spark.sql.catalog.spark_catalog: org.apache.spark.sql.delta.catalog.DeltaCatalog
18 |     spark.sql.extensions: io.delta.sql.DeltaSparkSessionExtension


--------------------------------------------------------------------------------
/test/config/test_project/sql/raw_dbx_patterns_control/header_footer.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | CREATE TABLE IF NOT EXISTS `raw_dbx_patterns_control`.`header_footer`
 3 | (
 4 |     header struct<flag:string,row_count:bigint,period:bigint,batch:string>,
 5 |     raw_header string,
 6 |     footer struct<flag:string,name:string,period:bigint>,
 7 |     raw_footer string,
 8 |     _process_id bigint,
 9 |     _load_date timestamp,
10 |     _metadata struct<file_path:string,file_name:string,file_size:bigint,file_modification_time:timestamp>
11 | )
12 | USING DELTA
13 | LOCATION '{{location}}'
14 | TBLPROPERTIES (
15 |     {{delta_properties}}
16 | );


--------------------------------------------------------------------------------
/yetl/__init__.py:
--------------------------------------------------------------------------------
 1 | from .config import (
 2 |     Config,
 3 |     Timeslice,
 4 |     TimesliceNow,
 5 |     TimesliceUtcNow,
 6 |     Read,
 7 |     DeltaLake,
 8 |     TableMapping,
 9 |     Tables,
10 |     StageType,
11 |     yetl_flow,
12 |     ValidationThreshold,
13 |     ValidationThresholdType,
14 | )
15 | 
16 | __all__ = [
17 |     "Config",
18 |     "Timeslice",
19 |     "TimesliceNow",
20 |     "TimesliceUtcNow",
21 |     "Read",
22 |     "DeltaLake",
23 |     "TableMapping",
24 |     "Tables",
25 |     "StageType",
26 |     "yetl_flow",
27 |     "ValidationThreshold",
28 |     "ValidationThresholdType",
29 | ]
30 | 


--------------------------------------------------------------------------------
/yetl/resource/__init__.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from importlib.resources import files as resources
 3 | except Exception:
 4 |     from importlib import resources
 5 | 
 6 | _PACKAGE = "yetl.resource"
 7 | 
 8 | 
 9 | def get_resource_text(resource: str):
10 |     try:
11 |         data = resources(_PACKAGE).joinpath(resource).read_text()
12 |     except Exception:
13 |         data = resources.read_text(_PACKAGE, resource)
14 |     return data
15 | 
16 | 
17 | def get_resource_binary(resource: str):
18 |     try:
19 |         schema = resources(_PACKAGE).joinpath(resource).read_bytes()
20 |     except Exception:
21 |         schema = resources.read_binary(_PACKAGE, resource)
22 |     return schema
23 | 


--------------------------------------------------------------------------------
/yetl/config/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._config import Config
 2 | from ._timeslice import Timeslice, TimesliceNow, TimesliceUtcNow
 3 | from .table import (
 4 |     DeltaLake,
 5 |     Read,
 6 |     ValidationThreshold,
 7 |     ValidationThresholdType,
 8 |     TableType,
 9 | )
10 | from ._tables import Tables
11 | from ._table_mapping import TableMapping
12 | from ._stage_type import StageType
13 | from ._decorators import yetl_flow
14 | 
15 | 
16 | __all__ = [
17 |     "Config",
18 |     "Timeslice",
19 |     "TimesliceNow",
20 |     "TimesliceUtcNow",
21 |     "Read",
22 |     "DeltaLake",
23 |     "TableMapping",
24 |     "Tables",
25 |     "StageType",
26 |     "yetl_flow",
27 |     "ValidationThreshold",
28 |     "ValidationThresholdType",
29 |     "TableType",
30 | ]
31 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # What Is Yetl
 2 | 
 3 | Website: https://www.yetl.io/
 4 | 
 5 | 
 6 | ## Development Setup
 7 | 
 8 | ```
 9 | pip install -r requirements.txt
10 | ```
11 | 
12 | ## Unit Tests
13 | 
14 | To run the unit tests with a coverage report.
15 | 
16 | ```
17 | pip install -e .
18 | pytest test/unit --junitxml=junit/test-results.xml --cov=yetl --cov-report=xml --cov-report=html
19 | ```
20 | 
21 | ## Integration Tests
22 | 
23 | To run the integration tests with a coverage report.
24 | 
25 | ```
26 | pip install -e .
27 | pytest test/integration --junitxml=junit/test-results.xml --cov=yetl --cov-report=xml --cov-report=html
28 | ```
29 | 
30 | ## Build
31 | 
32 | ```
33 | python setup.py sdist bdist_wheel
34 | ```
35 | 
36 | ## Publish
37 | 
38 | 
39 | ```
40 | twine upload dist/*
41 | ```
42 | 


--------------------------------------------------------------------------------
/test/config/test_project/schema/customer_details_1.yaml:
--------------------------------------------------------------------------------
 1 | fields:
 2 | -   metadata: {}
 3 |     name: flag
 4 |     nullable: true
 5 |     type: string
 6 | -   metadata: {}
 7 |     name: period
 8 |     nullable: true
 9 |     type: timestamp
10 | -   metadata: {}
11 |     name: id
12 |     nullable: true
13 |     type: integer
14 | -   metadata: {}
15 |     name: first_name
16 |     nullable: true
17 |     type: string
18 | -   metadata: {}
19 |     name: last_name
20 |     nullable: true
21 |     type: string
22 | -   metadata: {}
23 |     name: email
24 |     nullable: true
25 |     type: string
26 | -   metadata: {}
27 |     name: gender
28 |     nullable: true
29 |     type: string
30 | -   metadata: {}
31 |     name: job_title
32 |     nullable: true
33 |     type: string
34 | -   metadata: {}
35 |     name: amount
36 |     nullable: true
37 |     type: double
38 | type: struct
39 | 


--------------------------------------------------------------------------------
/test/config/test_project/schema/customer_details_2.yaml:
--------------------------------------------------------------------------------
 1 | fields:
 2 | -   metadata: {}
 3 |     name: flag
 4 |     nullable: true
 5 |     type: string
 6 | -   metadata: {}
 7 |     name: period
 8 |     nullable: true
 9 |     type: timestamp
10 | -   metadata: {}
11 |     name: id
12 |     nullable: true
13 |     type: integer
14 | -   metadata: {}
15 |     name: first_name
16 |     nullable: true
17 |     type: string
18 | -   metadata: {}
19 |     name: last_name
20 |     nullable: true
21 |     type: string
22 | -   metadata: {}
23 |     name: email
24 |     nullable: true
25 |     type: string
26 | -   metadata: {}
27 |     name: gender
28 |     nullable: true
29 |     type: string
30 | -   metadata: {}
31 |     name: job_title
32 |     nullable: true
33 |     type: string
34 | -   metadata: {}
35 |     name: amount
36 |     nullable: true
37 |     type: double
38 | type: struct
39 | 


--------------------------------------------------------------------------------
/requirements38.txt:
--------------------------------------------------------------------------------
 1 | attrs==23.1.0
 2 | black==23.3.0
 3 | click==8.1.4
 4 | coverage==7.2.7
 5 | delta-spark==2.4.0
 6 | et-xmlfile==1.1.0
 7 | exceptiongroup==1.1.2
 8 | flake8==6.0.0
 9 | importlib-metadata==6.8.0
10 | iniconfig==2.0.0
11 | Jinja2==3.1.2
12 | jsonschema==4.16.0
13 | jsonschema-specifications==2023.6.1
14 | MarkupSafe==2.1.3
15 | mccabe==0.7.0
16 | mypy-extensions==1.0.0
17 | numpy==1.25.0
18 | openpyxl==3.1.2
19 | packaging==23.1
20 | pandas==2.0.3
21 | pathspec==0.11.1
22 | platformdirs==3.8.1
23 | pluggy==1.2.0
24 | py4j==0.10.9.7
25 | pyaml==23.7.0
26 | pycodestyle==2.10.0
27 | pydantic==1.10.6
28 | pyflakes==3.0.1
29 | pyrsistent==0.19.3
30 | pyspark==3.4.1
31 | pytest==7.4.0
32 | pytest-cov==4.1.0
33 | pytest-env==0.8.2
34 | python-dateutil==2.8.2
35 | pytz==2023.3
36 | PyYAML==6.0
37 | referencing==0.29.1
38 | rpds-py==0.8.10
39 | six==1.16.0
40 | tomli==2.0.1
41 | typer==0.9.0
42 | typing_extensions==4.7.1
43 | tzdata==2023.3
44 | zipp==3.15.0
45 | 


--------------------------------------------------------------------------------
/test/config/test_project/sql/raw_dbx_patterns_control/raw_audit.sql:
--------------------------------------------------------------------------------
 1 |   CREATE TABLE IF NOT EXISTS `raw_dbx_patterns_control`.`raw_audit`
 2 |   (
 3 |     `file_name` string,
 4 |     source_database string,
 5 |     source_table string,
 6 |     `database` string,
 7 |     `table` string,
 8 | 
 9 |     total_count bigint,
10 |     valid_count bigint,
11 |     invalid_count bigint,
12 |     invalid_ratio double,
13 |     expected_row_count bigint,
14 |     warning_thresholds struct<
15 |       invalid_ratio:double,
16 |       invalid_rows:bigint,
17 |       max_rows:bigint,
18 |       min_rows:bigint
19 |     >,
20 |     exception_thresholds struct<
21 |       invalid_ratio:double,
22 |       invalid_rows:bigint,
23 |       max_rows:bigint,
24 |       min_rows:bigint
25 |     >,
26 |     file_path string,
27 |     file_size bigint,
28 |     file_modification_time timestamp,
29 |     _process_id bigint,
30 |     _load_date timestamp
31 |   )
32 |   USING DELTA
33 |   LOCATION '{{location}}'
34 |   TBLPROPERTIES (
35 |     {{delta_properties}}
36 |   )
37 | 
38 |   


--------------------------------------------------------------------------------
/test/unit/test_validation.py:
--------------------------------------------------------------------------------
 1 | from yetl.validation import _validate as v
 2 | import yaml
 3 | from jsonschema import SchemaError, ValidationError
 4 | 
 5 | 
 6 | def test_get_table_schema():
 7 | 
 8 |     schema = v.get_table_schema()
 9 | 
10 |     assert isinstance(schema, dict)
11 |     assert schema is not None
12 | 
13 | def test_validate_valid_tables():
14 | 
15 |     with open("./test/config/test_project/pipelines/tables.yaml", "r", encoding="utf-8") as f:
16 |         tables = yaml.safe_load(f)
17 | 
18 |     try:
19 |         v.validate_tables(tables)
20 |         assert True
21 |     except ValidationError:
22 |         assert False
23 | 
24 | def test_validate_invalid_tables():
25 | 
26 |     with open("./test/config/test_project/pipelines/tables_invalid.yaml", "r", encoding="utf-8") as f:
27 |         tables = yaml.safe_load(f)
28 | 
29 |     try:
30 |         v.validate_tables(tables)
31 |         assert False
32 |     except Exception as e:
33 |         assert isinstance(e, ValidationError)
34 |         assert e.message == "'version' is a required property"
35 | 
36 | 


--------------------------------------------------------------------------------
/requirements310.txt:
--------------------------------------------------------------------------------
 1 | annotated-types==0.5.0
 2 | attrs==23.1.0
 3 | black==23.7.0
 4 | click==8.1.5
 5 | coverage==7.2.7
 6 | delta-spark==2.4.0
 7 | et-xmlfile==1.1.0
 8 | exceptiongroup==1.1.2
 9 | flake8==6.0.0
10 | importlib-metadata==6.8.0
11 | iniconfig==2.0.0
12 | Jinja2==3.1.2
13 | jsonschema==4.18.3
14 | jsonschema-specifications==2023.6.1
15 | MarkupSafe==2.1.3
16 | mccabe==0.7.0
17 | mypy-extensions==1.0.0
18 | numpy==1.25.1
19 | openpyxl==3.1.2
20 | packaging==23.1
21 | pandas==2.0.3
22 | pathspec==0.11.1
23 | platformdirs==3.9.0
24 | pluggy==1.2.0
25 | py4j==0.10.9.7
26 | pycodestyle==2.10.0
27 | pydantic==2.0.3
28 | pydantic_core==2.3.0
29 | pyflakes==3.0.1
30 | pyspark==3.4.1
31 | pytest==7.4.0
32 | pytest-cov==4.1.0
33 | pytest-env==0.8.2
34 | python-dateutil==2.8.2
35 | pytz==2023.3
36 | PyYAML==6.0
37 | referencing==0.29.1
38 | rpds-py==0.8.10
39 | six==1.16.0
40 | tomli==2.0.1
41 | typer==0.9.0
42 | typing_extensions==4.7.1
43 | tzdata==2023.3
44 | -e git+https://github.com/sibytes/yetl.git@30cce5c49e9b4673e4e398265ee5aca39c34b14d#egg=yetl_framework
45 | zipp==3.16.2
46 | 


--------------------------------------------------------------------------------
/yetl/config/table/_factory.py:
--------------------------------------------------------------------------------
 1 | from ._deltalake import DeltaLake
 2 | from ._read import Read
 3 | from ._write import Write
 4 | from ._table import Table
 5 | import logging
 6 | from ._table_type import TableType
 7 | 
 8 | 
 9 | class TableFactory:
10 |     def __init__(self) -> None:
11 |         self._logger = logging.getLogger(self.__class__.__name__)
12 |         self._dataset = {}
13 |         self._table = {}
14 | 
15 |     def register_table_type(self, io_type: TableType, table_type: type):
16 |         self._logger.debug(f"Register table type {table_type} as {type}")
17 |         self._table[io_type] = table_type
18 | 
19 |     def make(self, table_type: TableType, config: dict) -> Table:
20 |         self._logger.debug(f"Get {table_type.name} from factory dataset")
21 |         table_class = self._table.get(table_type)
22 | 
23 |         if not table_class:
24 |             self._logger.debug(
25 |                 f"TableType {table_type.name} not registered in the table factory"
26 |             )
27 |             raise ValueError(table_type)
28 | 
29 |         return table_class(
30 |             **config,
31 |         )
32 | 
33 | 
34 | factory = TableFactory()
35 | factory.register_table_type(TableType.read, Read)
36 | factory.register_table_type(TableType.delta_lake, DeltaLake)
37 | factory.register_table_type(TableType.write, Write)
38 | 


--------------------------------------------------------------------------------
/yetl/config/_spark_context.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | from pyspark.sql import SparkSession
 4 | from delta import configure_spark_with_delta_pip
 5 | from ._utils import is_databricks
 6 | 
 7 | _logger = logging.getLogger(__name__)
 8 | 
 9 | 
10 | def get_spark_context(project: str, config: dict = None):
11 |     if is_databricks():
12 |         _logger.debug("Getting databricks spark context")
13 |         try:
14 |             from databricks.sdk.runtime import spark
15 | 
16 |             return spark
17 |         except Exception:
18 |             _logger.info("cannot create spark context, spark not found.")
19 |             return None
20 | 
21 |     else:
22 |         _logger.debug("Getting local spark context")
23 | 
24 |         if config is None:
25 |             config = {
26 |                 "spark.master": "local",
27 |                 "spark.databricks.delta.allowArbitraryProperties.enabled": True,
28 |                 "spark.sql.catalog.spark_catalog": "org.apache.spark.sql.delta.catalog.DeltaCatalog",
29 |                 "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension",
30 |             }
31 | 
32 |         msg = json.dumps(config, indent=4, default=str)
33 |         _logger.debug(msg)
34 | 
35 |         builder = SparkSession.builder
36 | 
37 |         for k, v in config.items():
38 |             builder = builder.config(k, v)
39 | 
40 |         builder.appName(project)
41 |         spark = configure_spark_with_delta_pip(builder).getOrCreate()
42 |         return spark
43 | 


--------------------------------------------------------------------------------
/yetl/workflow/_dlt.py:
--------------------------------------------------------------------------------
 1 | from ..config import StageType, Config
 2 | from ..config.table import Table
 3 | from typing import Callable
 4 | import logging
 5 | 
 6 | _logger = logging.getLogger(__name__)
 7 | 
 8 | 
 9 | def create_dlt(
10 |     config: Config,
11 |     stage: StageType,
12 |     dlt_funct: Callable[[Table, Table], None],
13 |     debug: bool = False,
14 |     **kwargs,
15 | ):
16 |     tables = config.tables.lookup_table(
17 |         stage=stage,
18 |         first_match=False,
19 |         # this will filter the tables on a custom property
20 |         # in the tables parameter you can add whatever custom properties you want
21 |         # either for filtering or to use in pipelines
22 |         **kwargs,
23 |     )
24 | 
25 |     for t in tables:
26 |         table_mapping = config.get_table_mapping(
27 |             stage=stage,
28 |             table=t.table,
29 |             # dlt does this so yetl doesn't need to
30 |             create_database=False,
31 |             create_table=False,
32 |         )
33 |         # TODO: not sure if we need checkpoints in DLT
34 |         # config.set_checkpoint(
35 |         #     table_mapping.source, table_mapping.destination
36 |         # )
37 |         src = table_mapping.source
38 |         dst = table_mapping.destination
39 |         if debug:
40 |             msg = f"{src.database}.{src.table} => {dst.database}.{dst.table}"
41 |             _logger.info(msg)
42 |         else:
43 |             dlt_funct(table_mapping.source, table_mapping.destination)
44 | 


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     // Use IntelliSense to learn about possible attributes.
 3 |     // Hover to view descriptions of existing attributes.
 4 |     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
 5 |     "version": "0.2.0",
 6 |     "configurations": [
 7 |         
 8 |         {
 9 |             "name": "Python: Main",
10 |             "type": "python",
11 |             "request": "launch",
12 |             "program": "main.py",
13 |             "console": "integratedTerminal",
14 |             "envFile": "${workspaceFolder}/.env",
15 |             "args": [
16 |                 // "validate", 
17 |                 // "test_project", 
18 |                 // "autoloader", 
19 |                 // "./test/config"
20 |                 
21 |                 // "init", "test_yetl"
22 |                 
23 |                 // "import-tables",
24 |                 // "./test/config/test_project/pipelines/tables.xlsx",
25 |                 // "./test/config/test_project/pipelines/test_tables.yaml"
26 | 
27 |                 "import-tables",
28 |                 "./test/config/fnz_pb/pipelines/tables.xlsx",
29 |                 "./test/config/fnz_pb/pipelines/tables.yaml"
30 |             ],
31 |             "env": {  }
32 |         },
33 |         {
34 |             "name": "Test",
35 |             "type": "python",
36 |             "request": "launch",
37 |             "console": "internalConsole",
38 |             "envFile": "${workspaceFolder}/.env"
39 |         }
40 |     ]
41 | }
42 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | from yetl import *
 2 | import os
 3 | import shutil
 4 | 
 5 | 
 6 | # from yetl import __main__
 7 | 
 8 | def tear_down():
 9 |     shutil.rmtree("./test/config/test_project/data", ignore_errors=True)
10 |     shutil.rmtree("./metastore_db", ignore_errors=True)
11 |     shutil.rmtree("./spark-warehouse", ignore_errors=True)
12 |     try:
13 |         os.remove("./derby.log")
14 |     except Exception:
15 |         pass
16 | 
17 | 
18 | tear_down()
19 | pipeline = "autoloader"
20 | config_path = "./test/config"
21 | project = "test_project"
22 | timeslice = Timeslice(day="*", month="*", year="*")
23 | config = Config(
24 |     project=project, 
25 |     pipeline=pipeline, 
26 |     config_path=config_path, 
27 |     timeslice=timeslice,
28 | )
29 | 
30 | # tables = config.tables.create_table(
31 | #     stage=StageType.audit_control,
32 | #     first_match=False,
33 | #     catalog="development"
34 | # )
35 | 
36 | table_mapping = config.get_table_mapping(
37 |     stage=StageType.raw, 
38 |     table="header_footer", 
39 |     catalog=None,
40 |     create_table=False
41 | )
42 | 
43 | 
44 | # source: Read = table_mapping.source["customer_details_1"]
45 | # destination: DeltaLake = table_mapping.destination
46 | # config.set_checkpoint(source=source, destination=destination)
47 | 
48 | 
49 | # t:Timeslice = Timeslice.parse_iso_date("*-*-")
50 | # print(t.strftime("%Y%m%d"))
51 | 
52 | 
53 | 
54 | # @yetl_flow(
55 | #         project="test_project", 
56 | #         stage=StageType.audit_control, 
57 | #         config_path="./test/config",
58 | #         catalog=None
59 | # )
60 | # def autoloader(table_mapping:TableMapping):
61 | #     return table_mapping
62 | 
63 | 
64 | # result = autoloader(table="header_footer")
65 | # tear_down()
66 | 
67 | 


--------------------------------------------------------------------------------
/yetl/validation/_validate.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from ..resource import get_resource_text
 3 | import jsonschema as js
 4 | from enum import Enum
 5 | 
 6 | 
 7 | class SchemaFiles(Enum):
 8 |     tables_schema = "sibytes_yetl_tables_schema.json"
 9 |     pipeline_schema = "sibytes_yetl_pipeline_schema.json"
10 |     project_schema = "sibytes_yetl_project_schema.json"
11 | 
12 | 
13 | def get_table_schema():
14 |     """Get the tables json schema from the package resource"""
15 |     schema = get_resource_text(SchemaFiles.tables_schema.value)
16 |     json_schema = json.loads(schema)
17 | 
18 |     return json_schema
19 | 
20 | 
21 | def get_pipeline_schema():
22 |     """Get the pipeline json schema from the package resource"""
23 |     schema = get_resource_text(SchemaFiles.pipeline_schema.value)
24 |     json_schema = json.loads(schema)
25 | 
26 |     return json_schema
27 | 
28 | 
29 | def get_project_schema():
30 |     """Get the project json schema from the package resource"""
31 |     schema = get_resource_text(SchemaFiles.project_schema.value)
32 |     json_schema = json.loads(schema)
33 | 
34 |     return json_schema
35 | 
36 | 
37 | def get_schema(schema_file: SchemaFiles):
38 |     if schema_file == SchemaFiles.pipeline_schema:
39 |         return get_pipeline_schema()
40 |     if schema_file == SchemaFiles.project_schema:
41 |         return get_project_schema()
42 |     if schema_file == SchemaFiles.tables_schema:
43 |         return get_table_schema()
44 | 
45 | 
46 | def validate_tables(data: dict) -> bool:
47 |     schema = get_table_schema()
48 |     js.validate(instance=data, schema=schema)
49 | 
50 | 
51 | def validate_pipeline(data: dict) -> bool:
52 |     schema = get_pipeline_schema()
53 |     js.validate(instance=data, schema=schema)
54 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | from setuptools import setup
 3 | 
 4 | # The directory containing this file
 5 | HERE = pathlib.Path(__file__).parent
 6 | 
 7 | # The text of the README file
 8 | README = (HERE / "README.md").read_text()
 9 | 
10 | # This call to setup() does all the work
11 | setup(
12 |     name="yetl-framework",
13 |     version="3.0.0",
14 |     description="yet (another spark) etl framework",
15 |     long_description=README,
16 |     long_description_content_type="text/markdown",
17 |     url="https://www.yetl.io/",
18 |     project_urls={
19 |         "GitHub": "https://github.com/sibytes/yetl",
20 |         "Documentation": "https://www.yetl.io/",
21 |     },
22 |     author="Shaun Ryan",
23 |     author_email="shaun_chiburi@hotmail.com",
24 |     license="MIT",
25 |     classifiers=[
26 |         "License :: OSI Approved :: MIT License",
27 |         "Programming Language :: Python :: 3",
28 |         "Programming Language :: Python :: 3.8",
29 |         "Programming Language :: Python :: 3.9",
30 |         "Programming Language :: Python :: 3.10",
31 |     ],
32 |     include_package_data=True,
33 |     package_dir={"": "."},
34 |     package_data={"yetl.resource": ["*.json", "*.yaml", "*.xlsx"]},
35 |     packages=[
36 |         "yetl",
37 |         "yetl.resource",
38 |         "yetl.validation",
39 |         "yetl.cli",
40 |         "yetl.cli.metadata_provider",
41 |         "yetl.config",
42 |         "yetl.config.table",
43 |         "yetl.workflow",
44 | 
45 |     ],
46 |     install_requires=[
47 |           'PyYAML',
48 |           'jinja2',
49 |           'pydantic',
50 |           'jsonschema',
51 |           'typer',
52 |           'pandas',
53 |           'openpyxl',
54 |           'delta-spark',
55 |           'pyspark'
56 |       ],
57 |     zip_safe=False
58 | )
59 | 


--------------------------------------------------------------------------------
/yetl/__main__.py:
--------------------------------------------------------------------------------
 1 | import typer
 2 | from .cli import _init
 3 | from typing_extensions import Annotated
 4 | from .cli.metadata_provider import XlsMetadata, ImportFormat
 5 | from .config import Config
 6 | from typing import Optional
 7 | import logging
 8 | 
 9 | app = typer.Typer()
10 | 
11 | 
12 | @app.command()
13 | def init(project: str, directory: str = "."):
14 |     """Initialise the project directory with the suggested structure and start config files
15 | 
16 |     --directory:str - Where you want the project to be initialised
17 |     """
18 |     _init.init(project, directory)
19 | 
20 | 
21 | @app.command()
22 | def import_tables(
23 |     source: str,
24 |     destination: str,
25 |     format: Annotated[
26 |         ImportFormat, typer.Option(case_sensitive=False)
27 |     ] = ImportFormat.excel,
28 | ):
29 |     """Import tables configuration from an external source such as a Excel.
30 | 
31 |     source:str - The uri indicator of the table metadata e.g. the file path if importing a csv \n
32 |     format:ImportFormat -  The format of the table metadata to import e.g. excel
33 |     """
34 |     metadata = XlsMetadata(source=source)
35 |     metadata.write(path=destination)
36 | 
37 | 
38 | @app.command()
39 | def validate(
40 |     project: Annotated[str, typer.Argument()],
41 |     pipeline: Annotated[str, typer.Argument()],
42 |     config_path: Annotated[Optional[str], typer.Argument()] = None,
43 | ):
44 |     """Validate that configuration meets the schema and deserialises.
45 | 
46 |     projec:str - Name of the project to validate \n
47 |     pipeline:str - Name of the pipeline config to validate \n
48 |     config_path:str - Path to the project configuration root \n
49 | 
50 |     """
51 |     _logger = logging.getLogger(__name__)
52 | 
53 |     _logger.info(f"validating project {project} {pipeline}")
54 |     Config(project=project, pipeline=pipeline, config_path=config_path)
55 |     _logger.info(f"{project} {pipeline} is Valid!")
56 | 
57 | 
58 | if __name__ in ["yetl.__main__", "__main__"]:
59 |     app()
60 | 


--------------------------------------------------------------------------------
/yetl/config/_project.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel, Field
 2 | from enum import Enum
 3 | from typing import Union, Dict, Any
 4 | import os
 5 | 
 6 | 
 7 | class SparkLoggingLevel(Enum):
 8 |     INFO = "INFO"
 9 |     DEBUG = "DEBUG"
10 |     WARNING = "WARNING"
11 |     ERROR = "ERROR"
12 | 
13 | 
14 | class SparkConfig(BaseModel):
15 |     logging_level: SparkLoggingLevel = Field(default=SparkLoggingLevel.INFO)
16 |     config: Dict[str, Union[str, bool]] = Field(
17 |         default={
18 |             "spark.master": "local",
19 |             "spark.sql.catalog.spark_catalog": "org.apache.spark.sql.delta.catalog.DeltaCatalog",
20 |             "spark.databricks.delta.merge.repartitionBeforeWrite.enabled": True,
21 |             "spark.sql.catalog.spark_catalog": "org.apache.spark.sql.delta.catalog.DeltaCatalog",
22 |             "spark.sql.catalogImplementation": "hive",
23 |         }
24 |     )
25 | 
26 | 
27 | class Project(BaseModel):
28 |     def __init__(self, **data: Any) -> None:
29 |         super().__init__(**data)
30 |         self.sql = os.path.join(self.config_path, self.sql.replace("./", ""))
31 |         self.pipelines = os.path.join(
32 |             self.config_path, self.pipelines.replace("./", "")
33 |         )
34 |         self.databricks_notebooks = os.path.join(
35 |             self.config_path, self.databricks_notebooks.replace("./", "")
36 |         )
37 |         self.databricks_workflows = os.path.join(
38 |             self.config_path, self.databricks_workflows.replace("./", "")
39 |         )
40 |         self.databricks_queries = os.path.join(
41 |             self.config_path, self.databricks_queries.replace("./", "")
42 |         )
43 | 
44 |     config_path: str = Field(...)
45 |     name: str = Field(...)
46 |     sql: str = Field(default="./sql")
47 |     pipelines: str = Field(default="./pipelines")
48 |     databricks_notebooks: str = Field(default="./databricks/notebooks")
49 |     databricks_workflows: str = Field(default="./databricks/workflows")
50 |     databricks_queries: str = Field(default="./databricks/queries")
51 | 
52 |     spark: SparkConfig = Field(default=SparkConfig())
53 | 


--------------------------------------------------------------------------------
/ci.yaml:
--------------------------------------------------------------------------------
 1 | trigger:
 2 |   branches:
 3 |     include:
 4 |     - main
 5 | 
 6 | # variables:
 7 | # - group: data-platform-kv
 8 | # - group: databricks
 9 | 
10 | pool:
11 |   vmImage: 'ubuntu-latest'
12 | strategy:
13 |   matrix:
14 |     Python310:
15 |       python.version: '3.10'
16 | 
17 | steps:
18 | - task: UsePythonVersion@0
19 |   inputs:
20 |     versionSpec: '$(python.version)'
21 |   displayName: 'Use Python $(python.version)'
22 | 
23 | - script: |
24 |     python -m pip install --upgrade pip setuptools wheel twine
25 |     pip install -r requirements310.txt
26 |   displayName: 'Install dependencies'
27 | 
28 | - script: |
29 |     python -m flake8 ./yetl
30 |   displayName: 'Run lint tests'
31 | 
32 | - script: |
33 |     python setup.py sdist bdist_wheel
34 |     ls dist/
35 |   displayName: 'Artifact creation'
36 | 
37 | - script: |    
38 |     pip install .
39 |     pytest test/ --junitxml=junit/test-results.xml --cov=yetl --cov-report=xml
40 |   displayName: 'Unit & Integration Tests'
41 | 
42 | 
43 | - task: PublishTestResults@2
44 |   condition: succeededOrFailed()
45 |   inputs:
46 |     testResultsFiles: '**/test-*.xml'
47 |     testRunTitle: 'Publish test results for Python $(python.version)'
48 | 
49 | - task: PublishCodeCoverageResults@1
50 |   inputs:
51 |     codeCoverageTool: Cobertura
52 |     summaryFileLocation: '$(System.DefaultWorkingDirectory)/**/coverage.xml'
53 |     reportDirectory: '$(System.DefaultWorkingDirectory)/**/htmlcov'
54 | 
55 | - task: CopyFiles@2
56 |   inputs:
57 |     SourceFolder: '$(Build.SourcesDirectory)'
58 |     Contents: |
59 |       dist/**
60 |       deployment/**
61 |     TargetFolder: '$(Build.ArtifactStagingDirectory)'
62 | 
63 | - task: PublishBuildArtifacts@1
64 |   inputs:
65 |     PathtoPublish: '$(Build.ArtifactStagingDirectory)'
66 |     ArtifactName: 'drop'
67 |     publishLocation: 'Container'
68 |   displayName: 'Publish Build Artefacts'
69 | 
70 | - task: TwineAuthenticate@0
71 |   inputs:
72 |     # artifactFeeds: 'sibytes'
73 |     externalFeeds: 'pypi'
74 |   displayName: 'Authenticate Twine'
75 | 
76 | - script: |
77 |     twine upload -r pypi --config-file $(PYPIRC_PATH) $(Build.SourcesDirectory)/dist/*
78 |   continueOnError: true
79 |   displayName: 'Publish to Artefact Store'
80 | 


--------------------------------------------------------------------------------
/yetl/config/_logging_config.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import logging.config
 3 | import yaml
 4 | import os
 5 | from yaml import YAMLError
 6 | from ._utils import YETL_CONFIG
 7 | 
 8 | 
 9 | def configure_logging(project: str = None, config_path=None):
10 |     """Returns as of a named logger based on a yaml logging configuration file
11 | 
12 |     The configuration file is called logging.yaml. It's a directory
13 |     location is taken from an envrionment variables called DATAPREPHOME.
14 |     If this does not exist then it is defaulted to ./config/
15 |     """
16 |     log_config_file = config_path
17 |     if not log_config_file:
18 |         log_config_file = os.getenv(YETL_CONFIG, "./config")
19 | 
20 |     if project:
21 |         project_log_config_file = os.path.join(log_config_file, project)
22 | 
23 |     project_log_config_file = f"{project_log_config_file}/logging.yaml"
24 |     project_log_config_file = os.path.abspath(project_log_config_file)
25 | 
26 |     log_config_file = f"{log_config_file}/logging.yaml"
27 |     log_config_file = os.path.abspath(log_config_file)
28 | 
29 |     if os.path.exists(project_log_config_file):
30 |         log_config_file = project_log_config_file
31 | 
32 |     # check that it exists
33 |     if not os.path.exists(log_config_file):
34 |         msg = f"Config logging file path does not exist {log_config_file}"
35 |         raise Exception(msg)
36 | 
37 |     # load the logging configuration into the logger
38 |     with open(log_config_file, "r") as f:
39 |         try:
40 |             config = yaml.safe_load(f.read())
41 |             logging.config.dictConfig(config)
42 | 
43 |             # if it errors because of invalid yaml format then
44 |             # provide details so the users can easily find and correct
45 |             # if it's a different exception just let it raise
46 |         except YAMLError as e:
47 |             location = ""
48 |             if hasattr(e, "problem_mark"):
49 |                 mark = e.problem_mark
50 |                 location = f"Error position ({mark.line}, {mark.column})"
51 | 
52 |             if hasattr(e, "problem"):
53 |                 problem = f"{e.problem}."
54 | 
55 |             raise Exception(
56 |                 f"Invalid yaml format in {log_config_file}. {problem} {location}"
57 |             )
58 | 


--------------------------------------------------------------------------------
/test/config/test_project/dataNone/_delta_log/00000000000000000000.json:
--------------------------------------------------------------------------------
1 | {"commitInfo":{"timestamp":1697012148607,"operation":"CREATE TABLE","operationParameters":{"isManaged":"false","description":null,"partitionBy":"[]","properties":"{\"delta.autooptimize.autocompact\":\"true\",\"delta.autooptimize.optimizewrite\":\"true\",\"delta.appendOnly\":\"true\"}"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{},"engineInfo":"Apache-Spark/3.4.1 Delta-Lake/2.4.0","txnId":"ecefe756-4bcb-42bc-85dd-3d148e23d8f2"}}
2 | {"protocol":{"minReaderVersion":1,"minWriterVersion":2}}
3 | {"metaData":{"id":"7ddc211e-d6d1-4928-849d-c513682fb508","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"header\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"flag\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"row_count\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"period\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"batch\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"raw_header\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"footer\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"flag\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"period\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"raw_footer\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"_process_id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"_load_date\",\"type\":\"timestamp\",\"nullable\":true,\"metadata\":{}},{\"name\":\"_metadata\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"file_path\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"file_name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"file_size\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"file_modification_time\",\"type\":\"timestamp\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.autooptimize.autocompact":"true","delta.autooptimize.optimizewrite":"true","delta.appendOnly":"true"},"createdTime":1697012148597}}
4 | 


--------------------------------------------------------------------------------
/test/config/test_project/pipelines/tables_invalid.yaml:
--------------------------------------------------------------------------------
 1 | # version: 3.0.0
 2 | 
 3 | audit_control:
 4 |   delta_lake:
 5 |     raw_dbx_patterns_control:
 6 |       header_footer:
 7 |         depends_on:
 8 |         - raw.raw_dbx_patterns.*
 9 |         partition_by: none
10 |         sql: ../sql/{{database}}/{{table}}.sql
11 |         vacuum: 30
12 |       catalog: hive_metastore
13 |       raw_audit:
14 |         depends_on:
15 |         - raw.raw_dbx_patterns.*
16 |         - audit_control.raw_dbx_patterns_control.header_footer
17 |         sql: ../sql/{{database}}/{{table}}.sql
18 |         vacuum: 30
19 | 
20 | landing:
21 |   read:
22 |     landing_dbx_patterns:
23 |       catalog: hive_metastore
24 |       customer_details_1: null
25 |       customer_details_2: null
26 | 
27 | raw:
28 |   delta_lake:
29 |     raw_dbx_patterns:
30 |       catalog: hive_metastore
31 |       customers:
32 |         custom_properties:
33 |           process_group: 1
34 |           rentention_days: 365
35 |         depends_on:
36 |         - landing.landing_dbx_patterns.customer_details_1
37 |         - landing.landing_dbx_patterns.customer_details_2
38 |         exception_thresholds:
39 |           invalid_ratio: 0.2
40 |           invalid_rows: 2
41 |           max_rows: 1000
42 |           min_rows: 0
43 |         id: id
44 |         vacuum: 30
45 |         warning_thresholds:
46 |           invalid_ratio: 0.1
47 |           invalid_rows: 0
48 |           max_rows: 100
49 |           min_rows: 5
50 |         z_order_by:
51 |         - _load_date_1
52 |         - _load_date_2
53 | 
54 | base:
55 |   delta_lake:
56 |     base_dbx_patterns:
57 |       catalog: hive_metastore
58 |       customer_details_1:
59 |         delta_properties:
60 |           delta.appendOnly: true
61 |           delta.autoOptimize.autoCompact: true
62 |           delta.autoOptimize.optimizeWrite: true
63 |           delta.enableChangeDataFeed: false
64 |         depends_on:
65 |         - raw.raw_dbx_patterns.customers
66 |         id: id
67 |         vacuum: 30
68 |       customer_details_2:
69 |         delta_properties:
70 |           delta.appendOnly: true
71 |           delta.autoOptimize.autoCompact: true
72 |           delta.autoOptimize.optimizeWrite: true
73 |           delta.enableChangeDataFeed: false
74 |         depends_on:
75 |         - raw.raw_dbx_patterns.customers
76 |         id: id
77 |         vacuum: 30
78 | 
79 | 


--------------------------------------------------------------------------------
/yetl/config/_decorators.py:
--------------------------------------------------------------------------------
 1 | # implicit, not referenced - must be the 1st import
 2 | from ._logging_config import configure_logging
 3 | import logging
 4 | from ._config import Config
 5 | from ._timeslice import Timeslice
 6 | from ._stage_type import StageType
 7 | from .table import Table
 8 | 
 9 | 
10 | def yetl_flow(
11 |     stage: StageType,
12 |     project: str,
13 |     pipeline: str = None,
14 |     config_path: str = None,
15 |     catalog: str = None,
16 | ):
17 |     def decorate(function):
18 |         def wrap_function(*args, **kwargs):
19 |             configure_logging(project)
20 |             _logger = logging.getLogger(__name__)
21 | 
22 |             _pipeline = pipeline
23 |             if not _pipeline:
24 |                 _pipeline = function.__name__
25 | 
26 |             _logger.info(f"Loading pipeline configuration {_pipeline}")
27 | 
28 |             timeslice = kwargs.get("timeslice", Timeslice(day="*", month="*", year="*"))
29 |             if "timeslice" in kwargs.keys():
30 |                 del kwargs["timeslice"]
31 | 
32 |             try:
33 |                 table = kwargs["table"]
34 |                 del kwargs["table"]
35 |             except KeyError as e:
36 |                 raise Exception(f"{e} is a required argument for a yetl flow function")
37 | 
38 |             config = Config(
39 |                 project=project,
40 |                 pipeline=_pipeline,
41 |                 config_path=config_path,
42 |                 timeslice=timeslice,
43 |             )
44 |             table_mapping = config.get_table_mapping(
45 |                 stage=stage,
46 |                 table=table,
47 |                 catalog=catalog,
48 |             )
49 | 
50 |             destination: Table = table_mapping.destination
51 |             sources = table_mapping.source
52 |             if isinstance(sources, dict):
53 |                 for _, source in sources.items():
54 |                     config.set_checkpoint(source=source, destination=destination)
55 |             else:
56 |                 config.set_checkpoint(source=sources, destination=destination)
57 | 
58 |             _logger.info(f"Calling function {function.__name__}")
59 |             ret = function(
60 |                 *args,
61 |                 table_mapping=table_mapping,
62 |                 **kwargs,
63 |             )
64 |             return ret
65 | 
66 |         return wrap_function
67 | 
68 |     return decorate
69 | 


--------------------------------------------------------------------------------
/test/config/test_project/pipelines/test_tables.yaml:
--------------------------------------------------------------------------------
 1 | # yaml-language-server: $schema=./json_schema/sibytes_yetl_tables_schema.json
 2 | 
 3 | version: 3.0.0
 4 | 
 5 | audit_control:
 6 |   delta_lake:
 7 |     raw_dbx_patterns_control:
 8 |       catalog: hive_metastore
 9 |       header_footer:
10 |         cluster_by: none
11 |         depends_on:
12 |         - raw.raw_dbx_patterns.*
13 |         partition_by: none
14 |         sql: ../sql/{{database}}/{{table}}.sql
15 |         vacuum: 30
16 |       raw_audit:
17 |         depends_on:
18 |         - raw.raw_dbx_patterns.*
19 |         - audit_control.raw_dbx_patterns_control.header_footer
20 |         sql: ../sql/{{database}}/{{table}}.sql
21 |         vacuum: 30
22 | 
23 | landing:
24 |   read:
25 |     landing_dbx_patterns:
26 |       catalog: hive_metastore
27 |       customer_details_1: null
28 |       customer_details_2: null
29 | 
30 | raw:
31 |   delta_lake:
32 |     raw_dbx_patterns:
33 |       catalog: hive_metastore
34 |       customers:
35 |         custom_properties:
36 |           process_group: 1
37 |           rentention_days: 365
38 |         depends_on:
39 |         - landing.landing_dbx_patterns.customer_details_1
40 |         - landing.landing_dbx_patterns.customer_details_2
41 |         exception_thresholds:
42 |           invalid_ratio: 0.2
43 |           invalid_rows: 2
44 |           max_rows: 1000
45 |           min_rows: 0
46 |         id: id
47 |         vacuum: 30
48 |         warning_thresholds:
49 |           invalid_ratio: 0.1
50 |           invalid_rows: 0
51 |           max_rows: 100
52 |           min_rows: 5
53 |         z_order_by:
54 |         - _load_date_1
55 |         - _load_date_2
56 | 
57 | base:
58 |   delta_lake:
59 |     base_dbx_patterns:
60 |       catalog: hive_metastore
61 |       customer_details_1:
62 |         delta_properties:
63 |           delta.appendOnly: true
64 |           delta.autoOptimize.autoCompact: true
65 |           delta.autoOptimize.optimizeWrite: true
66 |           delta.enableChangeDataFeed: false
67 |         depends_on:
68 |         - raw.raw_dbx_patterns.customers
69 |         id: id
70 |         vacuum: 30
71 |       customer_details_2:
72 |         delta_properties:
73 |           delta.appendOnly: true
74 |           delta.autoOptimize.autoCompact: true
75 |           delta.autoOptimize.optimizeWrite: true
76 |           delta.enableChangeDataFeed: false
77 |         depends_on:
78 |         - raw.raw_dbx_patterns.customers
79 |         id: id
80 |         vacuum: 30
81 | 
82 | 


--------------------------------------------------------------------------------
/test/config/test_project/pipelines/tables.yaml:
--------------------------------------------------------------------------------
 1 | version: 3.0.0
 2 | 
 3 | audit_control:
 4 |   delta_lake:
 5 |     raw_dbx_patterns_control:
 6 |       header_footer:
 7 |         depends_on:
 8 |         - raw.raw_dbx_patterns.*
 9 |         partition_by: none
10 |         sql: ../sql/{{database}}/{{table}}.sql
11 |         vacuum: 30
12 |       catalog: null
13 |       raw_audit:
14 |         depends_on:
15 |         - raw.raw_dbx_patterns.*
16 |         - audit_control.raw_dbx_patterns_control.header_footer
17 |         sql: ../sql/{{database}}/{{table}}.sql
18 |         vacuum: 30
19 | 
20 | source:
21 |   delta_lake:
22 |     source_dbx_patterns:
23 |       catalog: hive_metastore
24 |       customer_details_1: null
25 |       customer_details_2: null
26 | 
27 | landing:
28 |   read:
29 |     landing_dbx_patterns:
30 |       catalog: hive_metastore
31 |       customer_details_1: null
32 |       customer_details_2: null
33 | 
34 | raw:
35 |   delta_lake:
36 |     raw_dbx_patterns:
37 |       catalog: null
38 |       customers:
39 |         custom_properties:
40 |           process_group: 1
41 |           rentention_days: 365
42 |         depends_on:
43 |         - landing.landing_dbx_patterns.customer_details_1
44 |         - landing.landing_dbx_patterns.customer_details_2
45 |         exception_thresholds:
46 |           invalid_ratio: 0.2
47 |           invalid_rows: 2
48 |           max_rows: 1000
49 |           min_rows: 0
50 |         id: id
51 |         vacuum: 30
52 |         warning_thresholds:
53 |           invalid_ratio: 0.1
54 |           invalid_rows: 0
55 |           max_rows: 100
56 |           min_rows: 5
57 |         z_order_by:
58 |         - _load_date_1
59 |         - _load_date_2
60 | 
61 | base:
62 |   delta_lake:
63 |     base_dbx_patterns:
64 |       catalog: null
65 |       customer_details_1:
66 |         delta_properties:
67 |           delta.appendOnly: true
68 |           delta.autoOptimize.autoCompact: true
69 |           delta.autoOptimize.optimizeWrite: true
70 |           delta.enableChangeDataFeed: false
71 |         depends_on:
72 |         - raw.raw_dbx_patterns.customers
73 |         id: id
74 |         vacuum: 30
75 |       customer_details_2:
76 |         delta_properties:
77 |           delta.appendOnly: true
78 |           delta.autoOptimize.autoCompact: true
79 |           delta.autoOptimize.optimizeWrite: true
80 |           delta.enableChangeDataFeed: false
81 |         depends_on:
82 |         - raw.raw_dbx_patterns.customers
83 |         id: id
84 |         vacuum: 30
85 | 
86 | 


--------------------------------------------------------------------------------
/schema_testing/tables.yaml:
--------------------------------------------------------------------------------
 1 | # yaml-language-server: $schema=sibytes_yetl_tables_schema.json
 2 | 
 3 | version: 3.0.0
 4 | 
 5 | audit_control:
 6 |   delta_lake:
 7 |     raw_dbx_patterns_control:
 8 |       header_footer:
 9 |         depends_on:
10 |         - raw.raw_dbx_patterns.*
11 |         partition_by: none
12 |         sql: ../sql/{{database}}/{{table}}.sql
13 |         vacuum: 30
14 |       catalog: hive_metastore
15 |       raw_audit:
16 |         depends_on:
17 |         - raw.raw_dbx_patterns.*
18 |         - audit_control.raw_dbx_patterns_control.header_footer
19 |         sql: ../sql/{{database}}/{{table}}.sql
20 |         vacuum: 30
21 | 
22 | source:
23 |   delta_lake:
24 |     source_dbx_patterns:
25 |       catalog: hive_metastore
26 |       customer_details_1:
27 |         custom_properties:
28 |           process_group: 1
29 |           rentention_days: 365
30 |         id: id
31 |       customer_details_2:
32 |         custom_properties:
33 |           process_group: 1
34 |           rentention_days: 365
35 |         id: id
36 | 
37 | 
38 | landing:
39 |   read:
40 |     landing_dbx_patterns:
41 |       catalog: hive_metastore
42 |       customer_details_1: null
43 |       customer_details_2: null
44 | 
45 | 
46 | raw:
47 |   delta_lake:
48 |     raw_dbx_patterns:
49 |       catalog: hive_metastore
50 |       customers:
51 |         custom_properties:
52 |           process_group: 1
53 |           rentention_days: 365
54 |         depends_on:
55 |         - landing.landing_dbx_patterns.customer_details_1
56 |         - landing.landing_dbx_patterns.customer_details_2
57 |         exception_thresholds:
58 |           invalid_ratio: 0.2
59 |           invalid_rows: 2
60 |           max_rows: 1000
61 |           min_rows: 0
62 |         id: id
63 |         vacuum: 30
64 |         warning_thresholds:
65 |           invalid_ratio: 0.1
66 |           invalid_rows: 0
67 |           max_rows: 100
68 |           min_rows: 5
69 |         z_order_by:
70 |         - _load_date_1
71 |         - _load_date_2
72 | 
73 | base:
74 |   delta_lake:
75 |     base_dbx_patterns:
76 |       catalog: hive_metastore
77 |       customer_details_1:
78 |         delta_properties:
79 |           delta.appendOnly: true
80 |           delta.autoOptimize.autoCompact: true
81 |           delta.autoOptimize.optimizeWrite: true
82 |           delta.enableChangeDataFeed: false
83 |         depends_on:
84 |         - raw.raw_dbx_patterns.customers
85 |         id: id
86 |         vacuum: 30
87 |       customer_details_2:
88 |         delta_properties:
89 |           delta.appendOnly: true
90 |           delta.autoOptimize.autoCompact: true
91 |           delta.autoOptimize.optimizeWrite: true
92 |           delta.enableChangeDataFeed: false
93 |         depends_on:
94 |         - raw.raw_dbx_patterns.customers
95 |         id: id
96 |         vacuum: 30
97 | 
98 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | # .env
106 | .venv
107 | env/
108 | venv/
109 | venv*/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 | 
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 | 
118 | # Rope project settings
119 | .ropeproject
120 | 
121 | # mkdocs documentation
122 | /site
123 | 
124 | # mypy
125 | .mypy_cache/
126 | .dmypy.json
127 | dmypy.json
128 | 
129 | # Pyre type checker
130 | .pyre/
131 | 
132 | 
133 | # .vscode
134 | .DS_Store
135 | 
136 | 
137 | # ignore database engines
138 | /spark-standalone/delta_lake/*
139 | /spark-standalone/apps/*
140 | /spark-warehouse/*
141 | /test/config/test_project/data/*
142 | # internal spark metastore
143 | /metastore_db/*
144 | derby.log
145 | 
146 | 
147 | /config/runs/*
148 | /scratch
149 | 
150 | .databricks
151 | junit


--------------------------------------------------------------------------------
/schema_testing/autoloader.yaml:
--------------------------------------------------------------------------------
 1 | # yaml-language-server: $schema=sibytes_yetl_pipeline_schema.json
 2 | 
 3 | version: 3.0.0
 4 | tables: ./tables.yaml
 5 | 
 6 | audit_control:
 7 |   delta_lake:
 8 |     # delta table properties can be set at stage level or table level
 9 |     delta_properties:
10 |         delta.appendOnly: true
11 |         delta.autoOptimize.autoCompact: true
12 |         delta.autoOptimize.optimizeWrite: true
13 |     managed: false
14 |     container: datalake
15 |     # location: /mnt/{{container}}/data/raw
16 |     # path: "{{database}}/{{table}}"
17 |     options:
18 |       checkpointLocation: "/mnt/{{container}}/checkpoint/{{project}}/{{checkpoint}}"
19 | 
20 | source:
21 |   delta_lake:
22 |     managed: false
23 |     container: datalake
24 |     location: /mnt/{{container}}/data/source
25 |     path: "{{database}}/{{table}}"
26 |     options: null
27 | 
28 | landing:
29 |   read:
30 |     trigger: customerdetailscomplete-{{filename_date_format}}*.flg
31 |     trigger_type: file
32 |     container: datalake
33 |     location: "/mnt/{{container}}/data/landing/dbx_patterns/{{table}}/{{path_date_format}}"
34 |     filename: "{{table}}-{{filename_date_format}}*.csv"
35 |     filename_date_format: "%Y%m%d"
36 |     path_date_format: "%Y%m%d"
37 |     # injects the time period column into the dataset
38 |     # using either the path_date_format or the filename_date_format
39 |     # as you specify
40 |     slice_date: filename_date_format
41 |     slice_date_column_name: _slice_date
42 |     format: cloudFiles
43 |     spark_schema: ../schema/{{table.lower()}}.yaml
44 |     options:
45 |       # autoloader
46 |       cloudFiles.format: csv
47 |       cloudFiles.schemaLocation:  /mnt/{{container}}/checkpoint/{{project}}/{{checkpoint}}
48 |       cloudFiles.useIncrementalListing: auto
49 |       # schema
50 |       inferSchema: false
51 |       enforceSchema: true
52 |       columnNameOfCorruptRecord: _corrupt_record
53 |       # csv
54 |       header: false
55 |       mode: PERMISSIVE
56 |       encoding: windows-1252
57 |       delimiter: ","
58 |       escape: '"'
59 |       nullValue: ""
60 |       quote: '"'
61 |       emptyValue: ""
62 | 
63 | raw:
64 |   delta_lake:
65 |     # delta table properties can be set at stage level or table level
66 |     delta_properties:
67 |       delta.appendOnly: true
68 |       delta.autoOptimize.autoCompact: true    
69 |       delta.autoOptimize.optimizeWrite: true  
70 |       delta.enableChangeDataFeed: false
71 |     managed: false
72 |     container: datalake
73 |     location: /mnt/{{container}}/data/raw
74 |     path: "{{database}}/{{table}}"
75 |     options:
76 |       mergeSchema: true
77 |       checkpointLocation: "/mnt/{{container}}/checkpoint/{{project}}/{{checkpoint}}"
78 | 
79 | base:
80 |   delta_lake:
81 |     container: datalake
82 |     location: /mnt/{{container}}/data/base
83 |     path: "{{database}}/{{table}}"
84 |     options: null
85 | 


--------------------------------------------------------------------------------
/yetl/workflow/_multi_threaded.py:
--------------------------------------------------------------------------------
 1 | from concurrent.futures import ThreadPoolExecutor, Future, as_completed
 2 | from ._notebook import Notebook
 3 | from typing import List
 4 | 
 5 | 
 6 | # execute a notebook using databricks workflows
 7 | def _execute_notebook(notebook: Notebook, dbutils):
 8 |     """Execute a notebookd using databricks workflows"""
 9 |     msg = {
10 |         "_message": f"Executing notebook {notebook.path} parameters {notebook.parameters}",
11 |         "status": "executing",
12 |         "notebook": notebook.path,
13 |     }
14 |     print(msg["_message"], flush=True)
15 | 
16 |     try:
17 |         result = dbutils.notebook.run(
18 |             notebook.path, notebook.timeout, notebook.parameters
19 |         )
20 |         msg = {
21 |             "_message": f"Succeeded notebook {notebook.path}",
22 |             "status": "succeeded",
23 |             "notebook": notebook.path,
24 |         }
25 |         print(msg["_message"], flush=True)
26 |         return result
27 | 
28 |     except Exception as e:
29 |         if notebook.retry < 1:
30 |             msg = {
31 |                 "_message": f"notebook {notebook.path} failed.",
32 |                 "status": "failed",
33 |                 "error": str(e),
34 |                 "notebook": notebook.path,
35 |             }
36 |             print(msg["_message"], flush=True)
37 |             raise Exception(msg["_message"])
38 | 
39 |         msg = {
40 |             "_message": f"Retrying notebook {notebook.path}",
41 |             "status": "executing",
42 |             "notebook": notebook.path,
43 |         }
44 |         print(msg["_message"], flush=True)
45 |         notebook.retry -= 1
46 | 
47 | 
48 | def _try_future(future: Future, catch=False):
49 |     return future.result()
50 | 
51 | 
52 | def execute_notebooks(notebooks: List[Notebook], maxParallel: int, dbutils):
53 |     msg = {
54 |         "_message": f"Executing {len(notebooks)} with maxParallel of {maxParallel}",
55 |         "notebooks": len(notebooks),
56 |         "maxParallel": maxParallel,
57 |     }
58 |     print(msg["_message"], flush=True)
59 | 
60 |     with ThreadPoolExecutor(max_workers=maxParallel) as executor:
61 |         results = [
62 |             executor.submit(_execute_notebook, notebook, dbutils)
63 |             for notebook in notebooks
64 |             if notebook.enabled
65 |         ]
66 | 
67 |         # the individual notebooks handle their errors and pass back a packaged result
68 |         # we will still need to handle the fact that the notebook execution call may fail
69 |         # or a programmer missed the handling of an error in the notebook task
70 |         # that's what tryFuture(future:Future) does
71 |         results_list = [_try_future(r) for r in as_completed(results)]
72 | 
73 |         print(
74 |             f"Finished executing {len(notebooks)} with maxParallel of {maxParallel}",
75 |             flush=True,
76 |         )
77 |         return results_list
78 | 


--------------------------------------------------------------------------------
/yetl/resource/sibytes_yetl_project_schema.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$id": "https://yetl.io/schemas/project",
 3 | 
 4 |     "type":"object",
 5 |     "description": "Root of the yetl project config",
 6 |     "properties": {
 7 |         "version": {
 8 |             "type": "string",
 9 |             "description": "version of yetl that the configuration is compatible with",
10 |             "pattern": "^(\\d+\\.)?(\\d+\\.)?(\\*|\\d+)$"
11 |         },
12 |         "name": {
13 |             "type": "string",
14 |             "description": "name of the project"
15 |         },
16 |         "spark_schema": {
17 |             "type": "string",
18 |             "description": "relative project path to directory containing spark schema"
19 |         },
20 |         "pipeline": {
21 |             "type": "string",
22 |             "description": "relative project path to directory containing yetl pipeline configuration"
23 |         },
24 |         "databricks_notebooks": {
25 |             "type": "string",
26 |             "description": "relative project path to directory containing databricks notebooks"
27 |         },
28 |         "databricks_workflows": {
29 |             "type": "string",
30 |             "description": "relative project path to directory containing databricks workflows"
31 |         },
32 |         "databricks_queries": {
33 |             "type": "string",
34 |             "description": "relative project path to directory containing databricks queries"
35 |         },
36 |         "spark": {
37 |             "$ref": "#/$defs/spark" 
38 |         }
39 | 
40 |     },
41 |     "required": [
42 |         "version",
43 |         "name",
44 |         "spark_schema",
45 |         "pipeline",
46 |         "databricks_notebooks",
47 |         "databricks_workflows",
48 |         "databricks_queries"
49 |     ],
50 | 
51 |     "$defs": {
52 |         "spark": {
53 |             "type": "object",
54 |             "description": "defines spark logging and configuration for local execution",
55 |             "properties": {
56 |                 "logging_level": {
57 |                     "type" : "string",
58 |                     "enum": [
59 |                         "OFF",
60 |                         "FATAL",
61 |                         "ERROR",
62 |                         "WARN",
63 |                         "INFO",
64 |                         "DEBUG",
65 |                         "TRACE",
66 |                         "ALL"
67 |                       ]
68 |                 },
69 |                 "config": {
70 |                     "type": "object",
71 |                     "description": "spark configuration key value pairs",
72 |                     "minProperties": 1,
73 |                     "patternProperties":{
74 |                         "^\\S+$": {
75 |                             "type": ["string","number","boolean"],
76 |                             "description": "spark configuration properties"
77 |                         }
78 |                     }
79 |                 }
80 | 
81 |             }
82 |         }
83 |     }
84 | }
85 | 
86 | 
87 | 
88 | 
89 | 
90 | 
91 | 


--------------------------------------------------------------------------------
/test/config/test_project/pipelines/json_schema/sibytes_yetl_project_schema.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$id": "https://yetl.io/schemas/project",
 3 | 
 4 |     "type":"object",
 5 |     "description": "Root of the yetl project config",
 6 |     "properties": {
 7 |         "version": {
 8 |             "type": "string",
 9 |             "description": "version of yetl that the configuration is compatible with",
10 |             "pattern": "^(\\d+\\.)?(\\d+\\.)?(\\*|\\d+)$"
11 |         },
12 |         "name": {
13 |             "type": "string",
14 |             "description": "name of the project"
15 |         },
16 |         "spark_schema": {
17 |             "type": "string",
18 |             "description": "relative project path to directory containing spark schema"
19 |         },
20 |         "pipeline": {
21 |             "type": "string",
22 |             "description": "relative project path to directory containing yetl pipeline configuration"
23 |         },
24 |         "databricks_notebooks": {
25 |             "type": "string",
26 |             "description": "relative project path to directory containing databricks notebooks"
27 |         },
28 |         "databricks_workflows": {
29 |             "type": "string",
30 |             "description": "relative project path to directory containing databricks workflows"
31 |         },
32 |         "databricks_queries": {
33 |             "type": "string",
34 |             "description": "relative project path to directory containing databricks queries"
35 |         },
36 |         "spark": {
37 |             "$ref": "#/$defs/spark" 
38 |         }
39 | 
40 |     },
41 |     "required": [
42 |         "version",
43 |         "name",
44 |         "spark_schema",
45 |         "pipeline",
46 |         "databricks_notebooks",
47 |         "databricks_workflows",
48 |         "databricks_queries"
49 |     ],
50 | 
51 |     "$defs": {
52 |         "spark": {
53 |             "type": "object",
54 |             "description": "defines spark logging and configuration for local execution",
55 |             "properties": {
56 |                 "logging_level": {
57 |                     "type" : "string",
58 |                     "enum": [
59 |                         "OFF",
60 |                         "FATAL",
61 |                         "ERROR",
62 |                         "WARN",
63 |                         "INFO",
64 |                         "DEBUG",
65 |                         "TRACE",
66 |                         "ALL"
67 |                       ]
68 |                 },
69 |                 "config": {
70 |                     "type": "object",
71 |                     "description": "spark configuration key value pairs",
72 |                     "minProperties": 1,
73 |                     "patternProperties":{
74 |                         "^\\S+$": {
75 |                             "type": ["string","number","boolean"],
76 |                             "description": "spark configuration properties"
77 |                         }
78 |                     }
79 |                 }
80 | 
81 |             }
82 |         }
83 |     }
84 | }
85 | 
86 | 
87 | 
88 | 
89 | 
90 | 
91 | 


--------------------------------------------------------------------------------
/test/unit/test_timeslice.py:
--------------------------------------------------------------------------------
  1 | from pydantic import ValidationError
  2 | from yetl import Timeslice, TimesliceNow, TimesliceUtcNow
  3 | from datetime import datetime
  4 | 
  5 | 
  6 | def test_timeslice_all():
  7 |     format = "%Y%m%d"
  8 |     timeslice = Timeslice(day="*", month="*", year="*")
  9 |     actual = timeslice.strftime(format)
 10 |     expected = "***"
 11 |     assert actual == expected
 12 | 
 13 | 
 14 | def test_timeslice_year():
 15 |     format = "%Y/%m/%d"
 16 |     timeslice = Timeslice(day="*", month="*", year=2023)
 17 |     actual = timeslice.strftime(format)
 18 |     expected = "2023/*/*"
 19 |     assert actual == expected
 20 | 
 21 | 
 22 | def test_timeslice_month():
 23 |     format = "%Y-%m-%d"
 24 |     timeslice = Timeslice(day="*", month=1, year=2023)
 25 |     actual = timeslice.strftime(format)
 26 |     expected = "2023-01-*"
 27 |     assert actual == expected
 28 | 
 29 | 
 30 | def test_timeslice_day():
 31 |     format = "%Y\\%m\\%d"
 32 |     timeslice = Timeslice(day=1, month=1, year=2023)
 33 |     actual = timeslice.strftime(format)
 34 |     expected = "2023\\01\\01"
 35 |     assert actual == expected
 36 | 
 37 | 
 38 | def test_timeslice_invalid():
 39 |     actual = None
 40 |     try:
 41 |         timeslice = Timeslice(day="s", month=1, year=2023)  # noqa F841
 42 |     except ValidationError as e:
 43 |         actual = e
 44 | 
 45 |     assert isinstance(actual, ValidationError)
 46 | 
 47 | 
 48 | def test_timeslice_invalid_date():
 49 |     actual = None
 50 |     format = "%Y/%m/%d"
 51 |     try:
 52 |         timeslice = Timeslice(day=500, month=1, year=2023)
 53 |         actual = timeslice.strftime(format)
 54 |     except ValueError as e:
 55 |         actual = e
 56 | 
 57 |     assert isinstance(actual, ValueError)
 58 | 
 59 | 
 60 | def test_timeslice_now():
 61 |     now = datetime.now()
 62 |     timeslice = TimesliceNow()
 63 | 
 64 |     assert (
 65 |         timeslice.day == now.day
 66 |         and timeslice.month == now.month
 67 |         and timeslice.year == now.year
 68 |         and timeslice.hour == now.hour
 69 |         and timeslice.minute == now.minute
 70 |     )
 71 | 
 72 | 
 73 | def test_timeslice_utcnow():
 74 |     now = datetime.utcnow()
 75 |     timeslice = TimesliceUtcNow()
 76 | 
 77 |     assert (
 78 |         timeslice.day == now.day
 79 |         and timeslice.month == now.month
 80 |         and timeslice.year == now.year
 81 |         and timeslice.hour == now.hour
 82 |         and timeslice.minute == now.minute
 83 |     )
 84 | 
 85 | 
 86 | def test_timeslice_invalid_format_code():
 87 |     format = "%Y-%m-%d-%c"
 88 |     actual = ""
 89 |     try:
 90 |         timeslice = Timeslice(day="*", month=1, year=2023)
 91 |         actual = timeslice.strftime(format)
 92 |     except Exception as e:
 93 |         actual = str(e)
 94 | 
 95 |     expected = "The format contains the following unsupported format codes: %c"
 96 | 
 97 |     assert actual == expected
 98 | 
 99 | 
100 | def test_timeslice_str():
101 |     timeslice = Timeslice(day=1, month=1, year=2023)
102 |     actual = str(timeslice)
103 |     expected = "2023-01-01 00:00:00.000000"
104 |     assert actual == expected
105 | 


--------------------------------------------------------------------------------
/test/config/test_project/pipelines/autoloader.yaml:
--------------------------------------------------------------------------------
 1 | # yaml-language-server: $schema=./json_schema/sibytes_yetl_pipeline_schema.json
 2 | 
 3 | version: 3.0.0
 4 | tables: ./tables.yaml
 5 | 
 6 | audit_control:
 7 |   delta_lake:
 8 |     # delta table properties can be set at stage level or table level
 9 |     delta_properties:
10 |         delta.appendOnly: true
11 |         delta.autoOptimize.autoCompact: true
12 |         delta.autoOptimize.optimizeWrite: true
13 |     managed: false
14 |     container: datalake
15 |     # location: /mnt/{{container}}/data/raw
16 |     # path: "{{database}}/{{table}}"
17 |     options:
18 |       checkpointLocation: "/mnt/{{container}}/checkpoint/{{project}}/{{checkpoint}}"
19 | 
20 | source:
21 |   delta_lake:
22 |     # delta table properties can be set at stage level or table level
23 |     delta_properties:
24 |       delta.appendOnly: true
25 |       delta.autoOptimize.autoCompact: true    
26 |       delta.autoOptimize.optimizeWrite: true  
27 |       delta.enableChangeDataFeed: false
28 |     managed: false
29 |     container: datalake
30 |     location: /mnt/{{container}}/data/source
31 |     path: "{{database}}/{{table}}"
32 |     options: null
33 | 
34 | landing:
35 |   read:
36 |     trigger: customerdetailscomplete-{{filename_date_format}}*.flg
37 |     trigger_type: file
38 |     container: datalake
39 |     location: "/mnt/{{container}}/data/landing/dbx_patterns/{{table}}/{{path_date_format}}"
40 |     filename: "{{table}}-{{filename_date_format}}*.csv"
41 |     filename_date_format: "%Y%m%d"
42 |     path_date_format: "%Y%m%d"
43 |     # injects the time period column into the dataset
44 |     # using either the path_date_format or the filename_date_format
45 |     # as you specify
46 |     slice_date: filename_date_format
47 |     slice_date_column_name: _slice_date
48 |     format: cloudFiles
49 |     spark_schema: ../schema/{{table.lower()}}.yaml
50 |     options:
51 |       # autoloader
52 |       cloudFiles.format: csv
53 |       cloudFiles.schemaLocation:  /mnt/{{container}}/checkpoint/{{project}}/{{checkpoint}}
54 |       cloudFiles.useIncrementalListing: auto
55 |       # schema
56 |       inferSchema: false
57 |       enforceSchema: true
58 |       columnNameOfCorruptRecord: _corrupt_record
59 |       # csv
60 |       header: false
61 |       mode: PERMISSIVE
62 |       encoding: windows-1252
63 |       delimiter: ","
64 |       escape: '"'
65 |       nullValue: ""
66 |       quote: '"'
67 |       emptyValue: ""
68 | 
69 | raw:
70 |   delta_lake:
71 |     # delta table properties can be set at stage level or table level
72 |     delta_properties:
73 |       delta.appendOnly: true
74 |       delta.autoOptimize.autoCompact: true    
75 |       delta.autoOptimize.optimizeWrite: true  
76 |       delta.enableChangeDataFeed: false
77 |     managed: false
78 |     container: datalake
79 |     location: /mnt/{{container}}/data/raw
80 |     path: "{{database}}/{{table}}"
81 |     options:
82 |       mergeSchema: true
83 |       checkpointLocation: "/mnt/{{container}}/checkpoint/{{project}}/{{checkpoint}}"
84 | 
85 | base:
86 |   delta_lake:
87 |     container: datalake
88 |     location: /mnt/{{container}}/data/base
89 |     path: "{{database}}/{{table}}"
90 |     options: null
91 | 


--------------------------------------------------------------------------------
/test/unit/test_utils.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from pyspark.sql.types import (
  3 |     StructType,
  4 |     StructField,
  5 |     StringType,
  6 |     IntegerType,
  7 |     DecimalType,
  8 |     LongType,
  9 | )
 10 | from yetl.config import _utils as utils
 11 | import re
 12 | 
 13 | 
 14 | def remove_white_space(string: str):
 15 |     pattern = re.compile(r"\s+")
 16 |     string = re.sub(pattern, "", string)
 17 |     return string
 18 | 
 19 | 
 20 | @pytest.fixture
 21 | def spark_schema():
 22 |     spark_schema = StructType(
 23 |         [
 24 |             StructField("firstname", StringType(), True),
 25 |             StructField("middlename", StringType(), True),
 26 |             StructField("lastname", StringType(), True),
 27 |             StructField("id", LongType(), True),
 28 |             StructField("gender", StringType(), True),
 29 |             StructField("salary", DecimalType(4, 2), True),
 30 |             StructField("age", IntegerType(), True),
 31 |         ]
 32 |     )
 33 | 
 34 |     return spark_schema
 35 | 
 36 | 
 37 | @pytest.fixture
 38 | def replacements():
 39 |     replacements = {
 40 |         utils.JinjaVariables.DATABASE: "test_database",
 41 |         utils.JinjaVariables.TABLE: "test_table",
 42 |         utils.JinjaVariables.CHECKPOINT: "test_checkpoint",
 43 |         utils.JinjaVariables.FILENAME_DATE_FORMAT: "test_filename_date_format",
 44 |         utils.JinjaVariables.PATH_DATE_FORMAT: "test_path_date_format",
 45 |         utils.JinjaVariables.CONTAINER: "test_container",
 46 |     }
 47 |     return replacements
 48 | 
 49 | 
 50 | def test_utils_get_dll_header(spark_schema):
 51 |     actual = utils.get_ddl(spark_schema=spark_schema, header=True)
 52 |     expected = [
 53 |         "firstname string",
 54 |         "middlename string",
 55 |         "lastname string",
 56 |         "id bigint",
 57 |         "gender string",
 58 |         "salary decimal(4,2)",
 59 |         "age int",
 60 |     ]
 61 | 
 62 |     assert actual == expected
 63 | 
 64 | 
 65 | def test_utils_get_dll_noheader(spark_schema):
 66 |     actual = utils.get_ddl(spark_schema=spark_schema, header=False)
 67 |     expected = [
 68 |         "_c0 string",
 69 |         "_c1 string",
 70 |         "_c2 string",
 71 |         "_c3 bigint",
 72 |         "_c4 string",
 73 |         "_c5 decimal(4,2)",
 74 |         "_c6 int",
 75 |     ]
 76 | 
 77 |     assert actual == expected
 78 | 
 79 | 
 80 | def test_render_jinja(replacements):
 81 |     data = """
 82 |         {{database}}
 83 |         {{table}}
 84 |         {{checkpoint}}
 85 |         {{filename_date_format}}
 86 |         {{path_date_format}}
 87 |         {{container}}
 88 |     """
 89 | 
 90 |     actual = utils.render_jinja(data, replacements)
 91 |     expected = """
 92 |         test_database
 93 |         test_table
 94 |         test_checkpoint
 95 |         test_filename_date_format
 96 |         test_path_date_format
 97 |         test_container
 98 |     """
 99 | 
100 |     assert actual == expected
101 | 
102 | 
103 | def test_render_jinja_skip():
104 |     data = """
105 |         {{database}}
106 |     """
107 | 
108 |     replacements = {utils.JinjaVariables.DATABASE: None}
109 | 
110 |     actual = utils.render_jinja(data, replacements)
111 |     expected = """
112 |         {{database}}
113 |     """
114 | 
115 |     assert actual == expected
116 | 
117 | 
118 | def test_get_html_table():
119 |     test_kv = {"test": "succeeded", "test1": {"test1_1": "also fine"}}
120 |     actual = remove_white_space(utils.get_html_table(test_kv))
121 |     expected = """
122 |     <table>
123 |     <tr><th>Name</th><th>Source</th><th>Destination</th></tr>
124 |     <tr><td>test</td><td>succeeded</td></tr>
125 |     <tr><td>test1.test1_1</td><td>also fine</td></tr>
126 |     </table>
127 |     """
128 |     expected = remove_white_space(expected)
129 | 
130 |     assert actual == expected
131 | 


--------------------------------------------------------------------------------
/yetl/cli/_init.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import yaml
  3 | import pkg_resources
  4 | from ..validation import (
  5 |     SchemaFiles,
  6 |     get_schema,
  7 | )
  8 | import json
  9 | from ..resource import get_resource_binary, get_resource_text
 10 | 
 11 | 
 12 | def init(project: str, directory: str = "."):
 13 |     project = project.lower()
 14 |     project_path = os.path.abspath(directory)
 15 |     project_path = f"{project_path}/{project}"
 16 |     paths: dict = _make_project_dir(project_path, project)
 17 |     _create_log_file(project_path)
 18 |     _create_json_schema(project_path, paths["pipeline"])
 19 |     _create_tables_excel(project_path, paths["pipeline"])
 20 | 
 21 |     for _, p in paths.items():
 22 |         _make_dirs(project_path, p)
 23 | 
 24 | 
 25 | def _make_dirs(project_path: str, relative_path: str):
 26 |     relative_path.replace("./", "")
 27 |     path = f"{project_path}/{relative_path}"
 28 |     os.makedirs(path, exist_ok=True)
 29 | 
 30 | 
 31 | def _create_json_schema(project_path: str, pipeline_dir: str):
 32 |     """Create json schema files to assist with vscode editing and validation"""
 33 | 
 34 |     json_schema_path = os.path.abspath(project_path)
 35 |     json_schema_path = os.path.join(json_schema_path, pipeline_dir, "json_schema")
 36 |     os.makedirs(json_schema_path, exist_ok=True)
 37 | 
 38 |     for f in SchemaFiles:
 39 |         schema = get_schema(f)
 40 |         schema_path = os.path.join(json_schema_path, f.value)
 41 |         with open(schema_path, "w", encoding="utf-8") as f:
 42 |             f.write(json.dumps(schema, indent=4))
 43 | 
 44 | 
 45 | def _get_default_config(name: str):
 46 |     """Get the default configuration"""
 47 |     config = get_resource_text(name)
 48 | 
 49 |     return config
 50 | 
 51 | 
 52 | def _get_binary_template(name: str):
 53 |     """Get the binary template object"""
 54 |     data = get_resource_binary(name)
 55 | 
 56 |     return data
 57 | 
 58 | 
 59 | def _create_log_file(project_path: str):
 60 |     config: dict = yaml.safe_load(_get_default_config("logging.yaml"))
 61 |     file_path = os.path.join(project_path, "logging.yaml")
 62 |     with open(file_path, "w", encoding="utf-8") as f:
 63 |         f.write(yaml.safe_dump(config, indent=4))
 64 | 
 65 | 
 66 | def _create_tables_excel(project_path: str, pipeline_dir: str):
 67 |     data: bytes = _get_binary_template("tables.xlsx")
 68 | 
 69 |     pipeline_path = os.path.abspath(project_path)
 70 |     pipeline_path = os.path.join(pipeline_path, pipeline_dir)
 71 |     file_path = os.path.join(pipeline_path, "tables.xlsx")
 72 |     with open(file_path, "wb") as f:
 73 |         f.write(data)
 74 | 
 75 | 
 76 | def _make_project_dir(project_path: str, project: str):
 77 |     config: dict = yaml.safe_load(_get_default_config("project.yaml"))
 78 |     config["name"] = project
 79 |     config["version"] = pkg_resources.get_distribution("yetl-framework").version
 80 | 
 81 |     pipeline_path = config["pipeline"]
 82 |     paths = {
 83 |         "sql": config["sql"],
 84 |         "spark_schema": config["spark_schema"],
 85 |         "pipeline": pipeline_path,
 86 |         "databricks_notebooks": config["databricks_notebooks"],
 87 |         "databricks_workflows": config["databricks_workflows"],
 88 |         "databricks_queries": config["databricks_queries"],
 89 |     }
 90 | 
 91 |     try:
 92 |         os.makedirs(project_path, exist_ok=False)
 93 |     except Exception as e:
 94 |         raise Exception(f"project {project} already exists at this path") from e
 95 | 
 96 |     project_file_path = os.path.join(project_path, f"{project}.yaml")
 97 |     with open(project_file_path, "w", encoding="utf-8") as f:
 98 |         f.write(
 99 |             f"# yaml-language-server: $schema={pipeline_path}/json_schema/sibytes_yetl_project_schema.json\n\n"
100 |         )
101 |         f.write(yaml.safe_dump(config, indent=4))
102 | 
103 |     return paths
104 | 


--------------------------------------------------------------------------------
/yetl/config/table/_table.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from pydantic import BaseModel, Field, PrivateAttr
  3 | from .._utils import JinjaVariables
  4 | from typing import Any, Dict, Union, List, Optional
  5 | from .._timeslice import Timeslice
  6 | from .._stage_type import StageType
  7 | from ._table_type import TableType
  8 | from .._project import Project
  9 | from enum import Enum
 10 | from .._utils import render_jinja
 11 | 
 12 | 
 13 | class ValidationThresholdType(Enum):
 14 |     exception = ("exception",)
 15 |     warning = "warning"
 16 | 
 17 | 
 18 | class ValidationThreshold(BaseModel):
 19 |     def __init__(self, **data: Any) -> None:
 20 |         super().__init__(**data)
 21 | 
 22 |     invalid_ratio: Optional[float] = Field(default=None)
 23 |     invalid_rows: Optional[int] = Field(default=None)
 24 |     max_rows: Optional[int] = Field(default=None)
 25 |     min_rows: Optional[int] = Field(default=None)
 26 | 
 27 |     @classmethod
 28 |     def default_select_sql(cls):
 29 |         sql = "null"
 30 |         return sql
 31 | 
 32 |     def select_sql(self):
 33 |         thresholds_sql = []
 34 |         if self.invalid_ratio is not None:
 35 |             thresholds_sql.append(
 36 |                 f"cast({self.invalid_ratio} as double) as invalid_ratio"
 37 |             )
 38 |         else:
 39 |             thresholds_sql.append("null as invalid_ratio")
 40 | 
 41 |         if self.invalid_rows is not None:
 42 |             thresholds_sql.append(f"cast({self.invalid_rows} as long) as invalid_rows")
 43 |         else:
 44 |             thresholds_sql.append("null as invalid_rows")
 45 | 
 46 |         if self.max_rows is not None:
 47 |             thresholds_sql.append(f"cast({self.max_rows} as long) as max_rows")
 48 |         else:
 49 |             thresholds_sql.append("null as max_rows")
 50 | 
 51 |         if self.min_rows is not None:
 52 |             thresholds_sql.append(f"cast({self.min_rows} as long) as min_rows")
 53 |         else:
 54 |             thresholds_sql.append("null as min_rows")
 55 | 
 56 |         sql = f"""
 57 |             struct(
 58 |                 {",".join(thresholds_sql)}
 59 |             )
 60 |         """
 61 | 
 62 |         return sql
 63 | 
 64 | 
 65 | class Table(BaseModel):
 66 |     def __init__(self, **data: Any) -> None:
 67 |         super().__init__(**data)
 68 |         self._logger = logging.getLogger(self.__class__.__name__)
 69 | 
 70 |     _logger: Any = PrivateAttr(default=None)
 71 |     _rendered: bool = PrivateAttr(default=False)
 72 |     _replacements: Dict[JinjaVariables, str] = PrivateAttr(default=None)
 73 |     stage: StageType = Field(...)
 74 |     database: str = Field(...)
 75 |     table: str = Field(...)
 76 |     id: Union[str, List[str]] = Field(default=[])
 77 |     custom_properties: Optional[Dict[str, Any]] = Field(default=None)
 78 |     table_type: TableType = Field(...)
 79 |     warning_thresholds: Optional[ValidationThreshold] = Field(default=None)
 80 |     exception_thresholds: Optional[ValidationThreshold] = Field(default=None)
 81 |     project: Project = Field(...)
 82 |     container: str = Field(...)
 83 |     location: str = Field(...)
 84 |     path: Optional[str] = Field(default=None)
 85 |     options: dict = Field(...)
 86 |     timeslice: Timeslice = Field(...)
 87 |     checkpoint: Optional[str] = Field(default=None)
 88 |     config_path: str = Field(...)
 89 |     catalog: Optional[str] = Field(default=None)
 90 | 
 91 |     def _render(self):
 92 |         self._replacements = {
 93 |             JinjaVariables.TABLE: self.table,
 94 |             JinjaVariables.DATABASE: self.database,
 95 |             JinjaVariables.CONTAINER: self.container,
 96 |             JinjaVariables.CHECKPOINT: self.checkpoint,
 97 |             JinjaVariables.PROJECT: self.project.name,
 98 |             JinjaVariables.CATALOG: self.catalog,
 99 |         }
100 | 
101 |     def render(self):
102 |         self._replacements[JinjaVariables.CHECKPOINT] = self.checkpoint
103 | 
104 |         if self.options:
105 |             for option, value in self.options.items():
106 |                 if isinstance(value, str):
107 |                     self.options[option] = render_jinja(value, self._replacements)
108 | 
109 |     def thresholds_select_sql(self, threshold_type: ValidationThresholdType):
110 |         if threshold_type == ValidationThresholdType.exception:
111 |             if self.exception_thresholds:
112 |                 return self.exception_thresholds.select_sql()
113 |             else:
114 |                 return ValidationThreshold.default_select_sql()
115 | 
116 |         if threshold_type == ValidationThresholdType.warning:
117 |             if self.warning_thresholds:
118 |                 return self.warning_thresholds.select_sql()
119 |             else:
120 |                 return ValidationThreshold.default_select_sql()
121 | 
122 |     def _set_catalog(self, catalog: str = None):
123 |         if catalog:
124 |             self.catalog = catalog
125 | 
126 |     def create_table(self, catalog: str = None):
127 |         self._set_catalog(catalog)
128 | 
129 |     def create_database(self, catalog: str = None):
130 |         self._set_catalog(catalog)
131 | 
132 |     def qualified_table_name(self):
133 |         pass
134 | 


--------------------------------------------------------------------------------
/yetl/config/_config.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from .table import Table
  3 | from ._timeslice import Timeslice
  4 | from ._tables import Tables, _INDEX_WILDCARD, KeyContants
  5 | from ._stage_type import StageType
  6 | from ._utils import abs_config_path, load_yaml, get_config_path, check_version
  7 | from ._logging_config import configure_logging
  8 | import logging
  9 | from ._project import Project
 10 | from ..validation import validate_tables, validate_pipeline
 11 | from typing import Union
 12 | 
 13 | 
 14 | class Config:
 15 |     def __init__(
 16 |         self,
 17 |         project: str,
 18 |         pipeline: str,
 19 |         timeslice: Timeslice = None,
 20 |         config_path: str = None,
 21 |     ):
 22 |         self.config_path = get_config_path(project, config_path)
 23 |         self._logger = logging.getLogger(self.__class__.__name__)
 24 |         configure_logging(project, self.config_path)
 25 |         if not timeslice:
 26 |             timeslice = Timeslice(
 27 |                 year=_INDEX_WILDCARD, month=_INDEX_WILDCARD, day=_INDEX_WILDCARD
 28 |             )
 29 |         self.project = self._load_project(project)
 30 |         self.pipeline = pipeline
 31 |         self.tables = self._load_tables(timeslice)
 32 | 
 33 |     def _load_project(self, project: str):
 34 |         project_file_path = os.path.join(self.config_path, f"{project}.yaml")
 35 |         project_config = load_yaml(project_file_path)
 36 |         check_version(project_config)
 37 |         project_config["config_path"] = self.config_path
 38 |         project = Project(**project_config)
 39 |         return project
 40 | 
 41 |     def _load_pipeline(self, pipeline: str):
 42 |         pipeline_file = f"{pipeline}.yaml"
 43 |         config_file_path = os.path.join(self.project.pipelines, pipeline_file)
 44 |         pipeline = load_yaml(config_file_path)
 45 |         validate_pipeline(pipeline)
 46 |         check_version(pipeline)
 47 |         return pipeline
 48 | 
 49 |     def _load_tables(self, timeslice: Timeslice):
 50 |         tables_config = self._load_pipeline(self.pipeline)
 51 |         tables_path = tables_config[KeyContants.TABLES.value]
 52 |         tables_path = abs_config_path(self.project.pipelines, tables_path)
 53 | 
 54 |         data: dict = load_yaml(tables_path)
 55 |         validate_tables(data)
 56 |         check_version(data)
 57 | 
 58 |         tables_config[KeyContants.TABLES.value] = data
 59 |         tables_config[KeyContants.TIMESLICE.value] = timeslice
 60 |         tables_config[KeyContants.CONFIG_PATH.value] = self.project.pipelines
 61 |         tables_config[KeyContants.PROJECT.value] = self.project
 62 | 
 63 |         tables = Tables(table_data=tables_config)
 64 |         return tables
 65 | 
 66 |     def create_tables(
 67 |         self,
 68 |         stage: Union[StageType, str] = _INDEX_WILDCARD,
 69 |         database=_INDEX_WILDCARD,
 70 |         catalog: str = None,
 71 |         **kwargs,
 72 |     ):
 73 |         return self.tables.create_table(
 74 |             stage=stage,
 75 |             database=database,
 76 |             first_match=False,
 77 |             catalog=catalog,
 78 |             **kwargs,
 79 |         )
 80 | 
 81 |     def create_table(
 82 |         self,
 83 |         stage: Union[StageType, str] = _INDEX_WILDCARD,
 84 |         database=_INDEX_WILDCARD,
 85 |         table=_INDEX_WILDCARD,
 86 |         catalog: str = None,
 87 |         **kwargs,
 88 |     ):
 89 |         return self.tables.create_table(
 90 |             stage=stage,
 91 |             database=database,
 92 |             table=table,
 93 |             first_match=True,
 94 |             catalog=catalog,
 95 |             **kwargs,
 96 |         )
 97 | 
 98 |     def get_table_mapping(
 99 |         self,
100 |         stage: StageType,
101 |         database: str = _INDEX_WILDCARD,
102 |         table: str = _INDEX_WILDCARD,
103 |         create_database: bool = False,
104 |         create_table: bool = False,
105 |         catalog: str = None,
106 |     ):
107 |         table_mapping = self.tables.get_table_mapping(
108 |             stage=stage,
109 |             table=table,
110 |             database=database,
111 |             create_database=create_database,
112 |             create_table=create_table,
113 |             catalog=catalog,
114 |         )
115 | 
116 |         return table_mapping
117 | 
118 |     def set_checkpoint(
119 |         self,
120 |         source: Table,
121 |         destination: Table,
122 |         checkpoint_name: str = None,
123 |     ):
124 |         if not checkpoint_name:
125 |             checkpoint_name = f"{source.database}.{source.table}-{destination.database}.{destination.table}"
126 | 
127 |         source.checkpoint = checkpoint_name
128 |         source.render()
129 |         destination.checkpoint = checkpoint_name
130 |         destination.render()
131 | 
132 |     def lookup_table(
133 |         self,
134 |         stage: Union[StageType, str] = _INDEX_WILDCARD,
135 |         database=_INDEX_WILDCARD,
136 |         table=_INDEX_WILDCARD,
137 |         first_match: bool = True,
138 |         create_database: bool = False,
139 |         create_table: bool = False,
140 |         catalog: str = None,
141 |         **kwargs,
142 |     ):
143 |         return self.tables.lookup_table(
144 |             stage=stage,
145 |             database=database,
146 |             table=table,
147 |             first_match=first_match,
148 |             create_database=create_database,
149 |             create_table=create_table,
150 |             catalog=catalog,
151 |             **kwargs,
152 |         )
153 | 


--------------------------------------------------------------------------------
/test/integration/test_configuration_load.py:
--------------------------------------------------------------------------------
  1 | from yetl import Config, Timeslice, StageType, Read, DeltaLake, yetl_flow, TableMapping, ValidationThreshold
  2 | from yetl.config._project import SparkLoggingLevel 
  3 | from yetl.config.table import TableType
  4 | from yetl.config.table._read import SliceDateFormat
  5 | import pytest
  6 | import os
  7 | import shutil
  8 | 
  9 | 
 10 | 
 11 | @pytest.fixture()
 12 | def tear_down():
 13 |     def tear_down_fn():
 14 |         shutil.rmtree("./test/config/test_project/data", ignore_errors=True)
 15 |         shutil.rmtree("./metastore_db", ignore_errors=True)
 16 |         shutil.rmtree("./spark-warehouse", ignore_errors=True)
 17 |         try:
 18 |             os.remove("./derby.log")
 19 |         except Exception:
 20 |             pass
 21 |     return tear_down_fn
 22 | 
 23 | @pytest.fixture()
 24 | def root_path():
 25 | 
 26 |     root = os.path.abspath(os.getcwd())
 27 |     return root
 28 | 
 29 | 
 30 | def test_configuration_load(tear_down, root_path):
 31 |     tear_down()
 32 |     pipeline = "autoloader"
 33 |     config_path = "./test/config"
 34 |     project = "test_project"
 35 |     timeslice = Timeslice(day="*", month="*", year="*")
 36 |     config = Config(
 37 |         project=project, 
 38 |         pipeline=pipeline, 
 39 |         config_path=config_path, 
 40 |         timeslice=timeslice
 41 |     )
 42 |     table_mapping = config.get_table_mapping(
 43 |         stage=StageType.raw, table="customers"
 44 |     )
 45 | 
 46 |     source: Read = table_mapping.source["customer_details_1"]
 47 |     destination: DeltaLake = table_mapping.destination
 48 |     config.set_checkpoint(source=source, destination=destination)
 49 | 
 50 |     assert source.table == "customer_details_1"
 51 |     assert source.slice_date == SliceDateFormat.FILENAME_DATE_FORMAT
 52 |     assert source.slice_date_column_name == "_slice_date"
 53 | 
 54 |     assert destination.table == "customers"
 55 |     assert destination.stage == StageType.raw
 56 |     assert destination.database =='raw_dbx_patterns' 
 57 |     assert destination.table=='customers' 
 58 |     assert destination.id=='id'
 59 | 
 60 |     assert destination.custom_properties == {'process_group': 1,'rentention_days': 365} 
 61 |     assert destination.table_type == TableType.delta_lake
 62 |     assert destination.warning_thresholds == ValidationThreshold(invalid_ratio=0.1, invalid_rows=0, max_rows=100, min_rows=5) 
 63 |     assert destination.exception_thresholds == ValidationThreshold(invalid_ratio=0.2, invalid_rows=2, max_rows=1000, min_rows=0)
 64 |     assert destination.project.config_path == f'{root_path}/test/config/test_project'
 65 |     assert destination.project.name == 'test_project'
 66 |     assert destination.project.sql == f'{root_path}/test/config/test_project/sql'
 67 |     assert destination.project.pipelines == f'{root_path}/test/config/test_project/pipelines'
 68 |     assert destination.project.databricks_notebooks == f'{root_path}/test/config/test_project/databricks/notebooks'
 69 |     assert destination.project.databricks_workflows == f'{root_path}/test/config/test_project/databricks/workflows'
 70 |     assert destination.project.databricks_queries == f'{root_path}/test/config/test_project/databricks/queries'
 71 |     # assert destination.project.spark.config == {
 72 |     #     'spark.master': 'local', 'spark.databricks.delta.allowArbitraryProperties.enabled': True, 
 73 |     #     'spark.sql.catalog.spark_catalog': 'org.apache.spark.sql.delta.catalog.DeltaCatalog', 
 74 |     #      'spark.sql.extensions': 'io.delta.sql.DeltaSparkSessionExtension'
 75 |     # }
 76 |     assert destination.project.spark.logging_level == SparkLoggingLevel.ERROR
 77 |     assert destination.container == 'datalake'
 78 |     assert destination.location == f'{root_path}/test/config/test_project/data/mnt/datalake/data/raw/raw_dbx_patterns/customers'
 79 |     assert destination.path == 'raw_dbx_patterns/customers'
 80 |     assert destination.options == {'mergeSchema': True, 'checkpointLocation': '/mnt/datalake/checkpoint/test_project/landing_dbx_patterns.customer_details_1-raw_dbx_patterns.customers'}
 81 |     assert destination.timeslice == Timeslice(year='*', month='*', day='*', hour=0, minute=0, second=0, microsecond=0)
 82 |     assert destination.checkpoint == 'landing_dbx_patterns.customer_details_1-raw_dbx_patterns.customers'
 83 |     assert destination.delta_constraints == None 
 84 |     assert destination.partition_by == None 
 85 |     assert destination.z_order_by == ["_load_date_1", "_load_date_2"]
 86 |     assert destination.managed == False 
 87 |     assert destination.sql == None
 88 |     assert destination.vacuum == 30
 89 |     assert destination.catalog == None
 90 | 
 91 | 
 92 | 
 93 | def test_decorator_configuration_load(tear_down):
 94 |     @yetl_flow(
 95 |             project="test_project", 
 96 |             stage=StageType.raw, 
 97 |             config_path="./test/config",
 98 |             catalog=None
 99 |     )
100 |     def autoloader(table_mapping:TableMapping):
101 |         return table_mapping
102 |     
103 | 
104 |     result = autoloader(table="customers")
105 |     tear_down()
106 |     assert result.source["customer_details_1"].table == "customer_details_1"
107 |     assert result.destination.table == "customers"
108 | 
109 | 
110 | def test_decorator_configuration_audit_load(tear_down):
111 |     @yetl_flow(
112 |             project="test_project", 
113 |             stage=StageType.audit_control, 
114 |             config_path="./test/config",
115 |             catalog=None
116 |     )
117 |     def autoloader(table_mapping:TableMapping):
118 |         return table_mapping
119 |     
120 | 
121 |     result = autoloader(table="header_footer")
122 |     tear_down()
123 |     assert result.source.table == "customers"
124 |     assert result.destination.table == "header_footer"
125 | 
126 | 
127 | 


--------------------------------------------------------------------------------
/yetl/config/_utils.py:
--------------------------------------------------------------------------------
  1 | import jinja2
  2 | from enum import Enum
  3 | import yaml
  4 | from pyspark.sql.types import StructType
  5 | from typing import Dict
  6 | import os
  7 | import pkg_resources
  8 | import logging
  9 | 
 10 | 
 11 | YETL_CONFIG = "YETL_CONFIG"
 12 | _ENCODING = "utf-8"
 13 | _DBX_WORKSPACE_PATH = "/Workspace"
 14 | _DBX_REPO_PATH = "/Workspace/Repos"
 15 | 
 16 | 
 17 | class VersionNotFoundException(Exception):
 18 |     def __init__(self, *args: object) -> None:
 19 |         super().__init__(*args)
 20 | 
 21 | 
 22 | class JinjaVariables(Enum):
 23 |     DATABASE = "database"
 24 |     TABLE = "table"
 25 |     CHECKPOINT = "checkpoint"
 26 |     FILENAME_DATE_FORMAT = "filename_date_format"
 27 |     PATH_DATE_FORMAT = "path_date_format"
 28 |     CONTAINER = "container"
 29 |     DELTA_PROPERTIES = "delta_properties"
 30 |     LOCATION = "location"
 31 |     PROJECT = "project"
 32 |     CATALOG = "catalog"
 33 | 
 34 | 
 35 | def is_databricks():
 36 |     return "DATABRICKS_RUNTIME_VERSION" in os.environ
 37 | 
 38 | 
 39 | def check_version(data: dict):
 40 |     _logger = logging.getLogger(__name__)
 41 |     version = data.get("version")
 42 | 
 43 |     if version is None:
 44 |         raise VersionNotFoundException()
 45 | 
 46 |     del data["version"]
 47 | 
 48 |     pkg_version = pkg_resources.get_distribution("yetl-framework").version
 49 |     pkg_version = pkg_version.split(".")
 50 | 
 51 |     try:
 52 |         version = version.split(".")
 53 |         if pkg_version[0] != version[0] or pkg_version[1] != version[1]:
 54 |             _logger.warning(
 55 |                 f"Configuration and library shows that it's incompatible config version config_version={version} package_version={pkg_version}"
 56 |             )
 57 | 
 58 |     except Exception as e:
 59 |         raise VersionNotFoundException from e
 60 | 
 61 | 
 62 | def render_jinja(data: str, replacements: Dict[JinjaVariables, str]):
 63 |     _logger = logging.getLogger(__name__)
 64 |     _logger.debug(f"Rendering Jinja string {data}")
 65 |     if data and isinstance(data, str):
 66 |         replace = {k.value: v for (k, v) in replacements.items()}
 67 |         skip = False
 68 |         for k, v in replace.items():
 69 |             if v is None and "{{" + k + "}}" in data.replace(" ", ""):
 70 |                 skip = True
 71 |                 break
 72 | 
 73 |         if not skip:
 74 |             template: jinja2.Template = jinja2.Template(data)
 75 |             data = template.render(replace)
 76 |     _logger.debug(f"Rendered Jinja string {data}")
 77 | 
 78 |     return data
 79 | 
 80 | 
 81 | def abs_config_path(root: str, path: str):
 82 |     _logger = logging.getLogger(__name__)
 83 |     if not os.path.isabs(path):
 84 |         path = os.path.join(root, path)
 85 |     _logger.debug(f"Absolute config path {path}")
 86 |     return path
 87 | 
 88 | 
 89 | def get_config_path(project: str, path: str):
 90 |     _logger = logging.getLogger(__name__)
 91 | 
 92 |     default_path = "."
 93 |     if is_databricks():
 94 |         if os.path.exists(_DBX_WORKSPACE_PATH):
 95 |             default_path = f"{_DBX_WORKSPACE_PATH}"
 96 | 
 97 |         if os.getcwd().startswith(_DBX_REPO_PATH):
 98 |             repo_path = "/".join(os.getcwd().split("/")[0:5])
 99 |             default_path = repo_path
100 | 
101 |     if not path:
102 |         path = os.getenv(YETL_CONFIG, default_path)
103 |     path = os.path.abspath(path)
104 |     path = os.path.join(path, project)
105 |     _logger.info(f"Absolute root config path {path}")
106 |     return path
107 | 
108 | 
109 | def load_schema(path: str):
110 |     _logger = logging.getLogger(__name__)
111 |     schema = load_yaml(path)
112 |     _logger.info(f"Loading schema {path}")
113 |     schema = StructType.fromJson(schema)
114 | 
115 |     return schema
116 | 
117 | 
118 | def load_yaml(path: str):
119 |     _logger = logging.getLogger(__name__)
120 |     _logger.info(f"Loading yaml file {path}")
121 |     with open(path, "r", encoding=_ENCODING) as f:
122 |         try:
123 |             data = yaml.safe_load(f)
124 |         except yaml.YAMLError as e:
125 |             location = ""
126 |             if hasattr(e, "problem_mark"):
127 |                 mark = e.problem_mark
128 |                 location = f"Error position ({mark.line}, {mark.column})"
129 | 
130 |             if hasattr(e, "problem"):
131 |                 problem = f"{e.problem}."
132 | 
133 |             raise Exception(f"Invalid yaml format in {path}. {problem} {location}")
134 |     _logger.debug(data)
135 |     return data
136 | 
137 | 
138 | def load_text(path: str):
139 |     _logger = logging.getLogger(__name__)
140 |     _logger.info(f"Loading text file {path}")
141 |     with open(path, "r", encoding=_ENCODING) as f:
142 |         data = f.read()
143 |     _logger.debug(data)
144 |     return data
145 | 
146 | 
147 | def get_ddl(spark_schema: StructType, header: bool = True):
148 |     _logger = logging.getLogger(__name__)
149 |     _logger.debug(f"Converting spark schema to ddl with header={str(header)}")
150 |     if header:
151 |         ddl = [f"{f.name} {f.dataType.simpleString()}" for f in spark_schema.fields]
152 |         _logger.debug(ddl)
153 |     else:
154 |         ddl = [
155 |             f"_c{i} {f.dataType.simpleString()}"
156 |             for i, f in enumerate(spark_schema.fields)
157 |         ]
158 |         _logger.debug(ddl)
159 | 
160 |     return ddl
161 | 
162 | 
163 | def get_html_table(data: dict):
164 |     rows = []
165 |     for k, v in data.items():
166 |         if isinstance(v, dict):
167 |             for ki, vi in v.items():
168 |                 row = f"<tr><td>{k}.{ki}</td><td>{vi}</td></tr>"
169 |                 rows.append(row)
170 |         else:
171 |             row = f"<tr><td>{k}</td><td>{v}</td></tr>"
172 |             rows.append(row)
173 | 
174 |     html = "".join(rows)
175 | 
176 |     html = f"""
177 |   <table>
178 |     <tr><th>Name</th><th>Source</th><th>Destination</th></tr>
179 |     {html}
180 |   </table>
181 |   """
182 | 
183 |     return html
184 | 


--------------------------------------------------------------------------------
/yetl/config/table/_deltalake.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from pydantic import Field, PrivateAttr
  3 | from .._utils import (
  4 |     JinjaVariables,
  5 |     render_jinja,
  6 |     is_databricks,
  7 |     abs_config_path,
  8 |     load_text,
  9 | )
 10 | from typing import Any, Dict, Union, List, Optional
 11 | from .._timeslice import Timeslice
 12 | import os
 13 | from .._stage_type import StageType
 14 | from ._table import Table
 15 | from ..deltalake import DeltaLakeFn
 16 | from pyspark.sql.types import StructType
 17 | 
 18 | 
 19 | class DeltaLake(Table):
 20 |     def __init__(self, **data: Any) -> None:
 21 |         super().__init__(**data)
 22 |         self._logger = logging.getLogger(self.__class__.__name__)
 23 |         self._spark = DeltaLakeFn(project=self.project)
 24 |         self._render()
 25 | 
 26 |     @classmethod
 27 |     def in_allowed_stages(cls, stage: StageType):
 28 |         return stage in (stage.raw, stage.base, stage.curated)
 29 | 
 30 |     _logger: Any = PrivateAttr(default=None)
 31 |     _replacements: Dict[JinjaVariables, str] = PrivateAttr(default=None)
 32 |     _spark: DeltaLakeFn = PrivateAttr(default=None)
 33 |     depends_on: Optional[List[str]] = Field(default=[])
 34 |     delta_properties: Optional[Dict[str, Union[str, bool, int, float]]] = Field(
 35 |         default=None
 36 |     )
 37 |     delta_constraints: Optional[Dict[str, str]] = Field(default=None)
 38 |     partition_by: Optional[Union[List[str], str]] = Field(default=None)
 39 |     cluster_by: Optional[Union[List[str], str]] = Field(default=None)
 40 |     z_order_by: Optional[Union[List[str], str]] = Field(default=None)
 41 |     vacuum: Optional[int] = Field(default=31)
 42 |     options: Optional[Union[dict, None]] = Field(default=None)
 43 |     timeslice: Timeslice = Field(...)
 44 |     location: Optional[str] = Field(default=None)
 45 |     stage: StageType = Field(...)
 46 |     managed: Optional[bool] = Field(default=False)
 47 | 
 48 |     sql: Optional[str] = Field(default=None)
 49 | 
 50 |     def _load_sql(self, path: str):
 51 |         path = abs_config_path(self.config_path, path)
 52 |         sql = load_text(path)
 53 |         return sql
 54 | 
 55 |     def _render(self):
 56 |         super()._render()
 57 |         if not self._rendered:
 58 |             if self.delta_properties:
 59 |                 delta_properties_sql = self._spark.get_delta_properties_sql(
 60 |                     self.delta_properties
 61 |                 )
 62 |                 self._replacements[
 63 |                     JinjaVariables.DELTA_PROPERTIES
 64 |                 ] = delta_properties_sql
 65 |             self.database = render_jinja(self.database, self._replacements)
 66 |             self.table = render_jinja(self.table, self._replacements)
 67 |             self.location = render_jinja(self.location, self._replacements)
 68 |             self.path = render_jinja(self.path, self._replacements)
 69 |             if self.location and self.path:
 70 |                 self.location = os.path.join(self.location, self.path)
 71 |             if not is_databricks():
 72 |                 self.location = f"{self.config_path}/../data{self.location}"
 73 |                 self.location = os.path.abspath(self.location)
 74 |             self._replacements[JinjaVariables.LOCATION] = self.location
 75 | 
 76 |             if self.sql:
 77 |                 # render the path
 78 |                 self.sql = render_jinja(self.sql, self._replacements)
 79 |                 # load the file
 80 |                 self.sql = self._load_sql(self.sql)
 81 |                 # render the SQL
 82 |                 self.sql = render_jinja(self.sql, self._replacements)
 83 | 
 84 |             if self.options:
 85 |                 for option, value in self.options.items():
 86 |                     if isinstance(value, str):
 87 |                         self.options[option] = render_jinja(value, self._replacements)
 88 | 
 89 |         self._rendered = True
 90 | 
 91 |     def create_database(self, catalog: str = None):
 92 |         super().create_database(catalog=catalog)
 93 |         self._spark.create_database(self.database, catalog=self.catalog)
 94 | 
 95 |     # TODO: alter table
 96 |     def create_table(
 97 |         self, catalog: str = None, schema: StructType = None, create_database=True
 98 |     ):
 99 |         if create_database:
100 |             self.create_database(catalog=catalog)
101 |         super().create_table(catalog=catalog)
102 |         if self._spark.table_exists(
103 |             database=self.database, table=self.table, catalog=self.catalog
104 |         ):
105 |             pass
106 |             # TODO: alter table
107 |             if self.catalog:
108 |                 msg = f"table `{self.catalog}`.{self.database}`.`{self.table}` already exists."
109 |             else:
110 |                 msg = f"table {self.database}`.`{self.table}` already exists."
111 |             self._logger.info(msg)
112 |         else:
113 |             if self.managed:
114 |                 self._spark.create_table(
115 |                     database=self.database,
116 |                     table=self.table,
117 |                     delta_properties=self.delta_properties,
118 |                     sql=self.sql,
119 |                     catalog=self.catalog,
120 |                     schema=schema,
121 |                     cluster_by=self.cluster_by,
122 |                     partition_by=self.partition_by,
123 |                 )
124 |             else:
125 |                 self._spark.create_table(
126 |                     database=self.database,
127 |                     table=self.table,
128 |                     delta_properties=self.delta_properties,
129 |                     path=self.location,
130 |                     sql=self.sql,
131 |                     catalog=self.catalog,
132 |                     schema=schema,
133 |                     cluster_by=self.cluster_by,
134 |                     partition_by=self.partition_by,
135 |                 )
136 | 
137 |     def qualified_table_name(self):
138 |         name = f"`{self.database}`.`{self.table}`"
139 |         if self.catalog:
140 |             name = f"`{self.catalog}`.{name}"
141 |         return name
142 | 


--------------------------------------------------------------------------------
/yetl/config/_timeslice.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | from typing import Literal, Union
  3 | from pydantic import BaseModel, Field
  4 | from typing import Any
  5 | import re
  6 | 
  7 | _WILDCARD = "*"
  8 | Wildcard = Literal["*"]
  9 | 
 10 | _UNSUPPORTED_FORMAT_CODES = [
 11 |     "%c",
 12 |     "%x",
 13 |     "%X",
 14 |     "%G",
 15 |     "%u",
 16 |     "%V",
 17 |     "%z",
 18 |     "%Z",
 19 |     "%I",
 20 |     "%p",
 21 |     "%b",
 22 |     "%B",
 23 |     "%a",
 24 |     "%A",
 25 |     "%w",
 26 | ]
 27 | 
 28 | 
 29 | class Timeslice(BaseModel):
 30 |     def __init__(__pydantic_self__, **data: Any) -> None:
 31 |         super().__init__(**data)
 32 | 
 33 |     year: Union[int, Wildcard] = Field(...)
 34 |     month: Union[int, Wildcard] = Field(default=_WILDCARD)
 35 |     day: Union[int, Wildcard] = Field(default=_WILDCARD)
 36 |     hour: Union[int, Wildcard] = Field(default=0)
 37 |     minute: Union[int, Wildcard] = Field(default=0)
 38 |     second: Union[int, Wildcard] = Field(default=0)
 39 |     microsecond: Union[int, Wildcard] = Field(default=0)
 40 | 
 41 |     @classmethod
 42 |     def parse_iso_date(cls, iso_date: str):
 43 |         if iso_date == "*":
 44 |             iso_date = "*-*-*"
 45 |         pattern = "^(([12]\d{3}|[*])-(0[1-9]|1[0-2]|[*])-(0[1-9]|[12]\d|3[01]|[*]))$"  # noqa W605
 46 |         result = re.match(pattern, iso_date)
 47 | 
 48 |         if result:
 49 |             parts = iso_date.split("-")
 50 |             args = {"year": parts[0], "month": parts[1], "day": parts[2]}
 51 |             return cls(**args)
 52 |         else:
 53 |             raise Exception(
 54 |                 f"{iso_date} is an invalid iso date string. Must be the format YYYY-mm-dd"
 55 |             )
 56 | 
 57 |     def strftime(self, format: str):
 58 |         """This will format and return the timeslice using python format codes. Only a subset of format codes are suppoered by design
 59 |         %d - Day of the month as a zero-padded decimal number.
 60 |         %m - Month as a zero-padded decimal number.
 61 |         %y - Year without century as a zero-padded decimal number.
 62 |         %Y - Year with century as a decimal number.
 63 |         %H - Hour (24-hour clock) as a zero-padded decimal number.
 64 |         %M - Minute as a zero-padded decimal number.
 65 |         %S - Second as a zero-padded decimal number.
 66 |         %f - Microsecond as a decimal number, zero-padded to 6 digits.
 67 |         %% - A literal '%' character.
 68 |         %j - Day of the year as a zero-padded decimal number.
 69 |         %U - Week number of the year (Sunday as the first day of the week) as a zero-padded decimal number. All days in a new year preceding the first Sunday are considered to be in week 0.
 70 |         %W - Week number of the year (Monday as the first day of the week) as a zero-padded decimal number. All days in a new year preceding the first Monday are considered to be in week 0.
 71 |         NOT SUPPORTED - %c - Locale’s appropriate date and time representation.
 72 |         NOT SUPPORTED - %x - Locale’s appropriate date representation.
 73 |         NOT SUPPORTED - %X - Locale’s appropriate time representation.
 74 |         NOT SUPPORTED - %G - ISO 8601 year with century representing the year that contains the greater part of the ISO week (%V).
 75 |         NOT SUPPORTED - %u - ISO 8601 weekday as a decimal number where 1 is Monday.
 76 |         NOT SUPPORTED - %V - ISO 8601 week as a decimal number with Monday as the first day of the week. Week 01 is the week containing Jan 4.
 77 |         NOT SUPPORTED - %z - UTC offset in the form ±HHMM[SS[.ffffff]] (empty string if the object is naive).
 78 |         NOT SUPPORTED - %Z - Time zone name (empty string if the object is naive).
 79 |         NOT SUPPORTED - %I - Hour (12-hour clock) as a zero-padded decimal number.
 80 |         NOT SUPPORTED - %p - Locale’s equivalent of either AM or PM.
 81 |         NOT SUPPORTED - %b - Month as locale’s abbreviated name.
 82 |         NOT SUPPORTED - %B - Month as locale’s full name.
 83 |         NOT SUPPORTED - %a - Weekday as locale’s abbreviated name.
 84 |         NOT SUPPORTED - %A - Weekday as locale’s full name.
 85 |         NOT SUPPORTED - %w - Weekday as a decimal number, where 0 is Sunday and 6 is Saturday.
 86 |         """
 87 | 
 88 |         unsupported_codes = [c for c in _UNSUPPORTED_FORMAT_CODES if c in format]
 89 | 
 90 |         if unsupported_codes:
 91 |             unsupported_codes = ",".join(unsupported_codes)
 92 |             raise Exception(
 93 |                 f"The format contains the following unsupported format codes: {unsupported_codes}"
 94 |             )
 95 | 
 96 |         format, _year = self._format_wildcard(format, self.year, ["%y", "%Y"], 1900)
 97 |         format, _month = self._format_wildcard(format, self.month, "%m", 1)
 98 |         format, _day = self._format_wildcard(format, self.day, "%d", 1)
 99 | 
100 |         format, _hour = self._format_wildcard(format, self.hour, "%H")
101 |         format, _minutue = self._format_wildcard(format, self.minute, "%M")
102 |         format, _second = self._format_wildcard(format, self.second, "%S")
103 |         format, _microsecond = self._format_wildcard(format, self.microsecond, "%f")
104 | 
105 |         timeslice = datetime(
106 |             _year, _month, _day, _hour, _minutue, _second, _microsecond
107 |         )
108 | 
109 |         formatted = timeslice.strftime(format)
110 |         return formatted
111 | 
112 |     def _format_wildcard(
113 |         self,
114 |         format: str,
115 |         datepart: Union[int, Wildcard],
116 |         format_code: Union[list, str],
117 |         default=0,
118 |     ):
119 |         if datepart == _WILDCARD:
120 |             if isinstance(format_code, str):
121 |                 format = format.replace(format_code, f"{_WILDCARD}")
122 |             elif isinstance(format_code, list):
123 |                 for f in format_code:
124 |                     format = format.replace(f, f"{_WILDCARD}")
125 |             datepart = default
126 | 
127 |         return format, datepart
128 | 
129 |     def __str__(self) -> str:
130 |         return self.strftime("%Y-%m-%d %H:%M:%S.%f")
131 | 
132 | 
133 | class TimesliceNow(Timeslice):
134 |     def __init__(self) -> None:
135 |         now = datetime.now()
136 |         args = {
137 |             "year": now.year,
138 |             "month": now.month,
139 |             "day": now.day,
140 |             "hour": now.hour,
141 |             "minute": now.minute,
142 |             "second": now.second,
143 |             "microsecond": now.microsecond,
144 |         }
145 |         super().__init__(**args)
146 | 
147 | 
148 | class TimesliceUtcNow(Timeslice):
149 |     def __init__(self) -> None:
150 |         now = datetime.utcnow()
151 |         args = {
152 |             "year": now.year,
153 |             "month": now.month,
154 |             "day": now.day,
155 |             "hour": now.hour,
156 |             "minute": now.minute,
157 |             "second": now.second,
158 |             "microsecond": now.microsecond,
159 |         }
160 |         super().__init__(**args)
161 | 


--------------------------------------------------------------------------------
/yetl/config/table/_read.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from pydantic import Field, PrivateAttr
  3 | from .._utils import JinjaVariables, render_jinja, get_ddl, load_schema, abs_config_path
  4 | from typing import Any, Dict, List, Union, Optional
  5 | from enum import Enum
  6 | import os
  7 | from pyspark.sql.types import StructType
  8 | from pyspark.sql.streaming import StreamingQuery
  9 | from pyspark.sql import DataFrame
 10 | from pyspark.sql import functions as fn
 11 | from .._stage_type import StageType
 12 | from ._table import Table
 13 | from ..deltalake import DeltaLakeFn
 14 | 
 15 | 
 16 | class TriggerType(Enum):
 17 |     File = "file"
 18 | 
 19 | 
 20 | class SliceDateFormat(Enum):
 21 |     FILENAME_DATE_FORMAT = JinjaVariables.FILENAME_DATE_FORMAT.value
 22 |     PATH_DATE_FORMAT = JinjaVariables.PATH_DATE_FORMAT.value
 23 | 
 24 | 
 25 | class Read(Table):
 26 |     _OPTION_CF_SCHEMA_HINTS = "cloudFiles.schemaHints"
 27 |     _OPTION_CORRUPT_RECORD_NAME = "columnNameOfCorruptRecord"
 28 | 
 29 |     def add_timeslice(
 30 |         self, df: DataFrame, filepath_column_name: str = "_metadata.file_path"
 31 |     ):
 32 |         if self.slice_date == SliceDateFormat.FILENAME_DATE_FORMAT:
 33 |             date_format = self.path_date_format
 34 | 
 35 |         if self.slice_date == SliceDateFormat.PATH_DATE_FORMAT:
 36 |             date_format = self.filename_date_format
 37 | 
 38 |         pattern = DeltaLakeFn.to_regex_search_pattern(date_format)
 39 |         spark_format_string = DeltaLakeFn.to_spark_format_code(date_format)
 40 | 
 41 |         df = (
 42 |             df.withColumn(self.slice_date_column_name, fn.col(filepath_column_name))
 43 |             .withColumn(
 44 |                 self.slice_date_column_name,
 45 |                 fn.regexp_extract(fn.col(self.slice_date_column_name), pattern, 0),
 46 |             )
 47 |             .withColumn(
 48 |                 self.slice_date_column_name,
 49 |                 fn.to_timestamp(self.slice_date_column_name, spark_format_string),
 50 |             )
 51 |         )
 52 | 
 53 |         return df
 54 | 
 55 |     def __init__(self, **data: Any) -> None:
 56 |         super().__init__(**data)
 57 |         self._logger = logging.getLogger(self.__class__.__name__)
 58 |         self._render()
 59 |         self.path = os.path.join(self.location, self.filename)
 60 | 
 61 |     _logger: Any = PrivateAttr(default=None)
 62 |     _replacements: Dict[JinjaVariables, str] = PrivateAttr(default=None)
 63 |     managed: Optional[bool] = Field(default=False)
 64 |     trigger: Optional[str] = Field(default=None)
 65 |     trigger_type: Optional[TriggerType] = Field(default=None)
 66 |     filename: str = Field(...)
 67 |     filename_date_format: str = Field(...)
 68 |     path_date_format: str = Field(...)
 69 |     format: str = Field(...)
 70 |     spark_schema: Optional[Union[StructType, str]] = Field(default=None)
 71 |     ddl: Optional[List[str]] = Field(default=None)
 72 |     headerless_ddl: Optional[List[str]] = Field(default=None)
 73 |     stage: StageType = Field(...)
 74 |     slice_date: Optional[SliceDateFormat] = Field(
 75 |         default=SliceDateFormat.FILENAME_DATE_FORMAT
 76 |     )
 77 |     slice_date_column_name: Optional[str] = Field(default="_slice_date")
 78 | 
 79 |     def _render(self):
 80 |         super()._render()
 81 |         self._replacements[
 82 |             JinjaVariables.FILENAME_DATE_FORMAT
 83 |         ] = self.timeslice.strftime(self.filename_date_format)
 84 |         self._replacements[JinjaVariables.PATH_DATE_FORMAT] = self.timeslice.strftime(
 85 |             self.path_date_format
 86 |         )
 87 |         if not self._rendered:
 88 |             self.location = render_jinja(self.location, self._replacements)
 89 |             self.filename = render_jinja(self.filename, self._replacements)
 90 |             self.database = render_jinja(self.database, self._replacements)
 91 |             self.table = render_jinja(self.table, self._replacements)
 92 |             self.trigger = render_jinja(self.trigger, self._replacements)
 93 | 
 94 |             if self.options:
 95 |                 for option, value in self.options.items():
 96 |                     self.options[option] = render_jinja(value, self._replacements)
 97 | 
 98 |             self._config_schema_hints()
 99 | 
100 |             if isinstance(self.spark_schema, str):
101 |                 path = self.spark_schema
102 |                 path = render_jinja(self.spark_schema, self._replacements)
103 |                 path = abs_config_path(self.config_path, path)
104 |                 if os.path.exists(path):
105 |                     self._load_schema(path)
106 |                 else:
107 |                     self.spark_schema = path
108 |                     self._logger.warning(
109 |                         f"Schema path doesn't exist, schema has not been loaded and remains to be path {path}."
110 |                     )
111 | 
112 |             corrupt_record_name = self.options.get(
113 |                 self._OPTION_CORRUPT_RECORD_NAME, None
114 |             )
115 |             if isinstance(self.spark_schema, StructType) and corrupt_record_name:
116 |                 if corrupt_record_name not in self.spark_schema.names:
117 |                     self.spark_schema.add(field=corrupt_record_name, data_type="string")
118 | 
119 |             if self.options:
120 |                 for option, value in self.options.items():
121 |                     if isinstance(value, str):
122 |                         self.options[option] = render_jinja(value, self._replacements)
123 | 
124 |         self._rendered = True
125 | 
126 |     def _config_schema_hints(self):
127 |         path = self.options.get(self._OPTION_CF_SCHEMA_HINTS, None)
128 |         if path and "/" in path:
129 |             self._load_schema(path)
130 | 
131 |             if self.options.get("header"):
132 |                 self.options[self._OPTION_CF_SCHEMA_HINTS] = ", ".join(self.ddl)
133 |             else:
134 |                 self.options[self._OPTION_CF_SCHEMA_HINTS] = ", ".join(
135 |                     self.headerless_ddl
136 |                 )
137 | 
138 |     def _load_schema(self, path: str):
139 |         path = abs_config_path(self.config_path, path)
140 |         if not self.spark_schema or isinstance(self.spark_schema, str):
141 |             self.spark_schema = load_schema(path)
142 |         if not self.ddl:
143 |             self.ddl = get_ddl(self.spark_schema, header=True)
144 |         if not self.headerless_ddl:
145 |             self.headerless_ddl = get_ddl(self.spark_schema, header=False)
146 | 
147 |     def rename_headerless(self, df: Union[StreamingQuery, DataFrame]):
148 |         columns = [c for c in df.columns if c not in ["_rescued_data"]]
149 |         columns_cnt = len(columns)
150 |         ddls = len(self.ddl)
151 |         if columns_cnt != ddls:
152 |             raise Exception(
153 |                 f"Headless files with schema hints must have a fully hinted schema since it must work positionally. Datasets!=dll({columns_cnt}!={ddls}"
154 |             )
155 | 
156 |         for i, _ in enumerate(columns):
157 |             from_name = f"_c{i}"
158 |             to_name = self.ddl[i].split(" ")[0].strip()
159 |             logging.info(f"rename {from_name} to {to_name}")
160 |             df: Union[StreamingQuery, DataFrame] = df.withColumnRenamed(
161 |                 from_name, to_name
162 |             )
163 | 
164 |         return df
165 | 
166 |     def qualified_table_name(self):
167 |         return self.path
168 | 
169 |     class Config:
170 |         arbitrary_types_allowed = True
171 | 


--------------------------------------------------------------------------------
/yetl/config/_tables.py:
--------------------------------------------------------------------------------
  1 | from pydantic import BaseModel, Field, PrivateAttr
  2 | from typing import Union, Any, Dict, List, Optional
  3 | from ._stage_type import StageType
  4 | import fnmatch
  5 | from ._table_mapping import TableMapping
  6 | from .table import TableType
  7 | from .table import table_factory
  8 | from .table import Table
  9 | from enum import Enum
 10 | import logging
 11 | 
 12 | _INDEX_WILDCARD = "*"
 13 | 
 14 | 
 15 | class KeyContants(Enum):
 16 |     DATABASE = "database"
 17 |     TABLE = "table"
 18 |     TABLES = "tables"
 19 |     STAGE = "stage"
 20 |     TABLE_TYPE = "table_type"
 21 |     PROJECT = "project"
 22 |     TIMESLICE = "timeslice"
 23 |     CONFIG_PATH = "config_path"
 24 | 
 25 | 
 26 | class PushDownProperties(Enum):
 27 |     DELTA_PROPETIES = "delta_properties"
 28 |     CATALOG = "catalog"
 29 | 
 30 |     @classmethod
 31 |     def has_value(cls, value):
 32 |         return value in cls._value2member_map_
 33 | 
 34 |     @classmethod
 35 |     def has_not_value(cls, value):
 36 |         return value not in cls._value2member_map_
 37 | 
 38 | 
 39 | class Tables(BaseModel):
 40 |     def __init__(self, **data: Any) -> None:
 41 |         super().__init__(**data)
 42 |         self._logger = logging.getLogger(self.__class__.__name__)
 43 |         self._parse_configuration()
 44 |         self._build_tables()
 45 | 
 46 |     def _parse_configuration(self):
 47 |         push_down_properties = {}
 48 |         for stage_name, table_type in self.table_data["tables"].items():
 49 |             stage_type = StageType(stage_name)
 50 |             for table_type_name, database in table_type.items():
 51 |                 table_type = TableType(table_type_name)
 52 |                 push_down_properties = {}
 53 |                 for database_name, table in database.items():
 54 |                     if PushDownProperties.has_not_value(database_name):
 55 |                         catalog = table.get(PushDownProperties.CATALOG.value)
 56 |                         if PushDownProperties.CATALOG.value in table:
 57 |                             del table[PushDownProperties.CATALOG.value]
 58 |                         for table_name, table_properties in table.items():
 59 |                             table_config = {
 60 |                                 KeyContants.DATABASE.value: database_name,
 61 |                                 KeyContants.TABLE.value: table_name,
 62 |                                 KeyContants.STAGE.value: stage_type,
 63 |                                 KeyContants.TABLE_TYPE.value: table_type,
 64 |                                 KeyContants.PROJECT.value: self.table_data.get(
 65 |                                     KeyContants.PROJECT.value
 66 |                                 ),
 67 |                                 KeyContants.TIMESLICE.value: self.table_data.get(
 68 |                                     KeyContants.TIMESLICE.value
 69 |                                 ),
 70 |                                 KeyContants.CONFIG_PATH.value: self.table_data.get(
 71 |                                     KeyContants.CONFIG_PATH.value
 72 |                                 ),
 73 |                             }
 74 |                             if table_properties:
 75 |                                 table_config = {**table_config, **table_properties}
 76 |                             table_config = {**push_down_properties, **table_config}
 77 |                             table_config[PushDownProperties.CATALOG.value] = catalog
 78 |                             for p, v in push_down_properties.items():
 79 |                                 if isinstance(v, dict) and table_config.get(p):
 80 |                                     table_config[p] = {**v, **table_config[p]}
 81 |                                 else:
 82 |                                     table_config[p] = v
 83 |                             stage_config = self.table_data.get(stage_type.value, {})
 84 |                             stage_config = stage_config.get(table_type.value, {})
 85 |                             table_config = {**stage_config, **table_config}
 86 |                             index = f"{stage_name}.{database_name}.{table_name}"
 87 |                             self.tables_index[index] = table_config
 88 |                     else:
 89 |                         push_down_properties[database_name] = table
 90 | 
 91 |     table_data: dict = Field(...)
 92 |     tables_index: Dict[str, Table] = Field(default={})
 93 |     delta_properties: Optional[Dict[str, str]] = Field(default=None)
 94 |     _logger: Any = PrivateAttr(default=None)
 95 | 
 96 |     @classmethod
 97 |     def get_index(
 98 |         cls,
 99 |         stage: Union[StageType, str] = _INDEX_WILDCARD,
100 |         database=_INDEX_WILDCARD,
101 |         table=_INDEX_WILDCARD,
102 |     ):
103 |         if isinstance(stage, StageType):
104 |             return f"{stage.name}.{database}.{table}"
105 |         else:
106 |             return f"{stage}.{database}.{table}"
107 | 
108 |     @classmethod
109 |     def parse_index(
110 |         cls,
111 |         index: str,
112 |     ):
113 |         try:
114 |             parts = index.split(".")
115 |             stage = StageType[parts[0]]
116 |             database = parts[1]
117 |             table = parts[2]
118 |         except Exception as e:
119 |             raise Exception(
120 |                 f"attempted to parse an invalid index {index}. It must be of the form 'stage.database.table'"
121 |             ) from e
122 | 
123 |         return stage, database, table
124 | 
125 |     def _build_tables(self):
126 |         """
127 |         Parse through the table definitions dictionary and deserialize it
128 |         into Table objects. The table object are then place in a dictionary for easy
129 |         lookup with a key = stage.database.table and the value being the table
130 |         object it self. This dictionary index is held on self.tables_index
131 |         """
132 |         for index, table_config in self.tables_index.items():
133 |             self.tables_index[index] = table_factory.make(
134 |                 table_config["table_type"], table_config
135 |             )
136 | 
137 |     def create_table(
138 |         self,
139 |         stage: Union[StageType, str] = _INDEX_WILDCARD,
140 |         database=_INDEX_WILDCARD,
141 |         table=_INDEX_WILDCARD,
142 |         first_match: bool = True,
143 |         catalog: str = None,
144 |         **kwargs,
145 |     ):
146 |         return self.lookup_table(
147 |             stage=stage,
148 |             database=database,
149 |             table=table,
150 |             first_match=first_match,
151 |             create_database=True,
152 |             create_table=True,
153 |             catalog=catalog,
154 |             **kwargs,
155 |         )
156 | 
157 |     def lookup_table(
158 |         self,
159 |         stage: Union[StageType, str] = _INDEX_WILDCARD,
160 |         database=_INDEX_WILDCARD,
161 |         table=_INDEX_WILDCARD,
162 |         first_match: bool = True,
163 |         create_database: bool = False,
164 |         create_table: bool = False,
165 |         catalog: str = None,
166 |         **kwargs,
167 |     ):
168 |         index = Tables.get_index(stage, database, table)
169 |         matches = fnmatch.filter(list(self.tables_index.keys()), index)
170 | 
171 |         if not matches:
172 |             raise Exception(f"index {index} not found in tables_index")
173 | 
174 |         def match_property(
175 |             table: Table, properties: Dict[str, Any], matches: List[str]
176 |         ):
177 |             for p, v in properties.items():
178 |                 if (
179 |                     isinstance(table.custom_properties, dict)
180 |                     and table.custom_properties.get(p) == v
181 |                 ):
182 |                     return True
183 |                 else:
184 |                     index = Tables.get_index(table.stage, table.database, table.table)
185 |                     if index in matches:
186 |                         matches.remove(
187 |                             Tables.get_index(table.stage, table.database, table.table)
188 |                         )
189 |                     return False
190 | 
191 |         tables_index = dict(self.tables_index)
192 |         if kwargs:
193 |             tables_index = {
194 |                 k: v
195 |                 for k, v in self.tables_index.items()
196 |                 if match_property(v, kwargs, matches)
197 |             }
198 | 
199 |         if first_match:
200 |             matches = matches[0]
201 |             table = tables_index[matches]
202 |             msg_tables = f"{table.database}.{table.table}"
203 |             self._logger.info(f"Matched tables: {msg_tables}")
204 |             if create_database:
205 |                 table.create_database(catalog=catalog)
206 |             if create_table:
207 |                 table.create_table(catalog=catalog)
208 |             return table
209 |         else:
210 |             tables = [tables_index[i] for i in matches]
211 |             msg_tables = "\n".join([f"{t.database}.{t.table}" for t in tables])
212 |             self._logger.info(f"Matched tables: {msg_tables}")
213 |             db = ""
214 |             if create_table or create_database:
215 |                 for t in tables:
216 |                     if create_database and db != t.database:
217 |                         db = t.database
218 |                         t.create_database(catalog=catalog)
219 |                     if create_table:
220 |                         t.create_table(catalog=catalog)
221 |             return tables
222 | 
223 |     def get_table_mapping(
224 |         self,
225 |         stage: StageType,
226 |         table=_INDEX_WILDCARD,
227 |         database=_INDEX_WILDCARD,
228 |         create_database: bool = False,
229 |         create_table: bool = False,
230 |         catalog: str = None,
231 |     ):
232 |         destination = self.lookup_table(
233 |             stage=stage,
234 |             database=database,
235 |             table=table,
236 |             first_match=True,
237 |             create_database=create_database,
238 |             create_table=create_table,
239 |             catalog=catalog,
240 |         )
241 |         source = {}
242 | 
243 |         tables = []
244 |         try:
245 |             for index in destination.depends_on:
246 |                 do_stage, do_database, do_table = Tables.parse_index(index)
247 |                 tables = tables + self.lookup_table(
248 |                     stage=do_stage,
249 |                     table=do_table,
250 |                     database=do_database,
251 |                     first_match=False,
252 |                     create_database=create_database,
253 |                     create_table=create_table,
254 |                     catalog=catalog,
255 |                 )
256 |         except Exception as e:
257 |             raise Exception(f"Error looking up dependencies for table {table}") from e
258 | 
259 |         for tbl in tables:
260 |             source[tbl.table] = tbl
261 | 
262 |         if len(list(source.values())) == 1:
263 |             source = list(source.values())[0]
264 | 
265 |         return TableMapping(source=source, destination=destination)
266 | 


--------------------------------------------------------------------------------
/test/config/test_project/pipelines/json_schema/sibytes_yetl_pipeline_schema.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "$id": "https://yetl.io/schemas/pipeline",
  3 | 
  4 |     "type":"object",
  5 |     "description": "Root of the yetl tables config",
  6 |     "properties": {
  7 |         "version": {
  8 |             "type": "string",
  9 |             "description": "version of yetl that the configuration is compatible with",
 10 |             "pattern": "^(\\d+\\.)?(\\d+\\.)?(\\*|\\d+)$"
 11 |         },
 12 |         "audit_control": {
 13 |             "type": "object",
 14 |             "description": "definition of the audit_control database and tables",
 15 |             "properties": {
 16 |                 "delta_lake": { "$ref": "#/$defs/delta_lake" }
 17 |             }
 18 |         },
 19 |         "landing": {
 20 |             "type": "object",
 21 |             "description": "definition of the landing stage and files",
 22 |             "properties": {
 23 |                 "read": {
 24 |                     "type": "object",
 25 |                     "description": "read table type is used for spark read table properties, typically used for reading files in object storage",
 26 |                     "properties": {
 27 |                         "trigger": {
 28 |                             "type": ["string", "null"],
 29 |                             "description": "filemask patter to use as a trgger"
 30 |                         },
 31 |                         "trigger_type": {
 32 |                             "type": ["string", "null"],
 33 |                             "description": "type of trgger"
 34 |                         },
 35 |                         "container": {
 36 |                             "type": ["string", "null"],
 37 |                             "description": "type of trgger"
 38 |                         },
 39 |                         "location": {
 40 |                             "type": ["string", "null"],
 41 |                             "description": "file directory location"
 42 |                         },
 43 |                         "filename": {
 44 |                             "type": ["string", "null"],
 45 |                             "description": "filename mask"
 46 |                         },
 47 |                         "filename_date_format": {
 48 |                             "type": ["string", "null"],
 49 |                             "description": "define a date format jinja variable for filename dates"
 50 |                         },
 51 |                         "path_date_format": {
 52 |                             "type": ["string", "null"],
 53 |                             "description": "define a date format jinja variable for file paths"
 54 |                         },
 55 |                         "slice_date": {
 56 |                             "type": "string",
 57 |                             "enum": ["filename_date_format", "path_date_format"],
 58 |                             "description": "either the filename_date_format or the path_date_format used to shred the time period from the filename or path respectively"
 59 |                         },
 60 |                         "format": {
 61 |                             "type": "string",
 62 |                             "enum": ["cloudFiles", "csv", "json", "parquet"],
 63 |                             "description": "format of the landing file"
 64 |                         },
 65 |                         "spark_schema": {
 66 |                             "type": "string",
 67 |                             "description": "relative path to where the spark definition is held"
 68 |                         },
 69 |                         "options": { "$ref": "#/$defs/options" }
 70 |                     }
 71 |                 }
 72 |             },
 73 |             "required":[
 74 |                 "read"
 75 |             ]
 76 |         },
 77 | 
 78 |         "raw": {
 79 |             "type": "object",
 80 |             "description": "definition of the raw database and tables",
 81 |             "properties": {
 82 |                 "delta_lake": { "$ref": "#/$defs/delta_lake" }
 83 |             }
 84 |         },
 85 |         "base": {
 86 |             "type": "object",
 87 |             "description": "definition of the base database and tables",
 88 |             "properties": {
 89 |                 "delta_lake": { "$ref": "#/$defs/delta_lake" }
 90 |             }
 91 |         }
 92 |     },
 93 |     "required": [
 94 |         "version",
 95 |         "landing"
 96 |     ],
 97 | 
 98 |     "$defs": {
 99 |         "delta_lake": {
100 |             "type": "object",
101 |             "description": "defines a stage as a delta lake table stage",
102 |             "properties": {
103 |                 "managed": {
104 |                     "type": "boolean",
105 |                     "description": "whether it's a managed table or unmanaged that requires a location"
106 |                 },
107 |                 "delta_properties": { "$ref": "#/$defs/delta_properties" },
108 |                 "exception_thresholds": { "$ref": "#/$defs/thresholds" },
109 |                 "warning_thresholds": { "$ref": "#/$defs/thresholds" },
110 |                 "container": {
111 |                     "type": ["string", "null"],
112 |                     "description": "type of trgger"
113 |                 },
114 |                 "location": {
115 |                     "type": ["string", "null"],
116 |                     "description": "file location of managed tables for the stage"
117 |                 },
118 |                 "path": {
119 |                     "type": ["string", "null"],
120 |                     "description": "path of the table appended to the location of the stage"
121 |                 },
122 |                 "options": { "$ref": "#/$defs/options" },
123 |                 "database": {
124 |                     "type": "string",
125 |                     "description": "name of the database, {{ database }} variable will inherit the database name from table configuration"
126 |                 },
127 |                 "table": {
128 |                     "type": "string",
129 |                     "description": "name of the database table, {{ table }} variable will inherit the database name from table configuration"
130 |                 }
131 |             }
132 |         },
133 |         "options": {
134 |             "type": ["object","null"],
135 |             "description": "holds key value pairs of custom properties",
136 |             "minProperties": 1,
137 |             "patternProperties":{
138 |                 "^\\S+$": {
139 |                     "type": ["string","number","boolean"],
140 |                     "description": "value kay pairs of the spark DSL read options"
141 |                 }
142 |             }
143 |         },
144 |         "thresholds": {
145 |             "type": "object",
146 |             "description": "table etl thresholds",
147 |             "properties": {
148 |                 "invalid_ratio": {
149 |                     "type": "number", 
150 |                     "description": "decimal between 0 and 1 specifying the ratio of invalid rows to valid rows threshold",
151 |                     "exclusiveMinimum": 0,
152 |                     "maximum": 1
153 |                 },
154 |                 "invalid_rows": {"type": "integer", "description": "integer specifying invalid rows threshold"},
155 |                 "max_rows": {"type": "integer", "description": "integer specifying max rows threshold"},
156 |                 "min_rows": {"type": "integer", "description": "integer specifying min rows threshold"}
157 |             }
158 |         },
159 |         "delta_properties": {
160 |             "type": "object",
161 |             "description": "holds key value pairs of delta properties",
162 |             "minProperties": 1,
163 |             "properties":{
164 |                 "delta.appendOnly": {
165 |                     "type": "boolean",
166 |                     "description": "true for this Delta table to be append-only. If append-only, existing records cannot be deleted, and existing values cannot be updated."
167 |                 },
168 |                 "delta.autoOptimize.autoCompact": {
169 |                     "type": ["string","boolean"],
170 |                     "description": "auto for Delta Lake to automatically optimize the layout of the files for this Delta table."
171 |                 },    
172 |                 "delta.autoOptimize.optimizeWrite": {
173 |                     "type": ["string","boolean"],
174 |                     "description": "true for Delta Lake to automatically optimize the layout of the files for this Delta table during writes."
175 |                 },    
176 |                 "delta.checkpoint.writeStatsAsJson": {
177 |                     "type": ["string","boolean"],
178 |                     "description": "true for Delta Lake to write file statistics in checkpoints in JSON format for the stats column."
179 |                 },
180 |                 "delta.checkpoint.writeStatsAsStruct": {
181 |                     "type": ["string","boolean"],
182 |                     "description": "true for Delta Lake to write file statistics to checkpoints in struct format for the stats_parsed column and to write partition values as a struct for partitionValues_parsed."
183 |                 },
184 |                 "delta.columnMapping.mode": {
185 |                     "type": "string",
186 |                     "description": "Whether column mapping is enabled for Delta table columns and the corresponding Parquet columns that use different names."
187 |                 },
188 |                 "delta.compatibility.symlinkFormatManifest.enabled": {
189 |                     "type": ["string","boolean"],
190 |                     "description": "true for Delta Lake to configure the Delta table so that all write operations on the table automatically update the manifests."
191 |                 },
192 |                 "delta.dataSkippingNumIndexedCols": {
193 |                     "type": "integer",
194 |                     "description": "The number of columns for Delta Lake to collect statistics about for data skipping. A value of -1 means to collect statistics for all columns. Updating this property does not automatically collect statistics again; instead, it redefines the statistics schema of the Delta table. Specifically, it changes the behavior of future statistics collection (such as during appends and optimizations) as well as data skipping (such as ignoring column statistics beyond this number, even when such statistics exist)."
195 |                 },
196 |                 "delta.deletedFileRetentionDuration": {
197 |                     "type": "integer",
198 |                     "description": "The shortest duration for Delta Lake to keep logically deleted data files before deleting them physically. This is to prevent failures in stale readers after compactions or partition overwrites."
199 |                 },
200 |                 "delta.enableChangeDataFeed": {
201 |                     "type": ["string","boolean"],
202 |                     "description": "true to enable change data feed."
203 |                 },
204 |                 "delta.isolationLevel":  {
205 |                     "type": "string",
206 |                     "description": "The degree to which a transaction must be isolated from modifications made by concurrent transactions."
207 |                 },
208 |                 "delta.logRetentionDuration": {
209 |                     "type": "string",
210 |                     "description": "How long the history for a Delta table is kept. VACUUM operations override this retention threshold."
211 |                 },
212 |                 "delta.minReaderVersion": {
213 |                     "type": "integer",
214 |                     "description": "The minimum required protocol reader version for a reader that allows to read from this Delta table."
215 |                 },
216 |                 "delta.minWriterVersion": {
217 |                     "type": "integer",
218 |                     "description": "The minimum required protocol writer version for a writer that allows to write to this Delta table."
219 |                 },
220 |                 "delta.randomizeFilePrefixes": {
221 |                     "type": ["string","boolean"],
222 |                     "description": "true for Delta Lake to generate a random prefix for a file path instead of partition information."
223 |                 },
224 |                 "delta.randomPrefixLength": {
225 |                     "type": "integer",
226 |                     "description": "When delta.randomizeFilePrefixes is set to true, the number of characters that Delta Lake generates for random prefixes."
227 |                 },
228 |                 "delta.setTransactionRetentionDuration": {
229 |                     "type": "string",
230 |                     "description": "The shortest duration within which new snapshots will retain transaction identifiers (for example, SetTransactions). When a new snapshot sees a transaction identifier older than or equal to the duration specified by this property, the snapshot considers it expired and ignores it. The SetTransaction identifier is used when making the writes idempotent. "
231 |                 },
232 |                 "delta.targetFileSize": {
233 |                     "type": "string",
234 |                     "description": "The target file size in bytes or higher units for file tuning. For example, 104857600 (bytes) or 100mb."
235 |                 },
236 |                 "delta.tuneFileSizesForRewrites": {
237 |                     "type": ["string","boolean"],
238 |                     "description": "true to always use lower file sizes for all data layout optimization operations on the Delta table."
239 |                 }
240 |             }
241 |         }
242 |     }
243 | }


--------------------------------------------------------------------------------
/schema_testing/sibytes_yetl_pipeline_schema.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "$id": "https://yetl.io/schemas/pipeline",
  3 | 
  4 |     "type":"object",
  5 |     "description": "Root of the yetl tables config",
  6 |     "properties": {
  7 |         "version": {
  8 |             "type": "string",
  9 |             "description": "version of yetl that the configuration is compatible with",
 10 |             "pattern": "^(\\d+\\.)?(\\d+\\.)?(\\*|\\d+)$"
 11 |         },
 12 |         "audit_control": {
 13 |             "type": "object",
 14 |             "description": "definition of the audit_control database and tables",
 15 |             "properties": {
 16 |                 "delta_lake": { "$ref": "#/$defs/delta_lake" }
 17 |             }
 18 |         },
 19 |         "source": {
 20 |             "type": "object",
 21 |             "description": "definition of the landing stage and files",
 22 |             "properties": {
 23 |                 "read": { "$ref": "#/$defs/read" },
 24 |                 "delta_lake": { "$ref": "#/$defs/delta_lake" }
 25 |             },
 26 |             "oneOf": [
 27 |                 {
 28 |                     "required":[
 29 |                         "read"
 30 |                     ]
 31 |                 },
 32 |                 {
 33 |                     "required":[
 34 |                         "delta_lake"
 35 |                     ]
 36 |                 }
 37 |             ]
 38 |         },
 39 |         "landing": {
 40 |             "type": "object",
 41 |             "description": "definition of the landing stage and files",
 42 |             "properties": {
 43 |                 "read": { "$ref": "#/$defs/read" }
 44 |             },
 45 |             "required":[
 46 |                 "read"
 47 |             ]
 48 |         },
 49 | 
 50 |         "raw": {
 51 |             "type": "object",
 52 |             "description": "definition of the raw database and tables",
 53 |             "properties": {
 54 |                 "delta_lake": { "$ref": "#/$defs/delta_lake" }
 55 |             }
 56 |         },
 57 |         "base": {
 58 |             "type": "object",
 59 |             "description": "definition of the base database and tables",
 60 |             "properties": {
 61 |                 "delta_lake": { "$ref": "#/$defs/delta_lake" }
 62 |             }
 63 |         }
 64 |     },
 65 |     "required": [
 66 |         "version",
 67 |         "landing"
 68 |     ],
 69 | 
 70 |     "$defs": {
 71 |         "read": {
 72 |             "type": "object",
 73 |             "description": "read table type is used for spark read table properties, typically used for reading files in object storage",
 74 |             "properties": {
 75 |                 "trigger": {
 76 |                     "type": ["string", "null"],
 77 |                     "description": "filemask patter to use as a trgger"
 78 |                 },
 79 |                 "trigger_type": {
 80 |                     "type": ["string", "null"],
 81 |                     "description": "type of trgger"
 82 |                 },
 83 |                 "container": {
 84 |                     "type": ["string", "null"],
 85 |                     "description": "type of trgger"
 86 |                 },
 87 |                 "location": {
 88 |                     "type": ["string", "null"],
 89 |                     "description": "file directory location"
 90 |                 },
 91 |                 "filename": {
 92 |                     "type": ["string", "null"],
 93 |                     "description": "filename mask"
 94 |                 },
 95 |                 "filename_date_format": {
 96 |                     "type": ["string", "null"],
 97 |                     "description": "define a date format jinja variable for filename dates"
 98 |                 },
 99 |                 "path_date_format": {
100 |                     "type": ["string", "null"],
101 |                     "description": "define a date format jinja variable for file paths"
102 |                 },
103 |                 "slice_date": {
104 |                     "type": "string",
105 |                     "enum": ["filename_date_format", "path_date_format"],
106 |                     "description": "either the filename_date_format or the path_date_format used to shred the time period from the filename or path respectively"
107 |                 },
108 |                 "format": {
109 |                     "type": "string",
110 |                     "enum": ["cloudFiles", "csv", "json", "parquet"],
111 |                     "description": "format of the landing file"
112 |                 },
113 |                 "spark_schema": {
114 |                     "type": "string",
115 |                     "description": "relative path to where the spark definition is held"
116 |                 },
117 |                 "options": { "$ref": "#/$defs/options" }
118 |             }
119 |         },
120 | 
121 |         "delta_lake": {
122 |             "type": "object",
123 |             "description": "defines a stage as a delta lake table stage",
124 |             "properties": {
125 |                 "managed": {
126 |                     "type": "boolean",
127 |                     "description": "whether it's a managed table or unmanaged that requires a location"
128 |                 },
129 |                 "delta_properties": { "$ref": "#/$defs/delta_properties" },
130 |                 "exception_thresholds": { "$ref": "#/$defs/thresholds" },
131 |                 "warning_thresholds": { "$ref": "#/$defs/thresholds" },
132 |                 "container": {
133 |                     "type": ["string", "null"],
134 |                     "description": "type of trgger"
135 |                 },
136 |                 "location": {
137 |                     "type": ["string", "null"],
138 |                     "description": "file location of managed tables for the stage"
139 |                 },
140 |                 "path": {
141 |                     "type": ["string", "null"],
142 |                     "description": "path of the table appended to the location of the stage"
143 |                 },
144 |                 "options": { "$ref": "#/$defs/options" },
145 |                 "database": {
146 |                     "type": "string",
147 |                     "description": "name of the database, {{ database }} variable will inherit the database name from table configuration"
148 |                 },
149 |                 "table": {
150 |                     "type": "string",
151 |                     "description": "name of the database table, {{ table }} variable will inherit the database name from table configuration"
152 |                 }
153 |             }
154 |         },
155 |         "options": {
156 |             "type": ["object","null"],
157 |             "description": "holds key value pairs of custom properties",
158 |             "minProperties": 1,
159 |             "patternProperties":{
160 |                 "^\\S+$": {
161 |                     "type": ["string","number","boolean"],
162 |                     "description": "value kay pairs of the spark DSL read options"
163 |                 }
164 |             }
165 |         },
166 |         "thresholds": {
167 |             "type": "object",
168 |             "description": "table etl thresholds",
169 |             "properties": {
170 |                 "invalid_ratio": {
171 |                     "type": "number", 
172 |                     "description": "decimal between 0 and 1 specifying the ratio of invalid rows to valid rows threshold",
173 |                     "exclusiveMinimum": 0,
174 |                     "maximum": 1
175 |                 },
176 |                 "invalid_rows": {"type": "integer", "description": "integer specifying invalid rows threshold"},
177 |                 "max_rows": {"type": "integer", "description": "integer specifying max rows threshold"},
178 |                 "min_rows": {"type": "integer", "description": "integer specifying min rows threshold"}
179 |             }
180 |         },
181 |         "delta_properties": {
182 |             "type": "object",
183 |             "description": "holds key value pairs of delta properties",
184 |             "minProperties": 1,
185 |             "properties":{
186 |                 "delta.appendOnly": {
187 |                     "type": "boolean",
188 |                     "description": "true for this Delta table to be append-only. If append-only, existing records cannot be deleted, and existing values cannot be updated."
189 |                 },
190 |                 "delta.autoOptimize.autoCompact": {
191 |                     "type": ["string","boolean"],
192 |                     "description": "auto for Delta Lake to automatically optimize the layout of the files for this Delta table."
193 |                 },    
194 |                 "delta.autoOptimize.optimizeWrite": {
195 |                     "type": ["string","boolean"],
196 |                     "description": "true for Delta Lake to automatically optimize the layout of the files for this Delta table during writes."
197 |                 },    
198 |                 "delta.checkpoint.writeStatsAsJson": {
199 |                     "type": ["string","boolean"],
200 |                     "description": "true for Delta Lake to write file statistics in checkpoints in JSON format for the stats column."
201 |                 },
202 |                 "delta.checkpoint.writeStatsAsStruct": {
203 |                     "type": ["string","boolean"],
204 |                     "description": "true for Delta Lake to write file statistics to checkpoints in struct format for the stats_parsed column and to write partition values as a struct for partitionValues_parsed."
205 |                 },
206 |                 "delta.columnMapping.mode": {
207 |                     "type": "string",
208 |                     "description": "Whether column mapping is enabled for Delta table columns and the corresponding Parquet columns that use different names."
209 |                 },
210 |                 "delta.compatibility.symlinkFormatManifest.enabled": {
211 |                     "type": ["string","boolean"],
212 |                     "description": "true for Delta Lake to configure the Delta table so that all write operations on the table automatically update the manifests."
213 |                 },
214 |                 "delta.dataSkippingNumIndexedCols": {
215 |                     "type": "integer",
216 |                     "description": "The number of columns for Delta Lake to collect statistics about for data skipping. A value of -1 means to collect statistics for all columns. Updating this property does not automatically collect statistics again; instead, it redefines the statistics schema of the Delta table. Specifically, it changes the behavior of future statistics collection (such as during appends and optimizations) as well as data skipping (such as ignoring column statistics beyond this number, even when such statistics exist)."
217 |                 },
218 |                 "delta.deletedFileRetentionDuration": {
219 |                     "type": "integer",
220 |                     "description": "The shortest duration for Delta Lake to keep logically deleted data files before deleting them physically. This is to prevent failures in stale readers after compactions or partition overwrites."
221 |                 },
222 |                 "delta.enableChangeDataFeed": {
223 |                     "type": ["string","boolean"],
224 |                     "description": "true to enable change data feed."
225 |                 },
226 |                 "delta.isolationLevel":  {
227 |                     "type": "string",
228 |                     "description": "The degree to which a transaction must be isolated from modifications made by concurrent transactions."
229 |                 },
230 |                 "delta.logRetentionDuration": {
231 |                     "type": "string",
232 |                     "description": "How long the history for a Delta table is kept. VACUUM operations override this retention threshold."
233 |                 },
234 |                 "delta.minReaderVersion": {
235 |                     "type": "integer",
236 |                     "description": "The minimum required protocol reader version for a reader that allows to read from this Delta table."
237 |                 },
238 |                 "delta.minWriterVersion": {
239 |                     "type": "integer",
240 |                     "description": "The minimum required protocol writer version for a writer that allows to write to this Delta table."
241 |                 },
242 |                 "delta.randomizeFilePrefixes": {
243 |                     "type": ["string","boolean"],
244 |                     "description": "true for Delta Lake to generate a random prefix for a file path instead of partition information."
245 |                 },
246 |                 "delta.randomPrefixLength": {
247 |                     "type": "integer",
248 |                     "description": "When delta.randomizeFilePrefixes is set to true, the number of characters that Delta Lake generates for random prefixes."
249 |                 },
250 |                 "delta.setTransactionRetentionDuration": {
251 |                     "type": "string",
252 |                     "description": "The shortest duration within which new snapshots will retain transaction identifiers (for example, SetTransactions). When a new snapshot sees a transaction identifier older than or equal to the duration specified by this property, the snapshot considers it expired and ignores it. The SetTransaction identifier is used when making the writes idempotent. "
253 |                 },
254 |                 "delta.targetFileSize": {
255 |                     "type": "string",
256 |                     "description": "The target file size in bytes or higher units for file tuning. For example, 104857600 (bytes) or 100mb."
257 |                 },
258 |                 "delta.tuneFileSizesForRewrites": {
259 |                     "type": ["string","boolean"],
260 |                     "description": "true to always use lower file sizes for all data layout optimization operations on the Delta table."
261 |                 }
262 |             }
263 |         }
264 |     }
265 | }


--------------------------------------------------------------------------------
/yetl/resource/sibytes_yetl_pipeline_schema.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "$id": "https://yetl.io/schemas/pipeline",
  3 | 
  4 |     "type":"object",
  5 |     "description": "Root of the yetl tables config",
  6 |     "properties": {
  7 |         "version": {
  8 |             "type": "string",
  9 |             "description": "version of yetl that the configuration is compatible with",
 10 |             "pattern": "^(\\d+\\.)?(\\d+\\.)?(\\*|\\d+)$"
 11 |         },
 12 |         "audit_control": {
 13 |             "type": "object",
 14 |             "description": "definition of the audit_control database and tables",
 15 |             "properties": {
 16 |                 "delta_lake": { "$ref": "#/$defs/delta_lake" }
 17 |             }
 18 |         },
 19 |         "source": {
 20 |             "type": "object",
 21 |             "description": "definition of the landing stage and files",
 22 |             "properties": {
 23 |                 "delta_lake": { "$ref": "#/$defs/delta_lake" },
 24 |                 "read": { "$ref": "#/$defs/read" }
 25 |             },
 26 |             "oneOf": [
 27 |                 {
 28 |                     "required":[
 29 |                         "read"
 30 |                     ]
 31 |                 },
 32 |                 {
 33 |                     "required":[
 34 |                         "delta_lake"
 35 |                     ]
 36 |                 }
 37 |             ]
 38 |         },
 39 |         "landing": {
 40 |             "type": "object",
 41 |             "description": "definition of the landing stage and files",
 42 |             "properties": {
 43 |                 "read": { "$ref": "#/$defs/read" }
 44 |             },
 45 |             "required":[
 46 |                 "read"
 47 |             ]
 48 |         },
 49 | 
 50 |         "raw": {
 51 |             "type": "object",
 52 |             "description": "definition of the raw database and tables",
 53 |             "properties": {
 54 |                 "delta_lake": { "$ref": "#/$defs/delta_lake" }
 55 |             }
 56 |         },
 57 |         "base": {
 58 |             "type": "object",
 59 |             "description": "definition of the base database and tables",
 60 |             "properties": {
 61 |                 "delta_lake": { "$ref": "#/$defs/delta_lake" }
 62 |             }
 63 |         }
 64 |     },
 65 |     "required": [
 66 |         "version",
 67 |         "landing"
 68 |     ],
 69 | 
 70 |     "$defs": {
 71 |         "read": {
 72 |             "type": "object",
 73 |             "description": "read table type is used for spark read table properties, typically used for reading files in object storage",
 74 |             "properties": {
 75 |                 "trigger": {
 76 |                     "type": ["string", "null"],
 77 |                     "description": "filemask patter to use as a trgger"
 78 |                 },
 79 |                 "trigger_type": {
 80 |                     "type": ["string", "null"],
 81 |                     "description": "type of trgger"
 82 |                 },
 83 |                 "container": {
 84 |                     "type": ["string", "null"],
 85 |                     "description": "type of trgger"
 86 |                 },
 87 |                 "location": {
 88 |                     "type": ["string", "null"],
 89 |                     "description": "file directory location"
 90 |                 },
 91 |                 "filename": {
 92 |                     "type": ["string", "null"],
 93 |                     "description": "filename mask"
 94 |                 },
 95 |                 "filename_date_format": {
 96 |                     "type": ["string", "null"],
 97 |                     "description": "define a date format jinja variable for filename dates"
 98 |                 },
 99 |                 "path_date_format": {
100 |                     "type": ["string", "null"],
101 |                     "description": "define a date format jinja variable for file paths"
102 |                 },
103 |                 "slice_date": {
104 |                     "type": "string",
105 |                     "enum": ["filename_date_format", "path_date_format"],
106 |                     "description": "either the filename_date_format or the path_date_format used to shred the time period from the filename or path respectively"
107 |                 },
108 |                 "format": {
109 |                     "type": "string",
110 |                     "enum": ["cloudFiles", "csv", "json", "parquet"],
111 |                     "description": "format of the landing file"
112 |                 },
113 |                 "spark_schema": {
114 |                     "type": "string",
115 |                     "description": "relative path to where the spark definition is held"
116 |                 },
117 |                 "options": { "$ref": "#/$defs/options" }
118 |             }
119 |         },
120 | 
121 |         "delta_lake": {
122 |             "type": "object",
123 |             "description": "defines a stage as a delta lake table stage",
124 |             "properties": {
125 |                 "managed": {
126 |                     "type": "boolean",
127 |                     "description": "whether it's a managed table or unmanaged that requires a location"
128 |                 },
129 |                 "delta_properties": { "$ref": "#/$defs/delta_properties" },
130 |                 "exception_thresholds": { "$ref": "#/$defs/thresholds" },
131 |                 "warning_thresholds": { "$ref": "#/$defs/thresholds" },
132 |                 "container": {
133 |                     "type": ["string", "null"],
134 |                     "description": "type of trgger"
135 |                 },
136 |                 "location": {
137 |                     "type": ["string", "null"],
138 |                     "description": "file location of managed tables for the stage"
139 |                 },
140 |                 "path": {
141 |                     "type": ["string", "null"],
142 |                     "description": "path of the table appended to the location of the stage"
143 |                 },
144 |                 "options": { "$ref": "#/$defs/options" },
145 |                 "database": {
146 |                     "type": "string",
147 |                     "description": "name of the database, {{ database }} variable will inherit the database name from table configuration"
148 |                 },
149 |                 "table": {
150 |                     "type": "string",
151 |                     "description": "name of the database table, {{ table }} variable will inherit the database name from table configuration"
152 |                 }
153 |             }
154 |         },
155 |         "options": {
156 |             "type": ["object","null"],
157 |             "description": "holds key value pairs of custom properties",
158 |             "minProperties": 1,
159 |             "patternProperties":{
160 |                 "^\\S+$": {
161 |                     "type": ["string","number","boolean"],
162 |                     "description": "value kay pairs of the spark DSL read options"
163 |                 }
164 |             }
165 |         },
166 |         "thresholds": {
167 |             "type": "object",
168 |             "description": "table etl thresholds",
169 |             "properties": {
170 |                 "invalid_ratio": {
171 |                     "type": "number", 
172 |                     "description": "decimal between 0 and 1 specifying the ratio of invalid rows to valid rows threshold",
173 |                     "exclusiveMinimum": 0,
174 |                     "maximum": 1
175 |                 },
176 |                 "invalid_rows": {"type": "integer", "description": "integer specifying invalid rows threshold"},
177 |                 "max_rows": {"type": "integer", "description": "integer specifying max rows threshold"},
178 |                 "min_rows": {"type": "integer", "description": "integer specifying min rows threshold"}
179 |             }
180 |         },
181 |         "delta_properties": {
182 |             "type": "object",
183 |             "description": "holds key value pairs of delta properties",
184 |             "minProperties": 1,
185 |             "properties":{
186 |                 "delta.appendOnly": {
187 |                     "type": "boolean",
188 |                     "description": "true for this Delta table to be append-only. If append-only, existing records cannot be deleted, and existing values cannot be updated."
189 |                 },
190 |                 "delta.autoOptimize.autoCompact": {
191 |                     "type": ["string","boolean"],
192 |                     "description": "auto for Delta Lake to automatically optimize the layout of the files for this Delta table."
193 |                 },    
194 |                 "delta.autoOptimize.optimizeWrite": {
195 |                     "type": ["string","boolean"],
196 |                     "description": "true for Delta Lake to automatically optimize the layout of the files for this Delta table during writes."
197 |                 },    
198 |                 "delta.checkpoint.writeStatsAsJson": {
199 |                     "type": ["string","boolean"],
200 |                     "description": "true for Delta Lake to write file statistics in checkpoints in JSON format for the stats column."
201 |                 },
202 |                 "delta.checkpoint.writeStatsAsStruct": {
203 |                     "type": ["string","boolean"],
204 |                     "description": "true for Delta Lake to write file statistics to checkpoints in struct format for the stats_parsed column and to write partition values as a struct for partitionValues_parsed."
205 |                 },
206 |                 "delta.columnMapping.mode": {
207 |                     "type": "string",
208 |                     "description": "Whether column mapping is enabled for Delta table columns and the corresponding Parquet columns that use different names."
209 |                 },
210 |                 "delta.compatibility.symlinkFormatManifest.enabled": {
211 |                     "type": ["string","boolean"],
212 |                     "description": "true for Delta Lake to configure the Delta table so that all write operations on the table automatically update the manifests."
213 |                 },
214 |                 "delta.dataSkippingNumIndexedCols": {
215 |                     "type": "integer",
216 |                     "description": "The number of columns for Delta Lake to collect statistics about for data skipping. A value of -1 means to collect statistics for all columns. Updating this property does not automatically collect statistics again; instead, it redefines the statistics schema of the Delta table. Specifically, it changes the behavior of future statistics collection (such as during appends and optimizations) as well as data skipping (such as ignoring column statistics beyond this number, even when such statistics exist)."
217 |                 },
218 |                 "delta.deletedFileRetentionDuration": {
219 |                     "type": "integer",
220 |                     "description": "The shortest duration for Delta Lake to keep logically deleted data files before deleting them physically. This is to prevent failures in stale readers after compactions or partition overwrites."
221 |                 },
222 |                 "delta.enableChangeDataFeed": {
223 |                     "type": ["string","boolean"],
224 |                     "description": "true to enable change data feed."
225 |                 },
226 |                 "delta.isolationLevel":  {
227 |                     "type": "string",
228 |                     "description": "The degree to which a transaction must be isolated from modifications made by concurrent transactions."
229 |                 },
230 |                 "delta.logRetentionDuration": {
231 |                     "type": "string",
232 |                     "description": "How long the history for a Delta table is kept. VACUUM operations override this retention threshold."
233 |                 },
234 |                 "delta.minReaderVersion": {
235 |                     "type": "integer",
236 |                     "description": "The minimum required protocol reader version for a reader that allows to read from this Delta table."
237 |                 },
238 |                 "delta.minWriterVersion": {
239 |                     "type": "integer",
240 |                     "description": "The minimum required protocol writer version for a writer that allows to write to this Delta table."
241 |                 },
242 |                 "delta.randomizeFilePrefixes": {
243 |                     "type": ["string","boolean"],
244 |                     "description": "true for Delta Lake to generate a random prefix for a file path instead of partition information."
245 |                 },
246 |                 "delta.randomPrefixLength": {
247 |                     "type": "integer",
248 |                     "description": "When delta.randomizeFilePrefixes is set to true, the number of characters that Delta Lake generates for random prefixes."
249 |                 },
250 |                 "delta.setTransactionRetentionDuration": {
251 |                     "type": "string",
252 |                     "description": "The shortest duration within which new snapshots will retain transaction identifiers (for example, SetTransactions). When a new snapshot sees a transaction identifier older than or equal to the duration specified by this property, the snapshot considers it expired and ignores it. The SetTransaction identifier is used when making the writes idempotent. "
253 |                 },
254 |                 "delta.targetFileSize": {
255 |                     "type": "string",
256 |                     "description": "The target file size in bytes or higher units for file tuning. For example, 104857600 (bytes) or 100mb."
257 |                 },
258 |                 "delta.tuneFileSizesForRewrites": {
259 |                     "type": ["string","boolean"],
260 |                     "description": "true to always use lower file sizes for all data layout optimization operations on the Delta table."
261 |                 }
262 |             }
263 |         }
264 |     }
265 | }


--------------------------------------------------------------------------------
/test/config/test_project/pipelines/json_schema/sibytes_yetl_tables_schema.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "$id": "https://yetl.io/schemas/tables",
  3 | 
  4 |     "type":"object",
  5 |     "description": "Root of the yetl tables config",
  6 |     "properties": {
  7 |         "version": {
  8 |             "type": "string",
  9 |             "description": "version of yetl that the configuration is compatible with",
 10 |             "pattern": "^(\\d+\\.)?(\\d+\\.)?(\\*|\\d+)$"
 11 |         },
 12 |         "audit_control": {
 13 |             "type": "object",
 14 |             "description": "definition of the audit_control database and tables",
 15 |             "properties": {
 16 |                 "delta_lake": { "$ref": "#/$defs/delta_lake" }
 17 |             }
 18 |         },
 19 |         "landing": {
 20 |             "type": "object",
 21 |             "description": "definition of the landing stage and files",
 22 |             "properties": {
 23 |                 "read": {
 24 |                     "type": "object",
 25 |                     "description": "read table type is used for spark read table properties, typically used for reading files in object storage",
 26 |                     "minProperties": 1,
 27 |                     "maxProperties": 1,
 28 |                     "patternProperties": {
 29 |                         "^\\S+$": { 
 30 |                             "type": "object",
 31 |                             "description": "name of the volume holding the files",
 32 |                             "minProperties": 1,
 33 |                             "patternProperties": {
 34 |                                 "^\\S+$": { 
 35 |                                     "type": ["string", "null"],
 36 |                                     "description": "name or partname of the file that indicates the data table e.g. customers_20201001.csv would be customers"
 37 |                                 }
 38 |                             }
 39 |                         }
 40 |                     }
 41 |                 }
 42 |             },
 43 |             "required":[
 44 |                 "read"
 45 |             ]
 46 |         },
 47 |         "raw": {
 48 |             "type": "object",
 49 |             "description": "definition of the raw database and tables",
 50 |             "properties": {
 51 |                 "delta_lake": { "$ref": "#/$defs/delta_lake" }
 52 |             }
 53 |         },
 54 |         "base": {
 55 |             "type": "object",
 56 |             "description": "definition of the base database and tables",
 57 |             "properties": {
 58 |                 "delta_lake": { "$ref": "#/$defs/delta_lake" }
 59 |             }
 60 |         }
 61 |     },
 62 |     "required": [
 63 |         "version",
 64 |         "landing",
 65 |         "raw"
 66 |     ],
 67 | 
 68 |     "$defs": {
 69 |         "delta_lake": {
 70 |             "type": "object",
 71 |             "description": "defines a table object as a delta lake table",
 72 |             "properties": {
 73 |                 "delta_properties": { "$ref": "#/$defs/delta_properties" }
 74 | 
 75 |             },
 76 |             "patternProperties": {
 77 |                 "^\\S+$": { "$ref": "#/$defs/delta_lake_database" }
 78 |             }
 79 | 
 80 |         },
 81 |         "delta_lake_database": {
 82 |             "type": "object",
 83 |             "description": "database containing tables",
 84 |             "minProperties": 1,
 85 |             "properties": {
 86 |                 "catalog": {
 87 |                     "type": "string",
 88 |                     "description": "the default catalog name for the database"
 89 |                 }
 90 |             },
 91 |             "patternProperties": {
 92 |                 "^(?!catalog)(\\S+)$": { "$ref": "#/$defs/delta_lake_table" }
 93 |             }
 94 |         },
 95 |         "delta_lake_table": {
 96 |             "type": "object",
 97 |             "description": "defines a deltalake table and it's properties",
 98 |             "properties":{
 99 |                 "managed": {
100 |                     "type": "boolean",
101 |                     "description": "whether it's a managed table or unmanaged that requires a location"
102 |                 },
103 |                 "delta_properties": { "$ref": "#/$defs/delta_properties" },
104 |                 "delta_constraints": { "$ref": "#/$defs/delta_constraints" },
105 |                 "custom_properties": { "$ref": "#/$defs/custom_properties" },
106 |                 "depends_on": {
107 |                     "type": "array",
108 |                     "items": {
109 |                       "type": "string"
110 |                     },
111 |                     "uniqueItems": true,
112 |                     "pattern": "^(\\S+\\.)(\\S+\\.)(\\*|\\S+)",
113 |                     "description": "use to denote dependency on other tables referenced their index this is the stage.database.table. The form stage.database.* can also be used to reference all tables"
114 |                 },
115 |                 "exception_thresholds": { "$ref": "#/$defs/thresholds" },
116 |                 "warning_thresholds": { "$ref": "#/$defs/thresholds" },
117 |                 "partition_by": {
118 |                     "oneOf": [
119 |                         {
120 |                             "type": "string",
121 |                             "description": "columns on which to partition by"
122 |                         }, 
123 |                         {
124 |                             "type": "array",
125 |                             "uniqueItems": true,
126 |                             "items": {
127 |                               "type": "string"
128 |                             },
129 |                             "description": "columns on which to partition by"
130 |                         }
131 |                     ]
132 |                 },
133 |                 "cluster_by": {
134 |                     "oneOf": [
135 |                         {
136 |                             "type": "string",
137 |                             "description": "columns on which to appply liquid clustering"
138 |                         }, 
139 |                         {
140 |                             "type": "array",
141 |                             "uniqueItems": true,
142 |                             "items": {
143 |                               "type": "string"
144 |                             },
145 |                             "description": "columns on which to appply liquid clustering"
146 |                         }
147 |                     ]
148 |                 },
149 |                 "z_order_by": {
150 |                     "oneOf": [
151 |                         {
152 |                             "type": "string",
153 |                             "description": "column on which to appply z-ording"
154 |                         }, 
155 |                         {
156 |                             "type": "array",
157 |                             "uniqueItems": true,
158 |                             "items": {
159 |                               "type": "string"
160 |                             },
161 |                             "description": "columns on which to appply z-ording"
162 |                         }
163 |                     ]
164 |                 },
165 |                 "id": {
166 |                     "oneOf": [
167 |                         {
168 |                             "type": "string",
169 |                             "description": "column(s) that comprise the unique identifier"
170 |                         }, 
171 |                         {
172 |                             "type": "array",
173 |                             "uniqueItems": true,
174 |                             "items": {
175 |                               "type": "string"
176 |                             },
177 |                             "description": "column(s) that comprise the unique identifier"
178 |                         }
179 |                     ]
180 |                 },
181 |                 "vacuum": {
182 |                     "type": "integer",
183 |                     "description": "vacuum retention threshold in the number of hours",
184 |                     "minimum": 0
185 |                 }
186 |             }
187 |         },
188 |         "delta_constraints": {
189 |             "type": "object",
190 |             "description": "holds key value pairs of delta constraints",
191 |             "minProperties": 1,
192 |             "patternProperties":{
193 |                 "^\\S+$": {
194 |                     "type": "string",
195 |                     "description": "check constraint logic"
196 |                 }
197 |             }
198 |         },
199 |         "custom_properties": {
200 |             "type": "object",
201 |             "description": "holds key value pairs of custom properties",
202 |             "minProperties": 1,
203 |             "patternProperties":{
204 |                 "^\\S+$": {
205 |                     "type": ["string","number","boolean"],
206 |                     "description": "custom property"
207 |                 }
208 |             }
209 |         },
210 |         "thresholds": {
211 |             "type": "object",
212 |             "description": "table etl thresholds",
213 |             "properties": {
214 |                 "invalid_ratio": {
215 |                     "type": "number", 
216 |                     "description": "decimal between 0 and 1 specifying the ratio of invalid rows to valid rows threshold",
217 |                     "exclusiveMinimum": 0,
218 |                     "maximum": 1
219 |                 },
220 |                 "invalid_rows": {"type": "integer", "description": "integer specifying invalid rows threshold"},
221 |                 "max_rows": {"type": "integer", "description": "integer specifying max rows threshold"},
222 |                 "min_rows": {"type": "integer", "description": "integer specifying min rows threshold"}
223 |             }
224 |         },
225 |         "delta_properties": {
226 |             "type": "object",
227 |             "description": "holds key value pairs of delta properties",
228 |             "minProperties": 1,
229 |             "properties":{
230 |                 "delta.appendOnly": {
231 |                     "type": "boolean",
232 |                     "description": "true for this Delta table to be append-only. If append-only, existing records cannot be deleted, and existing values cannot be updated."
233 |                 },
234 |                 "delta.autoOptimize.autoCompact": {
235 |                     "type": ["string","boolean"],
236 |                     "description": "auto for Delta Lake to automatically optimize the layout of the files for this Delta table."
237 |                 },    
238 |                 "delta.autoOptimize.optimizeWrite": {
239 |                     "type": ["string","boolean"],
240 |                     "description": "true for Delta Lake to automatically optimize the layout of the files for this Delta table during writes."
241 |                 },    
242 |                 "delta.checkpoint.writeStatsAsJson": {
243 |                     "type": ["string","boolean"],
244 |                     "description": "true for Delta Lake to write file statistics in checkpoints in JSON format for the stats column."
245 |                 },
246 |                 "delta.checkpoint.writeStatsAsStruct": {
247 |                     "type": ["string","boolean"],
248 |                     "description": "true for Delta Lake to write file statistics to checkpoints in struct format for the stats_parsed column and to write partition values as a struct for partitionValues_parsed."
249 |                 },
250 |                 "delta.columnMapping.mode": {
251 |                     "type": "string",
252 |                     "description": "Whether column mapping is enabled for Delta table columns and the corresponding Parquet columns that use different names."
253 |                 },
254 |                 "delta.compatibility.symlinkFormatManifest.enabled": {
255 |                     "type": ["string","boolean"],
256 |                     "description": "true for Delta Lake to configure the Delta table so that all write operations on the table automatically update the manifests."
257 |                 },
258 |                 "delta.dataSkippingNumIndexedCols": {
259 |                     "type": "integer",
260 |                     "description": "The number of columns for Delta Lake to collect statistics about for data skipping. A value of -1 means to collect statistics for all columns. Updating this property does not automatically collect statistics again; instead, it redefines the statistics schema of the Delta table. Specifically, it changes the behavior of future statistics collection (such as during appends and optimizations) as well as data skipping (such as ignoring column statistics beyond this number, even when such statistics exist)."
261 |                 },
262 |                 "delta.deletedFileRetentionDuration": {
263 |                     "type": "integer",
264 |                     "description": "The shortest duration for Delta Lake to keep logically deleted data files before deleting them physically. This is to prevent failures in stale readers after compactions or partition overwrites."
265 |                 },
266 |                 "delta.enableChangeDataFeed": {
267 |                     "type": ["string","boolean"],
268 |                     "description": "true to enable change data feed."
269 |                 },
270 |                 "delta.isolationLevel":  {
271 |                     "type": "string",
272 |                     "description": "The degree to which a transaction must be isolated from modifications made by concurrent transactions."
273 |                 },
274 |                 "delta.logRetentionDuration": {
275 |                     "type": "string",
276 |                     "description": "How long the history for a Delta table is kept. VACUUM operations override this retention threshold."
277 |                 },
278 |                 "delta.minReaderVersion": {
279 |                     "type": "integer",
280 |                     "description": "The minimum required protocol reader version for a reader that allows to read from this Delta table."
281 |                 },
282 |                 "delta.minWriterVersion": {
283 |                     "type": "integer",
284 |                     "description": "The minimum required protocol writer version for a writer that allows to write to this Delta table."
285 |                 },
286 |                 "delta.randomizeFilePrefixes": {
287 |                     "type": ["string","boolean"],
288 |                     "description": "true for Delta Lake to generate a random prefix for a file path instead of partition information."
289 |                 },
290 |                 "delta.randomPrefixLength": {
291 |                     "type": "integer",
292 |                     "description": "When delta.randomizeFilePrefixes is set to true, the number of characters that Delta Lake generates for random prefixes."
293 |                 },
294 |                 "delta.setTransactionRetentionDuration": {
295 |                     "type": "string",
296 |                     "description": "The shortest duration within which new snapshots will retain transaction identifiers (for example, SetTransactions). When a new snapshot sees a transaction identifier older than or equal to the duration specified by this property, the snapshot considers it expired and ignores it. The SetTransaction identifier is used when making the writes idempotent. "
297 |                 },
298 |                 "delta.targetFileSize": {
299 |                     "type": "string",
300 |                     "description": "The target file size in bytes or higher units for file tuning. For example, 104857600 (bytes) or 100mb."
301 |                 },
302 |                 "delta.tuneFileSizesForRewrites": {
303 |                     "type": ["string","boolean"],
304 |                     "description": "true to always use lower file sizes for all data layout optimization operations on the Delta table."
305 |                 }
306 |             }
307 |         }
308 |     }
309 | }


--------------------------------------------------------------------------------
/schema_testing/sibytes_yetl_tables_schema.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "$id": "https://yetl.io/schemas/tables",
  3 | 
  4 |     "type":"object",
  5 |     "description": "Root of the yetl tables config",
  6 |     "properties": {
  7 |         "version": {
  8 |             "type": "string",
  9 |             "description": "version of yetl that the configuration is compatible with",
 10 |             "pattern": "^(\\d+\\.)?(\\d+\\.)?(\\*|\\d+)$"
 11 |         },
 12 |         "audit_control": {
 13 |             "type": "object",
 14 |             "description": "definition of the audit_control database and tables",
 15 |             "properties": {
 16 |                 "delta_lake": { "$ref": "#/$defs/delta_lake" }
 17 |             }
 18 |         },
 19 |         "source": {
 20 |             "type": "object",
 21 |             "description": "definition of the source stage",
 22 |             "properties": {
 23 |                 "read": { "$ref": "#/$defs/read" },
 24 |                 "delta_lake": { "$ref": "#/$defs/delta_lake" }
 25 |             },
 26 |             "oneOf": [
 27 |                 {
 28 |                     "required":[
 29 |                         "read"
 30 |                     ]
 31 |                 },
 32 |                 {
 33 |                     "required":[
 34 |                         "delta_lake"
 35 |                     ]
 36 |                 }
 37 |             ]
 38 |         },
 39 |         "landing": {
 40 |             "type": "object",
 41 |             "description": "definition of the landing stage and files",
 42 |             "properties": {
 43 |                 "read": { "$ref": "#/$defs/read" }
 44 |             },
 45 |             "required": ["read"]
 46 |         },
 47 |         "raw": {
 48 |             "type": "object",
 49 |             "description": "definition of the raw database and tables",
 50 |             "properties": {
 51 |                 "delta_lake": { "$ref": "#/$defs/delta_lake" }
 52 |             }
 53 |         },
 54 |         "base": {
 55 |             "type": "object",
 56 |             "description": "definition of the base database and tables",
 57 |             "properties": {
 58 |                 "delta_lake": { "$ref": "#/$defs/delta_lake" }
 59 |             }
 60 |         }
 61 |     },
 62 |     "required": [
 63 |         "version"
 64 |     ],
 65 | 
 66 |     "$defs": {
 67 |         "read": {
 68 |             "type": "object",
 69 |             "description": "read table type is used for spark read table properties, typically used for reading files in object storage",
 70 |             "minProperties": 1,
 71 |             "maxProperties": 1,
 72 |             "patternProperties": {
 73 |                 "^\\S+$": { 
 74 |                     "type": "object",
 75 |                     "description": "name of the volume holding the files",
 76 |                     "minProperties": 1,
 77 |                     "patternProperties": {
 78 |                         "^\\S+$": { 
 79 |                             "type": ["string", "null"],
 80 |                             "description": "name or partname of the file that indicates the data table e.g. customers_20201001.csv would be customers"
 81 |                         }
 82 |                     }
 83 |                 }
 84 |             }
 85 |         },
 86 |         "delta_lake": {
 87 |             "type": "object",
 88 |             "description": "defines a table object as a delta lake table",
 89 |             "properties": {
 90 |                 "delta_properties": { "$ref": "#/$defs/delta_properties" }
 91 | 
 92 |             },
 93 |             "patternProperties": {
 94 |                 "^\\S+$": { "$ref": "#/$defs/delta_lake_database" }
 95 |             }
 96 | 
 97 |         },
 98 |         "delta_lake_database": {
 99 |             "type": "object",
100 |             "description": "database containing tables",
101 |             "minProperties": 1,
102 |             "properties": {
103 |                 "catalog": {
104 |                     "type": ["string", "null"],
105 |                     "description": "the default catalog name for the database"
106 |                 }
107 |             },
108 |             "patternProperties": {
109 |                 "^(?!catalog)(\\S+)$": { "$ref": "#/$defs/delta_lake_table" }
110 |             }
111 |         },
112 |         "delta_lake_table": {
113 |             "type": "object",
114 |             "description": "defines a deltalake table and it's properties",
115 |             "properties":{
116 |                 "managed": {
117 |                     "type": "boolean",
118 |                     "description": "whether it's a managed table or unmanaged that requires a location"
119 |                 },
120 |                 "delta_properties": { "$ref": "#/$defs/delta_properties" },
121 |                 "delta_constraints": { "$ref": "#/$defs/delta_constraints" },
122 |                 "custom_properties": { "$ref": "#/$defs/custom_properties" },
123 |                 "depends_on": {
124 |                     "type": "array",
125 |                     "items": {
126 |                       "type": "string"
127 |                     },
128 |                     "uniqueItems": true,
129 |                     "pattern": "^(\\S+\\.)(\\S+\\.)(\\*|\\S+)",
130 |                     "description": "use to denote dependency on other tables referenced their index this is the stage.database.table. The form stage.database.* can also be used to reference all tables"
131 |                 },
132 |                 "exception_thresholds": { "$ref": "#/$defs/thresholds" },
133 |                 "warning_thresholds": { "$ref": "#/$defs/thresholds" },
134 |                 "partition_by": {
135 |                     "oneOf": [
136 |                         {
137 |                             "type": "string",
138 |                             "description": "columns on which to partition by"
139 |                         }, 
140 |                         {
141 |                             "type": "array",
142 |                             "uniqueItems": true,
143 |                             "items": {
144 |                               "type": "string"
145 |                             },
146 |                             "description": "columns on which to partition by"
147 |                         }
148 |                     ]
149 |                 },
150 |                 "cluster_by": {
151 |                     "oneOf": [
152 |                         {
153 |                             "type": "string",
154 |                             "description": "columns on which to appply liquid clustering"
155 |                         }, 
156 |                         {
157 |                             "type": "array",
158 |                             "uniqueItems": true,
159 |                             "items": {
160 |                               "type": "string"
161 |                             },
162 |                             "description": "columns on which to appply liquid clustering"
163 |                         }
164 |                     ]
165 |                 },
166 |                 "z_order_by": {
167 |                     "oneOf": [
168 |                         {
169 |                             "type": "string",
170 |                             "description": "column on which to appply z-ording"
171 |                         }, 
172 |                         {
173 |                             "type": "array",
174 |                             "uniqueItems": true,
175 |                             "items": {
176 |                               "type": "string"
177 |                             },
178 |                             "description": "columns on which to appply z-ording"
179 |                         }
180 |                     ]
181 |                 },
182 |                 "id": {
183 |                     "oneOf": [
184 |                         {
185 |                             "type": "string",
186 |                             "description": "column(s) that comprise the unique identifier"
187 |                         }, 
188 |                         {
189 |                             "type": "array",
190 |                             "uniqueItems": true,
191 |                             "items": {
192 |                               "type": "string"
193 |                             },
194 |                             "description": "column(s) that comprise the unique identifier"
195 |                         }
196 |                     ]
197 |                 },
198 |                 "vacuum": {
199 |                     "type": "integer",
200 |                     "description": "vacuum retention threshold in the number of hours",
201 |                     "minimum": 0
202 |                 }
203 |             }
204 |         },
205 |         "delta_constraints": {
206 |             "type": "object",
207 |             "description": "holds key value pairs of delta constraints",
208 |             "minProperties": 1,
209 |             "patternProperties":{
210 |                 "^\\S+$": {
211 |                     "type": "string",
212 |                     "description": "check constraint logic"
213 |                 }
214 |             }
215 |         },
216 |         "custom_properties": {
217 |             "type": "object",
218 |             "description": "holds key value pairs of custom properties",
219 |             "minProperties": 1,
220 |             "patternProperties":{
221 |                 "^\\S+$": {
222 |                     "type": ["string","number","boolean"],
223 |                     "description": "custom property"
224 |                 }
225 |             }
226 |         },
227 |         "thresholds": {
228 |             "type": "object",
229 |             "description": "table etl thresholds",
230 |             "properties": {
231 |                 "invalid_ratio": {
232 |                     "type": "number", 
233 |                     "description": "decimal between 0 and 1 specifying the ratio of invalid rows to valid rows threshold",
234 |                     "exclusiveMinimum": 0,
235 |                     "maximum": 1
236 |                 },
237 |                 "invalid_rows": {"type": "integer", "description": "integer specifying invalid rows threshold"},
238 |                 "max_rows": {"type": "integer", "description": "integer specifying max rows threshold"},
239 |                 "min_rows": {"type": "integer", "description": "integer specifying min rows threshold"}
240 |             }
241 |         },
242 |         "delta_properties": {
243 |             "type": "object",
244 |             "description": "holds key value pairs of delta properties",
245 |             "minProperties": 1,
246 |             "properties":{
247 |                 "delta.appendOnly": {
248 |                     "type": "boolean",
249 |                     "description": "true for this Delta table to be append-only. If append-only, existing records cannot be deleted, and existing values cannot be updated."
250 |                 },
251 |                 "delta.autoOptimize.autoCompact": {
252 |                     "type": ["string","boolean"],
253 |                     "description": "auto for Delta Lake to automatically optimize the layout of the files for this Delta table."
254 |                 },    
255 |                 "delta.autoOptimize.optimizeWrite": {
256 |                     "type": ["string","boolean"],
257 |                     "description": "true for Delta Lake to automatically optimize the layout of the files for this Delta table during writes."
258 |                 },    
259 |                 "delta.checkpoint.writeStatsAsJson": {
260 |                     "type": ["string","boolean"],
261 |                     "description": "true for Delta Lake to write file statistics in checkpoints in JSON format for the stats column."
262 |                 },
263 |                 "delta.checkpoint.writeStatsAsStruct": {
264 |                     "type": ["string","boolean"],
265 |                     "description": "true for Delta Lake to write file statistics to checkpoints in struct format for the stats_parsed column and to write partition values as a struct for partitionValues_parsed."
266 |                 },
267 |                 "delta.columnMapping.mode": {
268 |                     "type": "string",
269 |                     "description": "Whether column mapping is enabled for Delta table columns and the corresponding Parquet columns that use different names."
270 |                 },
271 |                 "delta.compatibility.symlinkFormatManifest.enabled": {
272 |                     "type": ["string","boolean"],
273 |                     "description": "true for Delta Lake to configure the Delta table so that all write operations on the table automatically update the manifests."
274 |                 },
275 |                 "delta.dataSkippingNumIndexedCols": {
276 |                     "type": "integer",
277 |                     "description": "The number of columns for Delta Lake to collect statistics about for data skipping. A value of -1 means to collect statistics for all columns. Updating this property does not automatically collect statistics again; instead, it redefines the statistics schema of the Delta table. Specifically, it changes the behavior of future statistics collection (such as during appends and optimizations) as well as data skipping (such as ignoring column statistics beyond this number, even when such statistics exist)."
278 |                 },
279 |                 "delta.deletedFileRetentionDuration": {
280 |                     "type": "integer",
281 |                     "description": "The shortest duration for Delta Lake to keep logically deleted data files before deleting them physically. This is to prevent failures in stale readers after compactions or partition overwrites."
282 |                 },
283 |                 "delta.enableChangeDataFeed": {
284 |                     "type": ["string","boolean"],
285 |                     "description": "true to enable change data feed."
286 |                 },
287 |                 "delta.isolationLevel":  {
288 |                     "type": "string",
289 |                     "description": "The degree to which a transaction must be isolated from modifications made by concurrent transactions."
290 |                 },
291 |                 "delta.logRetentionDuration": {
292 |                     "type": "string",
293 |                     "description": "How long the history for a Delta table is kept. VACUUM operations override this retention threshold."
294 |                 },
295 |                 "delta.minReaderVersion": {
296 |                     "type": "integer",
297 |                     "description": "The minimum required protocol reader version for a reader that allows to read from this Delta table."
298 |                 },
299 |                 "delta.minWriterVersion": {
300 |                     "type": "integer",
301 |                     "description": "The minimum required protocol writer version for a writer that allows to write to this Delta table."
302 |                 },
303 |                 "delta.randomizeFilePrefixes": {
304 |                     "type": ["string","boolean"],
305 |                     "description": "true for Delta Lake to generate a random prefix for a file path instead of partition information."
306 |                 },
307 |                 "delta.randomPrefixLength": {
308 |                     "type": "integer",
309 |                     "description": "When delta.randomizeFilePrefixes is set to true, the number of characters that Delta Lake generates for random prefixes."
310 |                 },
311 |                 "delta.setTransactionRetentionDuration": {
312 |                     "type": "string",
313 |                     "description": "The shortest duration within which new snapshots will retain transaction identifiers (for example, SetTransactions). When a new snapshot sees a transaction identifier older than or equal to the duration specified by this property, the snapshot considers it expired and ignores it. The SetTransaction identifier is used when making the writes idempotent. "
314 |                 },
315 |                 "delta.targetFileSize": {
316 |                     "type": "string",
317 |                     "description": "The target file size in bytes or higher units for file tuning. For example, 104857600 (bytes) or 100mb."
318 |                 },
319 |                 "delta.tuneFileSizesForRewrites": {
320 |                     "type": ["string","boolean"],
321 |                     "description": "true to always use lower file sizes for all data layout optimization operations on the Delta table."
322 |                 }
323 |             }
324 |         }
325 |     }
326 | }


--------------------------------------------------------------------------------
/yetl/resource/sibytes_yetl_tables_schema.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "$id": "https://yetl.io/schemas/tables",
  3 | 
  4 |     "type":"object",
  5 |     "description": "Root of the yetl tables config",
  6 |     "properties": {
  7 |         "version": {
  8 |             "type": "string",
  9 |             "description": "version of yetl that the configuration is compatible with",
 10 |             "pattern": "^(\\d+\\.)?(\\d+\\.)?(\\*|\\d+)$"
 11 |         },
 12 |         "audit_control": {
 13 |             "type": "object",
 14 |             "description": "definition of the audit_control database and tables",
 15 |             "properties": {
 16 |                 "delta_lake": { "$ref": "#/$defs/delta_lake" }
 17 |             }
 18 |         },
 19 |         "source": {
 20 |             "type": "object",
 21 |             "description": "definition of the source stage",
 22 |             "properties": {
 23 |                 "read": { "$ref": "#/$defs/read" },
 24 |                 "delta_lake": { "$ref": "#/$defs/delta_lake" }
 25 |             },
 26 |             "oneOf": [
 27 |                 {
 28 |                     "required":[
 29 |                         "read"
 30 |                     ]
 31 |                 },
 32 |                 {
 33 |                     "required":[
 34 |                         "delta_lake"
 35 |                     ]
 36 |                 }
 37 |             ]
 38 |         },
 39 |         "landing": {
 40 |             "type": "object",
 41 |             "description": "definition of the landing stage and files",
 42 |             "properties": {
 43 |                 "read": { "$ref": "#/$defs/read" }
 44 |             },
 45 |             "required": ["read"]
 46 |         },
 47 |         "raw": {
 48 |             "type": "object",
 49 |             "description": "definition of the raw database and tables",
 50 |             "properties": {
 51 |                 "delta_lake": { "$ref": "#/$defs/delta_lake" }
 52 |             }
 53 |         },
 54 |         "base": {
 55 |             "type": "object",
 56 |             "description": "definition of the base database and tables",
 57 |             "properties": {
 58 |                 "delta_lake": { "$ref": "#/$defs/delta_lake" }
 59 |             }
 60 |         }
 61 |     },
 62 |     "required": [
 63 |         "version"
 64 |     ],
 65 | 
 66 |     "$defs": {
 67 |         "read": {
 68 |             "type": "object",
 69 |             "description": "read table type is used for spark read table properties, typically used for reading files in object storage",
 70 |             "minProperties": 1,
 71 |             "maxProperties": 1,
 72 |             "patternProperties": {
 73 |                 "^\\S+$": { 
 74 |                     "type": "object",
 75 |                     "description": "name of the volume holding the files",
 76 |                     "minProperties": 1,
 77 |                     "patternProperties": {
 78 |                         "^\\S+$": { 
 79 |                             "type": ["string", "null"],
 80 |                             "description": "name or partname of the file that indicates the data table e.g. customers_20201001.csv would be customers"
 81 |                         }
 82 |                     }
 83 |                 }
 84 |             }
 85 |         },
 86 |         "delta_lake": {
 87 |             "type": "object",
 88 |             "description": "defines a table object as a delta lake table",
 89 |             "properties": {
 90 |                 "delta_properties": { "$ref": "#/$defs/delta_properties" }
 91 | 
 92 |             },
 93 |             "patternProperties": {
 94 |                 "^\\S+$": { "$ref": "#/$defs/delta_lake_database" }
 95 |             }
 96 | 
 97 |         },
 98 |         "delta_lake_database": {
 99 |             "type": "object",
100 |             "description": "database containing tables",
101 |             "minProperties": 1,
102 |             "properties": {
103 |                 "catalog": {
104 |                     "type": ["string", "null"],
105 |                     "description": "the default catalog name for the database"
106 |                 }
107 |             },
108 |             "patternProperties": {
109 |                 "^(?!catalog)(\\S+)$": { "$ref": "#/$defs/delta_lake_table" }
110 |             }
111 |         },
112 |         "delta_lake_table": {
113 |             "type": ["object", "null"],
114 |             "description": "defines a deltalake table and it's properties",
115 |             "properties":{
116 |                 "managed": {
117 |                     "type": "boolean",
118 |                     "description": "whether it's a managed table or unmanaged that requires a location"
119 |                 },
120 |                 "delta_properties": { "$ref": "#/$defs/delta_properties" },
121 |                 "delta_constraints": { "$ref": "#/$defs/delta_constraints" },
122 |                 "custom_properties": { "$ref": "#/$defs/custom_properties" },
123 |                 "depends_on": {
124 |                     "type": "array",
125 |                     "items": {
126 |                       "type": "string"
127 |                     },
128 |                     "uniqueItems": true,
129 |                     "pattern": "^(\\S+\\.)(\\S+\\.)(\\*|\\S+)",
130 |                     "description": "use to denote dependency on other tables referenced their index this is the stage.database.table. The form stage.database.* can also be used to reference all tables"
131 |                 },
132 |                 "exception_thresholds": { "$ref": "#/$defs/thresholds" },
133 |                 "warning_thresholds": { "$ref": "#/$defs/thresholds" },
134 |                 "partition_by": {
135 |                     "oneOf": [
136 |                         {
137 |                             "type": "string",
138 |                             "description": "columns on which to partition by"
139 |                         }, 
140 |                         {
141 |                             "type": "array",
142 |                             "uniqueItems": true,
143 |                             "items": {
144 |                               "type": "string"
145 |                             },
146 |                             "description": "columns on which to partition by"
147 |                         }
148 |                     ]
149 |                 },
150 |                 "cluster_by": {
151 |                     "oneOf": [
152 |                         {
153 |                             "type": "string",
154 |                             "description": "columns on which to appply liquid clustering"
155 |                         }, 
156 |                         {
157 |                             "type": "array",
158 |                             "uniqueItems": true,
159 |                             "items": {
160 |                               "type": "string"
161 |                             },
162 |                             "description": "columns on which to appply liquid clustering"
163 |                         }
164 |                     ]
165 |                 },
166 |                 "z_order_by": {
167 |                     "oneOf": [
168 |                         {
169 |                             "type": "string",
170 |                             "description": "column on which to appply z-ording"
171 |                         }, 
172 |                         {
173 |                             "type": "array",
174 |                             "uniqueItems": true,
175 |                             "items": {
176 |                               "type": "string"
177 |                             },
178 |                             "description": "columns on which to appply z-ording"
179 |                         }
180 |                     ]
181 |                 },
182 |                 "id": {
183 |                     "oneOf": [
184 |                         {
185 |                             "type": "string",
186 |                             "description": "column(s) that comprise the unique identifier"
187 |                         }, 
188 |                         {
189 |                             "type": "array",
190 |                             "uniqueItems": true,
191 |                             "items": {
192 |                               "type": "string"
193 |                             },
194 |                             "description": "column(s) that comprise the unique identifier"
195 |                         }
196 |                     ]
197 |                 },
198 |                 "vacuum": {
199 |                     "type": "integer",
200 |                     "description": "vacuum retention threshold in the number of hours",
201 |                     "minimum": 0
202 |                 }
203 |             }
204 |         },
205 |         "delta_constraints": {
206 |             "type": "object",
207 |             "description": "holds key value pairs of delta constraints",
208 |             "minProperties": 1,
209 |             "patternProperties":{
210 |                 "^\\S+$": {
211 |                     "type": "string",
212 |                     "description": "check constraint logic"
213 |                 }
214 |             }
215 |         },
216 |         "custom_properties": {
217 |             "type": "object",
218 |             "description": "holds key value pairs of custom properties",
219 |             "minProperties": 1,
220 |             "patternProperties":{
221 |                 "^\\S+$": {
222 |                     "type": ["string","number","boolean"],
223 |                     "description": "custom property"
224 |                 }
225 |             }
226 |         },
227 |         "thresholds": {
228 |             "type": "object",
229 |             "description": "table etl thresholds",
230 |             "properties": {
231 |                 "invalid_ratio": {
232 |                     "type": "number", 
233 |                     "description": "decimal between 0 and 1 specifying the ratio of invalid rows to valid rows threshold",
234 |                     "exclusiveMinimum": 0,
235 |                     "maximum": 1
236 |                 },
237 |                 "invalid_rows": {"type": "integer", "description": "integer specifying invalid rows threshold"},
238 |                 "max_rows": {"type": "integer", "description": "integer specifying max rows threshold"},
239 |                 "min_rows": {"type": "integer", "description": "integer specifying min rows threshold"}
240 |             }
241 |         },
242 |         "delta_properties": {
243 |             "type": "object",
244 |             "description": "holds key value pairs of delta properties",
245 |             "minProperties": 1,
246 |             "properties":{
247 |                 "delta.appendOnly": {
248 |                     "type": "boolean",
249 |                     "description": "true for this Delta table to be append-only. If append-only, existing records cannot be deleted, and existing values cannot be updated."
250 |                 },
251 |                 "delta.autoOptimize.autoCompact": {
252 |                     "type": ["string","boolean"],
253 |                     "description": "auto for Delta Lake to automatically optimize the layout of the files for this Delta table."
254 |                 },    
255 |                 "delta.autoOptimize.optimizeWrite": {
256 |                     "type": ["string","boolean"],
257 |                     "description": "true for Delta Lake to automatically optimize the layout of the files for this Delta table during writes."
258 |                 },    
259 |                 "delta.checkpoint.writeStatsAsJson": {
260 |                     "type": ["string","boolean"],
261 |                     "description": "true for Delta Lake to write file statistics in checkpoints in JSON format for the stats column."
262 |                 },
263 |                 "delta.checkpoint.writeStatsAsStruct": {
264 |                     "type": ["string","boolean"],
265 |                     "description": "true for Delta Lake to write file statistics to checkpoints in struct format for the stats_parsed column and to write partition values as a struct for partitionValues_parsed."
266 |                 },
267 |                 "delta.columnMapping.mode": {
268 |                     "type": "string",
269 |                     "description": "Whether column mapping is enabled for Delta table columns and the corresponding Parquet columns that use different names."
270 |                 },
271 |                 "delta.compatibility.symlinkFormatManifest.enabled": {
272 |                     "type": ["string","boolean"],
273 |                     "description": "true for Delta Lake to configure the Delta table so that all write operations on the table automatically update the manifests."
274 |                 },
275 |                 "delta.dataSkippingNumIndexedCols": {
276 |                     "type": "integer",
277 |                     "description": "The number of columns for Delta Lake to collect statistics about for data skipping. A value of -1 means to collect statistics for all columns. Updating this property does not automatically collect statistics again; instead, it redefines the statistics schema of the Delta table. Specifically, it changes the behavior of future statistics collection (such as during appends and optimizations) as well as data skipping (such as ignoring column statistics beyond this number, even when such statistics exist)."
278 |                 },
279 |                 "delta.deletedFileRetentionDuration": {
280 |                     "type": "integer",
281 |                     "description": "The shortest duration for Delta Lake to keep logically deleted data files before deleting them physically. This is to prevent failures in stale readers after compactions or partition overwrites."
282 |                 },
283 |                 "delta.enableChangeDataFeed": {
284 |                     "type": ["string","boolean"],
285 |                     "description": "true to enable change data feed."
286 |                 },
287 |                 "delta.isolationLevel":  {
288 |                     "type": "string",
289 |                     "description": "The degree to which a transaction must be isolated from modifications made by concurrent transactions."
290 |                 },
291 |                 "delta.logRetentionDuration": {
292 |                     "type": "string",
293 |                     "description": "How long the history for a Delta table is kept. VACUUM operations override this retention threshold."
294 |                 },
295 |                 "delta.minReaderVersion": {
296 |                     "type": "integer",
297 |                     "description": "The minimum required protocol reader version for a reader that allows to read from this Delta table."
298 |                 },
299 |                 "delta.minWriterVersion": {
300 |                     "type": "integer",
301 |                     "description": "The minimum required protocol writer version for a writer that allows to write to this Delta table."
302 |                 },
303 |                 "delta.randomizeFilePrefixes": {
304 |                     "type": ["string","boolean"],
305 |                     "description": "true for Delta Lake to generate a random prefix for a file path instead of partition information."
306 |                 },
307 |                 "delta.randomPrefixLength": {
308 |                     "type": "integer",
309 |                     "description": "When delta.randomizeFilePrefixes is set to true, the number of characters that Delta Lake generates for random prefixes."
310 |                 },
311 |                 "delta.setTransactionRetentionDuration": {
312 |                     "type": "string",
313 |                     "description": "The shortest duration within which new snapshots will retain transaction identifiers (for example, SetTransactions). When a new snapshot sees a transaction identifier older than or equal to the duration specified by this property, the snapshot considers it expired and ignores it. The SetTransaction identifier is used when making the writes idempotent. "
314 |                 },
315 |                 "delta.targetFileSize": {
316 |                     "type": "string",
317 |                     "description": "The target file size in bytes or higher units for file tuning. For example, 104857600 (bytes) or 100mb."
318 |                 },
319 |                 "delta.tuneFileSizesForRewrites": {
320 |                     "type": ["string","boolean"],
321 |                     "description": "true to always use lower file sizes for all data layout optimization operations on the Delta table."
322 |                 }
323 |             }
324 |         }
325 |     }
326 | }


--------------------------------------------------------------------------------