├── tests
    ├── __init__.py
    ├── data
    │   ├── output
    │   │   ├── json_to_cols.csv
    │   │   ├── campaign_performance_csv.parquet
    │   │   ├── campaign_performance_parquet.parquet
    │   │   ├── json_to_cols_unique.csv
    │   │   ├── json_to_rows.csv
    │   │   ├── explode_multi.csv
    │   │   ├── campaign_performance_parquet.csv
    │   │   ├── campaign_performance_csv.csv
    │   │   ├── data.singer
    │   │   ├── chunk_csv_campaign_performance.singer
    │   │   └── chunk_parquet_campaign_performance.singer
    │   └── input
    │   │   ├── campaign_performance-20250427T202442.parquet
    │   │   ├── json_to_cols.csv
    │   │   ├── json_to_cols_unique.csv
    │   │   ├── multi_json.csv
    │   │   ├── json_to_rows.csv
    │   │   └── campaign_csv-20250427T202522.csv
    └── etl_test.py
├── gluestick
    ├── utils
    │   ├── __init__.py
    │   └── polars_utils.py
    ├── readers
    │   ├── __init__.py
    │   ├── pl_reader.py
    │   └── pl_lazyframe_reader.py
    ├── __init__.py
    ├── reader.py
    ├── pandas_utils.py
    ├── singer.py
    └── etl_utils.py
├── requirements.txt
├── .travis.yml
├── mypy.ini
├── pyproject.toml
├── setup.py
├── LICENSE
├── README.md
├── tox.ini
├── .github
    └── workflows
    │   └── ci_workflow.yml
└── .gitignore


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/gluestick/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/gluestick/readers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | xlrd==1.2.0
2 | numpy==1.19.2
3 | pandas==1.1.3
4 | singer-python>=4.0.0
5 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 |   - "3.7"
4 | install:
5 |   - pip install -r requirements.txt
6 | script:
7 |   - pytest
8 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | python_version = 3.9
3 | warn_unused_configs = True
4 | 
5 | [mypy-backoff.*]
6 | ignore_missing_imports = True
7 | 


--------------------------------------------------------------------------------
/tests/data/output/json_to_cols.csv:
--------------------------------------------------------------------------------
1 | Customer Name,Metadata.FirstName,Metadata.LastName
2 | Company 1,John,Smith
3 | Company 2,Jane,Smith
4 | 


--------------------------------------------------------------------------------
/tests/data/output/campaign_performance_csv.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hotgluexyz/gluestick/HEAD/tests/data/output/campaign_performance_csv.parquet


--------------------------------------------------------------------------------
/tests/data/output/campaign_performance_parquet.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hotgluexyz/gluestick/HEAD/tests/data/output/campaign_performance_parquet.parquet


--------------------------------------------------------------------------------
/tests/data/input/campaign_performance-20250427T202442.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hotgluexyz/gluestick/HEAD/tests/data/input/campaign_performance-20250427T202442.parquet


--------------------------------------------------------------------------------
/tests/data/input/json_to_cols.csv:
--------------------------------------------------------------------------------
1 | Customer Name,Metadata
2 | Company 1,"{""FirstName"": ""John"", ""LastName"": ""Smith""}"
3 | Company 2,"{""FirstName"": ""Jane"", ""LastName"": ""Smith""}"
4 | 


--------------------------------------------------------------------------------
/tests/data/output/json_to_cols_unique.csv:
--------------------------------------------------------------------------------
1 | Customer Name,Metadata.FirstName,Metadata.LastName,Metadata.Type,Metadata.SubType
2 | Company 1,John,Smith,Person,Other
3 | Company 2,Jane,Smith,Person,Parent
4 | 


--------------------------------------------------------------------------------
/tests/data/input/json_to_cols_unique.csv:
--------------------------------------------------------------------------------
1 | Customer Name,Metadata
2 | Company 1,"{""FirstName"": ""John"", ""LastName"": ""Smith"", ""Type"": ""Person"", ""SubType"": ""Other""}"
3 | Company 2,"{""FirstName"": ""Jane"", ""LastName"": ""Smith"", ""Type"": ""Person"", ""SubType"": ""Parent""}"
4 | 
5 | 


--------------------------------------------------------------------------------
/tests/data/output/json_to_rows.csv:
--------------------------------------------------------------------------------
1 | Customer Name,Line Detail.Id,Line Detail.Desc,Line Detail.Amount
2 | Company 1,1,Bolts,101.15
3 | Company 1,2,Smith,90.8
4 | Company 2,1,Braces,51.15
5 | Company 2,2,Wood,190.1
6 | Company 3,1,Braces,51.15
7 | Company 4,NaN,NaN,NaN
8 | Company 5,1,Braces,51.15


--------------------------------------------------------------------------------
/gluestick/__init__.py:
--------------------------------------------------------------------------------
1 | """Import functions and classes to gluestick."""
2 | 
3 | from .etl_utils import *  # noqa
4 | from .pandas_utils import *  # noqa
5 | from .singer import *  # noqa
6 | from .reader import *  # noqa
7 | from .readers.pl_lazyframe_reader import *  # noqa
8 | from .readers.pl_reader import *  # noqa
9 | 


--------------------------------------------------------------------------------
/tests/data/output/explode_multi.csv:
--------------------------------------------------------------------------------
1 | CompanyId,Customer Name,Total,Metadata.FirstName,Metadata.LastName,LineDetail.Id,LineDetail.Desc,LineDetail.Amount
2 | 100,Company 1,191.95,John,Smith,1,Bolts,101.15
3 | 100,Company 1,191.95,John,Smith,2,Smith,90.8
4 | 200,Company 2,241.25,Jane,Smith,1,Braces,51.15
5 | 200,Company 2,241.25,Jane,Smith,2,Wood,190.1


--------------------------------------------------------------------------------
/tests/data/input/multi_json.csv:
--------------------------------------------------------------------------------
1 | CompanyId,Customer Name,Metadata,LineDetail,Total
2 | 100,Company 1,"{""FirstName"": ""John"", ""LastName"": ""Smith""}","[ {""Id"": ""1"", ""Desc"": ""Bolts"", ""Amount"": 101.15}, {""Id"": ""2"", ""Desc"": ""Smith"", ""Amount"": 90.80} ]",191.95
3 | 200,Company 2,"{""FirstName"": ""Jane"", ""LastName"": ""Smith""}","[ {""Id"": ""1"", ""Desc"": ""Braces"", ""Amount"": 51.15}, {""Id"": ""2"", ""Desc"": ""Wood"", ""Amount"": 190.10} ]",241.25
4 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.tox]
 2 | pytest = "^6.2.5"
 3 | tox = "^3.24.4"
 4 | flake8 = "^3.9.2"
 5 | black = "^21.9b0"
 6 | pydocstyle = "^6.1.1"
 7 | mypy = "^0.910"
 8 | types-requests = "^2.26.1"
 9 | isort = "^5.10.1"
10 | 
11 | [tool.isort]
12 | profile = "black"
13 | multi_line_output = 3 # Vertical Hanging Indent
14 | src_paths = "tap_skuvault"
15 | 
16 | [build-system]
17 | requires = [ "setuptools >= 35.0.2", "wheel >= 0.29.0"]
18 | build-backend = "setuptools.build_meta"
19 | 


--------------------------------------------------------------------------------
/tests/data/input/json_to_rows.csv:
--------------------------------------------------------------------------------
1 | Customer Name,Line Detail
2 | Company 1,"[ {""Id"": ""1"", ""Desc"": ""Bolts"", ""Amount"": 101.15}, {""Id"": ""2"", ""Desc"": ""Smith"", ""Amount"": 90.80} ]"
3 | Company 2,"[ {""Id"": ""1"", ""Desc"": ""Braces"", ""Amount"": 51.15}, {""Id"": ""2"", ""Desc"": ""Wood"", ""Amount"": 190.10} ]"
4 | Company 3,"[{""Id"": ""1"", ""Desc"": ""Braces"", ""Amount"": 51.15}]"
5 | Company 4,
6 | Company 5,"{""Id"": ""1"", ""Desc"": ""Braces"", ""Amount"": 51.15}"
7 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | with open("README.md", "r") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setup(
 7 |     name="gluestick",
 8 |     version="3.0.3",
 9 |     description="ETL utility functions built for the hotglue iPaaS platform",
10 |     long_description=long_description,
11 |     long_description_content_type="text/markdown",
12 |     url="https://github.com/hotgluexyz/gluestick",
13 |     install_requires=[
14 |         "singer-python>=4.0.0",
15 |         "numpy>=1.4",
16 |         "pandas>=1.2.5",
17 |         "pyarrow>=8.0.0",
18 |         "pytz>=2022.6",
19 |         "polars==1.34.0"
20 |     ],
21 |     author="hotglue",
22 |     author_email="hello@hotglue.xyz",
23 |     license="MIT",
24 |     packages=find_packages(include=["gluestick", "gluestick.*"]),
25 |     zip_safe=False,
26 | )
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2020 hotglue
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | gluestick [![Build Status](https://travis-ci.org/hotgluexyz/gluestick.svg?branch=master)](https://travis-ci.org/hotgluexyz/gluestick)
 2 | =============
 3 | 
 4 | A small Python module containing quick utility functions for standard ETL processes.
 5 | 
 6 | ## Installation ##
 7 | 
 8 | ```
 9 | pip install gluestick
10 | ```
11 | 
12 | ## Links ##
13 | 
14 | * [Source]
15 | * [Wiki]
16 | * [Issues]
17 | * [Slack]
18 | 
19 | ## License ##
20 | [MIT]
21 | 
22 | ## Dependencies ##
23 | * NumPy
24 | * Pandas
25 | 
26 | ## Contributing ##
27 | This project is maintained by the [hotglue] team. We welcome contributions from the 
28 | community via issues and pull requests.
29 | 
30 | If you wish to chat with our team, feel free to join our [Slack]!
31 | 
32 | 
33 | [Source]: https://github.com/hotgluexyz/gluestick
34 | [Wiki]: https://github.com/hotgluexyz/gluestick/wiki
35 | [Issues]: https://github.com/hotgluexyz/gluestick/issues
36 | [MIT]: https://tldrlegal.com/license/mit-license
37 | [hotglue]: https://hotglue.xyz
38 | [Slack]: https://bit.ly/2KBGGq1
39 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py38
 3 | isolated_build = true
 4 | 
 5 | [testenv]
 6 | deps = 
 7 |     pytest>=6.2.5
 8 |     tox>=3.24.4
 9 |     flake8>=3.9.2
10 |     black>=21.9b0
11 |     pydocstyle>=6.1.1
12 |     mypy>=0.910
13 |     types-requests>=2.26.1
14 |     isort>=5.10.1
15 | commands =
16 |     pytest
17 |     black --check gluestick/
18 |     flake8 gluestick
19 |     pydocstyle gluestick
20 |     mypy gluestick --exclude='tests'
21 | 
22 | [testenv:pytest]
23 | envlist = py37, py38, py39
24 | deps = pytest>=6.2.5
25 | commands = pytest
26 | 
27 | [testenv:format]
28 | deps = 
29 |     black>=21.9b0
30 |     isort>=5.10.1
31 | commands =
32 |     black gluestick/
33 |     isort gluestick
34 | 
35 | [testenv:lint]
36 | deps = 
37 |     flake8>=3.9.2
38 |     black>=21.9b0
39 |     pydocstyle>=6.1.1
40 |     mypy>=0.910
41 |     isort>=5.10.1
42 | commands =
43 |     black --check --diff gluestick/
44 |     isort --check gluestick
45 |     flake8 gluestick
46 |     pydocstyle gluestick
47 |     mypy gluestick --exclude='tests' --ignore-missing-imports
48 | 
49 | [flake8]
50 | ignore = W503,C901,E501,E722,E721
51 | max-complexity = 10
52 | 
53 | [pydocstyle]
54 | ignore = D105,D203,D213,D210,D413,D411,D401,D100
55 | 


--------------------------------------------------------------------------------
/tests/data/output/campaign_performance_parquet.csv:
--------------------------------------------------------------------------------
 1 | customer_id,campaign__resourceName,campaign__status,campaign__name,metrics__clicks,metrics__costMicros,metrics__ctr,metrics__averageCpc,metrics__impressions,segments__device,segments__date
 2 | 7950307320,customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,64,90217477,0.374269,1409648.1,171,DESKTOP,2025-03-25
 3 | 7950307320,customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,117,175920000,0.38870433,1503589.8,301,MOBILE,2025-03-25
 4 | 7950307320,customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,1,10000,0.33333334,10000.0,3,TABLET,2025-03-25
 5 | 7950307320,customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,56,79952060,0.3478261,1427715.4,161,DESKTOP,2025-03-26
 6 | 7950307320,customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,100,193100494,0.33670035,1931005.0,297,MOBILE,2025-03-26
 7 | 7950307320,customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,58,76730551,0.34117648,1322940.5,170,DESKTOP,2025-03-27
 8 | 7950307320,customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,102,185445105,0.3,1818089.2,340,MOBILE,2025-03-27
 9 | 7950307320,customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,1,830000,0.5,830000.0,2,TABLET,2025-03-27
10 | 


--------------------------------------------------------------------------------
/.github/workflows/ci_workflow.yml:
--------------------------------------------------------------------------------
 1 | ### A CI workflow template that runs linting and python testing
 2 | 
 3 | name: Test gluestick
 4 | 
 5 | on: [push]
 6 | 
 7 | jobs:
 8 |   # linting:
 9 | 
10 |   #   runs-on: ubuntu-latest
11 |   #   strategy:
12 |   #     matrix:
13 |   #       # Only lint using the primary version used for dev
14 |   #       python-version: ["3.10"]
15 | 
16 |   #   steps:
17 |   #   - uses: actions/checkout@v2
18 |   #   - name: Set up Python ${{ matrix.python-version }}
19 |   #     uses: actions/setup-python@v2
20 |   #     with:
21 |   #       python-version: ${{ matrix.python-version }}
22 |   #   - name: Install dependencies
23 |   #     run: |
24 |   #       pip install .
25 |   #       pip install tox
26 |   #   - name: Run lint command from tox.ini
27 |   #     run: |
28 |   #       tox -e lint
29 | 
30 |   pytest:
31 | 
32 |     runs-on: ubuntu-latest
33 |     env:
34 |       GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
35 |     strategy:
36 |       matrix:
37 |         python-version: ["3.10"]
38 | 
39 |     steps:
40 |     - uses: actions/checkout@v2
41 |     - name: Set up Python ${{ matrix.python-version }}
42 |       uses: actions/setup-python@v2
43 |       with:
44 |         python-version: ${{ matrix.python-version }}
45 |     - name: Install dependencies
46 |       run: |
47 |         pip install .
48 |         pip install pytest
49 |     - name: Test with pytest
50 |       run: |
51 |         pytest --capture=no
52 | 


--------------------------------------------------------------------------------
/gluestick/utils/polars_utils.py:
--------------------------------------------------------------------------------
 1 | import polars as pl
 2 | 
 3 | def map_pd_type_to_polars(type_name):
 4 |     if not isinstance(type_name, str):
 5 |         # its a pd type class
 6 |         type_name = type_name.__name__
 7 | 
 8 |     if type_name == "Int64":
 9 |         return pl.Int64
10 |     elif type_name == "Float64":
11 |         return pl.Float64
12 |     elif type_name in ["Boolean", "bool", "boolean"]:
13 |         return pl.Boolean
14 |     elif type_name == "String":
15 |         return pl.String
16 |     elif type_name == "Datetime":
17 |         return pl.Datetime(time_unit="ns", time_zone="UTC")
18 |     elif type_name == "Date":
19 |         return pl.Date
20 |     elif type_name == "Time":
21 |         return pl.Time
22 |     elif type_name == "object":
23 |         return pl.String
24 |     elif type_name == "float":
25 |         return pl.Float64
26 |     elif type_name == "int":
27 |         return pl.Int64
28 |     else:
29 |         raise ValueError(f"Unknown type: {type_name}")
30 | 
31 | def cast_lf_from_schema(lf: pl.LazyFrame, types_params: dict):
32 |     return lf.with_columns([
33 |                     pl.col(col).cast(dtype, strict=True) for col, dtype in types_params.items()
34 |             ])
35 | 
36 | def cast_df_from_schema(df: pl.DataFrame, types_params: dict):
37 |     return df.with_columns([
38 |                     pl.col(col).cast(dtype, strict=True) for col, dtype in types_params.items()
39 |             ])
40 | 


--------------------------------------------------------------------------------
/tests/data/input/campaign_csv-20250427T202522.csv:
--------------------------------------------------------------------------------
 1 | campaign__resourceName,campaign__status,campaign__name,metrics__clicks,metrics__costMicros,metrics__ctr,metrics__averageCpc,metrics__impressions,segments__device,segments__date,customer_id
 2 | customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,64,90217477,0.3742690058479532,1409648.078125,171,DESKTOP,2025-03-25,7950307320
 3 | customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,117,175920000,0.38870431893687707,1503589.7435897435,301,MOBILE,2025-03-25,7950307320
 4 | customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,1,10000,0.3333333333333333,10000,3,TABLET,2025-03-25,7950307320
 5 | customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,56,79952060,0.34782608695652173,1427715.357142857,161,DESKTOP,2025-03-26,7950307320
 6 | customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,100,193100494,0.3367003367003367,1931004.94,297,MOBILE,2025-03-26,7950307320
 7 | customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,58,76730551,0.3411764705882353,1322940.5344827587,170,DESKTOP,2025-03-27,7950307320
 8 | customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,102,185445105,0.3,1818089.2647058824,340,MOBILE,2025-03-27,7950307320
 9 | customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,1,830000,0.5,830000,2,TABLET,2025-03-27,7950307320
10 | 


--------------------------------------------------------------------------------
/tests/data/output/campaign_performance_csv.csv:
--------------------------------------------------------------------------------
 1 | campaign__resourceName,campaign__status,campaign__name,metrics__clicks,metrics__costMicros,metrics__ctr,metrics__averageCpc,metrics__impressions,segments__device,segments__date,customer_id
 2 | customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,64,90217477,0.3742690058479532,1409648.078125,171,DESKTOP,2025-03-25,7950307320
 3 | customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,117,175920000,0.38870431893687707,1503589.7435897435,301,MOBILE,2025-03-25,7950307320
 4 | customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,1,10000,0.3333333333333333,10000,3,TABLET,2025-03-25,7950307320
 5 | customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,56,79952060,0.34782608695652173,1427715.357142857,161,DESKTOP,2025-03-26,7950307320
 6 | customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,100,193100494,0.3367003367003367,1931004.94,297,MOBILE,2025-03-26,7950307320
 7 | customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,58,76730551,0.3411764705882353,1322940.5344827587,170,DESKTOP,2025-03-27,7950307320
 8 | customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,102,185445105,0.3,1818089.2647058824,340,MOBILE,2025-03-27,7950307320
 9 | customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,1,830000,0.5,830000,2,TABLET,2025-03-27,7950307320
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 86 | __pypackages__/
 87 | 
 88 | # Celery stuff
 89 | celerybeat-schedule
 90 | celerybeat.pid
 91 | 
 92 | # SageMath parsed files
 93 | *.sage.py
 94 | 
 95 | # Environments
 96 | .env
 97 | .venv
 98 | env/
 99 | venv/
100 | ENV/
101 | env.bak/
102 | venv.bak/
103 | 
104 | # Spyder project settings
105 | .spyderproject
106 | .spyproject
107 | 
108 | # Rope project settings
109 | .ropeproject
110 | 
111 | # mkdocs documentation
112 | /site
113 | 
114 | # mypy
115 | .mypy_cache/
116 | .dmypy.json
117 | dmypy.json
118 | 
119 | # Pyre type checker
120 | .pyre/
121 | 
122 | # pytype static type analyzer
123 | .pytype/
124 | 
125 | # Cython debug symbols
126 | cython_debug/
127 | 
128 | # Misc
129 | .vscode
130 | .DS_Store
131 | 


--------------------------------------------------------------------------------
/tests/data/output/data.singer:
--------------------------------------------------------------------------------
 1 | {"type": "SCHEMA", "stream": "campaign_performance", "schema": {"type": ["object", "null"], "properties": {"customer_id": {"type": ["string", "null"]}, "campaign__resourceName": {"type": ["string", "null"]}, "campaign__status": {"type": ["string", "null"]}, "campaign__name": {"type": ["string", "null"]}, "metrics__clicks": {"type": ["string", "null"]}, "metrics__costMicros": {"type": ["string", "null"]}, "metrics__ctr": {"type": ["number", "null"]}, "metrics__averageCpc": {"type": ["number", "null"]}, "metrics__impressions": {"type": ["string", "null"]}, "segments__device": {"type": ["string", "null"]}, "segments__date": {"type": ["string", "null"]}}}, "key_properties": ["id"]}
 2 | {"type": "RECORD", "stream": "campaign_performance", "record": {"customer_id": "7950307320", "campaign__resourceName": "customers/7950307320/campaigns/12768130709", "campaign__status": "ENABLED", "campaign__name": "lorem-ipsum-search-brand", "metrics__clicks": "64", "metrics__costMicros": "90217477", "metrics__ctr": 0.3742690086364746, "metrics__averageCpc": 1409648.125, "metrics__impressions": "171", "segments__device": "DESKTOP", "segments__date": "2025-03-25"}}
 3 | {"type": "RECORD", "stream": "campaign_performance", "record": {"customer_id": "7950307320", "campaign__resourceName": "customers/7950307320/campaigns/12768130709", "campaign__status": "ENABLED", "campaign__name": "lorem-ipsum-search-brand", "metrics__clicks": "117", "metrics__costMicros": "175920000", "metrics__ctr": 0.3887043297290802, "metrics__averageCpc": 1503589.75, "metrics__impressions": "301", "segments__device": "MOBILE", "segments__date": "2025-03-25"}}
 4 | {"type": "RECORD", "stream": "campaign_performance", "record": {"customer_id": "7950307320", "campaign__resourceName": "customers/7950307320/campaigns/12768130709", "campaign__status": "ENABLED", "campaign__name": "lorem-ipsum-search-brand", "metrics__clicks": "1", "metrics__costMicros": "10000", "metrics__ctr": 0.3333333432674408, "metrics__averageCpc": 10000.0, "metrics__impressions": "3", "segments__device": "TABLET", "segments__date": "2025-03-25"}}
 5 | {"type": "RECORD", "stream": "campaign_performance", "record": {"customer_id": "7950307320", "campaign__resourceName": "customers/7950307320/campaigns/12768130709", "campaign__status": "ENABLED", "campaign__name": "lorem-ipsum-search-brand", "metrics__clicks": "56", "metrics__costMicros": "79952060", "metrics__ctr": 0.3478260934352875, "metrics__averageCpc": 1427715.375, "metrics__impressions": "161", "segments__device": "DESKTOP", "segments__date": "2025-03-26"}}
 6 | {"type": "RECORD", "stream": "campaign_performance", "record": {"customer_id": "7950307320", "campaign__resourceName": "customers/7950307320/campaigns/12768130709", "campaign__status": "ENABLED", "campaign__name": "lorem-ipsum-search-brand", "metrics__clicks": "100", "metrics__costMicros": "193100494", "metrics__ctr": 0.33670035004615784, "metrics__averageCpc": 1931005.0, "metrics__impressions": "297", "segments__device": "MOBILE", "segments__date": "2025-03-26"}}
 7 | {"type": "RECORD", "stream": "campaign_performance", "record": {"customer_id": "7950307320", "campaign__resourceName": "customers/7950307320/campaigns/12768130709", "campaign__status": "ENABLED", "campaign__name": "lorem-ipsum-search-brand", "metrics__clicks": "58", "metrics__costMicros": "76730551", "metrics__ctr": 0.34117648005485535, "metrics__averageCpc": 1322940.5, "metrics__impressions": "170", "segments__device": "DESKTOP", "segments__date": "2025-03-27"}}
 8 | {"type": "RECORD", "stream": "campaign_performance", "record": {"customer_id": "7950307320", "campaign__resourceName": "customers/7950307320/campaigns/12768130709", "campaign__status": "ENABLED", "campaign__name": "lorem-ipsum-search-brand", "metrics__clicks": "102", "metrics__costMicros": "185445105", "metrics__ctr": 0.30000001192092896, "metrics__averageCpc": 1818089.25, "metrics__impressions": "340", "segments__device": "MOBILE", "segments__date": "2025-03-27"}}
 9 | {"type": "RECORD", "stream": "campaign_performance", "record": {"customer_id": "7950307320", "campaign__resourceName": "customers/7950307320/campaigns/12768130709", "campaign__status": "ENABLED", "campaign__name": "lorem-ipsum-search-brand", "metrics__clicks": "1", "metrics__costMicros": "830000", "metrics__ctr": 0.5, "metrics__averageCpc": 830000.0, "metrics__impressions": "2", "segments__device": "TABLET", "segments__date": "2025-03-27"}}
10 | {"type": "STATE", "value": {}}


--------------------------------------------------------------------------------
/tests/data/output/chunk_csv_campaign_performance.singer:
--------------------------------------------------------------------------------
 1 | {"type": "SCHEMA", "stream": "campaign_performance", "schema": {"type": ["object", "null"], "properties": {"campaign__resourceName": {"type": ["string", "null"]}, "campaign__status": {"type": ["string", "null"]}, "campaign__name": {"type": ["string", "null"]}, "metrics__clicks": {"type": ["integer", "null"]}, "metrics__costMicros": {"type": ["integer", "null"]}, "metrics__ctr": {"type": ["number", "null"]}, "metrics__averageCpc": {"type": ["number", "null"]}, "metrics__impressions": {"type": ["integer", "null"]}, "segments__device": {"type": ["string", "null"]}, "segments__date": {"type": ["string", "null"]}, "customer_id": {"type": ["integer", "null"]}}}, "key_properties": ["id"]}
 2 | {"type": "RECORD", "stream": "campaign_performance", "record": {"campaign__resourceName": "customers/7950307320/campaigns/13768133709", "campaign__status": "ENABLED", "campaign__name": "SuitShop-Google-OBcvr-OPpur-Search-Brand-RT-Hot", "metrics__clicks": 64, "metrics__costMicros": 90217477, "metrics__ctr": 0.3742690058479532, "metrics__averageCpc": 1409648.078125, "metrics__impressions": 171, "segments__device": "DESKTOP", "segments__date": "2025-03-25", "customer_id": 7950307320}}
 3 | {"type": "RECORD", "stream": "campaign_performance", "record": {"campaign__resourceName": "customers/7950307320/campaigns/13768133709", "campaign__status": "ENABLED", "campaign__name": "SuitShop-Google-OBcvr-OPpur-Search-Brand-RT-Hot", "metrics__clicks": 117, "metrics__costMicros": 175920000, "metrics__ctr": 0.388704318936877, "metrics__averageCpc": 1503589.7435897435, "metrics__impressions": 301, "segments__device": "MOBILE", "segments__date": "2025-03-25", "customer_id": 7950307320}}
 4 | {"type": "RECORD", "stream": "campaign_performance", "record": {"campaign__resourceName": "customers/7950307320/campaigns/13768133709", "campaign__status": "ENABLED", "campaign__name": "SuitShop-Google-OBcvr-OPpur-Search-Brand-RT-Hot", "metrics__clicks": 1, "metrics__costMicros": 10000, "metrics__ctr": 0.3333333333333333, "metrics__averageCpc": 10000.0, "metrics__impressions": 3, "segments__device": "TABLET", "segments__date": "2025-03-25", "customer_id": 7950307320}}
 5 | {"type": "RECORD", "stream": "campaign_performance", "record": {"campaign__resourceName": "customers/7950307320/campaigns/13768133709", "campaign__status": "ENABLED", "campaign__name": "SuitShop-Google-OBcvr-OPpur-Search-Brand-RT-Hot", "metrics__clicks": 56, "metrics__costMicros": 79952060, "metrics__ctr": 0.3478260869565217, "metrics__averageCpc": 1427715.357142857, "metrics__impressions": 161, "segments__device": "DESKTOP", "segments__date": "2025-03-26", "customer_id": 7950307320}}
 6 | {"type": "RECORD", "stream": "campaign_performance", "record": {"campaign__resourceName": "customers/7950307320/campaigns/13768133709", "campaign__status": "ENABLED", "campaign__name": "SuitShop-Google-OBcvr-OPpur-Search-Brand-RT-Hot", "metrics__clicks": 100, "metrics__costMicros": 193100494, "metrics__ctr": 0.3367003367003367, "metrics__averageCpc": 1931004.94, "metrics__impressions": 297, "segments__device": "MOBILE", "segments__date": "2025-03-26", "customer_id": 7950307320}}
 7 | {"type": "STATE", "value": {}}
 8 | {"type": "SCHEMA", "stream": "campaign_performance", "schema": {"type": ["object", "null"], "properties": {"campaign__resourceName": {"type": ["string", "null"]}, "campaign__status": {"type": ["string", "null"]}, "campaign__name": {"type": ["string", "null"]}, "metrics__clicks": {"type": ["integer", "null"]}, "metrics__costMicros": {"type": ["integer", "null"]}, "metrics__ctr": {"type": ["number", "null"]}, "metrics__averageCpc": {"type": ["number", "null"]}, "metrics__impressions": {"type": ["integer", "null"]}, "segments__device": {"type": ["string", "null"]}, "segments__date": {"type": ["string", "null"]}, "customer_id": {"type": ["integer", "null"]}}}, "key_properties": ["id"]}
 9 | {"type": "RECORD", "stream": "campaign_performance", "record": {"campaign__resourceName": "customers/7950307320/campaigns/13768133709", "campaign__status": "ENABLED", "campaign__name": "SuitShop-Google-OBcvr-OPpur-Search-Brand-RT-Hot", "metrics__clicks": 58, "metrics__costMicros": 76730551, "metrics__ctr": 0.3411764705882353, "metrics__averageCpc": 1322940.5344827587, "metrics__impressions": 170, "segments__device": "DESKTOP", "segments__date": "2025-03-27", "customer_id": 7950307320}}
10 | {"type": "RECORD", "stream": "campaign_performance", "record": {"campaign__resourceName": "customers/7950307320/campaigns/13768133709", "campaign__status": "ENABLED", "campaign__name": "SuitShop-Google-OBcvr-OPpur-Search-Brand-RT-Hot", "metrics__clicks": 102, "metrics__costMicros": 185445105, "metrics__ctr": 0.3, "metrics__averageCpc": 1818089.2647058824, "metrics__impressions": 340, "segments__device": "MOBILE", "segments__date": "2025-03-27", "customer_id": 7950307320}}
11 | {"type": "RECORD", "stream": "campaign_performance", "record": {"campaign__resourceName": "customers/7950307320/campaigns/13768133709", "campaign__status": "ENABLED", "campaign__name": "SuitShop-Google-OBcvr-OPpur-Search-Brand-RT-Hot", "metrics__clicks": 1, "metrics__costMicros": 830000, "metrics__ctr": 0.5, "metrics__averageCpc": 830000.0, "metrics__impressions": 2, "segments__device": "TABLET", "segments__date": "2025-03-27", "customer_id": 7950307320}}
12 | {"type": "STATE", "value": {}}
13 | 


--------------------------------------------------------------------------------
/tests/data/output/chunk_parquet_campaign_performance.singer:
--------------------------------------------------------------------------------
 1 | {"type": "SCHEMA", "stream": "campaign_performance", "schema": {"type": ["object", "null"], "properties": {"customer_id": {"type": ["string", "null"]}, "campaign__resourceName": {"type": ["string", "null"]}, "campaign__status": {"type": ["string", "null"]}, "campaign__name": {"type": ["string", "null"]}, "metrics__clicks": {"type": ["string", "null"]}, "metrics__costMicros": {"type": ["string", "null"]}, "metrics__ctr": {"type": ["number", "null"]}, "metrics__averageCpc": {"type": ["number", "null"]}, "metrics__impressions": {"type": ["string", "null"]}, "segments__device": {"type": ["string", "null"]}, "segments__date": {"type": ["string", "null"]}}}, "key_properties": ["id"]}
 2 | {"type": "RECORD", "stream": "campaign_performance", "record": {"customer_id": "7950307320", "campaign__resourceName": "customers/7950307320/campaigns/12768130709", "campaign__status": "ENABLED", "campaign__name": "SuitShop-Google-OBcvr-OPpur-Search-Brand-RT-Hot", "metrics__clicks": "64", "metrics__costMicros": "90217477", "metrics__ctr": 0.3742690086364746, "metrics__averageCpc": 1409648.125, "metrics__impressions": "171", "segments__device": "DESKTOP", "segments__date": "2025-03-25"}}
 3 | {"type": "RECORD", "stream": "campaign_performance", "record": {"customer_id": "7950307320", "campaign__resourceName": "customers/7950307320/campaigns/12768130709", "campaign__status": "ENABLED", "campaign__name": "SuitShop-Google-OBcvr-OPpur-Search-Brand-RT-Hot", "metrics__clicks": "117", "metrics__costMicros": "175920000", "metrics__ctr": 0.3887043297290802, "metrics__averageCpc": 1503589.75, "metrics__impressions": "301", "segments__device": "MOBILE", "segments__date": "2025-03-25"}}
 4 | {"type": "RECORD", "stream": "campaign_performance", "record": {"customer_id": "7950307320", "campaign__resourceName": "customers/7950307320/campaigns/12768130709", "campaign__status": "ENABLED", "campaign__name": "SuitShop-Google-OBcvr-OPpur-Search-Brand-RT-Hot", "metrics__clicks": "1", "metrics__costMicros": "10000", "metrics__ctr": 0.3333333432674408, "metrics__averageCpc": 10000.0, "metrics__impressions": "3", "segments__device": "TABLET", "segments__date": "2025-03-25"}}
 5 | {"type": "RECORD", "stream": "campaign_performance", "record": {"customer_id": "7950307320", "campaign__resourceName": "customers/7950307320/campaigns/12768130709", "campaign__status": "ENABLED", "campaign__name": "SuitShop-Google-OBcvr-OPpur-Search-Brand-RT-Hot", "metrics__clicks": "56", "metrics__costMicros": "79952060", "metrics__ctr": 0.3478260934352875, "metrics__averageCpc": 1427715.375, "metrics__impressions": "161", "segments__device": "DESKTOP", "segments__date": "2025-03-26"}}
 6 | {"type": "RECORD", "stream": "campaign_performance", "record": {"customer_id": "7950307320", "campaign__resourceName": "customers/7950307320/campaigns/12768130709", "campaign__status": "ENABLED", "campaign__name": "SuitShop-Google-OBcvr-OPpur-Search-Brand-RT-Hot", "metrics__clicks": "100", "metrics__costMicros": "193100494", "metrics__ctr": 0.33670035004615784, "metrics__averageCpc": 1931005.0, "metrics__impressions": "297", "segments__device": "MOBILE", "segments__date": "2025-03-26"}}
 7 | {"type": "STATE", "value": {}}
 8 | {"type": "SCHEMA", "stream": "campaign_performance", "schema": {"type": ["object", "null"], "properties": {"customer_id": {"type": ["string", "null"]}, "campaign__resourceName": {"type": ["string", "null"]}, "campaign__status": {"type": ["string", "null"]}, "campaign__name": {"type": ["string", "null"]}, "metrics__clicks": {"type": ["string", "null"]}, "metrics__costMicros": {"type": ["string", "null"]}, "metrics__ctr": {"type": ["number", "null"]}, "metrics__averageCpc": {"type": ["number", "null"]}, "metrics__impressions": {"type": ["string", "null"]}, "segments__device": {"type": ["string", "null"]}, "segments__date": {"type": ["string", "null"]}}}, "key_properties": ["id"]}
 9 | {"type": "RECORD", "stream": "campaign_performance", "record": {"customer_id": "7950307320", "campaign__resourceName": "customers/7950307320/campaigns/12768130709", "campaign__status": "ENABLED", "campaign__name": "SuitShop-Google-OBcvr-OPpur-Search-Brand-RT-Hot", "metrics__clicks": "58", "metrics__costMicros": "76730551", "metrics__ctr": 0.34117648005485535, "metrics__averageCpc": 1322940.5, "metrics__impressions": "170", "segments__device": "DESKTOP", "segments__date": "2025-03-27"}}
10 | {"type": "RECORD", "stream": "campaign_performance", "record": {"customer_id": "7950307320", "campaign__resourceName": "customers/7950307320/campaigns/12768130709", "campaign__status": "ENABLED", "campaign__name": "SuitShop-Google-OBcvr-OPpur-Search-Brand-RT-Hot", "metrics__clicks": "102", "metrics__costMicros": "185445105", "metrics__ctr": 0.30000001192092896, "metrics__averageCpc": 1818089.25, "metrics__impressions": "340", "segments__device": "MOBILE", "segments__date": "2025-03-27"}}
11 | {"type": "RECORD", "stream": "campaign_performance", "record": {"customer_id": "7950307320", "campaign__resourceName": "customers/7950307320/campaigns/12768130709", "campaign__status": "ENABLED", "campaign__name": "SuitShop-Google-OBcvr-OPpur-Search-Brand-RT-Hot", "metrics__clicks": "1", "metrics__costMicros": "830000", "metrics__ctr": 0.5, "metrics__averageCpc": 830000.0, "metrics__impressions": "2", "segments__device": "TABLET", "segments__date": "2025-03-27"}}
12 | {"type": "STATE", "value": {}}
13 | 


--------------------------------------------------------------------------------
/gluestick/readers/pl_reader.py:
--------------------------------------------------------------------------------
  1 | from gluestick.reader import Reader
  2 | from gluestick.utils.polars_utils import map_pd_type_to_polars, cast_df_from_schema
  3 | import pyarrow.parquet as pq
  4 | import polars as pl
  5 | import pandas as pd
  6 | import os
  7 | 
  8 | 
  9 | class PolarsReader(Reader):
 10 | 
 11 |     def get(self, stream, default=None, catalog_types=True) -> pl.DataFrame | None:
 12 |         """
 13 |         Reads the given stream from sync output and returns a pl.DataFrame.
 14 | 
 15 |         Parameters
 16 |         ----------
 17 |         stream: str
 18 |             The name of the stream to read.
 19 |         default: pl.DataFrame | None
 20 |             The default value to return if the stream is not found.
 21 |         catalog_types: bool
 22 |             Whether to coerce the dataframe to the types given by the local catalog.
 23 |         """
 24 | 
 25 |         filepath = self.input_files.get(stream)
 26 |         if not filepath:
 27 |             return default
 28 | 
 29 |         if filepath.endswith(".parquet"):
 30 |             return self.get_parquet(stream, filepath, catalog_types)
 31 |         elif filepath.endswith(".csv"):
 32 |             return self.get_csv(stream, filepath, catalog_types)
 33 |         raise ValueError(f"Unsupported file type: {filepath}")
 34 | 
 35 |     def get_csv(self, stream, filepath, catalog_types=True):
 36 |         if catalog_types:
 37 |             catalog = self.read_catalog()
 38 |             if catalog:
 39 |                 headers = pd.read_csv(filepath, nrows=0).columns.tolist()
 40 |                 types_params = self.get_types_from_catalog(catalog, stream, headers=headers)
 41 |                 if types_params:
 42 |                     return pl.read_csv(filepath, dtypes=types_params)
 43 | 
 44 |         return pl.read_csv(filepath)
 45 | 
 46 |     def get_parquet(self, stream, filepath, catalog_types=True):
 47 |         df = pl.read_parquet(filepath)
 48 |         if catalog_types:
 49 |             catalog = self.read_catalog()
 50 |             if catalog:
 51 |                 headers = pq.read_table(filepath).to_pandas(safe=False).columns.tolist()
 52 |                 types_params = self.get_types_from_catalog(catalog, stream, headers=headers)
 53 |                 if types_params:
 54 |                     return cast_df_from_schema(df, types_params)
 55 |         return df
 56 | 
 57 |     def get_types_from_catalog(self, catalog, stream, headers=None):
 58 |         """Get the polars types base on the catalog definition."""
 59 |         type_information = super().get_types_from_catalog(catalog, stream, headers)
 60 |         pd_types = type_information.get("dtype", {})
 61 |         date_fields = type_information.get("parse_dates", [])
 62 |         pd_types = {
 63 |             k: "Datetime"
 64 |             if k in date_fields
 65 |             else v
 66 |             for k, v in pd_types.items()
 67 |         }
 68 |         return {col: map_pd_type_to_polars(pd_type) for col, pd_type in pd_types.items()}
 69 | 
 70 |     def read_snapshots(self,stream, snapshot_dir, **kwargs) -> pl.DataFrame | None:
 71 |         """Read a snapshot file and return a polars dataframe.
 72 | 
 73 |         Parameters
 74 |         ----------
 75 |         stream: str
 76 |             The name of the stream to read the snapshot from.
 77 |         snapshot_dir: str
 78 |             The path to the snapshot directory.
 79 |         """
 80 |         if os.path.isfile(path=f"{snapshot_dir}/{stream}.snapshot.parquet"):
 81 |             return pl.read_parquet(source=f"{snapshot_dir}/{stream}.snapshot.parquet", **kwargs)
 82 |         elif os.path.isfile(path=f"{snapshot_dir}/{stream}.snapshot.csv"):
 83 |             return pl.read_csv(source=f"{snapshot_dir}/{stream}.snapshot.csv", **kwargs)
 84 |         else:
 85 |             return None
 86 | 
 87 |     def snapshot_records(
 88 |         self,
 89 |         stream_data,
 90 |         stream,
 91 |         snapshot_dir,
 92 |         pk="id", 
 93 |         just_new=False, 
 94 |         use_csv=False, 
 95 |         overwrite=False, 
 96 |     ) -> pl.DataFrame | None:
 97 |         """Update a snapshot file and return the merged data.
 98 | 
 99 |         Parameters
100 |         ----------
101 |         stream_data: pl.DataFrame
102 |             The data to be included in the snapshot.
103 |         stream: str
104 |             The name of the stream of the snapshots.
105 |         snapshot_dir: str
106 |             The name of the stream of the snapshots.
107 |         pk: str or list of str
108 |             The primary key used for the snapshot.
109 |         just_new: bool
110 |             Return just the input data if True, else returns the whole data
111 |         use_csv: bool
112 |             Whether to use csv format for the snapshot instead of parquet.
113 |         overwrite: bool
114 |             Whether to overwrite the existing snapshot file instead of updating and merging.
115 | 
116 |         Returns
117 |         -------
118 |         return: pl.DataFrame
119 |             A polars dataframe with the merged data.
120 | 
121 |         """
122 | 
123 |         if isinstance(pk, str):
124 |             pk = [pk]
125 | 
126 |         snapshot_df = self.read_snapshots(stream, snapshot_dir)
127 |         if not overwrite and stream_data is not None and snapshot_df is not None:
128 |             
129 |             for key in pk:
130 |                 new_data_pk_df = stream_data.select(key)
131 |                 snapshot_df = snapshot_df.filter(
132 |                     ~pl.col(key).is_in(new_data_pk_df.get_column(key))
133 |                 )
134 | 
135 | 
136 |             merged_df = pl.concat(items=[snapshot_df, stream_data], how="diagonal_relaxed")
137 | 
138 |             if use_csv:
139 |                 merged_df.write_csv(f"{snapshot_dir}/{stream}.snapshot.csv")
140 |             else:
141 |                 merged_df.write_parquet(f"{snapshot_dir}/{stream}.snapshot.parquet")
142 |             
143 | 
144 |             if just_new:
145 |                 return stream_data
146 |             else:
147 |                 return merged_df
148 |         elif stream_data is not None:
149 |             if use_csv:
150 |                 stream_data.write_csv(f"{snapshot_dir}/{stream}.snapshot.csv")
151 |             else:
152 |                 stream_data.write_parquet(f"{snapshot_dir}/{stream}.snapshot.parquet")
153 | 
154 |             return stream_data
155 |         elif snapshot_df is not None:
156 |             return snapshot_df
157 |         else:
158 |             return None
159 | 
160 | 
161 | 
162 | 


--------------------------------------------------------------------------------
/gluestick/readers/pl_lazyframe_reader.py:
--------------------------------------------------------------------------------
  1 | from gluestick.reader import Reader
  2 | from gluestick.utils.polars_utils import map_pd_type_to_polars, cast_lf_from_schema
  3 | import pyarrow.parquet as pq
  4 | import polars as pl
  5 | import pandas as pd
  6 | import os
  7 | class PLLazyFrameReader(Reader):
  8 | 
  9 |     def get(self, stream, default=None, catalog_types=True) -> pl.LazyFrame | None:
 10 |         """
 11 |         Reads the given stream from sync output and returns a pl.LazyFrame.
 12 | 
 13 |         Parameters
 14 |         ----------
 15 |         stream: str
 16 |             The name of the stream to read.
 17 |         default: pl.LazyFrame | None
 18 |             The default value to return if the stream is not found.
 19 |         catalog_types: bool
 20 |             Whether to coerce the lazyframe to the types given by the local catalog.
 21 |         """
 22 | 
 23 |         filepath = self.input_files.get(stream)
 24 |         if not filepath:
 25 |             return default
 26 | 
 27 |         if filepath.endswith(".parquet"):
 28 |             return self.get_parquet(stream, filepath, catalog_types)
 29 |         elif filepath.endswith(".csv"):
 30 |             return self.get_csv(stream, filepath, catalog_types)
 31 |         raise ValueError(f"Unsupported file type: {filepath}")
 32 | 
 33 | 
 34 |     def get_csv(self, stream, filepath, catalog_types=True):
 35 |         if catalog_types:
 36 |             catalog = self.read_catalog()
 37 |             if catalog:
 38 |                 headers = pd.read_csv(filepath, nrows=0).columns.tolist()
 39 |                 types_params = self.get_types_from_catalog(catalog, stream, headers=headers)
 40 |                 if types_params:
 41 |                     return pl.scan_csv(filepath, schema=types_params)
 42 | 
 43 |         return pl.scan_csv(filepath)
 44 | 
 45 |     def get_parquet(self, stream, filepath, catalog_types=True):
 46 |         if catalog_types:
 47 |             catalog = self.read_catalog()
 48 |             if catalog:
 49 |                 headers = pq.read_table(filepath).to_pandas(safe=False).columns.tolist()
 50 |                 types_params = self.get_types_from_catalog(catalog, stream, headers=headers)
 51 |                 lf = pl.scan_parquet(filepath)
 52 |                 return cast_lf_from_schema(lf, types_params)
 53 | 
 54 |         return pl.scan_parquet(filepath)
 55 |             
 56 | 
 57 |     def get_types_from_catalog(self, catalog, stream, headers=None):
 58 |         """Get the polars types base on the catalog definition."""
 59 |         type_information = super().get_types_from_catalog(catalog, stream, headers)
 60 |         pd_types = type_information.get("dtype", {})
 61 |         date_fields = type_information.get("parse_dates", [])
 62 |         pd_types = {
 63 |             k: "Datetime" 
 64 |             if k in date_fields 
 65 |             else v
 66 |             for k,v in pd_types.items()
 67 |             }
 68 |         return {col: map_pd_type_to_polars(pd_type) for col, pd_type in pd_types.items()}
 69 | 
 70 |     def read_snapshots(self,stream, snapshot_dir, **kwargs) -> pl.LazyFrame | None:
 71 |         """Read a snapshot file.
 72 | 
 73 |         Parameters
 74 |         ----------
 75 |         stream: str
 76 |             The name of the stream to read the snapshot from.
 77 |         snapshot_dir: str
 78 |             The path to the snapshot directory.
 79 |         """
 80 |         if os.path.isfile(path=f"{snapshot_dir}/{stream}.snapshot.parquet"):
 81 |             return pl.scan_parquet(source=f"{snapshot_dir}/{stream}.snapshot.parquet")
 82 |         elif os.path.isfile(path=f"{snapshot_dir}/{stream}.snapshot.csv"):
 83 |             return pl.scan_csv(source=f"{snapshot_dir}/{stream}.snapshot.csv")
 84 |         else:
 85 |             return None
 86 | 
 87 |     def snapshot_records(
 88 |         self,
 89 |         stream_data,
 90 |         stream,
 91 |         snapshot_dir,
 92 |         pk="id", 
 93 |         just_new=False, 
 94 |         use_csv=False, 
 95 |         overwrite=False, 
 96 |     ) -> pl.LazyFrame | None:
 97 |         """Update a snapshot file and return the merged data.
 98 | 
 99 |         Parameters
100 |         ----------
101 |         stream_data: pl.LazyFrame
102 |             The data to be included in the snapshot.
103 |         stream: str
104 |             The name of the stream of the snapshots.
105 |         snapshot_dir: str
106 |             The name of the stream of the snapshots.
107 |         pk: str or list of str
108 |             The primary key used for the snapshot.
109 |         just_new: bool
110 |             Return just the input data if True, else returns the whole data
111 |         use_csv: bool
112 |             Whether to use csv format for the snapshot instead of parquet.
113 |         overwrite: bool
114 |             Whether to overwrite the existing snapshot file instead of updating and merging.
115 | 
116 |         Returns
117 |         -------
118 |         return: pl.LazyFrame
119 |             A polars lazyframe with the merged data.
120 | 
121 |         """
122 | 
123 |         if isinstance(pk, str):
124 |             pk = [pk]
125 | 
126 |         snapshot_lf = self.read_snapshots(stream, snapshot_dir)
127 |         if not overwrite and stream_data is not None and snapshot_lf is not None:
128 |             
129 |             for key in pk:
130 |                 new_data_pk_lf = stream_data.select(key).collect()
131 |                 snapshot_lf = snapshot_lf.filter(
132 |                 ~pl.col(key).is_in(new_data_pk_lf.get_column(key))
133 |                 )
134 | 
135 | 
136 |             merged_lf = pl.concat(items=[snapshot_lf, stream_data],how="vertical_relaxed")
137 | 
138 |             if use_csv:
139 |                 merged_lf.sink_csv(f"{snapshot_dir}/{stream}.temp.snapshot.csv")
140 |                 os.remove(f"{snapshot_dir}/{stream}.snapshot.csv")
141 |                 os.rename(f"{snapshot_dir}/{stream}.temp.snapshot.csv", f"{snapshot_dir}/{stream}.snapshot.csv")
142 |             else:
143 |                 merged_lf.sink_parquet(f"{snapshot_dir}/{stream}.temp.snapshot.parquet")
144 |                 os.remove(f"{snapshot_dir}/{stream}.snapshot.parquet")
145 |                 os.rename(f"{snapshot_dir}/{stream}.temp.snapshot.parquet", f"{snapshot_dir}/{stream}.snapshot.parquet")
146 |             
147 | 
148 |             if just_new:
149 |                 return stream_data
150 |             else:
151 |                 return merged_lf
152 |         elif stream_data is not None:
153 |             if use_csv:
154 |                 stream_data.sink_csv(f"{snapshot_dir}/{stream}.snapshot.csv")
155 |             else:
156 |                 stream_data.sink_parquet(f"{snapshot_dir}/{stream}.snapshot.parquet")
157 | 
158 |             return stream_data
159 |         elif snapshot_lf is not None:
160 |             return snapshot_lf
161 |         else:
162 |             return None
163 | 
164 | 
165 | 
166 | 
167 | 
168 | 
169 | 


--------------------------------------------------------------------------------
/tests/etl_test.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import gluestick as gs
  4 | import pandas as pd
  5 | import pytest
  6 | 
  7 | # Tests gluestick ETL utilities
  8 | class TestETL(object):
  9 |     @classmethod
 10 |     def setup_class(cls):
 11 |         print("=====")
 12 |         print("setup")
 13 | 
 14 |     # TODO: Test join
 15 | 
 16 |     # Run explode_json_to_cols
 17 |     def test_explode_json_to_cols(self):
 18 |         print("=====")
 19 |         print("test_explode_json_to_cols")
 20 | 
 21 |         # Read data
 22 |         dirname = os.path.dirname(__file__)
 23 |         df = pd.read_csv(
 24 |             os.path.join(dirname, "data/input/json_to_cols.csv"), index_col=0
 25 |         )
 26 |         expected_df = pd.read_csv(
 27 |             os.path.join(dirname, "data/output/json_to_cols.csv"), index_col=0
 28 |         )
 29 | 
 30 |         # Explode
 31 |         r = gs.array_to_dict_reducer("Name", "StringValue")
 32 |         df2 = gs.explode_json_to_cols(df, "Metadata", reducer=r)
 33 |         print(df2)
 34 | 
 35 |         assert df2.equals(expected_df)
 36 |         print("test_explode_json_to_cols output is correct")
 37 | 
 38 |     def test_explode_json_to_cols_unique(self):
 39 |         print("=====")
 40 |         print("test_explode_json_to_cols_unique")
 41 | 
 42 |         # Read data
 43 |         dirname = os.path.dirname(__file__)
 44 |         df = pd.read_csv(
 45 |             os.path.join(dirname, "data/input/json_to_cols_unique.csv"), index_col=0
 46 |         )
 47 |         expected_df = pd.read_csv(
 48 |             os.path.join(dirname, "data/output/json_to_cols_unique.csv"), index_col=0
 49 |         )
 50 | 
 51 |         # Explode
 52 |         df2 = gs.explode_json_to_cols(df, "Metadata")
 53 |         print(df2)
 54 | 
 55 |         assert df2.equals(expected_df)
 56 |         print("test_explode_json_to_cols_unique output is correct")
 57 | 
 58 |     # Run explode_json_to_rows
 59 |     def test_explode_json_to_rows(self):
 60 |         print("=====")
 61 |         print("test_explode_json_to_rows")
 62 | 
 63 |         # Read data
 64 |         dirname = os.path.dirname(__file__)
 65 |         df = pd.read_csv(
 66 |             os.path.join(dirname, "data/input/json_to_rows.csv"), index_col=0
 67 |         )
 68 |         expected_df = pd.read_csv(
 69 |             os.path.join(dirname, "data/output/json_to_rows.csv"), index_col=0
 70 |         ).astype({"Line Detail.Id": "float64"})
 71 | 
 72 |         # Explode
 73 |         df2 = gs.explode_json_to_rows(df, "Line Detail").astype(
 74 |             {"Line Detail.Id": "float64"}
 75 |         )
 76 |         assert df2.equals(expected_df)
 77 |         print("test_explode_json_to_rows output is correct")
 78 | 
 79 |     def test_explode_multi(self):
 80 |         print("=====")
 81 |         print("test_explode_multi")
 82 | 
 83 |         # Read data
 84 |         dirname = os.path.dirname(__file__)
 85 |         df = pd.read_csv(
 86 |             os.path.join(dirname, "data/input/multi_json.csv"), index_col=0
 87 |         )
 88 |         expected_df = (
 89 |             pd.read_csv(
 90 |                 os.path.join(dirname, "data/output/explode_multi.csv"), index_col=0
 91 |             )
 92 |             .pipe(lambda x: x.astype({"LineDetail.Id": "float64"}))
 93 |             .pipe(lambda x: x.sort_index(axis=1))
 94 |         )
 95 | 
 96 |         transformed_df = (
 97 |             df.pipe(
 98 |                 gs.explode_json_to_cols,
 99 |                 "Metadata",
100 |                 reducer=gs.array_to_dict_reducer("Name", "StringValue"),
101 |             )
102 |             .pipe(gs.explode_json_to_rows, "LineDetail")
103 |             .pipe(lambda x: x.astype({"LineDetail.Id": "float64"}))
104 |             .pipe(lambda x: x.sort_index(axis=1))
105 |         )
106 |         assert transformed_df.equals(expected_df)
107 | 
108 |         # changing order should not matter
109 |         transformed_df = (
110 |             df.pipe(gs.explode_json_to_rows, "LineDetail")
111 |             .pipe(
112 |                 gs.explode_json_to_cols,
113 |                 "Metadata",
114 |                 reducer=gs.array_to_dict_reducer("Name", "StringValue"),
115 |             )
116 |             .pipe(lambda x: x.astype({"LineDetail.Id": "float64"}))
117 |             .pipe(lambda x: x.sort_index(axis=1))
118 |         )
119 |         assert transformed_df.equals(expected_df)
120 | 
121 |         print("test_explode_multi output is correct")
122 | 
123 | 
124 |     def test_to_export(self, tmp_path):
125 |         print("=====")
126 |         print("test_to_export")
127 |         dir_name = os.path.dirname(__file__)
128 |         input = gs.Reader(dir=os.path.join(dir_name, "data/input"))
129 | 
130 |         campaign_parquet_df = input.get("campaign_performance")
131 |         campaign_csv_df = input.get("campaign_csv")
132 | 
133 |         # Define stream name and output file
134 |         stream_name = "campaign_performance"
135 |         output_dir = tmp_path
136 | 
137 |         true_output_data = {}
138 |         
139 | 
140 |         singer_output_path = os.path.join(dir_name, "data/output/data.singer")
141 |         csv_csv_output_path = os.path.join(dir_name, "data/output/campaign_performance_csv.csv")
142 |         parquet_csv_output_path = os.path.join(dir_name, "data/output/campaign_performance_parquet.csv")
143 | 
144 |         parquet_parquet_output_path = os.path.join(dir_name, "data/output/campaign_performance_parquet.parquet")
145 |         csv_parquet_output_path = os.path.join(dir_name, "data/output/campaign_performance_csv.parquet")
146 |         true_output_data["singer"] = open(singer_output_path, "r").read()
147 | 
148 | 
149 | 
150 | 
151 |         for type, df, output_csv_path, output_parquet_path in [
152 |             ("parquet", campaign_parquet_df, parquet_csv_output_path, parquet_parquet_output_path), 
153 |             ("csv", campaign_csv_df, csv_csv_output_path, csv_parquet_output_path)
154 |         ]:
155 | 
156 |             # Read the output file
157 |             singer_output_file = output_dir / "data.singer"
158 |             if singer_output_file.exists():
159 |                 singer_output_file.unlink()
160 |             
161 |             # Test singer export
162 |             gs.to_export(
163 |                 campaign_parquet_df,
164 |                 name=stream_name,
165 |                 output_dir=output_dir,
166 |                 keys=["id"]
167 |             )
168 | 
169 |             assert singer_output_file.exists(), f"{type} -> Singer Output file {singer_output_file} does not exist."
170 | 
171 |             with open(singer_output_file, "r") as f:
172 |                 test_lines = [json.loads(line) for line in f if line.strip()]
173 | 
174 |             with open(singer_output_path, "r") as f:
175 |                 true_lines = [json.loads(line) for line in f if line.strip()]
176 |             
177 |             assert test_lines == true_lines, f"{type} -> Singer output is incorrect"
178 | 
179 |             # Test CSV Export
180 |             csv_output_file = output_dir / "campaign_performance.csv"
181 |             if csv_output_file.exists():
182 |                 csv_output_file.unlink()
183 |             
184 |             gs.to_export(
185 |                 df,
186 |                 name=stream_name,
187 |                 output_dir=output_dir,
188 |                 export_format="csv",
189 |                 keys=["id"]
190 |             )
191 | 
192 |             test_output_df = pd.read_csv(csv_output_file)
193 |             true_output_df = pd.read_csv(output_csv_path)
194 | 
195 |             assert csv_output_file.exists(), f"{type} -> CSV Output file {csv_output_file} does not exist."
196 | 
197 |             assert test_output_df.equals(true_output_df), f"{type} -> CSV output is incorrect"
198 | 
199 | 
200 |             # Test parquet export
201 |             parquet_output_file = output_dir / "campaign_performance.parquet"
202 |             if parquet_output_file.exists():
203 |                 parquet_output_file.unlink()
204 | 
205 |             true_output_df = pd.read_parquet(path=output_parquet_path)
206 |             
207 |             gs.to_export(
208 |                 df,
209 |                 name=stream_name,
210 |                 output_dir=output_dir,
211 |                 export_format="parquet",
212 |                 keys=["id"]
213 |             )
214 | 
215 |             test_output_df = pd.read_parquet(path=parquet_output_file)
216 | 
217 |             assert parquet_output_file.exists(), f"{type} -> Parquet Output file {parquet_output_file} does not exist."
218 | 
219 |             for col in test_output_df.columns:
220 |                 print("Dtype in test: ", test_output_df[col].dtype)
221 |                 print("Dtype in true: ", true_output_df[col].dtype)
222 |                 assert test_output_df[col].equals(true_output_df[col]), f"{type} -> Column {col} is incorrect"
223 | 
224 | 
225 | 
226 | 
227 | 
228 | 
229 |         print("test to_export output is correct")
230 | 
231 | 
232 | 
233 | 


--------------------------------------------------------------------------------
/gluestick/reader.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import pandas as pd
  4 | from pandas.io.parsers import TextFileReader
  5 | import pyarrow as pa
  6 | import pyarrow.parquet as pq
  7 | 
  8 | class Reader:
  9 |     """A reader for gluestick ETL files."""
 10 | 
 11 |     ROOT_DIR = os.environ.get("ROOT_DIR", ".")
 12 |     INPUT_DIR = f"{ROOT_DIR}/sync-output"
 13 | 
 14 |     def __init__(self, dir=INPUT_DIR, root=ROOT_DIR):
 15 |         """Init the class and read directories.
 16 | 
 17 |         Parameters
 18 |         ----------
 19 |         dir: str
 20 |             Directory with the input data.
 21 |         root: str
 22 |             Root directory.
 23 | 
 24 |         """
 25 |         self.root = root
 26 |         self.dir = dir
 27 |         self.input_files = self.read_directories()
 28 | 
 29 |     def __dict__(self):
 30 |         return self.input_files
 31 | 
 32 |     def __str__(self):
 33 |         return str(list(self.input_files.keys()))
 34 | 
 35 |     def __repr__(self):
 36 |         return str(list(self.input_files.keys()))
 37 | 
 38 |     def read_parquet_with_chunks(self, filepath, chunksize):
 39 |         parquet_file = pq.ParquetFile(filepath)
 40 | 
 41 |         for batch in parquet_file.iter_batches(batch_size=chunksize):
 42 |             df = batch.to_pandas(safe=False)
 43 |             # TODO: add support for catalog types
 44 |             yield df
 45 | 
 46 |     def get(self, stream, default=None, catalog_types=False, **kwargs):
 47 |         """Read the selected file."""
 48 |         filepath = self.input_files.get(stream)
 49 |         if not filepath:
 50 |             return default
 51 |         if filepath.endswith(".parquet"):
 52 |             if kwargs.get("chunksize"):
 53 |                 return self.read_parquet_with_chunks(filepath, kwargs.get("chunksize"))
 54 | 
 55 |             catalog = self.read_catalog()
 56 |             if catalog and catalog_types:
 57 |                 try:
 58 |                     headers = pq.read_table(filepath).to_pandas(safe=False).columns.tolist()
 59 |                     types_params = self.get_types_from_catalog(catalog, stream, headers=headers)
 60 |                     dtype_dict = types_params.get('dtype')
 61 |                     parse_dates = types_params.get('parse_dates')
 62 | 
 63 |                     # Mapping pandas dtypes to pyarrow types
 64 |                     type_mapping = {
 65 |                         'int64': pa.int64(),
 66 |                         'float64': pa.float64(),
 67 |                         "<class 'float'>": pa.float64(),
 68 |                         'string': pa.string(),
 69 |                         'object': pa.string(),
 70 |                         'datetime64[ns]': pa.timestamp('ns'),
 71 |                         'bool': pa.bool_(),
 72 |                         'boolean': pa.bool_(),
 73 |                         # TODO: Add more mappings as needed
 74 |                     }
 75 | 
 76 |                     if dtype_dict:
 77 |                         # Convert dtype dictionary to pyarrow schema
 78 |                         fields = [(col, type_mapping[str(dtype).lower()]) for col, dtype in dtype_dict.items()]
 79 |                         fields.extend([(col, pa.timestamp('ns')) for col in parse_dates])
 80 |                         schema = pa.schema(fields)
 81 |                         df = pq.read_table(filepath, schema=schema).to_pandas(safe=False)
 82 |                         for col, dtype in dtype_dict.items():
 83 |                             # NOTE: bools require explicit conversion at the end because if there are empty values (NaN)
 84 |                             # pyarrow/pd defaults to convert to string
 85 |                             if str(dtype).lower() in ["bool", "boolean"]:
 86 |                                 df[col] = df[col].astype('boolean')
 87 |                             elif str(dtype).lower() in ["int64"]:
 88 |                                 df[col] = df[col].astype('Int64')
 89 |                             elif str(dtype).lower() in ["object", "string"]:
 90 |                                 df[col] = df[col].astype("string")
 91 |                         return df
 92 |                 except:
 93 |                     # NOTE: silencing errors to avoid breaking existing workflow
 94 |                     print(f"Failed to parse catalog_types for {stream}. Ignoring.")
 95 |                     pass
 96 | 
 97 |             return pq.read_table(filepath).to_pandas(safe=False)
 98 |         catalog = self.read_catalog()
 99 |         if catalog and catalog_types:
100 |             types_params = self.get_types_from_catalog(catalog, stream)
101 |             kwargs.update(types_params)
102 |         df = pd.read_csv(filepath, **kwargs)
103 | 
104 |         # needed to handle chunked CSVs properly
105 |         if isinstance(df, TextFileReader):
106 |             return df, kwargs.get("parse_dates", [])
107 | 
108 |         # if a date field value is empty read_csv will read it as "object"
109 |         # make sure all date fields are typed as date
110 |         for date_col in kwargs.get("parse_dates", []):
111 |             df[date_col] = pd.to_datetime(df[date_col], errors='coerce', utc=True)
112 | 
113 |         return df
114 | 
115 |     def get_metadata(self, stream):
116 |         """Get metadata from parquet file."""
117 |         file = self.input_files.get(stream)
118 |         if file is None:
119 |             raise FileNotFoundError(f"There is no file for stream with name {stream}.")
120 |         if file.endswith(".parquet"):
121 |             return {
122 |                 k.decode(): v.decode()
123 |                 for k, v in pq.read_metadata(file).metadata.items()
124 |             }
125 |         return {}
126 | 
127 |     def get_pk(self, stream):
128 |         """Get pk from parquet file or catalog if available."""
129 |         key_properties = []
130 |         if self.read_directories().get(stream, "").endswith(".parquet"):
131 |             metadata = self.get_metadata(stream)
132 |             if metadata.get("key_properties"):
133 |                 key_properties = eval(metadata["key_properties"])
134 |         else:
135 |             catalog = self.read_catalog()
136 | 
137 |             if catalog is not None:
138 |                 streams = next(
139 |                     (c for c in catalog["streams"] if c.get("stream") == stream), None
140 |                 )
141 |                 if streams and streams.get("metadata"):
142 |                     breadcrumb = next(
143 |                         s for s in streams["metadata"] if not s["breadcrumb"]
144 |                     )
145 |                     if breadcrumb:
146 |                         key_properties = breadcrumb.get("metadata", {}).get(
147 |                             "table-key-properties", []
148 |                         )
149 |         return key_properties
150 | 
151 |     def read_directories(self, ignore=[]):
152 |         """Read all the available directories for input files.
153 | 
154 |         Parameters
155 |         ----------
156 |         ignore: list
157 |             Stream names to ignore.
158 | 
159 |         Returns
160 |         -------
161 |         return: dict
162 |             Dict with the name of the streams and their paths.
163 | 
164 |         """
165 |         is_directory = os.path.isdir(self.dir)
166 |         all_files = []
167 |         results = {}
168 |         if is_directory:
169 |             for entry in os.listdir(self.dir):
170 |                 file_path = os.path.join(self.dir, entry)
171 |                 if os.path.isfile(file_path):
172 |                     if file_path.endswith(".csv") or file_path.endswith(".parquet"):
173 |                         all_files.append(file_path)
174 |         else:
175 |             all_files.append(self.dir)
176 | 
177 |         for file in all_files:
178 |             split_path = file.split("/")
179 |             entity_type = split_path[len(split_path) - 1].rsplit(".", 1)[0]
180 | 
181 |             if "-" in entity_type:
182 |                 entity_type = entity_type.rsplit("-", 1)[0]
183 | 
184 |             if entity_type not in results and entity_type not in ignore:
185 |                 results[entity_type] = file
186 | 
187 |         return results
188 | 
189 |     def read_catalog(self):
190 |         """Read the catalog.json file."""
191 |         file_name = f"{self.root}/catalog.json"
192 |         if os.path.isfile(file_name):
193 |             with open(file_name) as f:
194 |                 catalog = json.load(f)
195 |             print(f"Finished loading source catalog.")
196 |         else:
197 |             print(f"Source catalog not found at {file_name}.")
198 |             catalog = None
199 |         return catalog
200 |     
201 |     def clean_catalog(self, catalog):
202 |         clean_catalog = {}
203 |         if "streams" in catalog :
204 |             for stream_info in catalog ["streams"]:
205 |                 # Use 'stream' preferentially, fallback to 'tap_stream_id'
206 |                 stream_name = stream_info.get("stream") or stream_info.get("tap_stream_id")
207 |                 schema_properties = stream_info.get("schema", {}).get("properties", {})
208 |                 if stream_name and schema_properties:
209 |                     clean_catalog[stream_name] = schema_properties
210 |         print(f"Finished loading target schemas for streams: {list(clean_catalog.keys())}")
211 |         return clean_catalog
212 |     
213 |     def read_target_catalog(self, process_schema=False):
214 |         """Read the target catalog.json file."""
215 |         filename = f"{self.root}/target-catalog.json"
216 | 
217 |         if not os.path.exists(filename):
218 |             print(f"Target catalog not found at {filename}.")
219 |             return None
220 |         
221 |         with open(filename, "r", encoding="utf-8") as f:
222 |             raw_target_catalog = json.load(f)
223 |         
224 |         if not process_schema:
225 |             return raw_target_catalog
226 |         
227 |         return raw_target_catalog , self.clean_catalog(raw_target_catalog)
228 | 
229 |     def get_types_from_catalog(self, catalog, stream, headers=None):
230 |         """Get the pandas types base on the catalog definition.
231 | 
232 |         Parameters
233 |         ----------
234 |         catalog: dict
235 |             The singer catalog used on the tap.
236 |         stream: str
237 |             The name of the stream.
238 | 
239 |         Returns
240 |         -------
241 |         return: dict
242 |             Dict with arguments to be used by pandas.
243 | 
244 |         """
245 |         filepath = self.input_files.get(stream)
246 |         if headers is None:
247 |             headers = pd.read_csv(filepath, nrows=0).columns.tolist()
248 | 
249 |         streams = next((c for c in catalog["streams"] if c["stream"] == stream or c["tap_stream_id"] == stream), None)
250 |         if not streams:
251 |             return dict()
252 |         types = streams["schema"]["properties"]
253 | 
254 |         type_mapper = {"integer": "Int64", "number": float, "boolean": "boolean"}
255 | 
256 |         dtype = {}
257 |         parse_dates = []
258 |         for col in headers:
259 |             col_type = types.get(col)
260 |             if col_type:
261 |                 # if col has multiple types, use type with format if it not exists assign type object to support multiple types
262 |                 any_of_list = col_type.get("anyOf", [])
263 |                 if any_of_list:
264 |                     type_with_format = next((col_t for col_t in any_of_list if "format" in col_t), None)
265 |                     col_type = type_with_format if type_with_format else {"type": "object"}
266 |                 if col_type.get("format") == "date-time":
267 |                     parse_dates.append(col)
268 |                     continue
269 |                 if col_type.get("type"):
270 |                     catalog_type = [t for t in col_type["type"] if t != "null"]
271 |                     if len(catalog_type) == 1:
272 |                         dtype[col] = type_mapper.get(catalog_type[0], "object")
273 |                         continue
274 |             dtype[col] = "object"
275 | 
276 |         return dict(dtype=dtype, parse_dates=parse_dates)
277 | 


--------------------------------------------------------------------------------
/gluestick/pandas_utils.py:
--------------------------------------------------------------------------------
  1 | """Utilities for pandas dataframes containing objects."""
  2 | 
  3 | import ast
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from pandas.io.json._normalize import nested_to_record
  8 | from gluestick.reader import Reader
  9 | 
 10 | 
 11 | def json_tuple_to_cols(
 12 |     df,
 13 |     column_name,
 14 |     col_config={
 15 |         "cols": {"key_prop": "Name", "value_prop": "Value"},
 16 |         "look_up": {"key_prop": "name", "value_prop": "value"},
 17 |     },
 18 | ):
 19 |     """Convert a column with a JSON tuple in it to two column.
 20 | 
 21 |     Parameters
 22 |     ----------
 23 |     df: pd.DataFrame
 24 |         The input pandas data frame.
 25 |     column_name: str
 26 |         Column with the json tuple.
 27 |     col_config:
 28 |         Conversion config.
 29 | 
 30 |     Returns
 31 |     -------
 32 |     return: pd.DataFrame
 33 |         A dataframe with the new columns.
 34 | 
 35 |     Examples
 36 |     --------
 37 |      IN[51]: qb_lookup_keys = {'key_prop': 'name', 'value_prop': 'value'}
 38 |      IN[52]: invoices = json_tuple_to_cols(
 39 |         invoices,
 40 |         'Line.DiscountLineDetail.DiscountAccountRef',
 41 |         col_config={
 42 |             'cols': {
 43 |                 'key_prop': 'Discount Details',
 44 |                 'value_prop': 'Discount %'
 45 |                 },
 46 |             'look_up': qb_lookup_keys
 47 |             }
 48 |         )
 49 | 
 50 |     """
 51 | 
 52 |     def get_value(y, prop):
 53 |         value = y
 54 |         if type(value) is str:
 55 |             value = ast.literal_eval(y)
 56 |         if type(value) is dict:
 57 |             return value.get(prop)
 58 |         if type(value) is list:
 59 |             return value[0].get(prop)
 60 |         else:
 61 |             return None
 62 | 
 63 |     df[col_config["cols"]["key_prop"]] = df[column_name].apply(
 64 |         lambda y: get_value(y, col_config["look_up"]["key_prop"])
 65 |     )
 66 |     df[col_config["cols"]["value_prop"]] = df[column_name].apply(
 67 |         lambda y: get_value(y, col_config["look_up"]["value_prop"])
 68 |     )
 69 | 
 70 |     return df.drop(column_name, 1)
 71 | 
 72 | 
 73 | def rename(df, target_columns):
 74 |     """Rename columns in DataFrame using a json format.
 75 | 
 76 |     Notes
 77 |     -----
 78 |     Also allow for converting the types of the values.
 79 | 
 80 |     Parameters
 81 |     ----------
 82 |     df: pd.DataFrame
 83 |         The input pandas data frame.
 84 |     target_columns: dict
 85 |         Dictionary with the columns to rename.
 86 | 
 87 |     Returns
 88 |     -------
 89 |     return: pd.DataFrame
 90 |         Modified data frame with the renamed columns.
 91 | 
 92 |     Examples
 93 |     --------
 94 |     IN[52]: rename(df, )
 95 |     Out[52]:
 96 |     {'dict1.c': 1,
 97 |      'dict1.d': 2,
 98 |      'flat1': 1,
 99 |      'nested.d': 2,
100 |      'nested.e.c': 1,
101 |      'nested.e.d': 2}
102 | 
103 |     """
104 |     if target_columns is not None:
105 |         if isinstance(target_columns, list):
106 |             return df[target_columns]
107 |         elif isinstance(target_columns, dict):
108 |             idx1 = pd.Index(target_columns.keys())
109 |             idx2 = pd.Index(df.columns)
110 |             target_column_names = idx1.intersection(idx2).array
111 |             return df[target_column_names].rename(columns=target_columns)
112 |     return df
113 | 
114 | def enforce_exploded_col_types(df, column_name, stream=None):
115 |     """Enforce types for columns created by exploded fields for better consistency.
116 | 
117 |     Notes
118 |     -----
119 |     Enforce types for columns created by exploded fields using catalog if defined there
120 |     or enforce nullable booleans for consistency
121 | 
122 |     Parameters
123 |     ----------
124 |     df: pd.DataFrame
125 |         The input pandas data frame.
126 |     column_name: str
127 |         The name of the column that should be exploded.
128 |     stream: str
129 |         Stream name to enforce types using catalog typing
130 |     """
131 |     
132 |     # enforce types for booleans and integers
133 |     field_schema = None
134 |     exploded_columns = [col for col in df.columns if col.startswith(f"{column_name}.")]
135 |     
136 |     if stream:
137 |         input = Reader()
138 |         catalog = input.read_catalog()
139 |         stream_schema = [s for s in catalog["streams"] if s["tap_stream_id"] == stream]
140 |         if stream_schema:
141 |             field_schema = stream_schema[0].get("schema", {}).get("properties", {}).get(column_name)
142 | 
143 |     if field_schema and "properties" in field_schema:
144 |         for col in exploded_columns:
145 |             col_name = col.split(".")[-1]
146 |             col_type = field_schema.get("properties").get(col_name, {}).get("type")
147 |             if isinstance(col_type, list) and col_type:
148 |                 col_type = next(iter([t for t in col_type if t != "null"]), None)
149 |             if col_type:          
150 |                 if col_type in ["bool", "boolean"]:
151 |                     df[col] = df[col].astype("boolean")
152 |                 elif  col_type in ["int", "integer"]:
153 |                     df[col] = df[col].astype("Int64")
154 | 
155 |     else:
156 |         for col in  exploded_columns:
157 |             # if all column values are false let pandas infere type
158 |             if df[col].dropna().empty:
159 |                 continue
160 |                 
161 |             first_non_null_value = df[col].dropna().iloc[0]
162 |             if type(first_non_null_value) in [list, dict, str]:
163 |                 continue
164 |             # if all not null fields are bool type column as boolean
165 |             are_all_boolean = df[col].dropna().apply(lambda x: isinstance(x, bool)).all()
166 |             if are_all_boolean:
167 |                 df[col] = df[col].astype("boolean")
168 |                 continue
169 |             # Enforcing only boolean types if "field_schema" is not present, 
170 |             # as pandas automatically converts integers with NaN values (e.g., 2 to 2.0),
171 |     return df
172 | 
173 | 
174 | def explode_json_to_rows(df, column_name, drop=True, stream=None, **kwargs):
175 |     """Explodes a column with an array of objects into multiple rows.
176 | 
177 |     Notes
178 |     -----
179 |     Convert an array of objects into a list of dictionaries and explode it into
180 |     multiple rows and columns, one column for each dictionary key and one row for each
181 |     object inside the array.
182 | 
183 |     Parameters
184 |     ----------
185 |     df: pd.DataFrame
186 |         The input pandas data frame.
187 |     column_name: str
188 |         The name of the column that should be exploded.
189 |     drop: boolean
190 |         To drop or not the exploded column.
191 |     stream: str
192 |         Stream name to enforce types using catalog typing
193 |     **kwargs:
194 |         Additional arguments.
195 | 
196 | 
197 |     Returns
198 |     -------
199 |     return: pd.DataFrame
200 |         New data frame with the JSON line expanded into columns and rows.
201 | 
202 |     Examples
203 |     --------
204 |     IN[52]: explode_json_to_rows(df, df['Line'] )
205 |     an example of the line would be:
206 |     [
207 |         {
208 |             "Id":"1",
209 |             "LineNum":"1",
210 |             "Amount":275,
211 |             "DetailType":"SalesItemLineDetail",
212 |             "SalesItemLineDetail":{"ItemRef":{"value":"5","name":"Rock Fountain"},
213 |             "ItemAccountRef":{"value":"79","name":"Sales of Product Income"},
214 |             "TaxCodeRef":{"value":"TAX","name":null},
215 |             "SubTotalLineDetail":null,
216 |             "DiscountLineDetail":null
217 |         },
218 |         {
219 |             "Id":"2",
220 |             "LineNum":"2",
221 |             "Amount":12.75,
222 |             "DetailType":"SalesItemLineDetail",
223 |             "SalesItemLineDetail":{"ItemRef":{"value":"11","name":"Pump"},
224 |             "ItemAccountRef":{"value":"79","name":"Sales of Product Income"},
225 |             "TaxCodeRef":{"value":"TAX","name":null},
226 |             "SubTotalLineDetail":null,
227 |             "DiscountLineDetail":null
228 |         },
229 |         {
230 |             "Id":"3",
231 |             "LineNum":"3",
232 |             "Amount":47.5,
233 |             "DetailType":"SalesItemLineDetail",
234 |             "SalesItemLineDetail":{"ItemRef":{"value":"3","name":"Concrete"},
235 |             "ItemAccountRef":{
236 |                 "value":"48",
237 |                 "name":"Landscaping Services:Job Materials"
238 |                 },
239 |             "TaxCodeRef":{"value":"TAX","name":null},
240 |             "SubTotalLineDetail":null,
241 |             "DiscountLineDetail":null
242 |         },
243 |         {
244 |             "Id":null,
245 |             "LineNum":null,
246 |             "Amount":335.25,
247 |             "DetailType":"SubTotalLineDetail",
248 |             "SalesItemLineDetail":null,
249 |             "SubTotalLineDetail":{},
250 |             "DiscountLineDetail":null
251 |         }
252 |     ]
253 |     Out[52]:
254 |     Line.Id Line.LineNum  Line.Amount      Line.DetailType
255 |     Index
256 |     1037            1            1       275.00  SalesItemLineDetail
257 |     1037            2            2        12.75  SalesItemLineDetail
258 |     1037            3            3        47.50  SalesItemLineDetail
259 |     1037         None         None       335.25   SubTotalLineDetail
260 |     1036            1            1        50.00  SalesItemLineDetail
261 | 
262 |     """
263 |     # Explode to new rows
264 |     max_level = kwargs.get("max_level", 1)
265 | 
266 |     def to_list(y, parser=ast.literal_eval):
267 |         if type(y) is str:
268 |             y = parser(y)
269 | 
270 |         if type(y) is not list:
271 |             y = [y]
272 | 
273 |         return y
274 | 
275 |     def flatten(y):
276 |         if type(y) is dict:
277 |             return pd.Series(nested_to_record(y, sep=".", max_level=max_level))
278 |         else:
279 |             return pd.Series(dtype=np.float64)
280 | 
281 |     parser = kwargs.get("parser", ast.literal_eval)
282 |     df[column_name] = df[column_name].apply(to_list, parser=parser)
283 | 
284 |     df = df.explode(column_name)
285 | 
286 |     df = pd.concat(
287 |         [df, df[column_name].apply(flatten).add_prefix(f"{column_name}.")], axis=1
288 |     )
289 |     if drop:
290 |         df.drop(column_name, axis=1, inplace=True)
291 | 
292 |     # enforce types
293 |     df = enforce_exploded_col_types(df, column_name, stream)
294 |     return df
295 | 
296 | 
297 | def explode_json_to_cols(df: pd.DataFrame, column_name: str, **kwargs):
298 |     """Convert a JSON column that has an array value into a DataFrame.
299 | 
300 |     Notes
301 |     -----
302 |     Arrays such as [{"Name": "First", "Value": "Jo"},{"Name": "Last", "Value": "Do"}]
303 |     with a column for each value are converted to pandas DataFrame. Note that the new
304 |     series produced from the JSON will be de-duplicated and inner joined with the
305 |     index.
306 | 
307 |     Parameters
308 |     ----------
309 |     df: pd.DataFrame
310 |         The input pandas data frame.
311 |     column_name: str
312 |         The name of the column that should be exploded.
313 |     **kwargs:
314 |         Additional arguments.
315 | 
316 |     Returns
317 |     -------
318 |     return: pd.DataFrame
319 |         New data frame with the JSON line expanded into columns.
320 | 
321 |     Examples
322 |     --------
323 |     IN[5]: explode_json_to_cols(df, 'ProductRef' )
324 |     an example of the ProductRef would be:
325 |     {"value": "Hi Tea Chipper","name": "Product"},
326 |     Out[5]:
327 |     Product
328 |     Index
329 |     1037       Hi Tea Chipper
330 | 
331 |     """
332 |     drop = kwargs.get("drop", True)
333 |     expected_keys = kwargs.get("expected_keys", ["value", "name"])
334 | 
335 |     if not kwargs.get("inplace"):
336 |         df = df.copy()
337 | 
338 |     df[column_name] = df[column_name].fillna("{}")
339 |     parser = kwargs.get("parser", ast.literal_eval)
340 | 
341 |     df[column_name] = df[column_name].apply(
342 |         lambda x: parser(x) if isinstance(x, str) else x
343 |     )
344 | 
345 |     cols = df[column_name].apply(lambda x: x.keys()).explode().unique().tolist()
346 |     cols = [x for x in cols if x == x]
347 |     if cols:
348 |         default_dict = {c: np.nan for c in cols}
349 |         cols = [f"{column_name}.{col}" for col in cols]
350 |     else:
351 |         default_dict = {c: np.nan for c in expected_keys}
352 |         cols = [f"{column_name}.{col}" for col in expected_keys]
353 | 
354 |     def set_default_dict(object, default_dict):
355 |         if isinstance(object, dict):
356 |             for k, v in default_dict.items():
357 |                 object.setdefault(k, v)
358 |             return object
359 |         return np.nan
360 | 
361 |     df[column_name] = df[column_name].apply(lambda x: set_default_dict(x, default_dict))
362 |     df[cols] = df[column_name].apply(pd.Series)
363 | 
364 |     if drop:
365 |         df = df.drop(column_name, axis=1)
366 | 
367 |     return df
368 | 
369 | 
370 | def array_to_dict_reducer(key_prop=None, value_prop=None):
371 |     """Convert an array into a dictionary.
372 | 
373 |     Parameters
374 |     ----------
375 |     key_prop: str
376 |         Property in dictionary for key.
377 |     value_prop: str
378 |         Property in dictionary for value.
379 | 
380 |     Returns
381 |     -------
382 |     return: dict
383 |         A dictionary that has all the accumulated values.
384 | 
385 |     """
386 | 
387 |     def reducer(accumulator, current_value):
388 |         if type(current_value) is not dict:
389 |             raise AttributeError("Value being reduced must be a dictionary")
390 | 
391 |         if key_prop is not None and value_prop is not None:
392 |             key = current_value.get(key_prop)
393 |             current_value = current_value.get(value_prop)
394 |             accumulator[key] = current_value
395 |         else:
396 |             for key, value in current_value.items():
397 |                 accumulator[key] = value
398 | 
399 |         return accumulator
400 | 
401 |     return reducer
402 | 
403 | 
404 | def compress_rows_to_col(df: pd.DataFrame, column_prefix: str, pk):
405 |     """Compress exploded columns rows back to a single column.
406 | 
407 |     Parameters
408 |     ----------
409 |     df: pd.DataFrame
410 |         Input DataFrame to be compressed.
411 |     column_prefix: str
412 |         Column prefix to be compressed.
413 |     pk: str
414 |         Primary key to group on.
415 | 
416 |     Returns
417 |     -------
418 |     return: pd.DataFrame
419 |         A data frame with the compressed data.
420 | 
421 |     """
422 |     compress_cols = [col for col in df.columns if col.startswith(column_prefix)]
423 |     df_compress = df[compress_cols]
424 |     df.drop(compress_cols, inplace=True, axis=1)
425 | 
426 |     prefix_len = len(column_prefix) + 1
427 |     cols_rename = {c: c[prefix_len:] for c in compress_cols}
428 |     df_compress.rename(cols_rename, axis=1, inplace=True)
429 | 
430 |     df[column_prefix] = df_compress.apply(lambda x: str(x.to_dict()), axis=1)
431 | 
432 |     grouped = df.groupby(pk, axis=0)[column_prefix].apply(list).reset_index()
433 |     df.drop_duplicates(pk, inplace=True)
434 |     return df.merge(grouped, how="left", on=pk)
435 | 
436 | 


--------------------------------------------------------------------------------
/gluestick/singer.py:
--------------------------------------------------------------------------------
  1 | """Singer related util functions."""
  2 | 
  3 | import ast
  4 | import datetime
  5 | import json
  6 | import os
  7 | from contextlib import redirect_stdout
  8 | from functools import singledispatch, partial
  9 | import pandas as pd
 10 | import singer
 11 | from gluestick.reader import Reader
 12 | import polars as pl
 13 | 
 14 | def gen_singer_header(df: pd.DataFrame, allow_objects: bool, schema=None, catalog_schema=False, recursive_typing=True):
 15 |     """Generate singer headers based on pandas types.
 16 | 
 17 |     Parameters
 18 |     ----------
 19 |     df: pandas.DataFrame
 20 |         The dataframe to extranct the types from.
 21 |     allow_objects: bool
 22 |         If the function should proccess objects in the columns.
 23 | 
 24 |     Returns
 25 |     -------
 26 |     return: dict
 27 |         Dict of pandas.DataFrames. the keys of which are the entity names
 28 | 
 29 |     """
 30 |     header_map = dict(type=["object", "null"], properties={})
 31 | 
 32 |     type_mapping = {
 33 |         "float": {"type": ["number", "null"]},
 34 |         "int": {"type": ["integer", "null"]},
 35 |         "bool": {"type": ["boolean", "null"]},
 36 |         "str": {"type": ["string", "null"]},
 37 |         "date": {
 38 |             "format": "date-time",
 39 |             "type": ["string", "null"],
 40 |         },
 41 |         "array": {"type": ["array", "null"], "items": {"type": ["object", "string", "null"]}},
 42 |     }
 43 | 
 44 |     if schema and not catalog_schema:
 45 |         header_map = schema
 46 |         return df, header_map
 47 | 
 48 |     for col in df.columns:
 49 |         dtype = df[col].dtype.__str__().lower()
 50 | 
 51 |         if "date" in dtype:
 52 |             df[col] = df[col].dt.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
 53 | 
 54 |         col_type = next((t for t in type_mapping.keys() if t in dtype), None)
 55 | 
 56 |         if col_type:
 57 |             header_map["properties"][col] = type_mapping[col_type]
 58 |         elif allow_objects:
 59 |             value = df[col].dropna()
 60 |             if value.empty:
 61 |                 header_map["properties"][col] = type_mapping["str"]
 62 |                 continue
 63 |             else:
 64 |                 first_value = value.iloc[0]
 65 | 
 66 |             if isinstance(first_value, list):
 67 |                 if recursive_typing:
 68 |                     new_input = {}
 69 |                     for row in value:
 70 |                         if len(row):
 71 |                             for arr_value in row:
 72 |                                 if isinstance(arr_value, dict):
 73 |                                     temp_dict = {k:v for k, v in arr_value.items() if (k not in new_input.keys()) or isinstance(v, float)}
 74 |                                     new_input.update(temp_dict)
 75 |                                 else:
 76 |                                     new_input = arr_value
 77 |                     _schema = dict(type=["array", "null"], items=to_singer_schema(new_input))
 78 |                     header_map["properties"][col] = _schema
 79 |                     if not new_input:
 80 |                         header_map["properties"][col] = {
 81 |                                 "items": type_mapping["str"],
 82 |                                 "type": ["array", "null"],
 83 |                             }
 84 |                 else:
 85 |                     header_map["properties"][col] = type_mapping["array"]
 86 |             elif isinstance(first_value, dict):
 87 |                 _schema = dict(type=["object", "null"], properties={})
 88 |                 for k, v in first_value.items():
 89 |                     _schema["properties"][k] = to_singer_schema(v)
 90 |                 header_map["properties"][col] = _schema
 91 |             else:
 92 |                 header_map["properties"][col] = type_mapping["str"]
 93 |         else:
 94 |             header_map["properties"][col] = type_mapping["str"]
 95 | 
 96 |             def check_null(x):
 97 |                 if isinstance(x, list) or isinstance(x, dict):
 98 |                     return json.dumps(x, default=str)
 99 |                 elif not pd.isna(x):
100 |                     return str(x)
101 |                 return x
102 | 
103 |             df[col] = df[col].apply(check_null)
104 | 
105 |     # update schema using types from catalog and keeping extra columns not defined in catalog
106 |     # i.e. tenant, sync_date, etc
107 |     if catalog_schema:
108 |         header_map["properties"].update(schema["properties"])
109 | 
110 |     return df, header_map
111 | 
112 | 
113 | def to_singer_schema(input):
114 |     """Generate singer headers based on pandas types.
115 | 
116 |     Parameters
117 |     ----------
118 |     input:
119 |         Object to extract the types from.
120 | 
121 |     Returns
122 |     -------
123 |     return: dict
124 |         Dict of the singer mapped types.
125 | 
126 |     """
127 |     if type(input) == dict:
128 |         property = dict(type=["object", "null"], properties={})
129 |         for k, v in input.items():
130 |             property["properties"][k] = to_singer_schema(v)
131 |         return property
132 |     elif type(input) == list:
133 |         if len(input):
134 |             return dict(type=["array", "null"], items=to_singer_schema(input[0]))
135 |         else:
136 |             return {"items": {"type": ["string", "null"]}, "type": ["array", "null"]}
137 |     elif type(input) == bool:
138 |         return {"type": ["boolean", "null"]}
139 |     elif type(input) == int:
140 |         return {"type": ["integer", "null"]}
141 |     elif type(input) == float:
142 |         return {"type": ["number", "null"]}
143 |     return {"type": ["string", "null"]}
144 | 
145 | 
146 | def unwrap_json_schema(schema):
147 |     def resolve_refs(schema, defs):
148 |         if isinstance(schema, dict):
149 |             if '$ref' in schema:
150 |                 ref_path = schema['$ref'].split('/')
151 |                 ref_name = ref_path[-1]
152 |                 return resolve_refs(defs[ref_name], defs)
153 |             else:
154 |                 resolved_schema = {}
155 |                 for k,v in schema.items():
156 |                     if type(v) != list and type(v) != dict:
157 |                         if k not in ['required', 'title']:
158 |                             resolved_schema[k] = v
159 |                     else:
160 |                         resolved_schema[k] = resolve_refs(v, defs)
161 |                 return resolved_schema
162 |         elif isinstance(schema, list):
163 |             return [resolve_refs(item, defs) for item in schema]
164 |         else:
165 |             return schema
166 | 
167 |     def simplify_anyof(schema):
168 |         if isinstance(schema, dict):
169 |             if 'anyOf' in schema:
170 |                 types = [item.get('type') for item in schema['anyOf'] if 'type' in item]
171 | 
172 |                 # Handle cases where anyOf contains more than just type definitions
173 |                 # For example, when it includes properties or other nested structures
174 |                 combined_schema = {}
175 |                 for item in schema['anyOf']:
176 |                     for key, value in item.items():
177 |                         combined_schema[key] = simplify_anyof(value)
178 |                 combined_schema['type'] = types
179 |                 return combined_schema
180 |             else:
181 |                 resolved_schema = {}
182 |                 for k,v in schema.items():
183 |                     if type(v) != list and type(v) != dict:
184 |                         if k not in ['required,' 'title']:
185 |                             resolved_schema[k] = v
186 |                     else:
187 |                         resolved_schema[k] = simplify_anyof(v)
188 |                 return resolved_schema
189 |         elif isinstance(schema, list):
190 |             return [simplify_anyof(item) for item in schema]
191 |         else:
192 |             return schema
193 | 
194 |     defs = schema.get('$defs', {})
195 |     resolved_schema = resolve_refs(schema, defs)
196 |     simplified_schema = simplify_anyof(resolved_schema)
197 |     simplified_schema.pop("$defs", None)
198 |     return simplified_schema
199 | 
200 | 
201 | def deep_convert_datetimes(value):
202 |     """Transforms all nested datetimes in a list or dict to %Y-%m-%dT%H:%M:%S.%fZ.
203 | 
204 |     Notes
205 |     -----
206 |     This function transforms all datetimes to %Y-%m-%dT%H:%M:%S.%fZ
207 | 
208 |     Parameters
209 |     ----------
210 |     value: list, dict, datetime
211 | 
212 |     Returns
213 |     -------
214 |     return: list or dict with all datetime values transformed to %Y-%m-%dT%H:%M:%S.%fZ
215 | 
216 |     """
217 |     if isinstance(value, list):
218 |         return [deep_convert_datetimes(child) for child in value]
219 |     elif isinstance(value, dict):
220 |         return {k: deep_convert_datetimes(v) for k, v in value.items()}
221 |     elif isinstance(value, datetime.datetime):
222 |         return value.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
223 |     elif isinstance(value, datetime.date):
224 |         return value.strftime("%Y-%m-%d")
225 |     return value
226 | 
227 | def parse_objs(x):
228 |     """Parse a stringified dict or list of dicts.
229 | 
230 |     Notes
231 |     -----
232 |     This function will parse a stringified dict or list of dicts
233 | 
234 |     Parameters
235 |     ----------
236 |     x: str
237 |         stringified dict or list of dicts.
238 | 
239 |     Returns
240 |     -------
241 |     return: dict, list
242 |         parsed dict or list of dicts.
243 | 
244 |     """
245 |     # if it's not a string, we just return the input
246 |     if type(x) != str:
247 |         return x
248 | 
249 |     try:
250 |         return ast.literal_eval(x)
251 |     except:
252 |         return json.loads(x)
253 | 
254 | 
255 | def get_catalog_schema(stream):
256 |     """Get a df schema using the catalog.
257 | 
258 |     Parameters
259 |     ----------
260 |     stream: str
261 |         Stream name in catalog.
262 | 
263 |     """
264 |     input = Reader()
265 |     catalog = input.read_catalog()
266 |     schema = next(
267 |         (str["schema"] for str in catalog["streams"] if str["stream"] == stream), None
268 |     )
269 |     if not schema:
270 |         raise Exception(f"No schema found in catalog for stream {stream}")
271 |     else:
272 |         # keep only relevant fields
273 |         schema = {k: v for k, v in schema.items() if k in ["type", "properties"]}
274 |         # need to ensure every array type has an items dict or we'll have issues
275 |         for p in schema.get("properties", dict()):
276 |             prop = schema["properties"][p]
277 |             if prop.get("type") == "array" or "array" in prop.get("type") and prop.get("items") is None:
278 |                 prop["items"] = dict()
279 |     return schema
280 | 
281 | 
282 | def parse_df_cols(df, schema):
283 |     """Parse all df list and dict columns according to schema.
284 | 
285 |     Parameters
286 |     ----------
287 |     stream: str
288 |         Stream name in catalog.
289 |     schema: dict
290 |         Schema that will be used to export the data.
291 | 
292 |     """
293 |     for col in df.columns:
294 |         col_type = schema["properties"].get(col, {}).get("type", [])
295 |         if (isinstance(col_type, list) and any(
296 |             item in ["object", "array"]
297 |             for item in col_type
298 |         )) or col_type in ["object", "array"]:
299 |             df[col] = df[col].apply(lambda x: parse_objs(x))
300 |     return df
301 | 
302 | @singledispatch
303 | def to_singer(
304 |     df,
305 |     stream,
306 |     output_dir,
307 |     keys=[],
308 |     filename="data.singer",
309 |     allow_objects=False,
310 |     schema=None,
311 |     unified_model=None,
312 |     keep_null_fields=False,
313 |     catalog_stream=None,
314 |     recursive_typing=True
315 | ):
316 |     raise NotImplementedError("to_singer is not implemented for this type")
317 | 
318 | @to_singer.register(pd.DataFrame)
319 | def pandas_df_to_singer(
320 |     df: pd.DataFrame,
321 |     stream,
322 |     output_dir,
323 |     keys=[],
324 |     filename="data.singer",
325 |     allow_objects=False,
326 |     schema=None,
327 |     unified_model=None,
328 |     keep_null_fields=False,
329 |     catalog_stream=None,
330 |     recursive_typing=True
331 | ):
332 |     """Convert a pandas DataFrame into a singer file.
333 | 
334 |     Parameters
335 |     ----------
336 |     df: pd.DataFrame
337 |         Object to extract the types from.
338 |     stream: str
339 |         Stream name to be used in the singer output file.
340 |     output_dir: str
341 |         Path to the output directory.
342 |     keys: list
343 |         The primary-keys to be used.
344 |     filename: str
345 |         The output file name.
346 |     allow_objects: boolean
347 |         Allow or not objects to the parsed, if false defaults types to str.
348 |     keep_null_fields: boolean
349 |         Flag to keep all null fields
350 |     catalog_stream: str
351 |         Name of the stream in the catalog to be used to generate the schema if USE_CATALOG_SCHEMA is set as true
352 |         If this is not set it will use stream parameter to generate the catalog
353 |     recursive_typing: boolean
354 |         If true, the function will recursively convert arrays of objects to arrays of primitives.
355 |         If false, the function will fuzzy list types when generating singer header.
356 |     """
357 |     catalog_schema = os.environ.get("USE_CATALOG_SCHEMA", "false").lower() == "true"
358 |     include_all_unified_fields = os.environ.get("INCLUDE_ALL_UNIFIED_FIELDS", "false").lower() == "true" and unified_model is not None
359 | 
360 |     # drop columns with all null values except when we want to keep null fields
361 |     if allow_objects and not (catalog_schema or include_all_unified_fields or keep_null_fields):
362 |         df = df.dropna(how="all", axis=1)
363 |     else:
364 |         # df.dropna returns a new dataframe so df it's no longer pointing to the original dataframe, 
365 |         # if dropna is not applied we need to copy it or gen_singer_header will cast the original dataframe datetime columns as strings
366 |         df = df.copy()
367 | 
368 |     if catalog_schema or catalog_stream:
369 |         # it'll allow_objects but keeping all columns
370 |         allow_objects = True
371 |         # get schema from catalog
372 |         stream_name = catalog_stream or stream
373 |         schema = get_catalog_schema(stream_name)
374 |         # parse all fields that are typed as objects or lists
375 |         df = parse_df_cols(df, schema)
376 | 
377 |     elif unified_model:
378 |         schema = unwrap_json_schema(unified_model.model_json_schema())
379 | 
380 |     df, header_map = gen_singer_header(df, allow_objects, schema, catalog_schema, recursive_typing=recursive_typing)
381 |     output = os.path.join(output_dir, filename)
382 |     mode = "a" if os.path.isfile(output) else "w"
383 | 
384 |     with open(output, mode) as f:
385 |         with redirect_stdout(f):
386 |             singer.write_schema(stream, header_map, keys)
387 |             for _, row in df.iterrows():
388 |                 # keep null fields for catalog_schema, include_all_unified_fields and keep_null_fields
389 |                 if not (catalog_schema or include_all_unified_fields or keep_null_fields):
390 |                     filtered_row = row.dropna()
391 |                 else:
392 |                     filtered_row = row.where(pd.notna(row), None)
393 |                 filtered_row = filtered_row.to_dict()
394 |                 filtered_row = deep_convert_datetimes(filtered_row)
395 |                 singer.write_record(stream, filtered_row)
396 |             singer.write_state({})
397 | 
398 | 
399 | 
400 | def gen_singer_header_from_polars_schema(
401 |     schema: pl.Schema
402 | ) -> dict:
403 |     """
404 |     Generate Singer headers from a Polars schema.
405 | 
406 |     Parameters
407 |     ----------
408 |     schema : pl.Schema
409 |         Polars DataFrame schema.
410 | 
411 |     Returns
412 |     -------
413 |     dict
414 |         Singer schema dictionary with non-primitives stringified.
415 |     """
416 |     primitive_mapping = {
417 |         "Float64": {"type": ["number", "null"]},
418 |         "Float32": {"type": ["number", "null"]},
419 |         "Int64": {"type": ["integer", "null"]},
420 |         "Int32": {"type": ["integer", "null"]},
421 |         "Int16": {"type": ["integer", "null"]},
422 |         "Int8": {"type": ["integer", "null"]},
423 |         "UInt64": {"type": ["integer", "null"]},
424 |         "UInt32": {"type": ["integer", "null"]},
425 |         "UInt16": {"type": ["integer", "null"]},
426 |         "UInt8": {"type": ["integer", "null"]},
427 |         "Boolean": {"type": ["boolean", "null"]},
428 |         "Utf8": {"type": ["string", "null"]},
429 |         "Date": {"type": ["string", "null"], "format": "date"},
430 |         "Datetime": {"type": ["string", "null"], "format": "date-time"},
431 |         "Time": {"type": ["string", "null"], "format": "time"},
432 |     }
433 | 
434 |     def map_dtype(dtype) -> dict:
435 |         dtype_name = str(dtype)
436 |         # Only primitive types keep their mapping
437 |         if dtype_name.startswith("Struct("):
438 |             return {"type": ["object", "null"]}
439 |         
440 |         if dtype_name.startswith("Datetime("):
441 |             return {"type": ["string", "null"], "format": "date-time"}
442 |         
443 |         if dtype_name.startswith("List("):
444 |             return {"type": ["array", "null"], "items": {"type": ["any", "null"]}}
445 |         return primitive_mapping.get(dtype_name, {"type": ["string", "null"]})
446 | 
447 |     header_map = {
448 |         "type": ["object", "null"],
449 |         "properties": {col: map_dtype(dtype) for col, dtype in schema.items()}
450 |     }
451 | 
452 |     return header_map
453 | 
454 | 
455 | @to_singer.register(pl.DataFrame)
456 | def polars_df_to_singer(
457 |     df: pl.DataFrame,
458 |     stream,
459 |     output_dir,
460 |     keys=[],
461 |     filename="data.singer",
462 |     allow_objects=False,
463 |     schema=None,
464 |     unified_model=None,
465 |     keep_null_fields=False,
466 |     catalog_stream=None,
467 |     recursive_typing=True
468 | ):
469 |     """Convert a polars DataFrame into a singer file.
470 | 
471 |     Parameters
472 |     ----------
473 |     df: pl.DataFrame
474 |         Polars DataFrame to convert to singer.
475 |     stream: str
476 |         Stream name to be used in the singer output file.
477 |     output_dir: str
478 |         Path to the output directory.
479 |     keys: list
480 |         The primary-keys to be used.
481 |     filename: str
482 |         The output file name.
483 |     allow_objects: boolean
484 |         Allow or not objects to the parsed, if false defaults types to str.
485 |     keep_null_fields: boolean
486 |         Flag to keep all null fields
487 |     catalog_stream: str
488 |         Name of the stream in the catalog to be used to generate the schema if USE_CATALOG_SCHEMA is set as true
489 |         If this is not set it will use stream parameter to generate the catalog
490 |     recursive_typing: boolean
491 |         If true, the function will recursively convert arrays of objects to arrays of primitives.
492 |         If false, the function will fuzzy list types when generating singer header.
493 |     """
494 | 
495 |     output = os.path.join(output_dir, filename)
496 |     mode = "a" if os.path.isfile(output) else "w"
497 | 
498 |     header_map = gen_singer_header_from_polars_schema(df.schema)
499 |  
500 | 
501 | 
502 |     with open(output, mode) as f:
503 |         with redirect_stdout(f):
504 |             singer.write_schema(stream, header_map, keys)
505 |             for row in df.iter_rows(named=True):
506 |                 row = {k: v.strftime("%Y-%m-%dT%H:%M:%S.%fZ") if isinstance(v, datetime.datetime) else v for k, v in row.items()}
507 |                 singer.write_record(stream, row)
508 | 
509 | 
510 |             
511 | 
512 | @to_singer.register(pl.LazyFrame)
513 | def polars_lf_to_singer(
514 |     df: pl.LazyFrame,
515 |     stream,
516 |     output_dir,
517 |     keys=[],
518 |     filename="data.singer",
519 |     allow_objects=False,
520 |     schema=None,
521 |     unified_model=None,
522 |     keep_null_fields=False,
523 |     catalog_stream=None,
524 |     recursive_typing=True
525 | ):
526 |     """Convert a polars Lazyframe into a singer file.
527 | 
528 |     Parameters
529 |     ----------
530 |     df: pd.DataFrame
531 |         Object to extract the types from.
532 |     stream: str
533 |         Stream name to be used in the singer output file.
534 |     output_dir: str
535 |         Path to the output directory.
536 |     keys: list
537 |         The primary-keys to be used.
538 |     filename: str
539 |         The output file name.
540 |     allow_objects: boolean
541 |         Allow or not objects to the parsed, if false defaults types to str.
542 |     keep_null_fields: boolean
543 |         Flag to keep all null fields
544 |     catalog_stream: str
545 |         Name of the stream in the catalog to be used to generate the schema if USE_CATALOG_SCHEMA is set as true
546 |         If this is not set it will use stream parameter to generate the catalog
547 |     recursive_typing: boolean
548 |         If true, the function will recursively convert arrays of objects to arrays of primitives.
549 |         If false, the function will fuzzy list types when generating singer header.
550 |     """
551 | 
552 |     sink_fn = partial(
553 |         polars_df_to_singer,
554 |         stream=stream,
555 |         output_dir=output_dir,
556 |         keys=keys,
557 |         filename=filename,
558 |         allow_objects=allow_objects,
559 |         schema=schema,
560 |         unified_model=unified_model,
561 |         keep_null_fields=keep_null_fields,
562 |         catalog_stream=catalog_stream,
563 |         recursive_typing=recursive_typing,
564 |     )
565 |     df.sink_batches(sink_fn, chunk_size=1000)
566 | 


--------------------------------------------------------------------------------
/gluestick/etl_utils.py:
--------------------------------------------------------------------------------
   1 | """Utilities for hotglue ETL scripts."""
   2 | 
   3 | import hashlib
   4 | import json
   5 | import os
   6 | 
   7 | import pandas as pd
   8 | import numpy as np
   9 | import pyarrow.parquet as pq
  10 | from datetime import datetime
  11 | from pytz import utc
  12 | from gluestick.singer import to_singer
  13 | import re
  14 | from gluestick.reader import Reader
  15 | import polars as pl
  16 | from gluestick.readers.pl_lazyframe_reader import PLLazyFrameReader
  17 | from gluestick.readers.pl_reader import PolarsReader
  18 | from functools import singledispatch
  19 | 
  20 | 
  21 | def read_csv_folder(path, converters={}, index_cols={}, ignore=[]):
  22 |     """Read a set of CSV files in a folder using read_csv().
  23 | 
  24 |     Notes
  25 |     -----
  26 |     This method assumes that the files are being pulled in a stream and follow a
  27 |     naming convention with the stream/ entity / table name is the first word in the
  28 |     file name for example; Account-20200811T121507.csv is for an entity called
  29 |     ``Account``.
  30 | 
  31 |     Parameters
  32 |     ----------
  33 |     path: str
  34 |         The folder directory
  35 |     converters: dict
  36 |         A dictionary with an array of converters that are passed to
  37 |         read_csv, the key of the dictionary is the name of the entity.
  38 |     index_cols:
  39 |         A dictionary with an array of index_cols, the key of the dictionary is the name
  40 |         of the entity.
  41 |     ignore: list
  42 |         List of files to ignore
  43 | 
  44 |     Returns
  45 |     -------
  46 |     return: dict
  47 |         Dict of pandas.DataFrames. the keys of which are the entity names
  48 | 
  49 |     Examples
  50 |     --------
  51 |     IN[31]: entity_data = read_csv_folder(
  52 |         CSV_FOLDER_PATH,
  53 |         index_cols={'Invoice': 'DocNumber'},
  54 |         converters={'Invoice': {
  55 |             'Line': ast.literal_eval,
  56 |             'CustomField': ast.literal_eval,
  57 |             'Categories': ast.literal_eval
  58 |             }}
  59 |         )
  60 |     IN[32]: df = entity_data['Account']
  61 | 
  62 |     """
  63 |     is_directory = os.path.isdir(path)
  64 |     all_files = []
  65 |     results = {}
  66 |     if is_directory:
  67 |         for entry in os.listdir(path):
  68 |             if os.path.isfile(os.path.join(path, entry)) and os.path.join(
  69 |                 path, entry
  70 |             ).endswith(".csv"):
  71 |                 all_files.append(os.path.join(path, entry))
  72 | 
  73 |     else:
  74 |         all_files.append(path)
  75 | 
  76 |     for file in all_files:
  77 |         split_path = file.split("/")
  78 |         entity_type = split_path[len(split_path) - 1].rsplit(".csv", 1)[0]
  79 | 
  80 |         if "-" in entity_type:
  81 |             entity_type = entity_type.rsplit("-", 1)[0]
  82 | 
  83 |         if entity_type not in results and entity_type not in ignore:
  84 |             # print(f"Reading file of type {entity_type} in the data file {file}")
  85 |             results[entity_type] = pd.read_csv(
  86 |                 file,
  87 |                 index_col=index_cols.get(entity_type),
  88 |                 converters=converters.get(entity_type),
  89 |             )
  90 | 
  91 |     return results
  92 | 
  93 | 
  94 | def read_parquet_folder(path, ignore=[]):
  95 |     """Read a set of parquet files in a folder using read_parquet().
  96 | 
  97 |     Notes
  98 |     -----
  99 |     This method assumes that the files are being pulled in a stream and follow a
 100 |     naming convention with the stream/ entity / table name is the first word in the
 101 |     file name for example; Account-20200811T121507.parquet is for an entity called
 102 |     ``Account``.
 103 | 
 104 |     Parameters
 105 |     ----------
 106 |     path: str
 107 |         The folder directory
 108 |     ignore: list
 109 |         List of files to ignore
 110 | 
 111 |     Returns
 112 |     -------
 113 |     return: dict
 114 |         Dict of pandas.DataFrames. the keys of which are the entity names
 115 | 
 116 |     Examples
 117 |     --------
 118 |     IN[31]: entity_data = read_parquet_folder(PARQUET_FOLDER_PATH)
 119 |     IN[32]: df = entity_data['Account']
 120 | 
 121 |     """
 122 |     is_directory = os.path.isdir(path)
 123 |     all_files = []
 124 |     results = {}
 125 |     if is_directory:
 126 |         for entry in os.listdir(path):
 127 |             if os.path.isfile(os.path.join(path, entry)) and os.path.join(
 128 |                 path, entry
 129 |             ).endswith(".parquet"):
 130 |                 all_files.append(os.path.join(path, entry))
 131 | 
 132 |     else:
 133 |         all_files.append(path)
 134 | 
 135 |     for file in all_files:
 136 |         split_path = file.split("/")
 137 |         entity_type = split_path[len(split_path) - 1].rsplit(".parquet", 1)[0]
 138 | 
 139 |         if "-" in entity_type:
 140 |             entity_type = entity_type.rsplit("-", 1)[0]
 141 | 
 142 |         if entity_type not in results and entity_type not in ignore:
 143 |             df = pq.read_table(file, use_threads=False).to_pandas(safe=False, use_threads=False)
 144 |             # df = df.convert_dtypes()
 145 |             results[entity_type] = df
 146 | 
 147 |     return results
 148 | 
 149 | 
 150 | def read_snapshots(stream, snapshot_dir, **kwargs):
 151 |     """Read a snapshot file.
 152 | 
 153 |     Parameters
 154 |     ----------
 155 |     stream: str
 156 |         The name of the stream to extract the snapshots from.
 157 |     snapshot_dir: str
 158 |         The path for the directory where the snapshots are stored.
 159 |     **kwargs:
 160 |         Additional arguments that are passed to pandas read_csv.
 161 | 
 162 |     Returns
 163 |     -------
 164 |     return: pd.DataFrame
 165 |         A pandas dataframe with the snapshot data.
 166 | 
 167 |     """
 168 |     # Read snapshot file if it exists
 169 |     if os.path.isfile(f"{snapshot_dir}/{stream}.snapshot.parquet"):
 170 |         snapshot = pq.read_table(f"{snapshot_dir}/{stream}.snapshot.parquet", use_threads=False).to_pandas(safe=False, use_threads=False)
 171 |         # snapshot = snapshot.convert_dtypes()
 172 |     elif os.path.isfile(f"{snapshot_dir}/{stream}.snapshot.csv"):
 173 |         snapshot = pd.read_csv(f"{snapshot_dir}/{stream}.snapshot.csv", **kwargs)
 174 |     else:
 175 |         snapshot = None
 176 |     return snapshot
 177 | 
 178 | 
 179 | def snapshot_records(
 180 |     stream_data, stream, snapshot_dir, pk="id", just_new=False, use_csv=False, coerce_types= False, localize_datetime_types=False, overwrite=False, **kwargs
 181 | ):
 182 |     """Update a snapshot file.
 183 | 
 184 |     Parameters
 185 |     ----------
 186 |     stream_data: str
 187 |         DataFrame with the data to be included in the snapshot.
 188 |     stream: str
 189 |         The name of the stream of the snapshots.
 190 |     snapshot_dir: str
 191 |         The name of the stream of the snapshots.
 192 |     pk: str
 193 |         The primary key used for the snapshot.
 194 |     just_new: str
 195 |         Return just the input data if True, else returns the whole data
 196 |     coerce_types: bool
 197 |         Coerces types to the stream_data types if True, else mantains current snapshot types
 198 |     localize_datetime_types: bool
 199 |         Localizes datetime columns to UTC if True, else mantains current snapshot types
 200 |     **kwargs:
 201 |         Additional arguments that are passed to pandas read_csv.
 202 | 
 203 |     Returns
 204 |     -------
 205 |     return: pd.DataFrame
 206 |         A pandas dataframe with the snapshot data.
 207 | 
 208 |     """
 209 |     # Read snapshot file if it exists
 210 |     snapshot = read_snapshots(stream, snapshot_dir, **kwargs)
 211 | 
 212 |     # If snapshot file and stream data exist update the snapshot
 213 |     if not overwrite and stream_data is not None and snapshot is not None:
 214 |         snapshot_types = snapshot.dtypes
 215 | 
 216 |         if localize_datetime_types:
 217 |             # Localize datetime columns to UTC (datetime64[ns, UTC]) if they are not already
 218 |             for column, dtype in snapshot_types.items():
 219 |                 if dtype == "datetime64[ns]":
 220 |                     snapshot[column] = localize_datetime(snapshot, column)
 221 | 
 222 |         merged_data = pd.concat([snapshot, stream_data])
 223 |         # coerce snapshot types to incoming data types
 224 |         if coerce_types:
 225 |             if not stream_data.empty and not snapshot.empty:
 226 |                 # Save incoming data types
 227 |                 df_types = stream_data.dtypes
 228 |                 try:
 229 |                     for column, dtype in df_types.items():
 230 |                         if dtype == 'bool':
 231 |                             merged_data[column] = merged_data[column].astype('boolean')
 232 |                         elif dtype in ["int64", "int32", "Int32", "Int64"]:
 233 |                             merged_data[column] = merged_data[column].astype("Int64")
 234 |                         else:
 235 |                             merged_data[column] = merged_data[column].astype(dtype)
 236 |                 except Exception as e:
 237 |                     raise Exception(f"Snapshot failed while trying to convert field {column} from type {snapshot_types.get(column)} to type {dtype}")
 238 |         # drop duplicates
 239 |         merged_data = merged_data.drop_duplicates(pk, keep="last")
 240 |         # export data
 241 |         if use_csv:
 242 |             merged_data.to_csv(f"{snapshot_dir}/{stream}.snapshot.csv", index=False)
 243 |         else:
 244 |             merged_data.to_parquet(f"{snapshot_dir}/{stream}.snapshot.parquet", index=False)
 245 | 
 246 |         if not just_new:
 247 |             return merged_data
 248 |         else:
 249 |             return stream_data
 250 | 
 251 |     # If there is no snapshot file snapshots and return the new data
 252 |     if stream_data is not None:
 253 |         if use_csv:
 254 |             stream_data.to_csv(f"{snapshot_dir}/{stream}.snapshot.csv", index=False)
 255 |         else:
 256 |             stream_data.to_parquet(f"{snapshot_dir}/{stream}.snapshot.parquet", index=False)
 257 |         return stream_data
 258 | 
 259 |     if just_new or overwrite:
 260 |         return stream_data
 261 |     else:
 262 |         return snapshot
 263 | 
 264 | 
 265 | def get_row_hash(row, columns):
 266 |     """Update a snapshot file.
 267 | 
 268 |     Parameters
 269 |     ----------
 270 |     row: pd.DataSeries
 271 |         DataFrame row to create the hash from.
 272 | 
 273 |     Returns
 274 |     -------
 275 |     return: str
 276 |         A string with the hash for the row.
 277 | 
 278 |     """
 279 |     # ensure stable order
 280 |     values = []
 281 | 
 282 |     for col in columns:
 283 |         v = row[col]
 284 | 
 285 |         if (isinstance(v, list) or not pd.isna(v)) and v==v and (v not in [None, np.nan]):
 286 |             values.append(str(v))
 287 | 
 288 |     row_str = "".join(values)
 289 |     return hashlib.md5(row_str.encode()).hexdigest()
 290 | 
 291 | 
 292 | def drop_redundant(df, name, output_dir, pk=[], updated_flag=False, use_csv=False):
 293 |     """Drop the rows that were present in previous versions of the dataframe.
 294 | 
 295 |     Notes
 296 |     -----
 297 |     This function will create a hash for every row of the dataframe and snapshot it, if
 298 |     the same row was present in previous versions of the dataframe, it will be dropped.
 299 | 
 300 |     Parameters
 301 |     ----------
 302 |     df: pd.DataFrame
 303 |         The dataframe do be checked for duplicates
 304 |     name: str
 305 |         The name used to snapshot the hash.
 306 |     output_dir: str
 307 |         The snapshot directory to save the state in.
 308 |     pk: list, str
 309 |         Primary key(s) used to associate the state with.
 310 |     updated_flag: bool
 311 |         To create of not a column with a flag for new/updated rows for the given
 312 |         primary key.
 313 | 
 314 |     Returns
 315 |     -------
 316 |     return: pd.DataFrame
 317 |         Dataframe with the data after dropping the redundant rows.
 318 | 
 319 |     """
 320 |     df = df.copy()
 321 | 
 322 |     if pk:
 323 |         # PK needs to be unique, so we drop the duplicated values
 324 |         df = df.drop_duplicates(subset=pk)
 325 | 
 326 |     # get a sorted list of columns to build the hash
 327 |     columns = list(df.columns)
 328 |     columns.sort()
 329 | 
 330 |     df["hash"] = df.apply(lambda row: get_row_hash(row, columns), axis=1)
 331 |     # If there is a snapshot file compare and filter the hash
 332 |     hash_df = None
 333 |     if os.path.isfile(f"{output_dir}/{name}.hash.snapshot.parquet"):
 334 |         hash_df = pq.read_table(f"{output_dir}/{name}.hash.snapshot.parquet", use_threads=False).to_pandas(safe=False, use_threads=False)
 335 |     elif os.path.isfile(f"{output_dir}/{name}.hash.snapshot.csv"):
 336 |         hash_df = pd.read_csv(f"{output_dir}/{name}.hash.snapshot.csv")
 337 | 
 338 |     if hash_df is not None:
 339 |         pk = [pk] if not isinstance(pk, list) else pk
 340 | 
 341 |         if pk:
 342 |             hash_df = hash_df.drop_duplicates(subset=pk)
 343 | 
 344 |         if updated_flag and pk:
 345 |             updated_pk = df[pk].merge(hash_df[pk], on=pk, how="inner")
 346 |             updated_pk["_updated"] = True
 347 | 
 348 |         df = df.merge(
 349 |             hash_df[pk + ["hash"]], on=pk + ["hash"], how="left", indicator=True
 350 |         )
 351 |         df = df[df["_merge"] == "left_only"]
 352 |         df = df.drop("_merge", axis=1)
 353 | 
 354 |         if updated_flag and pk:
 355 |             df = df.merge(updated_pk, on=pk, how="left")
 356 |             df["_updated"] = df["_updated"].fillna(False)
 357 | 
 358 |     snapshot_records(df[pk + ["hash"]], f"{name}.hash", output_dir, pk, use_csv=use_csv)
 359 |     df = df.drop("hash", axis=1)
 360 |     return df
 361 | 
 362 | def clean_convert(input):
 363 |     """Cleans all None values from a list or dict.
 364 | 
 365 |     Notes
 366 |     -----
 367 |     This function will iterate through all the values of a list or dict 
 368 |     and delete all None values 
 369 | 
 370 |     Parameters
 371 |     ----------
 372 |     input: dict, list
 373 |         The dict or list that will be cleaned.
 374 | 
 375 |     Returns
 376 |     -------
 377 |     return: dict, list
 378 |         list or dict with the data after deleting all None values.
 379 | 
 380 |     """
 381 |     if isinstance(input, list):
 382 |         return [clean_convert(i) for i in input]
 383 |     elif isinstance(input, dict):
 384 |         output = {}
 385 |         for k, v in input.items():
 386 |             v = clean_convert(v)
 387 |             if isinstance(v, list):
 388 |                 output[k] = [i for i in v if not pd.isna(i)]
 389 |             elif not pd.isna(v):
 390 |                 output[k] = v
 391 |         return output
 392 |     elif isinstance(input, datetime):
 393 |         return input.isoformat()
 394 |     elif not pd.isna(input):
 395 |         return input
 396 | 
 397 | def map_fields(row, mapping):
 398 |     """Maps the row values according to the mapping dict.
 399 | 
 400 |     Notes
 401 |     -----
 402 |     This function will iterate through all the values of a mapping dict
 403 |     and map the values from the row accordingly
 404 | 
 405 |     Parameters
 406 |     ----------
 407 |     row: dict or dataframe row with the values to be mapped
 408 |     mapping: dict that estabilsh how to map the fields
 409 | 
 410 |     Returns
 411 |     -------
 412 |     return: dict
 413 |         dict with the mapped data.
 414 | 
 415 |     """
 416 |     output = {}
 417 |     for key, value in mapping.items():
 418 |         if isinstance(value, list):
 419 |             out_list = []
 420 |             for v in value:
 421 |                 mapped = map_fields(row, v)
 422 |                 if mapped:
 423 |                     out_list.append(mapped)
 424 |             if out_list:
 425 |                 output[key] = out_list
 426 |         elif isinstance(value, dict):
 427 |             mapped = map_fields(row, value)
 428 |             if mapped:
 429 |                 output[key] = mapped
 430 |         elif value is not None:
 431 |             if isinstance(row.get(value), list) or not pd.isna(row.get(value)):
 432 |                 output[key] = row.get(value)
 433 |     return output
 434 | 
 435 | def clean_obj_null_values(obj):
 436 |     """Replaces all null values by None.
 437 | 
 438 |     Notes
 439 |     -----
 440 |     This function will replace all null values by None so other functions 
 441 |     such as explode_json_to_cols, explode_json_to_rows, etc can be used
 442 | 
 443 |     Parameters
 444 |     ----------
 445 |     obj: str
 446 |         stringified dict or list where null values should be replaced.
 447 | 
 448 |     Returns
 449 |     -------
 450 |     return: str
 451 |         str with all null values replaced.
 452 | 
 453 |     """
 454 |     if not pd.isna(obj):
 455 |         obj = obj.replace('null', 'None')
 456 |         return obj
 457 |     else:
 458 |         return {}
 459 | 
 460 | 
 461 | def get_index_safely(arr, index):
 462 |     """Safely retrieves an item from an list by index.
 463 | 
 464 |     Parameters
 465 |     ----------
 466 |     arr: list
 467 |         List of items.
 468 |     index: int
 469 |         The index position of the item
 470 | 
 471 |     Returns
 472 |     -------
 473 |     return: any
 474 |         The item at the specified index, or `None` if the index is out of bounds.
 475 |     """
 476 |     try:
 477 |         return arr[index]
 478 |     except:
 479 |         return None
 480 | 
 481 | 
 482 | def build_string_format_variables(
 483 |     default_kwargs=dict(), use_tenant_metadata=True, subtenant_delimiter="_"
 484 | ):
 485 |     """Builds a dictionary of string format variables from multiple sources.
 486 | 
 487 |     Parameters
 488 |     ----------
 489 |     default_kwargs : dict
 490 |         A dictionary of default values for the format variables. Keys in this
 491 |         dictionary are reserved and cannot be overridden by tenant metadata.
 492 |     use_tenant_metadata : bool
 493 |         Whether to include variables derived from tenant metadata. If True,
 494 |         attempts to load metadata from the tenant configuration JSON file.
 495 |     subtenant_delimiter : str
 496 |         The delimiter used to split the `tenant_id` into root and sub-tenant
 497 |         components.
 498 | 
 499 |     Returns
 500 |     -------
 501 |     dict
 502 |         A dictionary containing the consolidated string format variables.
 503 | 
 504 |     """
 505 |     # Reserved keys are keys that may not be overriden by other sources of variabes (e.g., tenant metadata)
 506 |     # The keys in the "default_kwargs" are chosen to be these reserved keys
 507 |     reserved_keys = list(default_kwargs.keys())
 508 | 
 509 |     final_kwargs = default_kwargs.copy()
 510 | 
 511 |     # Build tenant metadata variable
 512 |     tenant_metadata = dict()
 513 |     if use_tenant_metadata:
 514 |         tenant_metadata_path = (
 515 |             f"{os.environ.get('ROOT')}/snapshots/tenant-config.json"
 516 |         )
 517 |         if os.path.exists(tenant_metadata_path):
 518 |             with open(tenant_metadata_path, "r") as file:
 519 |                 tenant_metadata = json.load(file)
 520 |         tenant_metadata = tenant_metadata.get("hotglue_metadata") or dict()
 521 |         tenant_metadata = tenant_metadata.get("metadata") or dict()
 522 | 
 523 |     # Iterate over "tenant_metadata" items and only add them in the "final_kwargs" if
 524 |     # the key is not in the "reserved_keys"
 525 |     for k, v in tenant_metadata.items():
 526 |         if k in reserved_keys:
 527 |             continue
 528 | 
 529 |         final_kwargs[k] = v
 530 | 
 531 |     flow_id = os.environ.get("FLOW")
 532 |     job_id = os.environ.get("JOB_ID")
 533 |     tap = os.environ.get("TAP")
 534 |     connector = os.environ.get("CONNECTOR_ID")
 535 |     tenant_id = os.environ.get("TENANT", "")
 536 |     env_id = os.environ.get("ENV_ID")
 537 | 
 538 |     splitted_tenant_id = tenant_id.split(subtenant_delimiter)
 539 |     root_tenant_id = splitted_tenant_id[0]
 540 |     sub_tenant_id = get_index_safely(splitted_tenant_id, 1) or ""
 541 | 
 542 |     final_kwargs.update(
 543 |         {
 544 |             "tenant": tenant_id,
 545 |             "tenant_id": tenant_id,
 546 |             "root_tenant_id": root_tenant_id,
 547 |             "sub_tenant_id": sub_tenant_id,
 548 |             "env_id": env_id,
 549 |             "flow_id": flow_id,
 550 |             "job_id": job_id,
 551 |             "tap": tap,
 552 |             "connector": connector,
 553 |         }
 554 |     )
 555 | 
 556 |     return final_kwargs
 557 | 
 558 | 
 559 | def format_str_safely(str_to_format, **format_variables):
 560 |     """Safely formats a string by replacing placeholders with provided values.
 561 | 
 562 |     Notes
 563 |     -----
 564 |     - This function skips placeholders with missing or empty values in
 565 |       `format_variables`.
 566 | 
 567 |     Parameters
 568 |     ----------
 569 |     str_to_format : str
 570 |         The string containing placeholders to be replaced. Placeholders
 571 |         should be in the format `{key}`.
 572 |     **format_variables : dict
 573 |         Keyword arguments representing the variables to replace in the string.
 574 | 
 575 |     Returns
 576 |     -------
 577 |     str
 578 |         A formatted string with the placeholders replaced by their
 579 |         corresponding values.
 580 | 
 581 |     """
 582 |     str_output = str_to_format
 583 | 
 584 |     for k, v in format_variables.items():
 585 |         if not v:
 586 |             continue
 587 |         str_output = re.sub(re.compile("{" + k + "}"), v, str_output)
 588 | 
 589 |     return str_output
 590 | 
 591 | 
 592 | @singledispatch
 593 | def to_export(
 594 |     data,
 595 |     name, 
 596 |     output_dir, 
 597 |     keys=[], 
 598 |     unified_model=None, 
 599 |     export_format=os.environ.get("DEFAULT_EXPORT_FORMAT", "singer"), 
 600 |     output_file_prefix=os.environ.get("OUTPUT_FILE_PREFIX"), 
 601 |     schema=None, 
 602 |     stringify_objects=False, 
 603 |     reserved_variables={}
 604 |     ):
 605 |     raise NotImplementedError("to_export is not implemented for this dataframe type")
 606 | 
 607 | 
 608 | @to_export.register(pd.DataFrame)
 609 | def pandas_df_to_export(
 610 |     data,
 611 |     name,
 612 |     output_dir,
 613 |     keys=[],
 614 |     unified_model=None,
 615 |     export_format=os.environ.get("DEFAULT_EXPORT_FORMAT", "singer"),
 616 |     output_file_prefix=os.environ.get("OUTPUT_FILE_PREFIX"),
 617 |     schema=None,
 618 |     stringify_objects=False,
 619 |     reserved_variables={},
 620 | ):
 621 |     """Parse a stringified dict or list of dicts.
 622 | 
 623 |     Notes
 624 |     -----
 625 |     This function will export the input data to a specified format
 626 | 
 627 |     Parameters
 628 |     ----------
 629 |     data: dataframe
 630 |         dataframe that will be transformed to a specified format.
 631 |     name: str
 632 |         name of the output file
 633 |     output_dir: str
 634 |         path of the folder that will store the output file
 635 |     output_file_prefix: str
 636 |         prefix of the output file name if needed
 637 |     export_format: str
 638 |         format to which the dataframe will be transformed
 639 |         supported values are: singer, parquet, json and csv
 640 |     unified_model: pydantic model
 641 |         pydantic model used to generate the schema for export format
 642 |         'singer'
 643 |     schema: dict
 644 |         customized schema used for export format 'singer'
 645 |     stringify_objects: bool
 646 |         for parquet files it will stringify complex structures as arrays
 647 |         of objects
 648 |     reserved_variables: dict
 649 |         A dictionary of default values for the format variables to be used
 650 |         in the output_file_prefix.
 651 | 
 652 |     Returns
 653 |     -------
 654 |     return: file
 655 |         it outputs a singer, parquet, json or csv file
 656 | 
 657 |     """
 658 |     # NOTE: This is meant to allow users to override the default output name for a specific stream
 659 |     if os.environ.get(f"HG_UNIFIED_OUTPUT_{name.upper()}"):
 660 |         name = os.environ[f"HG_UNIFIED_OUTPUT_{name.upper()}"]
 661 | 
 662 |     if output_file_prefix:
 663 |         # format output_file_prefix with env variables
 664 |         format_variables = build_string_format_variables(
 665 |             default_kwargs=reserved_variables
 666 |         )
 667 |         output_file_prefix = format_str_safely(output_file_prefix, **format_variables)
 668 |         composed_name = f"{output_file_prefix}{name}"
 669 |     else:
 670 |         composed_name = name
 671 | 
 672 |     if export_format == "singer":
 673 |         # get pk
 674 |         reader = Reader()
 675 |         keys = keys or reader.get_pk(name)
 676 |         # export data as singer
 677 |         to_singer(data, composed_name, output_dir, keys=keys, allow_objects=True, unified_model=unified_model, schema=schema)
 678 |     elif export_format == "parquet":
 679 |         if stringify_objects:
 680 |             data.to_parquet(
 681 |                 os.path.join(output_dir, f"{composed_name}.parquet"),
 682 |                 engine="fastparquet",
 683 |             )
 684 |         else:
 685 |             data.to_parquet(os.path.join(output_dir, f"{composed_name}.parquet"))
 686 |     elif export_format == "json":
 687 |         data.to_json(f"{output_dir}/{composed_name}.json", orient="records", date_format='iso')
 688 |     elif export_format == "jsonl":
 689 |         data.to_json(f"{output_dir}/{composed_name}.jsonl", orient='records', lines=True, date_format='iso')
 690 |     else:
 691 |         data.to_csv(f"{output_dir}/{composed_name}.csv", index=False)
 692 | 
 693 | 
 694 | @to_export.register(pl.LazyFrame)
 695 | def polars_lf_to_export(
 696 |     data,
 697 |     name,
 698 |     output_dir,
 699 |     keys=[],
 700 |     unified_model=None,
 701 |     export_format=os.environ.get("DEFAULT_EXPORT_FORMAT", "singer"),
 702 |     output_file_prefix=os.environ.get("OUTPUT_FILE_PREFIX"),
 703 |     schema=None,
 704 |     stringify_objects=False,
 705 |     reserved_variables={},
 706 | ):
 707 |     """Write a Polars LazyFrame to a specified format.
 708 | 
 709 |     Notes
 710 |     -----
 711 |     This function will export the input data to a specified format
 712 | 
 713 |     Parameters
 714 |     ----------
 715 |     data: Polars LazyFrame
 716 |         Polars LazyFrame that will be transformed to a specified format.
 717 |     name: str
 718 |         name of the output file
 719 |     output_dir: str
 720 |         path of the folder that will store the output file
 721 |     output_file_prefix: str
 722 |         prefix of the output file name if needed
 723 |     export_format: str
 724 |         format to which the dataframe will be transformed
 725 |         supported values are: singer, parquet, json and csv
 726 |     unified_model: pydantic model
 727 |         pydantic model used to generate the schema for export format
 728 |         'singer'
 729 |     schema: dict
 730 |         customized schema used for export format 'singer'
 731 |     stringify_objects: bool
 732 |         for parquet files it will stringify complex structures as arrays
 733 |         of objects
 734 |     reserved_variables: dict
 735 |         A dictionary of default values for the format variables to be used
 736 |         in the output_file_prefix.
 737 | 
 738 |     Returns
 739 |     -------
 740 |     return: file
 741 |         it outputs a singer, parquet, json or csv file
 742 | 
 743 |     """
 744 |     if output_file_prefix:
 745 |         # format output_file_prefix with env variables
 746 |         format_variables = build_string_format_variables(
 747 |             default_kwargs=reserved_variables
 748 |         )
 749 |         output_file_prefix = format_str_safely(output_file_prefix, **format_variables)
 750 |         composed_name = f"{output_file_prefix}{name}"
 751 |     else:
 752 |         composed_name = name
 753 | 
 754 |     if export_format == "singer":
 755 |         # get pk
 756 |         reader = PLLazyFrameReader()
 757 |         keys = keys or reader.get_pk(name)
 758 |         # export data as singer
 759 |         to_singer(data, composed_name, output_dir, keys=keys, allow_objects=True, unified_model=unified_model, schema=schema)
 760 |     elif export_format == "parquet":
 761 |         data.sink_parquet(os.path.join(output_dir, f"{composed_name}.parquet"))
 762 |     elif export_format == "csv":
 763 |         data.sink_csv(os.path.join(output_dir, f"{composed_name}.csv"))
 764 |     else:
 765 |         raise ValueError(f"Unsupported export format: {export_format}")
 766 | 
 767 | 
 768 | @to_export.register(pl.DataFrame)
 769 | def polars_df_to_export(
 770 |     data,
 771 |     name,
 772 |     output_dir,
 773 |     keys=[],
 774 |     unified_model=None,
 775 |     export_format=os.environ.get("DEFAULT_EXPORT_FORMAT", "singer"),
 776 |     output_file_prefix=os.environ.get("OUTPUT_FILE_PREFIX"),
 777 |     schema=None,
 778 |     stringify_objects=False,
 779 |     reserved_variables={},
 780 | ):
 781 |     """Write a Polars DataFrame to a specified format.
 782 | 
 783 |     Notes
 784 |     -----
 785 |     This function will export the input data to a specified format
 786 | 
 787 |     Parameters
 788 |     ----------
 789 |     data: Polars DataFrame
 790 |         Polars DataFrame that will be transformed to a specified format.
 791 |     name: str
 792 |         name of the output file
 793 |     output_dir: str
 794 |         path of the folder that will store the output file
 795 |     output_file_prefix: str
 796 |         prefix of the output file name if needed
 797 |     export_format: str
 798 |         format to which the dataframe will be transformed
 799 |         supported values are: singer, parquet, json and csv
 800 |     unified_model: pydantic model
 801 |         pydantic model used to generate the schema for export format
 802 |         'singer'
 803 |     schema: dict
 804 |         customized schema used for export format 'singer'
 805 |     stringify_objects: bool
 806 |         Unused for polars parquet; kept for signature compatibility
 807 |     reserved_variables: dict
 808 |         A dictionary of default values for the format variables to be used
 809 |         in the output_file_prefix.
 810 | 
 811 |     Returns
 812 |     -------
 813 |     return: file
 814 |         it outputs a singer, parquet, csv, json or jsonl file
 815 | 
 816 |     """
 817 |     if output_file_prefix:
 818 |         # format output_file_prefix with env variables
 819 |         format_variables = build_string_format_variables(
 820 |             default_kwargs=reserved_variables
 821 |         )
 822 |         output_file_prefix = format_str_safely(output_file_prefix, **format_variables)
 823 |         composed_name = f"{output_file_prefix}{name}"
 824 |     else:
 825 |         composed_name = name
 826 | 
 827 |     if export_format == "singer":
 828 |         # get pk
 829 |         reader = PolarsReader()
 830 |         keys = keys or reader.get_pk(name)
 831 |         # export data as singer
 832 |         to_singer(data, composed_name, output_dir, keys=keys, allow_objects=True, unified_model=unified_model, schema=schema)
 833 |     elif export_format == "parquet":
 834 |         data.write_parquet(os.path.join(output_dir, f"{composed_name}.parquet"))
 835 |     elif export_format == "csv":
 836 |         data.write_csv(os.path.join(output_dir, f"{composed_name}.csv"))
 837 |     elif export_format == "json":
 838 |         data.write_json(os.path.join(output_dir, f"{composed_name}.json"))
 839 |     elif export_format == "jsonl":
 840 |         data.write_ndjson(os.path.join(output_dir, f"{composed_name}.jsonl"))
 841 |     else:
 842 |         raise ValueError(f"Unsupported export format: {export_format}")
 843 | 
 844 | 
 845 | def localize_datetime(df, column_name):
 846 |     """
 847 |     Localize a Pandas DataFrame column to a specific timezone.
 848 |     Parameters:
 849 |     -----------
 850 |     df : pandas.DataFrame
 851 |         The DataFrame to be modified.
 852 |     column_name : str
 853 |         The name of the column to be localized.
 854 |     """
 855 |     # Convert the column to a Pandas Timestamp object
 856 |     df[column_name] = pd.to_datetime(df[column_name], errors="coerce")
 857 |     # Localize the column to the specified timezone
 858 |     try:
 859 |         df[column_name] = df[column_name].dt.tz_localize(utc)
 860 |     except:
 861 |         df[column_name] = df[column_name].dt.tz_convert('UTC')
 862 | 
 863 |     return df[column_name]
 864 | 
 865 | def exception(exception, root_dir, error_message=None):
 866 |     """
 867 |     Stores an exception and a message into a file errors.txt, 
 868 |     then the executor reads the error from the txt file to showcase the right error.
 869 |     It should be used instead of raise Exception.
 870 |     Parameters:
 871 |     -----------
 872 |     exception : the exception caught in a try except code.
 873 |     root_dir : str
 874 |         The path of the roo_dir to store errors.txt
 875 |     error_message: str
 876 |         Additional message or data to make the error clearer.
 877 |     """
 878 |     if error_message:
 879 |         error = f"ERROR: {error_message}. Cause: {exception}"
 880 |     else:
 881 |         error = f"ERROR: {exception}"
 882 |     with open(f"{root_dir}/errors.txt", "w") as outfile:
 883 |         outfile.write(error)
 884 |     raise Exception(error)
 885 | 
 886 | def merge_id_from_snapshot(df, snapshot_dir, stream, flow_id, pk):
 887 |     """
 888 |     Merges DataFrame with target created snapshot to retrieve existing target ids.
 889 |     
 890 |     Parameters
 891 |     ----------
 892 |     df : pandas.DataFrame
 893 |         The DataFrame to be modified.
 894 |     snapshot_dir : str
 895 |         The path of the snapshot directory.
 896 |     stream : str
 897 |         The name of the stream.
 898 |     flow_id : str
 899 |         The id of the flow used to create the snapshot.
 900 |     pk : str
 901 |         The name of the primary key column to output.
 902 | 
 903 |     Returns
 904 |     -------
 905 |     pandas.DataFrame
 906 |         The DataFrame with the primary key column added.
 907 |     """
 908 | 
 909 |     # if no pk, set it to None and return
 910 |     if not pk:
 911 |         raise Exception(f"No PK found for '{stream}'. Cannot merge.")
 912 | 
 913 |     # if no externalId, raise an error
 914 |     if "externalId" not in df.columns:
 915 |         raise Exception(f"'externalId' missing for '{stream}'. Cannot merge.")
 916 |     
 917 |     # read snapshot
 918 |     prefix = f"{stream}_{flow_id}"
 919 |     print(f"Reading snapshot: '{prefix}'")
 920 |     snapshot_data_frame = read_snapshots(prefix, snapshot_dir)
 921 | 
 922 |     # if no snapshot, return dataframe
 923 |     if snapshot_data_frame is None or snapshot_data_frame.empty:
 924 |         print(f"No snapshot for '{prefix}'.")
 925 |         return df
 926 | 
 927 |     # get ids from snapshot
 928 |     ids = snapshot_data_frame[["InputId", "RemoteId"]].drop_duplicates(
 929 |         subset=["InputId"], keep="last"
 930 |     )
 931 | 
 932 |     # merge dataframe with snapshot
 933 |     merged = df.merge(
 934 |         ids,
 935 |         left_on="externalId",
 936 |         right_on="InputId",
 937 |         how="left",
 938 |         suffixes=("", "_snap"),
 939 |     )
 940 | 
 941 |     # rename RemoteId to pk
 942 |     if "RemoteId" in merged.columns:
 943 |         merged = merged.rename(columns={"RemoteId": pk})
 944 | 
 945 |     # drop InputId (not needed)
 946 |     if "InputId" in merged.columns:
 947 |         merged = merged.drop(columns=["InputId"])
 948 | 
 949 |     # set pk to None if not in snapshot
 950 |     if pk in merged.columns:
 951 |         merged[pk] = merged[pk].where(pd.notna(merged[pk]), None)
 952 |     print(f"Finished getting ids from snapshot for '{stream}'.")
 953 |     return merged
 954 | 
 955 | def read_tenant_custom_mapping(tenant_config, flow_id=None):
 956 |     """Read the tenant mapping from the tenant config.
 957 | 
 958 |     Parameters
 959 |     ----------
 960 |     tenant_config : dict
 961 |         The tenant config.
 962 |     """
 963 |     # read mapping from tenant config
 964 |     raw_mapping_data = tenant_config.get("hotglue_mapping", {}).get("mapping", {})
 965 |     if not raw_mapping_data:
 966 |         print("No 'hotglue_mapping.mapping' section found in tenant config.")
 967 |         return {}, {}
 968 | 
 969 |     custom_field_mappings = {}
 970 |     stream_name_mapping = {}
 971 | 
 972 |     # get flow_id from tenant config
 973 |     potential_flow_id_key = (
 974 |         list(raw_mapping_data.keys())[0]
 975 |         if len(raw_mapping_data) == 1
 976 |         else None
 977 |     )
 978 | 
 979 |     flow_id = flow_id or potential_flow_id_key
 980 |     raw_mapping_data = raw_mapping_data.get(flow_id)
 981 | 
 982 |     if not raw_mapping_data:
 983 |         print(f"No mapping found for flow_id: {flow_id}")
 984 |         return custom_field_mappings, stream_name_mapping
 985 |     
 986 |     if not isinstance(raw_mapping_data, dict):
 987 |         print(f"Unexpected structure in mapping content: Expected dict, got {type(raw_mapping_data)}")
 988 |         raise ValueError("Invalid mapping structure.")
 989 | 
 990 |     # process mapping
 991 |     for combined_stream_name, field_map in raw_mapping_data.items():
 992 |         try:
 993 |             # Key format is SourceStream/TargetStream
 994 |             source_stream, target_stream = combined_stream_name.split("/", 1)
 995 |             custom_field_mappings[source_stream] = field_map
 996 |             stream_name_mapping[source_stream] = target_stream
 997 |         except Exception as e:
 998 |             raise Exception(f"Error processing mapping key '{combined_stream_name}': {e}. Skipping.")
 999 |     return custom_field_mappings, stream_name_mapping
1000 | 


--------------------------------------------------------------------------------