├── tests ├── __init__.py ├── data │ ├── output │ │ ├── json_to_cols.csv │ │ ├── campaign_performance_csv.parquet │ │ ├── campaign_performance_parquet.parquet │ │ ├── json_to_cols_unique.csv │ │ ├── json_to_rows.csv │ │ ├── explode_multi.csv │ │ ├── campaign_performance_parquet.csv │ │ ├── campaign_performance_csv.csv │ │ ├── data.singer │ │ ├── chunk_csv_campaign_performance.singer │ │ └── chunk_parquet_campaign_performance.singer │ └── input │ │ ├── campaign_performance-20250427T202442.parquet │ │ ├── json_to_cols.csv │ │ ├── json_to_cols_unique.csv │ │ ├── multi_json.csv │ │ ├── json_to_rows.csv │ │ └── campaign_csv-20250427T202522.csv └── etl_test.py ├── gluestick ├── utils │ ├── __init__.py │ └── polars_utils.py ├── readers │ ├── __init__.py │ ├── pl_reader.py │ └── pl_lazyframe_reader.py ├── __init__.py ├── reader.py ├── pandas_utils.py ├── singer.py └── etl_utils.py ├── requirements.txt ├── .travis.yml ├── mypy.ini ├── pyproject.toml ├── setup.py ├── LICENSE ├── README.md ├── tox.ini ├── .github └── workflows │ └── ci_workflow.yml └── .gitignore /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gluestick/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gluestick/readers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | xlrd==1.2.0 2 | numpy==1.19.2 3 | pandas==1.1.3 4 | singer-python>=4.0.0 5 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.7" 4 | install: 5 | - pip install -r requirements.txt 6 | script: 7 | - pytest 8 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | python_version = 3.9 3 | warn_unused_configs = True 4 | 5 | [mypy-backoff.*] 6 | ignore_missing_imports = True 7 | -------------------------------------------------------------------------------- /tests/data/output/json_to_cols.csv: -------------------------------------------------------------------------------- 1 | Customer Name,Metadata.FirstName,Metadata.LastName 2 | Company 1,John,Smith 3 | Company 2,Jane,Smith 4 | -------------------------------------------------------------------------------- /tests/data/output/campaign_performance_csv.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hotgluexyz/gluestick/HEAD/tests/data/output/campaign_performance_csv.parquet -------------------------------------------------------------------------------- /tests/data/output/campaign_performance_parquet.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hotgluexyz/gluestick/HEAD/tests/data/output/campaign_performance_parquet.parquet -------------------------------------------------------------------------------- /tests/data/input/campaign_performance-20250427T202442.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hotgluexyz/gluestick/HEAD/tests/data/input/campaign_performance-20250427T202442.parquet -------------------------------------------------------------------------------- /tests/data/input/json_to_cols.csv: -------------------------------------------------------------------------------- 1 | Customer Name,Metadata 2 | Company 1,"{""FirstName"": ""John"", ""LastName"": ""Smith""}" 3 | Company 2,"{""FirstName"": ""Jane"", ""LastName"": ""Smith""}" 4 | -------------------------------------------------------------------------------- /tests/data/output/json_to_cols_unique.csv: -------------------------------------------------------------------------------- 1 | Customer Name,Metadata.FirstName,Metadata.LastName,Metadata.Type,Metadata.SubType 2 | Company 1,John,Smith,Person,Other 3 | Company 2,Jane,Smith,Person,Parent 4 | -------------------------------------------------------------------------------- /tests/data/input/json_to_cols_unique.csv: -------------------------------------------------------------------------------- 1 | Customer Name,Metadata 2 | Company 1,"{""FirstName"": ""John"", ""LastName"": ""Smith"", ""Type"": ""Person"", ""SubType"": ""Other""}" 3 | Company 2,"{""FirstName"": ""Jane"", ""LastName"": ""Smith"", ""Type"": ""Person"", ""SubType"": ""Parent""}" 4 | 5 | -------------------------------------------------------------------------------- /tests/data/output/json_to_rows.csv: -------------------------------------------------------------------------------- 1 | Customer Name,Line Detail.Id,Line Detail.Desc,Line Detail.Amount 2 | Company 1,1,Bolts,101.15 3 | Company 1,2,Smith,90.8 4 | Company 2,1,Braces,51.15 5 | Company 2,2,Wood,190.1 6 | Company 3,1,Braces,51.15 7 | Company 4,NaN,NaN,NaN 8 | Company 5,1,Braces,51.15 -------------------------------------------------------------------------------- /gluestick/__init__.py: -------------------------------------------------------------------------------- 1 | """Import functions and classes to gluestick.""" 2 | 3 | from .etl_utils import * # noqa 4 | from .pandas_utils import * # noqa 5 | from .singer import * # noqa 6 | from .reader import * # noqa 7 | from .readers.pl_lazyframe_reader import * # noqa 8 | from .readers.pl_reader import * # noqa 9 | -------------------------------------------------------------------------------- /tests/data/output/explode_multi.csv: -------------------------------------------------------------------------------- 1 | CompanyId,Customer Name,Total,Metadata.FirstName,Metadata.LastName,LineDetail.Id,LineDetail.Desc,LineDetail.Amount 2 | 100,Company 1,191.95,John,Smith,1,Bolts,101.15 3 | 100,Company 1,191.95,John,Smith,2,Smith,90.8 4 | 200,Company 2,241.25,Jane,Smith,1,Braces,51.15 5 | 200,Company 2,241.25,Jane,Smith,2,Wood,190.1 -------------------------------------------------------------------------------- /tests/data/input/multi_json.csv: -------------------------------------------------------------------------------- 1 | CompanyId,Customer Name,Metadata,LineDetail,Total 2 | 100,Company 1,"{""FirstName"": ""John"", ""LastName"": ""Smith""}","[ {""Id"": ""1"", ""Desc"": ""Bolts"", ""Amount"": 101.15}, {""Id"": ""2"", ""Desc"": ""Smith"", ""Amount"": 90.80} ]",191.95 3 | 200,Company 2,"{""FirstName"": ""Jane"", ""LastName"": ""Smith""}","[ {""Id"": ""1"", ""Desc"": ""Braces"", ""Amount"": 51.15}, {""Id"": ""2"", ""Desc"": ""Wood"", ""Amount"": 190.10} ]",241.25 4 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.tox] 2 | pytest = "^6.2.5" 3 | tox = "^3.24.4" 4 | flake8 = "^3.9.2" 5 | black = "^21.9b0" 6 | pydocstyle = "^6.1.1" 7 | mypy = "^0.910" 8 | types-requests = "^2.26.1" 9 | isort = "^5.10.1" 10 | 11 | [tool.isort] 12 | profile = "black" 13 | multi_line_output = 3 # Vertical Hanging Indent 14 | src_paths = "tap_skuvault" 15 | 16 | [build-system] 17 | requires = [ "setuptools >= 35.0.2", "wheel >= 0.29.0"] 18 | build-backend = "setuptools.build_meta" 19 | -------------------------------------------------------------------------------- /tests/data/input/json_to_rows.csv: -------------------------------------------------------------------------------- 1 | Customer Name,Line Detail 2 | Company 1,"[ {""Id"": ""1"", ""Desc"": ""Bolts"", ""Amount"": 101.15}, {""Id"": ""2"", ""Desc"": ""Smith"", ""Amount"": 90.80} ]" 3 | Company 2,"[ {""Id"": ""1"", ""Desc"": ""Braces"", ""Amount"": 51.15}, {""Id"": ""2"", ""Desc"": ""Wood"", ""Amount"": 190.10} ]" 4 | Company 3,"[{""Id"": ""1"", ""Desc"": ""Braces"", ""Amount"": 51.15}]" 5 | Company 4, 6 | Company 5,"{""Id"": ""1"", ""Desc"": ""Braces"", ""Amount"": 51.15}" 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | setup( 7 | name="gluestick", 8 | version="3.0.3", 9 | description="ETL utility functions built for the hotglue iPaaS platform", 10 | long_description=long_description, 11 | long_description_content_type="text/markdown", 12 | url="https://github.com/hotgluexyz/gluestick", 13 | install_requires=[ 14 | "singer-python>=4.0.0", 15 | "numpy>=1.4", 16 | "pandas>=1.2.5", 17 | "pyarrow>=8.0.0", 18 | "pytz>=2022.6", 19 | "polars==1.34.0" 20 | ], 21 | author="hotglue", 22 | author_email="hello@hotglue.xyz", 23 | license="MIT", 24 | packages=find_packages(include=["gluestick", "gluestick.*"]), 25 | zip_safe=False, 26 | ) 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2020 hotglue 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | gluestick [![Build Status](https://travis-ci.org/hotgluexyz/gluestick.svg?branch=master)](https://travis-ci.org/hotgluexyz/gluestick) 2 | ============= 3 | 4 | A small Python module containing quick utility functions for standard ETL processes. 5 | 6 | ## Installation ## 7 | 8 | ``` 9 | pip install gluestick 10 | ``` 11 | 12 | ## Links ## 13 | 14 | * [Source] 15 | * [Wiki] 16 | * [Issues] 17 | * [Slack] 18 | 19 | ## License ## 20 | [MIT] 21 | 22 | ## Dependencies ## 23 | * NumPy 24 | * Pandas 25 | 26 | ## Contributing ## 27 | This project is maintained by the [hotglue] team. We welcome contributions from the 28 | community via issues and pull requests. 29 | 30 | If you wish to chat with our team, feel free to join our [Slack]! 31 | 32 | 33 | [Source]: https://github.com/hotgluexyz/gluestick 34 | [Wiki]: https://github.com/hotgluexyz/gluestick/wiki 35 | [Issues]: https://github.com/hotgluexyz/gluestick/issues 36 | [MIT]: https://tldrlegal.com/license/mit-license 37 | [hotglue]: https://hotglue.xyz 38 | [Slack]: https://bit.ly/2KBGGq1 39 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py38 3 | isolated_build = true 4 | 5 | [testenv] 6 | deps = 7 | pytest>=6.2.5 8 | tox>=3.24.4 9 | flake8>=3.9.2 10 | black>=21.9b0 11 | pydocstyle>=6.1.1 12 | mypy>=0.910 13 | types-requests>=2.26.1 14 | isort>=5.10.1 15 | commands = 16 | pytest 17 | black --check gluestick/ 18 | flake8 gluestick 19 | pydocstyle gluestick 20 | mypy gluestick --exclude='tests' 21 | 22 | [testenv:pytest] 23 | envlist = py37, py38, py39 24 | deps = pytest>=6.2.5 25 | commands = pytest 26 | 27 | [testenv:format] 28 | deps = 29 | black>=21.9b0 30 | isort>=5.10.1 31 | commands = 32 | black gluestick/ 33 | isort gluestick 34 | 35 | [testenv:lint] 36 | deps = 37 | flake8>=3.9.2 38 | black>=21.9b0 39 | pydocstyle>=6.1.1 40 | mypy>=0.910 41 | isort>=5.10.1 42 | commands = 43 | black --check --diff gluestick/ 44 | isort --check gluestick 45 | flake8 gluestick 46 | pydocstyle gluestick 47 | mypy gluestick --exclude='tests' --ignore-missing-imports 48 | 49 | [flake8] 50 | ignore = W503,C901,E501,E722,E721 51 | max-complexity = 10 52 | 53 | [pydocstyle] 54 | ignore = D105,D203,D213,D210,D413,D411,D401,D100 55 | -------------------------------------------------------------------------------- /tests/data/output/campaign_performance_parquet.csv: -------------------------------------------------------------------------------- 1 | customer_id,campaign__resourceName,campaign__status,campaign__name,metrics__clicks,metrics__costMicros,metrics__ctr,metrics__averageCpc,metrics__impressions,segments__device,segments__date 2 | 7950307320,customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,64,90217477,0.374269,1409648.1,171,DESKTOP,2025-03-25 3 | 7950307320,customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,117,175920000,0.38870433,1503589.8,301,MOBILE,2025-03-25 4 | 7950307320,customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,1,10000,0.33333334,10000.0,3,TABLET,2025-03-25 5 | 7950307320,customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,56,79952060,0.3478261,1427715.4,161,DESKTOP,2025-03-26 6 | 7950307320,customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,100,193100494,0.33670035,1931005.0,297,MOBILE,2025-03-26 7 | 7950307320,customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,58,76730551,0.34117648,1322940.5,170,DESKTOP,2025-03-27 8 | 7950307320,customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,102,185445105,0.3,1818089.2,340,MOBILE,2025-03-27 9 | 7950307320,customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,1,830000,0.5,830000.0,2,TABLET,2025-03-27 10 | -------------------------------------------------------------------------------- /.github/workflows/ci_workflow.yml: -------------------------------------------------------------------------------- 1 | ### A CI workflow template that runs linting and python testing 2 | 3 | name: Test gluestick 4 | 5 | on: [push] 6 | 7 | jobs: 8 | # linting: 9 | 10 | # runs-on: ubuntu-latest 11 | # strategy: 12 | # matrix: 13 | # # Only lint using the primary version used for dev 14 | # python-version: ["3.10"] 15 | 16 | # steps: 17 | # - uses: actions/checkout@v2 18 | # - name: Set up Python ${{ matrix.python-version }} 19 | # uses: actions/setup-python@v2 20 | # with: 21 | # python-version: ${{ matrix.python-version }} 22 | # - name: Install dependencies 23 | # run: | 24 | # pip install . 25 | # pip install tox 26 | # - name: Run lint command from tox.ini 27 | # run: | 28 | # tox -e lint 29 | 30 | pytest: 31 | 32 | runs-on: ubuntu-latest 33 | env: 34 | GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} 35 | strategy: 36 | matrix: 37 | python-version: ["3.10"] 38 | 39 | steps: 40 | - uses: actions/checkout@v2 41 | - name: Set up Python ${{ matrix.python-version }} 42 | uses: actions/setup-python@v2 43 | with: 44 | python-version: ${{ matrix.python-version }} 45 | - name: Install dependencies 46 | run: | 47 | pip install . 48 | pip install pytest 49 | - name: Test with pytest 50 | run: | 51 | pytest --capture=no 52 | -------------------------------------------------------------------------------- /gluestick/utils/polars_utils.py: -------------------------------------------------------------------------------- 1 | import polars as pl 2 | 3 | def map_pd_type_to_polars(type_name): 4 | if not isinstance(type_name, str): 5 | # its a pd type class 6 | type_name = type_name.__name__ 7 | 8 | if type_name == "Int64": 9 | return pl.Int64 10 | elif type_name == "Float64": 11 | return pl.Float64 12 | elif type_name in ["Boolean", "bool", "boolean"]: 13 | return pl.Boolean 14 | elif type_name == "String": 15 | return pl.String 16 | elif type_name == "Datetime": 17 | return pl.Datetime(time_unit="ns", time_zone="UTC") 18 | elif type_name == "Date": 19 | return pl.Date 20 | elif type_name == "Time": 21 | return pl.Time 22 | elif type_name == "object": 23 | return pl.String 24 | elif type_name == "float": 25 | return pl.Float64 26 | elif type_name == "int": 27 | return pl.Int64 28 | else: 29 | raise ValueError(f"Unknown type: {type_name}") 30 | 31 | def cast_lf_from_schema(lf: pl.LazyFrame, types_params: dict): 32 | return lf.with_columns([ 33 | pl.col(col).cast(dtype, strict=True) for col, dtype in types_params.items() 34 | ]) 35 | 36 | def cast_df_from_schema(df: pl.DataFrame, types_params: dict): 37 | return df.with_columns([ 38 | pl.col(col).cast(dtype, strict=True) for col, dtype in types_params.items() 39 | ]) 40 | -------------------------------------------------------------------------------- /tests/data/input/campaign_csv-20250427T202522.csv: -------------------------------------------------------------------------------- 1 | campaign__resourceName,campaign__status,campaign__name,metrics__clicks,metrics__costMicros,metrics__ctr,metrics__averageCpc,metrics__impressions,segments__device,segments__date,customer_id 2 | customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,64,90217477,0.3742690058479532,1409648.078125,171,DESKTOP,2025-03-25,7950307320 3 | customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,117,175920000,0.38870431893687707,1503589.7435897435,301,MOBILE,2025-03-25,7950307320 4 | customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,1,10000,0.3333333333333333,10000,3,TABLET,2025-03-25,7950307320 5 | customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,56,79952060,0.34782608695652173,1427715.357142857,161,DESKTOP,2025-03-26,7950307320 6 | customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,100,193100494,0.3367003367003367,1931004.94,297,MOBILE,2025-03-26,7950307320 7 | customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,58,76730551,0.3411764705882353,1322940.5344827587,170,DESKTOP,2025-03-27,7950307320 8 | customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,102,185445105,0.3,1818089.2647058824,340,MOBILE,2025-03-27,7950307320 9 | customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,1,830000,0.5,830000,2,TABLET,2025-03-27,7950307320 10 | -------------------------------------------------------------------------------- /tests/data/output/campaign_performance_csv.csv: -------------------------------------------------------------------------------- 1 | campaign__resourceName,campaign__status,campaign__name,metrics__clicks,metrics__costMicros,metrics__ctr,metrics__averageCpc,metrics__impressions,segments__device,segments__date,customer_id 2 | customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,64,90217477,0.3742690058479532,1409648.078125,171,DESKTOP,2025-03-25,7950307320 3 | customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,117,175920000,0.38870431893687707,1503589.7435897435,301,MOBILE,2025-03-25,7950307320 4 | customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,1,10000,0.3333333333333333,10000,3,TABLET,2025-03-25,7950307320 5 | customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,56,79952060,0.34782608695652173,1427715.357142857,161,DESKTOP,2025-03-26,7950307320 6 | customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,100,193100494,0.3367003367003367,1931004.94,297,MOBILE,2025-03-26,7950307320 7 | customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,58,76730551,0.3411764705882353,1322940.5344827587,170,DESKTOP,2025-03-27,7950307320 8 | customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,102,185445105,0.3,1818089.2647058824,340,MOBILE,2025-03-27,7950307320 9 | customers/7950307320/campaigns/12768130709,ENABLED,lorem-ipsum-search-brand,1,830000,0.5,830000,2,TABLET,2025-03-27,7950307320 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 86 | __pypackages__/ 87 | 88 | # Celery stuff 89 | celerybeat-schedule 90 | celerybeat.pid 91 | 92 | # SageMath parsed files 93 | *.sage.py 94 | 95 | # Environments 96 | .env 97 | .venv 98 | env/ 99 | venv/ 100 | ENV/ 101 | env.bak/ 102 | venv.bak/ 103 | 104 | # Spyder project settings 105 | .spyderproject 106 | .spyproject 107 | 108 | # Rope project settings 109 | .ropeproject 110 | 111 | # mkdocs documentation 112 | /site 113 | 114 | # mypy 115 | .mypy_cache/ 116 | .dmypy.json 117 | dmypy.json 118 | 119 | # Pyre type checker 120 | .pyre/ 121 | 122 | # pytype static type analyzer 123 | .pytype/ 124 | 125 | # Cython debug symbols 126 | cython_debug/ 127 | 128 | # Misc 129 | .vscode 130 | .DS_Store 131 | -------------------------------------------------------------------------------- /tests/data/output/data.singer: -------------------------------------------------------------------------------- 1 | {"type": "SCHEMA", "stream": "campaign_performance", "schema": {"type": ["object", "null"], "properties": {"customer_id": {"type": ["string", "null"]}, "campaign__resourceName": {"type": ["string", "null"]}, "campaign__status": {"type": ["string", "null"]}, "campaign__name": {"type": ["string", "null"]}, "metrics__clicks": {"type": ["string", "null"]}, "metrics__costMicros": {"type": ["string", "null"]}, "metrics__ctr": {"type": ["number", "null"]}, "metrics__averageCpc": {"type": ["number", "null"]}, "metrics__impressions": {"type": ["string", "null"]}, "segments__device": {"type": ["string", "null"]}, "segments__date": {"type": ["string", "null"]}}}, "key_properties": ["id"]} 2 | {"type": "RECORD", "stream": "campaign_performance", "record": {"customer_id": "7950307320", "campaign__resourceName": "customers/7950307320/campaigns/12768130709", "campaign__status": "ENABLED", "campaign__name": "lorem-ipsum-search-brand", "metrics__clicks": "64", "metrics__costMicros": "90217477", "metrics__ctr": 0.3742690086364746, "metrics__averageCpc": 1409648.125, "metrics__impressions": "171", "segments__device": "DESKTOP", "segments__date": "2025-03-25"}} 3 | {"type": "RECORD", "stream": "campaign_performance", "record": {"customer_id": "7950307320", "campaign__resourceName": "customers/7950307320/campaigns/12768130709", "campaign__status": "ENABLED", "campaign__name": "lorem-ipsum-search-brand", "metrics__clicks": "117", "metrics__costMicros": "175920000", "metrics__ctr": 0.3887043297290802, "metrics__averageCpc": 1503589.75, "metrics__impressions": "301", "segments__device": "MOBILE", "segments__date": "2025-03-25"}} 4 | {"type": "RECORD", "stream": "campaign_performance", "record": {"customer_id": "7950307320", "campaign__resourceName": "customers/7950307320/campaigns/12768130709", "campaign__status": "ENABLED", "campaign__name": "lorem-ipsum-search-brand", "metrics__clicks": "1", "metrics__costMicros": "10000", "metrics__ctr": 0.3333333432674408, "metrics__averageCpc": 10000.0, "metrics__impressions": "3", "segments__device": "TABLET", "segments__date": "2025-03-25"}} 5 | {"type": "RECORD", "stream": "campaign_performance", "record": {"customer_id": "7950307320", "campaign__resourceName": "customers/7950307320/campaigns/12768130709", "campaign__status": "ENABLED", "campaign__name": "lorem-ipsum-search-brand", "metrics__clicks": "56", "metrics__costMicros": "79952060", "metrics__ctr": 0.3478260934352875, "metrics__averageCpc": 1427715.375, "metrics__impressions": "161", "segments__device": "DESKTOP", "segments__date": "2025-03-26"}} 6 | {"type": "RECORD", "stream": "campaign_performance", "record": {"customer_id": "7950307320", "campaign__resourceName": "customers/7950307320/campaigns/12768130709", "campaign__status": "ENABLED", "campaign__name": "lorem-ipsum-search-brand", "metrics__clicks": "100", "metrics__costMicros": "193100494", "metrics__ctr": 0.33670035004615784, "metrics__averageCpc": 1931005.0, "metrics__impressions": "297", "segments__device": "MOBILE", "segments__date": "2025-03-26"}} 7 | {"type": "RECORD", "stream": "campaign_performance", "record": {"customer_id": "7950307320", "campaign__resourceName": "customers/7950307320/campaigns/12768130709", "campaign__status": "ENABLED", "campaign__name": "lorem-ipsum-search-brand", "metrics__clicks": "58", "metrics__costMicros": "76730551", "metrics__ctr": 0.34117648005485535, "metrics__averageCpc": 1322940.5, "metrics__impressions": "170", "segments__device": "DESKTOP", "segments__date": "2025-03-27"}} 8 | {"type": "RECORD", "stream": "campaign_performance", "record": {"customer_id": "7950307320", "campaign__resourceName": "customers/7950307320/campaigns/12768130709", "campaign__status": "ENABLED", "campaign__name": "lorem-ipsum-search-brand", "metrics__clicks": "102", "metrics__costMicros": "185445105", "metrics__ctr": 0.30000001192092896, "metrics__averageCpc": 1818089.25, "metrics__impressions": "340", "segments__device": "MOBILE", "segments__date": "2025-03-27"}} 9 | {"type": "RECORD", "stream": "campaign_performance", "record": {"customer_id": "7950307320", "campaign__resourceName": "customers/7950307320/campaigns/12768130709", "campaign__status": "ENABLED", "campaign__name": "lorem-ipsum-search-brand", "metrics__clicks": "1", "metrics__costMicros": "830000", "metrics__ctr": 0.5, "metrics__averageCpc": 830000.0, "metrics__impressions": "2", "segments__device": "TABLET", "segments__date": "2025-03-27"}} 10 | {"type": "STATE", "value": {}} -------------------------------------------------------------------------------- /tests/data/output/chunk_csv_campaign_performance.singer: -------------------------------------------------------------------------------- 1 | {"type": "SCHEMA", "stream": "campaign_performance", "schema": {"type": ["object", "null"], "properties": {"campaign__resourceName": {"type": ["string", "null"]}, "campaign__status": {"type": ["string", "null"]}, "campaign__name": {"type": ["string", "null"]}, "metrics__clicks": {"type": ["integer", "null"]}, "metrics__costMicros": {"type": ["integer", "null"]}, "metrics__ctr": {"type": ["number", "null"]}, "metrics__averageCpc": {"type": ["number", "null"]}, "metrics__impressions": {"type": ["integer", "null"]}, "segments__device": {"type": ["string", "null"]}, "segments__date": {"type": ["string", "null"]}, "customer_id": {"type": ["integer", "null"]}}}, "key_properties": ["id"]} 2 | {"type": "RECORD", "stream": "campaign_performance", "record": {"campaign__resourceName": "customers/7950307320/campaigns/13768133709", "campaign__status": "ENABLED", "campaign__name": "SuitShop-Google-OBcvr-OPpur-Search-Brand-RT-Hot", "metrics__clicks": 64, "metrics__costMicros": 90217477, "metrics__ctr": 0.3742690058479532, "metrics__averageCpc": 1409648.078125, "metrics__impressions": 171, "segments__device": "DESKTOP", "segments__date": "2025-03-25", "customer_id": 7950307320}} 3 | {"type": "RECORD", "stream": "campaign_performance", "record": {"campaign__resourceName": "customers/7950307320/campaigns/13768133709", "campaign__status": "ENABLED", "campaign__name": "SuitShop-Google-OBcvr-OPpur-Search-Brand-RT-Hot", "metrics__clicks": 117, "metrics__costMicros": 175920000, "metrics__ctr": 0.388704318936877, "metrics__averageCpc": 1503589.7435897435, "metrics__impressions": 301, "segments__device": "MOBILE", "segments__date": "2025-03-25", "customer_id": 7950307320}} 4 | {"type": "RECORD", "stream": "campaign_performance", "record": {"campaign__resourceName": "customers/7950307320/campaigns/13768133709", "campaign__status": "ENABLED", "campaign__name": "SuitShop-Google-OBcvr-OPpur-Search-Brand-RT-Hot", "metrics__clicks": 1, "metrics__costMicros": 10000, "metrics__ctr": 0.3333333333333333, "metrics__averageCpc": 10000.0, "metrics__impressions": 3, "segments__device": "TABLET", "segments__date": "2025-03-25", "customer_id": 7950307320}} 5 | {"type": "RECORD", "stream": "campaign_performance", "record": {"campaign__resourceName": "customers/7950307320/campaigns/13768133709", "campaign__status": "ENABLED", "campaign__name": "SuitShop-Google-OBcvr-OPpur-Search-Brand-RT-Hot", "metrics__clicks": 56, "metrics__costMicros": 79952060, "metrics__ctr": 0.3478260869565217, "metrics__averageCpc": 1427715.357142857, "metrics__impressions": 161, "segments__device": "DESKTOP", "segments__date": "2025-03-26", "customer_id": 7950307320}} 6 | {"type": "RECORD", "stream": "campaign_performance", "record": {"campaign__resourceName": "customers/7950307320/campaigns/13768133709", "campaign__status": "ENABLED", "campaign__name": "SuitShop-Google-OBcvr-OPpur-Search-Brand-RT-Hot", "metrics__clicks": 100, "metrics__costMicros": 193100494, "metrics__ctr": 0.3367003367003367, "metrics__averageCpc": 1931004.94, "metrics__impressions": 297, "segments__device": "MOBILE", "segments__date": "2025-03-26", "customer_id": 7950307320}} 7 | {"type": "STATE", "value": {}} 8 | {"type": "SCHEMA", "stream": "campaign_performance", "schema": {"type": ["object", "null"], "properties": {"campaign__resourceName": {"type": ["string", "null"]}, "campaign__status": {"type": ["string", "null"]}, "campaign__name": {"type": ["string", "null"]}, "metrics__clicks": {"type": ["integer", "null"]}, "metrics__costMicros": {"type": ["integer", "null"]}, "metrics__ctr": {"type": ["number", "null"]}, "metrics__averageCpc": {"type": ["number", "null"]}, "metrics__impressions": {"type": ["integer", "null"]}, "segments__device": {"type": ["string", "null"]}, "segments__date": {"type": ["string", "null"]}, "customer_id": {"type": ["integer", "null"]}}}, "key_properties": ["id"]} 9 | {"type": "RECORD", "stream": "campaign_performance", "record": {"campaign__resourceName": "customers/7950307320/campaigns/13768133709", "campaign__status": "ENABLED", "campaign__name": "SuitShop-Google-OBcvr-OPpur-Search-Brand-RT-Hot", "metrics__clicks": 58, "metrics__costMicros": 76730551, "metrics__ctr": 0.3411764705882353, "metrics__averageCpc": 1322940.5344827587, "metrics__impressions": 170, "segments__device": "DESKTOP", "segments__date": "2025-03-27", "customer_id": 7950307320}} 10 | {"type": "RECORD", "stream": "campaign_performance", "record": {"campaign__resourceName": "customers/7950307320/campaigns/13768133709", "campaign__status": "ENABLED", "campaign__name": "SuitShop-Google-OBcvr-OPpur-Search-Brand-RT-Hot", "metrics__clicks": 102, "metrics__costMicros": 185445105, "metrics__ctr": 0.3, "metrics__averageCpc": 1818089.2647058824, "metrics__impressions": 340, "segments__device": "MOBILE", "segments__date": "2025-03-27", "customer_id": 7950307320}} 11 | {"type": "RECORD", "stream": "campaign_performance", "record": {"campaign__resourceName": "customers/7950307320/campaigns/13768133709", "campaign__status": "ENABLED", "campaign__name": "SuitShop-Google-OBcvr-OPpur-Search-Brand-RT-Hot", "metrics__clicks": 1, "metrics__costMicros": 830000, "metrics__ctr": 0.5, "metrics__averageCpc": 830000.0, "metrics__impressions": 2, "segments__device": "TABLET", "segments__date": "2025-03-27", "customer_id": 7950307320}} 12 | {"type": "STATE", "value": {}} 13 | -------------------------------------------------------------------------------- /tests/data/output/chunk_parquet_campaign_performance.singer: -------------------------------------------------------------------------------- 1 | {"type": "SCHEMA", "stream": "campaign_performance", "schema": {"type": ["object", "null"], "properties": {"customer_id": {"type": ["string", "null"]}, "campaign__resourceName": {"type": ["string", "null"]}, "campaign__status": {"type": ["string", "null"]}, "campaign__name": {"type": ["string", "null"]}, "metrics__clicks": {"type": ["string", "null"]}, "metrics__costMicros": {"type": ["string", "null"]}, "metrics__ctr": {"type": ["number", "null"]}, "metrics__averageCpc": {"type": ["number", "null"]}, "metrics__impressions": {"type": ["string", "null"]}, "segments__device": {"type": ["string", "null"]}, "segments__date": {"type": ["string", "null"]}}}, "key_properties": ["id"]} 2 | {"type": "RECORD", "stream": "campaign_performance", "record": {"customer_id": "7950307320", "campaign__resourceName": "customers/7950307320/campaigns/12768130709", "campaign__status": "ENABLED", "campaign__name": "SuitShop-Google-OBcvr-OPpur-Search-Brand-RT-Hot", "metrics__clicks": "64", "metrics__costMicros": "90217477", "metrics__ctr": 0.3742690086364746, "metrics__averageCpc": 1409648.125, "metrics__impressions": "171", "segments__device": "DESKTOP", "segments__date": "2025-03-25"}} 3 | {"type": "RECORD", "stream": "campaign_performance", "record": {"customer_id": "7950307320", "campaign__resourceName": "customers/7950307320/campaigns/12768130709", "campaign__status": "ENABLED", "campaign__name": "SuitShop-Google-OBcvr-OPpur-Search-Brand-RT-Hot", "metrics__clicks": "117", "metrics__costMicros": "175920000", "metrics__ctr": 0.3887043297290802, "metrics__averageCpc": 1503589.75, "metrics__impressions": "301", "segments__device": "MOBILE", "segments__date": "2025-03-25"}} 4 | {"type": "RECORD", "stream": "campaign_performance", "record": {"customer_id": "7950307320", "campaign__resourceName": "customers/7950307320/campaigns/12768130709", "campaign__status": "ENABLED", "campaign__name": "SuitShop-Google-OBcvr-OPpur-Search-Brand-RT-Hot", "metrics__clicks": "1", "metrics__costMicros": "10000", "metrics__ctr": 0.3333333432674408, "metrics__averageCpc": 10000.0, "metrics__impressions": "3", "segments__device": "TABLET", "segments__date": "2025-03-25"}} 5 | {"type": "RECORD", "stream": "campaign_performance", "record": {"customer_id": "7950307320", "campaign__resourceName": "customers/7950307320/campaigns/12768130709", "campaign__status": "ENABLED", "campaign__name": "SuitShop-Google-OBcvr-OPpur-Search-Brand-RT-Hot", "metrics__clicks": "56", "metrics__costMicros": "79952060", "metrics__ctr": 0.3478260934352875, "metrics__averageCpc": 1427715.375, "metrics__impressions": "161", "segments__device": "DESKTOP", "segments__date": "2025-03-26"}} 6 | {"type": "RECORD", "stream": "campaign_performance", "record": {"customer_id": "7950307320", "campaign__resourceName": "customers/7950307320/campaigns/12768130709", "campaign__status": "ENABLED", "campaign__name": "SuitShop-Google-OBcvr-OPpur-Search-Brand-RT-Hot", "metrics__clicks": "100", "metrics__costMicros": "193100494", "metrics__ctr": 0.33670035004615784, "metrics__averageCpc": 1931005.0, "metrics__impressions": "297", "segments__device": "MOBILE", "segments__date": "2025-03-26"}} 7 | {"type": "STATE", "value": {}} 8 | {"type": "SCHEMA", "stream": "campaign_performance", "schema": {"type": ["object", "null"], "properties": {"customer_id": {"type": ["string", "null"]}, "campaign__resourceName": {"type": ["string", "null"]}, "campaign__status": {"type": ["string", "null"]}, "campaign__name": {"type": ["string", "null"]}, "metrics__clicks": {"type": ["string", "null"]}, "metrics__costMicros": {"type": ["string", "null"]}, "metrics__ctr": {"type": ["number", "null"]}, "metrics__averageCpc": {"type": ["number", "null"]}, "metrics__impressions": {"type": ["string", "null"]}, "segments__device": {"type": ["string", "null"]}, "segments__date": {"type": ["string", "null"]}}}, "key_properties": ["id"]} 9 | {"type": "RECORD", "stream": "campaign_performance", "record": {"customer_id": "7950307320", "campaign__resourceName": "customers/7950307320/campaigns/12768130709", "campaign__status": "ENABLED", "campaign__name": "SuitShop-Google-OBcvr-OPpur-Search-Brand-RT-Hot", "metrics__clicks": "58", "metrics__costMicros": "76730551", "metrics__ctr": 0.34117648005485535, "metrics__averageCpc": 1322940.5, "metrics__impressions": "170", "segments__device": "DESKTOP", "segments__date": "2025-03-27"}} 10 | {"type": "RECORD", "stream": "campaign_performance", "record": {"customer_id": "7950307320", "campaign__resourceName": "customers/7950307320/campaigns/12768130709", "campaign__status": "ENABLED", "campaign__name": "SuitShop-Google-OBcvr-OPpur-Search-Brand-RT-Hot", "metrics__clicks": "102", "metrics__costMicros": "185445105", "metrics__ctr": 0.30000001192092896, "metrics__averageCpc": 1818089.25, "metrics__impressions": "340", "segments__device": "MOBILE", "segments__date": "2025-03-27"}} 11 | {"type": "RECORD", "stream": "campaign_performance", "record": {"customer_id": "7950307320", "campaign__resourceName": "customers/7950307320/campaigns/12768130709", "campaign__status": "ENABLED", "campaign__name": "SuitShop-Google-OBcvr-OPpur-Search-Brand-RT-Hot", "metrics__clicks": "1", "metrics__costMicros": "830000", "metrics__ctr": 0.5, "metrics__averageCpc": 830000.0, "metrics__impressions": "2", "segments__device": "TABLET", "segments__date": "2025-03-27"}} 12 | {"type": "STATE", "value": {}} 13 | -------------------------------------------------------------------------------- /gluestick/readers/pl_reader.py: -------------------------------------------------------------------------------- 1 | from gluestick.reader import Reader 2 | from gluestick.utils.polars_utils import map_pd_type_to_polars, cast_df_from_schema 3 | import pyarrow.parquet as pq 4 | import polars as pl 5 | import pandas as pd 6 | import os 7 | 8 | 9 | class PolarsReader(Reader): 10 | 11 | def get(self, stream, default=None, catalog_types=True) -> pl.DataFrame | None: 12 | """ 13 | Reads the given stream from sync output and returns a pl.DataFrame. 14 | 15 | Parameters 16 | ---------- 17 | stream: str 18 | The name of the stream to read. 19 | default: pl.DataFrame | None 20 | The default value to return if the stream is not found. 21 | catalog_types: bool 22 | Whether to coerce the dataframe to the types given by the local catalog. 23 | """ 24 | 25 | filepath = self.input_files.get(stream) 26 | if not filepath: 27 | return default 28 | 29 | if filepath.endswith(".parquet"): 30 | return self.get_parquet(stream, filepath, catalog_types) 31 | elif filepath.endswith(".csv"): 32 | return self.get_csv(stream, filepath, catalog_types) 33 | raise ValueError(f"Unsupported file type: {filepath}") 34 | 35 | def get_csv(self, stream, filepath, catalog_types=True): 36 | if catalog_types: 37 | catalog = self.read_catalog() 38 | if catalog: 39 | headers = pd.read_csv(filepath, nrows=0).columns.tolist() 40 | types_params = self.get_types_from_catalog(catalog, stream, headers=headers) 41 | if types_params: 42 | return pl.read_csv(filepath, dtypes=types_params) 43 | 44 | return pl.read_csv(filepath) 45 | 46 | def get_parquet(self, stream, filepath, catalog_types=True): 47 | df = pl.read_parquet(filepath) 48 | if catalog_types: 49 | catalog = self.read_catalog() 50 | if catalog: 51 | headers = pq.read_table(filepath).to_pandas(safe=False).columns.tolist() 52 | types_params = self.get_types_from_catalog(catalog, stream, headers=headers) 53 | if types_params: 54 | return cast_df_from_schema(df, types_params) 55 | return df 56 | 57 | def get_types_from_catalog(self, catalog, stream, headers=None): 58 | """Get the polars types base on the catalog definition.""" 59 | type_information = super().get_types_from_catalog(catalog, stream, headers) 60 | pd_types = type_information.get("dtype", {}) 61 | date_fields = type_information.get("parse_dates", []) 62 | pd_types = { 63 | k: "Datetime" 64 | if k in date_fields 65 | else v 66 | for k, v in pd_types.items() 67 | } 68 | return {col: map_pd_type_to_polars(pd_type) for col, pd_type in pd_types.items()} 69 | 70 | def read_snapshots(self,stream, snapshot_dir, **kwargs) -> pl.DataFrame | None: 71 | """Read a snapshot file and return a polars dataframe. 72 | 73 | Parameters 74 | ---------- 75 | stream: str 76 | The name of the stream to read the snapshot from. 77 | snapshot_dir: str 78 | The path to the snapshot directory. 79 | """ 80 | if os.path.isfile(path=f"{snapshot_dir}/{stream}.snapshot.parquet"): 81 | return pl.read_parquet(source=f"{snapshot_dir}/{stream}.snapshot.parquet", **kwargs) 82 | elif os.path.isfile(path=f"{snapshot_dir}/{stream}.snapshot.csv"): 83 | return pl.read_csv(source=f"{snapshot_dir}/{stream}.snapshot.csv", **kwargs) 84 | else: 85 | return None 86 | 87 | def snapshot_records( 88 | self, 89 | stream_data, 90 | stream, 91 | snapshot_dir, 92 | pk="id", 93 | just_new=False, 94 | use_csv=False, 95 | overwrite=False, 96 | ) -> pl.DataFrame | None: 97 | """Update a snapshot file and return the merged data. 98 | 99 | Parameters 100 | ---------- 101 | stream_data: pl.DataFrame 102 | The data to be included in the snapshot. 103 | stream: str 104 | The name of the stream of the snapshots. 105 | snapshot_dir: str 106 | The name of the stream of the snapshots. 107 | pk: str or list of str 108 | The primary key used for the snapshot. 109 | just_new: bool 110 | Return just the input data if True, else returns the whole data 111 | use_csv: bool 112 | Whether to use csv format for the snapshot instead of parquet. 113 | overwrite: bool 114 | Whether to overwrite the existing snapshot file instead of updating and merging. 115 | 116 | Returns 117 | ------- 118 | return: pl.DataFrame 119 | A polars dataframe with the merged data. 120 | 121 | """ 122 | 123 | if isinstance(pk, str): 124 | pk = [pk] 125 | 126 | snapshot_df = self.read_snapshots(stream, snapshot_dir) 127 | if not overwrite and stream_data is not None and snapshot_df is not None: 128 | 129 | for key in pk: 130 | new_data_pk_df = stream_data.select(key) 131 | snapshot_df = snapshot_df.filter( 132 | ~pl.col(key).is_in(new_data_pk_df.get_column(key)) 133 | ) 134 | 135 | 136 | merged_df = pl.concat(items=[snapshot_df, stream_data], how="diagonal_relaxed") 137 | 138 | if use_csv: 139 | merged_df.write_csv(f"{snapshot_dir}/{stream}.snapshot.csv") 140 | else: 141 | merged_df.write_parquet(f"{snapshot_dir}/{stream}.snapshot.parquet") 142 | 143 | 144 | if just_new: 145 | return stream_data 146 | else: 147 | return merged_df 148 | elif stream_data is not None: 149 | if use_csv: 150 | stream_data.write_csv(f"{snapshot_dir}/{stream}.snapshot.csv") 151 | else: 152 | stream_data.write_parquet(f"{snapshot_dir}/{stream}.snapshot.parquet") 153 | 154 | return stream_data 155 | elif snapshot_df is not None: 156 | return snapshot_df 157 | else: 158 | return None 159 | 160 | 161 | 162 | -------------------------------------------------------------------------------- /gluestick/readers/pl_lazyframe_reader.py: -------------------------------------------------------------------------------- 1 | from gluestick.reader import Reader 2 | from gluestick.utils.polars_utils import map_pd_type_to_polars, cast_lf_from_schema 3 | import pyarrow.parquet as pq 4 | import polars as pl 5 | import pandas as pd 6 | import os 7 | class PLLazyFrameReader(Reader): 8 | 9 | def get(self, stream, default=None, catalog_types=True) -> pl.LazyFrame | None: 10 | """ 11 | Reads the given stream from sync output and returns a pl.LazyFrame. 12 | 13 | Parameters 14 | ---------- 15 | stream: str 16 | The name of the stream to read. 17 | default: pl.LazyFrame | None 18 | The default value to return if the stream is not found. 19 | catalog_types: bool 20 | Whether to coerce the lazyframe to the types given by the local catalog. 21 | """ 22 | 23 | filepath = self.input_files.get(stream) 24 | if not filepath: 25 | return default 26 | 27 | if filepath.endswith(".parquet"): 28 | return self.get_parquet(stream, filepath, catalog_types) 29 | elif filepath.endswith(".csv"): 30 | return self.get_csv(stream, filepath, catalog_types) 31 | raise ValueError(f"Unsupported file type: {filepath}") 32 | 33 | 34 | def get_csv(self, stream, filepath, catalog_types=True): 35 | if catalog_types: 36 | catalog = self.read_catalog() 37 | if catalog: 38 | headers = pd.read_csv(filepath, nrows=0).columns.tolist() 39 | types_params = self.get_types_from_catalog(catalog, stream, headers=headers) 40 | if types_params: 41 | return pl.scan_csv(filepath, schema=types_params) 42 | 43 | return pl.scan_csv(filepath) 44 | 45 | def get_parquet(self, stream, filepath, catalog_types=True): 46 | if catalog_types: 47 | catalog = self.read_catalog() 48 | if catalog: 49 | headers = pq.read_table(filepath).to_pandas(safe=False).columns.tolist() 50 | types_params = self.get_types_from_catalog(catalog, stream, headers=headers) 51 | lf = pl.scan_parquet(filepath) 52 | return cast_lf_from_schema(lf, types_params) 53 | 54 | return pl.scan_parquet(filepath) 55 | 56 | 57 | def get_types_from_catalog(self, catalog, stream, headers=None): 58 | """Get the polars types base on the catalog definition.""" 59 | type_information = super().get_types_from_catalog(catalog, stream, headers) 60 | pd_types = type_information.get("dtype", {}) 61 | date_fields = type_information.get("parse_dates", []) 62 | pd_types = { 63 | k: "Datetime" 64 | if k in date_fields 65 | else v 66 | for k,v in pd_types.items() 67 | } 68 | return {col: map_pd_type_to_polars(pd_type) for col, pd_type in pd_types.items()} 69 | 70 | def read_snapshots(self,stream, snapshot_dir, **kwargs) -> pl.LazyFrame | None: 71 | """Read a snapshot file. 72 | 73 | Parameters 74 | ---------- 75 | stream: str 76 | The name of the stream to read the snapshot from. 77 | snapshot_dir: str 78 | The path to the snapshot directory. 79 | """ 80 | if os.path.isfile(path=f"{snapshot_dir}/{stream}.snapshot.parquet"): 81 | return pl.scan_parquet(source=f"{snapshot_dir}/{stream}.snapshot.parquet") 82 | elif os.path.isfile(path=f"{snapshot_dir}/{stream}.snapshot.csv"): 83 | return pl.scan_csv(source=f"{snapshot_dir}/{stream}.snapshot.csv") 84 | else: 85 | return None 86 | 87 | def snapshot_records( 88 | self, 89 | stream_data, 90 | stream, 91 | snapshot_dir, 92 | pk="id", 93 | just_new=False, 94 | use_csv=False, 95 | overwrite=False, 96 | ) -> pl.LazyFrame | None: 97 | """Update a snapshot file and return the merged data. 98 | 99 | Parameters 100 | ---------- 101 | stream_data: pl.LazyFrame 102 | The data to be included in the snapshot. 103 | stream: str 104 | The name of the stream of the snapshots. 105 | snapshot_dir: str 106 | The name of the stream of the snapshots. 107 | pk: str or list of str 108 | The primary key used for the snapshot. 109 | just_new: bool 110 | Return just the input data if True, else returns the whole data 111 | use_csv: bool 112 | Whether to use csv format for the snapshot instead of parquet. 113 | overwrite: bool 114 | Whether to overwrite the existing snapshot file instead of updating and merging. 115 | 116 | Returns 117 | ------- 118 | return: pl.LazyFrame 119 | A polars lazyframe with the merged data. 120 | 121 | """ 122 | 123 | if isinstance(pk, str): 124 | pk = [pk] 125 | 126 | snapshot_lf = self.read_snapshots(stream, snapshot_dir) 127 | if not overwrite and stream_data is not None and snapshot_lf is not None: 128 | 129 | for key in pk: 130 | new_data_pk_lf = stream_data.select(key).collect() 131 | snapshot_lf = snapshot_lf.filter( 132 | ~pl.col(key).is_in(new_data_pk_lf.get_column(key)) 133 | ) 134 | 135 | 136 | merged_lf = pl.concat(items=[snapshot_lf, stream_data],how="vertical_relaxed") 137 | 138 | if use_csv: 139 | merged_lf.sink_csv(f"{snapshot_dir}/{stream}.temp.snapshot.csv") 140 | os.remove(f"{snapshot_dir}/{stream}.snapshot.csv") 141 | os.rename(f"{snapshot_dir}/{stream}.temp.snapshot.csv", f"{snapshot_dir}/{stream}.snapshot.csv") 142 | else: 143 | merged_lf.sink_parquet(f"{snapshot_dir}/{stream}.temp.snapshot.parquet") 144 | os.remove(f"{snapshot_dir}/{stream}.snapshot.parquet") 145 | os.rename(f"{snapshot_dir}/{stream}.temp.snapshot.parquet", f"{snapshot_dir}/{stream}.snapshot.parquet") 146 | 147 | 148 | if just_new: 149 | return stream_data 150 | else: 151 | return merged_lf 152 | elif stream_data is not None: 153 | if use_csv: 154 | stream_data.sink_csv(f"{snapshot_dir}/{stream}.snapshot.csv") 155 | else: 156 | stream_data.sink_parquet(f"{snapshot_dir}/{stream}.snapshot.parquet") 157 | 158 | return stream_data 159 | elif snapshot_lf is not None: 160 | return snapshot_lf 161 | else: 162 | return None 163 | 164 | 165 | 166 | 167 | 168 | 169 | -------------------------------------------------------------------------------- /tests/etl_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import gluestick as gs 4 | import pandas as pd 5 | import pytest 6 | 7 | # Tests gluestick ETL utilities 8 | class TestETL(object): 9 | @classmethod 10 | def setup_class(cls): 11 | print("=====") 12 | print("setup") 13 | 14 | # TODO: Test join 15 | 16 | # Run explode_json_to_cols 17 | def test_explode_json_to_cols(self): 18 | print("=====") 19 | print("test_explode_json_to_cols") 20 | 21 | # Read data 22 | dirname = os.path.dirname(__file__) 23 | df = pd.read_csv( 24 | os.path.join(dirname, "data/input/json_to_cols.csv"), index_col=0 25 | ) 26 | expected_df = pd.read_csv( 27 | os.path.join(dirname, "data/output/json_to_cols.csv"), index_col=0 28 | ) 29 | 30 | # Explode 31 | r = gs.array_to_dict_reducer("Name", "StringValue") 32 | df2 = gs.explode_json_to_cols(df, "Metadata", reducer=r) 33 | print(df2) 34 | 35 | assert df2.equals(expected_df) 36 | print("test_explode_json_to_cols output is correct") 37 | 38 | def test_explode_json_to_cols_unique(self): 39 | print("=====") 40 | print("test_explode_json_to_cols_unique") 41 | 42 | # Read data 43 | dirname = os.path.dirname(__file__) 44 | df = pd.read_csv( 45 | os.path.join(dirname, "data/input/json_to_cols_unique.csv"), index_col=0 46 | ) 47 | expected_df = pd.read_csv( 48 | os.path.join(dirname, "data/output/json_to_cols_unique.csv"), index_col=0 49 | ) 50 | 51 | # Explode 52 | df2 = gs.explode_json_to_cols(df, "Metadata") 53 | print(df2) 54 | 55 | assert df2.equals(expected_df) 56 | print("test_explode_json_to_cols_unique output is correct") 57 | 58 | # Run explode_json_to_rows 59 | def test_explode_json_to_rows(self): 60 | print("=====") 61 | print("test_explode_json_to_rows") 62 | 63 | # Read data 64 | dirname = os.path.dirname(__file__) 65 | df = pd.read_csv( 66 | os.path.join(dirname, "data/input/json_to_rows.csv"), index_col=0 67 | ) 68 | expected_df = pd.read_csv( 69 | os.path.join(dirname, "data/output/json_to_rows.csv"), index_col=0 70 | ).astype({"Line Detail.Id": "float64"}) 71 | 72 | # Explode 73 | df2 = gs.explode_json_to_rows(df, "Line Detail").astype( 74 | {"Line Detail.Id": "float64"} 75 | ) 76 | assert df2.equals(expected_df) 77 | print("test_explode_json_to_rows output is correct") 78 | 79 | def test_explode_multi(self): 80 | print("=====") 81 | print("test_explode_multi") 82 | 83 | # Read data 84 | dirname = os.path.dirname(__file__) 85 | df = pd.read_csv( 86 | os.path.join(dirname, "data/input/multi_json.csv"), index_col=0 87 | ) 88 | expected_df = ( 89 | pd.read_csv( 90 | os.path.join(dirname, "data/output/explode_multi.csv"), index_col=0 91 | ) 92 | .pipe(lambda x: x.astype({"LineDetail.Id": "float64"})) 93 | .pipe(lambda x: x.sort_index(axis=1)) 94 | ) 95 | 96 | transformed_df = ( 97 | df.pipe( 98 | gs.explode_json_to_cols, 99 | "Metadata", 100 | reducer=gs.array_to_dict_reducer("Name", "StringValue"), 101 | ) 102 | .pipe(gs.explode_json_to_rows, "LineDetail") 103 | .pipe(lambda x: x.astype({"LineDetail.Id": "float64"})) 104 | .pipe(lambda x: x.sort_index(axis=1)) 105 | ) 106 | assert transformed_df.equals(expected_df) 107 | 108 | # changing order should not matter 109 | transformed_df = ( 110 | df.pipe(gs.explode_json_to_rows, "LineDetail") 111 | .pipe( 112 | gs.explode_json_to_cols, 113 | "Metadata", 114 | reducer=gs.array_to_dict_reducer("Name", "StringValue"), 115 | ) 116 | .pipe(lambda x: x.astype({"LineDetail.Id": "float64"})) 117 | .pipe(lambda x: x.sort_index(axis=1)) 118 | ) 119 | assert transformed_df.equals(expected_df) 120 | 121 | print("test_explode_multi output is correct") 122 | 123 | 124 | def test_to_export(self, tmp_path): 125 | print("=====") 126 | print("test_to_export") 127 | dir_name = os.path.dirname(__file__) 128 | input = gs.Reader(dir=os.path.join(dir_name, "data/input")) 129 | 130 | campaign_parquet_df = input.get("campaign_performance") 131 | campaign_csv_df = input.get("campaign_csv") 132 | 133 | # Define stream name and output file 134 | stream_name = "campaign_performance" 135 | output_dir = tmp_path 136 | 137 | true_output_data = {} 138 | 139 | 140 | singer_output_path = os.path.join(dir_name, "data/output/data.singer") 141 | csv_csv_output_path = os.path.join(dir_name, "data/output/campaign_performance_csv.csv") 142 | parquet_csv_output_path = os.path.join(dir_name, "data/output/campaign_performance_parquet.csv") 143 | 144 | parquet_parquet_output_path = os.path.join(dir_name, "data/output/campaign_performance_parquet.parquet") 145 | csv_parquet_output_path = os.path.join(dir_name, "data/output/campaign_performance_csv.parquet") 146 | true_output_data["singer"] = open(singer_output_path, "r").read() 147 | 148 | 149 | 150 | 151 | for type, df, output_csv_path, output_parquet_path in [ 152 | ("parquet", campaign_parquet_df, parquet_csv_output_path, parquet_parquet_output_path), 153 | ("csv", campaign_csv_df, csv_csv_output_path, csv_parquet_output_path) 154 | ]: 155 | 156 | # Read the output file 157 | singer_output_file = output_dir / "data.singer" 158 | if singer_output_file.exists(): 159 | singer_output_file.unlink() 160 | 161 | # Test singer export 162 | gs.to_export( 163 | campaign_parquet_df, 164 | name=stream_name, 165 | output_dir=output_dir, 166 | keys=["id"] 167 | ) 168 | 169 | assert singer_output_file.exists(), f"{type} -> Singer Output file {singer_output_file} does not exist." 170 | 171 | with open(singer_output_file, "r") as f: 172 | test_lines = [json.loads(line) for line in f if line.strip()] 173 | 174 | with open(singer_output_path, "r") as f: 175 | true_lines = [json.loads(line) for line in f if line.strip()] 176 | 177 | assert test_lines == true_lines, f"{type} -> Singer output is incorrect" 178 | 179 | # Test CSV Export 180 | csv_output_file = output_dir / "campaign_performance.csv" 181 | if csv_output_file.exists(): 182 | csv_output_file.unlink() 183 | 184 | gs.to_export( 185 | df, 186 | name=stream_name, 187 | output_dir=output_dir, 188 | export_format="csv", 189 | keys=["id"] 190 | ) 191 | 192 | test_output_df = pd.read_csv(csv_output_file) 193 | true_output_df = pd.read_csv(output_csv_path) 194 | 195 | assert csv_output_file.exists(), f"{type} -> CSV Output file {csv_output_file} does not exist." 196 | 197 | assert test_output_df.equals(true_output_df), f"{type} -> CSV output is incorrect" 198 | 199 | 200 | # Test parquet export 201 | parquet_output_file = output_dir / "campaign_performance.parquet" 202 | if parquet_output_file.exists(): 203 | parquet_output_file.unlink() 204 | 205 | true_output_df = pd.read_parquet(path=output_parquet_path) 206 | 207 | gs.to_export( 208 | df, 209 | name=stream_name, 210 | output_dir=output_dir, 211 | export_format="parquet", 212 | keys=["id"] 213 | ) 214 | 215 | test_output_df = pd.read_parquet(path=parquet_output_file) 216 | 217 | assert parquet_output_file.exists(), f"{type} -> Parquet Output file {parquet_output_file} does not exist." 218 | 219 | for col in test_output_df.columns: 220 | print("Dtype in test: ", test_output_df[col].dtype) 221 | print("Dtype in true: ", true_output_df[col].dtype) 222 | assert test_output_df[col].equals(true_output_df[col]), f"{type} -> Column {col} is incorrect" 223 | 224 | 225 | 226 | 227 | 228 | 229 | print("test to_export output is correct") 230 | 231 | 232 | 233 | -------------------------------------------------------------------------------- /gluestick/reader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import pandas as pd 4 | from pandas.io.parsers import TextFileReader 5 | import pyarrow as pa 6 | import pyarrow.parquet as pq 7 | 8 | class Reader: 9 | """A reader for gluestick ETL files.""" 10 | 11 | ROOT_DIR = os.environ.get("ROOT_DIR", ".") 12 | INPUT_DIR = f"{ROOT_DIR}/sync-output" 13 | 14 | def __init__(self, dir=INPUT_DIR, root=ROOT_DIR): 15 | """Init the class and read directories. 16 | 17 | Parameters 18 | ---------- 19 | dir: str 20 | Directory with the input data. 21 | root: str 22 | Root directory. 23 | 24 | """ 25 | self.root = root 26 | self.dir = dir 27 | self.input_files = self.read_directories() 28 | 29 | def __dict__(self): 30 | return self.input_files 31 | 32 | def __str__(self): 33 | return str(list(self.input_files.keys())) 34 | 35 | def __repr__(self): 36 | return str(list(self.input_files.keys())) 37 | 38 | def read_parquet_with_chunks(self, filepath, chunksize): 39 | parquet_file = pq.ParquetFile(filepath) 40 | 41 | for batch in parquet_file.iter_batches(batch_size=chunksize): 42 | df = batch.to_pandas(safe=False) 43 | # TODO: add support for catalog types 44 | yield df 45 | 46 | def get(self, stream, default=None, catalog_types=False, **kwargs): 47 | """Read the selected file.""" 48 | filepath = self.input_files.get(stream) 49 | if not filepath: 50 | return default 51 | if filepath.endswith(".parquet"): 52 | if kwargs.get("chunksize"): 53 | return self.read_parquet_with_chunks(filepath, kwargs.get("chunksize")) 54 | 55 | catalog = self.read_catalog() 56 | if catalog and catalog_types: 57 | try: 58 | headers = pq.read_table(filepath).to_pandas(safe=False).columns.tolist() 59 | types_params = self.get_types_from_catalog(catalog, stream, headers=headers) 60 | dtype_dict = types_params.get('dtype') 61 | parse_dates = types_params.get('parse_dates') 62 | 63 | # Mapping pandas dtypes to pyarrow types 64 | type_mapping = { 65 | 'int64': pa.int64(), 66 | 'float64': pa.float64(), 67 | "": pa.float64(), 68 | 'string': pa.string(), 69 | 'object': pa.string(), 70 | 'datetime64[ns]': pa.timestamp('ns'), 71 | 'bool': pa.bool_(), 72 | 'boolean': pa.bool_(), 73 | # TODO: Add more mappings as needed 74 | } 75 | 76 | if dtype_dict: 77 | # Convert dtype dictionary to pyarrow schema 78 | fields = [(col, type_mapping[str(dtype).lower()]) for col, dtype in dtype_dict.items()] 79 | fields.extend([(col, pa.timestamp('ns')) for col in parse_dates]) 80 | schema = pa.schema(fields) 81 | df = pq.read_table(filepath, schema=schema).to_pandas(safe=False) 82 | for col, dtype in dtype_dict.items(): 83 | # NOTE: bools require explicit conversion at the end because if there are empty values (NaN) 84 | # pyarrow/pd defaults to convert to string 85 | if str(dtype).lower() in ["bool", "boolean"]: 86 | df[col] = df[col].astype('boolean') 87 | elif str(dtype).lower() in ["int64"]: 88 | df[col] = df[col].astype('Int64') 89 | elif str(dtype).lower() in ["object", "string"]: 90 | df[col] = df[col].astype("string") 91 | return df 92 | except: 93 | # NOTE: silencing errors to avoid breaking existing workflow 94 | print(f"Failed to parse catalog_types for {stream}. Ignoring.") 95 | pass 96 | 97 | return pq.read_table(filepath).to_pandas(safe=False) 98 | catalog = self.read_catalog() 99 | if catalog and catalog_types: 100 | types_params = self.get_types_from_catalog(catalog, stream) 101 | kwargs.update(types_params) 102 | df = pd.read_csv(filepath, **kwargs) 103 | 104 | # needed to handle chunked CSVs properly 105 | if isinstance(df, TextFileReader): 106 | return df, kwargs.get("parse_dates", []) 107 | 108 | # if a date field value is empty read_csv will read it as "object" 109 | # make sure all date fields are typed as date 110 | for date_col in kwargs.get("parse_dates", []): 111 | df[date_col] = pd.to_datetime(df[date_col], errors='coerce', utc=True) 112 | 113 | return df 114 | 115 | def get_metadata(self, stream): 116 | """Get metadata from parquet file.""" 117 | file = self.input_files.get(stream) 118 | if file is None: 119 | raise FileNotFoundError(f"There is no file for stream with name {stream}.") 120 | if file.endswith(".parquet"): 121 | return { 122 | k.decode(): v.decode() 123 | for k, v in pq.read_metadata(file).metadata.items() 124 | } 125 | return {} 126 | 127 | def get_pk(self, stream): 128 | """Get pk from parquet file or catalog if available.""" 129 | key_properties = [] 130 | if self.read_directories().get(stream, "").endswith(".parquet"): 131 | metadata = self.get_metadata(stream) 132 | if metadata.get("key_properties"): 133 | key_properties = eval(metadata["key_properties"]) 134 | else: 135 | catalog = self.read_catalog() 136 | 137 | if catalog is not None: 138 | streams = next( 139 | (c for c in catalog["streams"] if c.get("stream") == stream), None 140 | ) 141 | if streams and streams.get("metadata"): 142 | breadcrumb = next( 143 | s for s in streams["metadata"] if not s["breadcrumb"] 144 | ) 145 | if breadcrumb: 146 | key_properties = breadcrumb.get("metadata", {}).get( 147 | "table-key-properties", [] 148 | ) 149 | return key_properties 150 | 151 | def read_directories(self, ignore=[]): 152 | """Read all the available directories for input files. 153 | 154 | Parameters 155 | ---------- 156 | ignore: list 157 | Stream names to ignore. 158 | 159 | Returns 160 | ------- 161 | return: dict 162 | Dict with the name of the streams and their paths. 163 | 164 | """ 165 | is_directory = os.path.isdir(self.dir) 166 | all_files = [] 167 | results = {} 168 | if is_directory: 169 | for entry in os.listdir(self.dir): 170 | file_path = os.path.join(self.dir, entry) 171 | if os.path.isfile(file_path): 172 | if file_path.endswith(".csv") or file_path.endswith(".parquet"): 173 | all_files.append(file_path) 174 | else: 175 | all_files.append(self.dir) 176 | 177 | for file in all_files: 178 | split_path = file.split("/") 179 | entity_type = split_path[len(split_path) - 1].rsplit(".", 1)[0] 180 | 181 | if "-" in entity_type: 182 | entity_type = entity_type.rsplit("-", 1)[0] 183 | 184 | if entity_type not in results and entity_type not in ignore: 185 | results[entity_type] = file 186 | 187 | return results 188 | 189 | def read_catalog(self): 190 | """Read the catalog.json file.""" 191 | file_name = f"{self.root}/catalog.json" 192 | if os.path.isfile(file_name): 193 | with open(file_name) as f: 194 | catalog = json.load(f) 195 | print(f"Finished loading source catalog.") 196 | else: 197 | print(f"Source catalog not found at {file_name}.") 198 | catalog = None 199 | return catalog 200 | 201 | def clean_catalog(self, catalog): 202 | clean_catalog = {} 203 | if "streams" in catalog : 204 | for stream_info in catalog ["streams"]: 205 | # Use 'stream' preferentially, fallback to 'tap_stream_id' 206 | stream_name = stream_info.get("stream") or stream_info.get("tap_stream_id") 207 | schema_properties = stream_info.get("schema", {}).get("properties", {}) 208 | if stream_name and schema_properties: 209 | clean_catalog[stream_name] = schema_properties 210 | print(f"Finished loading target schemas for streams: {list(clean_catalog.keys())}") 211 | return clean_catalog 212 | 213 | def read_target_catalog(self, process_schema=False): 214 | """Read the target catalog.json file.""" 215 | filename = f"{self.root}/target-catalog.json" 216 | 217 | if not os.path.exists(filename): 218 | print(f"Target catalog not found at {filename}.") 219 | return None 220 | 221 | with open(filename, "r", encoding="utf-8") as f: 222 | raw_target_catalog = json.load(f) 223 | 224 | if not process_schema: 225 | return raw_target_catalog 226 | 227 | return raw_target_catalog , self.clean_catalog(raw_target_catalog) 228 | 229 | def get_types_from_catalog(self, catalog, stream, headers=None): 230 | """Get the pandas types base on the catalog definition. 231 | 232 | Parameters 233 | ---------- 234 | catalog: dict 235 | The singer catalog used on the tap. 236 | stream: str 237 | The name of the stream. 238 | 239 | Returns 240 | ------- 241 | return: dict 242 | Dict with arguments to be used by pandas. 243 | 244 | """ 245 | filepath = self.input_files.get(stream) 246 | if headers is None: 247 | headers = pd.read_csv(filepath, nrows=0).columns.tolist() 248 | 249 | streams = next((c for c in catalog["streams"] if c["stream"] == stream or c["tap_stream_id"] == stream), None) 250 | if not streams: 251 | return dict() 252 | types = streams["schema"]["properties"] 253 | 254 | type_mapper = {"integer": "Int64", "number": float, "boolean": "boolean"} 255 | 256 | dtype = {} 257 | parse_dates = [] 258 | for col in headers: 259 | col_type = types.get(col) 260 | if col_type: 261 | # if col has multiple types, use type with format if it not exists assign type object to support multiple types 262 | any_of_list = col_type.get("anyOf", []) 263 | if any_of_list: 264 | type_with_format = next((col_t for col_t in any_of_list if "format" in col_t), None) 265 | col_type = type_with_format if type_with_format else {"type": "object"} 266 | if col_type.get("format") == "date-time": 267 | parse_dates.append(col) 268 | continue 269 | if col_type.get("type"): 270 | catalog_type = [t for t in col_type["type"] if t != "null"] 271 | if len(catalog_type) == 1: 272 | dtype[col] = type_mapper.get(catalog_type[0], "object") 273 | continue 274 | dtype[col] = "object" 275 | 276 | return dict(dtype=dtype, parse_dates=parse_dates) 277 | -------------------------------------------------------------------------------- /gluestick/pandas_utils.py: -------------------------------------------------------------------------------- 1 | """Utilities for pandas dataframes containing objects.""" 2 | 3 | import ast 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from pandas.io.json._normalize import nested_to_record 8 | from gluestick.reader import Reader 9 | 10 | 11 | def json_tuple_to_cols( 12 | df, 13 | column_name, 14 | col_config={ 15 | "cols": {"key_prop": "Name", "value_prop": "Value"}, 16 | "look_up": {"key_prop": "name", "value_prop": "value"}, 17 | }, 18 | ): 19 | """Convert a column with a JSON tuple in it to two column. 20 | 21 | Parameters 22 | ---------- 23 | df: pd.DataFrame 24 | The input pandas data frame. 25 | column_name: str 26 | Column with the json tuple. 27 | col_config: 28 | Conversion config. 29 | 30 | Returns 31 | ------- 32 | return: pd.DataFrame 33 | A dataframe with the new columns. 34 | 35 | Examples 36 | -------- 37 | IN[51]: qb_lookup_keys = {'key_prop': 'name', 'value_prop': 'value'} 38 | IN[52]: invoices = json_tuple_to_cols( 39 | invoices, 40 | 'Line.DiscountLineDetail.DiscountAccountRef', 41 | col_config={ 42 | 'cols': { 43 | 'key_prop': 'Discount Details', 44 | 'value_prop': 'Discount %' 45 | }, 46 | 'look_up': qb_lookup_keys 47 | } 48 | ) 49 | 50 | """ 51 | 52 | def get_value(y, prop): 53 | value = y 54 | if type(value) is str: 55 | value = ast.literal_eval(y) 56 | if type(value) is dict: 57 | return value.get(prop) 58 | if type(value) is list: 59 | return value[0].get(prop) 60 | else: 61 | return None 62 | 63 | df[col_config["cols"]["key_prop"]] = df[column_name].apply( 64 | lambda y: get_value(y, col_config["look_up"]["key_prop"]) 65 | ) 66 | df[col_config["cols"]["value_prop"]] = df[column_name].apply( 67 | lambda y: get_value(y, col_config["look_up"]["value_prop"]) 68 | ) 69 | 70 | return df.drop(column_name, 1) 71 | 72 | 73 | def rename(df, target_columns): 74 | """Rename columns in DataFrame using a json format. 75 | 76 | Notes 77 | ----- 78 | Also allow for converting the types of the values. 79 | 80 | Parameters 81 | ---------- 82 | df: pd.DataFrame 83 | The input pandas data frame. 84 | target_columns: dict 85 | Dictionary with the columns to rename. 86 | 87 | Returns 88 | ------- 89 | return: pd.DataFrame 90 | Modified data frame with the renamed columns. 91 | 92 | Examples 93 | -------- 94 | IN[52]: rename(df, ) 95 | Out[52]: 96 | {'dict1.c': 1, 97 | 'dict1.d': 2, 98 | 'flat1': 1, 99 | 'nested.d': 2, 100 | 'nested.e.c': 1, 101 | 'nested.e.d': 2} 102 | 103 | """ 104 | if target_columns is not None: 105 | if isinstance(target_columns, list): 106 | return df[target_columns] 107 | elif isinstance(target_columns, dict): 108 | idx1 = pd.Index(target_columns.keys()) 109 | idx2 = pd.Index(df.columns) 110 | target_column_names = idx1.intersection(idx2).array 111 | return df[target_column_names].rename(columns=target_columns) 112 | return df 113 | 114 | def enforce_exploded_col_types(df, column_name, stream=None): 115 | """Enforce types for columns created by exploded fields for better consistency. 116 | 117 | Notes 118 | ----- 119 | Enforce types for columns created by exploded fields using catalog if defined there 120 | or enforce nullable booleans for consistency 121 | 122 | Parameters 123 | ---------- 124 | df: pd.DataFrame 125 | The input pandas data frame. 126 | column_name: str 127 | The name of the column that should be exploded. 128 | stream: str 129 | Stream name to enforce types using catalog typing 130 | """ 131 | 132 | # enforce types for booleans and integers 133 | field_schema = None 134 | exploded_columns = [col for col in df.columns if col.startswith(f"{column_name}.")] 135 | 136 | if stream: 137 | input = Reader() 138 | catalog = input.read_catalog() 139 | stream_schema = [s for s in catalog["streams"] if s["tap_stream_id"] == stream] 140 | if stream_schema: 141 | field_schema = stream_schema[0].get("schema", {}).get("properties", {}).get(column_name) 142 | 143 | if field_schema and "properties" in field_schema: 144 | for col in exploded_columns: 145 | col_name = col.split(".")[-1] 146 | col_type = field_schema.get("properties").get(col_name, {}).get("type") 147 | if isinstance(col_type, list) and col_type: 148 | col_type = next(iter([t for t in col_type if t != "null"]), None) 149 | if col_type: 150 | if col_type in ["bool", "boolean"]: 151 | df[col] = df[col].astype("boolean") 152 | elif col_type in ["int", "integer"]: 153 | df[col] = df[col].astype("Int64") 154 | 155 | else: 156 | for col in exploded_columns: 157 | # if all column values are false let pandas infere type 158 | if df[col].dropna().empty: 159 | continue 160 | 161 | first_non_null_value = df[col].dropna().iloc[0] 162 | if type(first_non_null_value) in [list, dict, str]: 163 | continue 164 | # if all not null fields are bool type column as boolean 165 | are_all_boolean = df[col].dropna().apply(lambda x: isinstance(x, bool)).all() 166 | if are_all_boolean: 167 | df[col] = df[col].astype("boolean") 168 | continue 169 | # Enforcing only boolean types if "field_schema" is not present, 170 | # as pandas automatically converts integers with NaN values (e.g., 2 to 2.0), 171 | return df 172 | 173 | 174 | def explode_json_to_rows(df, column_name, drop=True, stream=None, **kwargs): 175 | """Explodes a column with an array of objects into multiple rows. 176 | 177 | Notes 178 | ----- 179 | Convert an array of objects into a list of dictionaries and explode it into 180 | multiple rows and columns, one column for each dictionary key and one row for each 181 | object inside the array. 182 | 183 | Parameters 184 | ---------- 185 | df: pd.DataFrame 186 | The input pandas data frame. 187 | column_name: str 188 | The name of the column that should be exploded. 189 | drop: boolean 190 | To drop or not the exploded column. 191 | stream: str 192 | Stream name to enforce types using catalog typing 193 | **kwargs: 194 | Additional arguments. 195 | 196 | 197 | Returns 198 | ------- 199 | return: pd.DataFrame 200 | New data frame with the JSON line expanded into columns and rows. 201 | 202 | Examples 203 | -------- 204 | IN[52]: explode_json_to_rows(df, df['Line'] ) 205 | an example of the line would be: 206 | [ 207 | { 208 | "Id":"1", 209 | "LineNum":"1", 210 | "Amount":275, 211 | "DetailType":"SalesItemLineDetail", 212 | "SalesItemLineDetail":{"ItemRef":{"value":"5","name":"Rock Fountain"}, 213 | "ItemAccountRef":{"value":"79","name":"Sales of Product Income"}, 214 | "TaxCodeRef":{"value":"TAX","name":null}, 215 | "SubTotalLineDetail":null, 216 | "DiscountLineDetail":null 217 | }, 218 | { 219 | "Id":"2", 220 | "LineNum":"2", 221 | "Amount":12.75, 222 | "DetailType":"SalesItemLineDetail", 223 | "SalesItemLineDetail":{"ItemRef":{"value":"11","name":"Pump"}, 224 | "ItemAccountRef":{"value":"79","name":"Sales of Product Income"}, 225 | "TaxCodeRef":{"value":"TAX","name":null}, 226 | "SubTotalLineDetail":null, 227 | "DiscountLineDetail":null 228 | }, 229 | { 230 | "Id":"3", 231 | "LineNum":"3", 232 | "Amount":47.5, 233 | "DetailType":"SalesItemLineDetail", 234 | "SalesItemLineDetail":{"ItemRef":{"value":"3","name":"Concrete"}, 235 | "ItemAccountRef":{ 236 | "value":"48", 237 | "name":"Landscaping Services:Job Materials" 238 | }, 239 | "TaxCodeRef":{"value":"TAX","name":null}, 240 | "SubTotalLineDetail":null, 241 | "DiscountLineDetail":null 242 | }, 243 | { 244 | "Id":null, 245 | "LineNum":null, 246 | "Amount":335.25, 247 | "DetailType":"SubTotalLineDetail", 248 | "SalesItemLineDetail":null, 249 | "SubTotalLineDetail":{}, 250 | "DiscountLineDetail":null 251 | } 252 | ] 253 | Out[52]: 254 | Line.Id Line.LineNum Line.Amount Line.DetailType 255 | Index 256 | 1037 1 1 275.00 SalesItemLineDetail 257 | 1037 2 2 12.75 SalesItemLineDetail 258 | 1037 3 3 47.50 SalesItemLineDetail 259 | 1037 None None 335.25 SubTotalLineDetail 260 | 1036 1 1 50.00 SalesItemLineDetail 261 | 262 | """ 263 | # Explode to new rows 264 | max_level = kwargs.get("max_level", 1) 265 | 266 | def to_list(y, parser=ast.literal_eval): 267 | if type(y) is str: 268 | y = parser(y) 269 | 270 | if type(y) is not list: 271 | y = [y] 272 | 273 | return y 274 | 275 | def flatten(y): 276 | if type(y) is dict: 277 | return pd.Series(nested_to_record(y, sep=".", max_level=max_level)) 278 | else: 279 | return pd.Series(dtype=np.float64) 280 | 281 | parser = kwargs.get("parser", ast.literal_eval) 282 | df[column_name] = df[column_name].apply(to_list, parser=parser) 283 | 284 | df = df.explode(column_name) 285 | 286 | df = pd.concat( 287 | [df, df[column_name].apply(flatten).add_prefix(f"{column_name}.")], axis=1 288 | ) 289 | if drop: 290 | df.drop(column_name, axis=1, inplace=True) 291 | 292 | # enforce types 293 | df = enforce_exploded_col_types(df, column_name, stream) 294 | return df 295 | 296 | 297 | def explode_json_to_cols(df: pd.DataFrame, column_name: str, **kwargs): 298 | """Convert a JSON column that has an array value into a DataFrame. 299 | 300 | Notes 301 | ----- 302 | Arrays such as [{"Name": "First", "Value": "Jo"},{"Name": "Last", "Value": "Do"}] 303 | with a column for each value are converted to pandas DataFrame. Note that the new 304 | series produced from the JSON will be de-duplicated and inner joined with the 305 | index. 306 | 307 | Parameters 308 | ---------- 309 | df: pd.DataFrame 310 | The input pandas data frame. 311 | column_name: str 312 | The name of the column that should be exploded. 313 | **kwargs: 314 | Additional arguments. 315 | 316 | Returns 317 | ------- 318 | return: pd.DataFrame 319 | New data frame with the JSON line expanded into columns. 320 | 321 | Examples 322 | -------- 323 | IN[5]: explode_json_to_cols(df, 'ProductRef' ) 324 | an example of the ProductRef would be: 325 | {"value": "Hi Tea Chipper","name": "Product"}, 326 | Out[5]: 327 | Product 328 | Index 329 | 1037 Hi Tea Chipper 330 | 331 | """ 332 | drop = kwargs.get("drop", True) 333 | expected_keys = kwargs.get("expected_keys", ["value", "name"]) 334 | 335 | if not kwargs.get("inplace"): 336 | df = df.copy() 337 | 338 | df[column_name] = df[column_name].fillna("{}") 339 | parser = kwargs.get("parser", ast.literal_eval) 340 | 341 | df[column_name] = df[column_name].apply( 342 | lambda x: parser(x) if isinstance(x, str) else x 343 | ) 344 | 345 | cols = df[column_name].apply(lambda x: x.keys()).explode().unique().tolist() 346 | cols = [x for x in cols if x == x] 347 | if cols: 348 | default_dict = {c: np.nan for c in cols} 349 | cols = [f"{column_name}.{col}" for col in cols] 350 | else: 351 | default_dict = {c: np.nan for c in expected_keys} 352 | cols = [f"{column_name}.{col}" for col in expected_keys] 353 | 354 | def set_default_dict(object, default_dict): 355 | if isinstance(object, dict): 356 | for k, v in default_dict.items(): 357 | object.setdefault(k, v) 358 | return object 359 | return np.nan 360 | 361 | df[column_name] = df[column_name].apply(lambda x: set_default_dict(x, default_dict)) 362 | df[cols] = df[column_name].apply(pd.Series) 363 | 364 | if drop: 365 | df = df.drop(column_name, axis=1) 366 | 367 | return df 368 | 369 | 370 | def array_to_dict_reducer(key_prop=None, value_prop=None): 371 | """Convert an array into a dictionary. 372 | 373 | Parameters 374 | ---------- 375 | key_prop: str 376 | Property in dictionary for key. 377 | value_prop: str 378 | Property in dictionary for value. 379 | 380 | Returns 381 | ------- 382 | return: dict 383 | A dictionary that has all the accumulated values. 384 | 385 | """ 386 | 387 | def reducer(accumulator, current_value): 388 | if type(current_value) is not dict: 389 | raise AttributeError("Value being reduced must be a dictionary") 390 | 391 | if key_prop is not None and value_prop is not None: 392 | key = current_value.get(key_prop) 393 | current_value = current_value.get(value_prop) 394 | accumulator[key] = current_value 395 | else: 396 | for key, value in current_value.items(): 397 | accumulator[key] = value 398 | 399 | return accumulator 400 | 401 | return reducer 402 | 403 | 404 | def compress_rows_to_col(df: pd.DataFrame, column_prefix: str, pk): 405 | """Compress exploded columns rows back to a single column. 406 | 407 | Parameters 408 | ---------- 409 | df: pd.DataFrame 410 | Input DataFrame to be compressed. 411 | column_prefix: str 412 | Column prefix to be compressed. 413 | pk: str 414 | Primary key to group on. 415 | 416 | Returns 417 | ------- 418 | return: pd.DataFrame 419 | A data frame with the compressed data. 420 | 421 | """ 422 | compress_cols = [col for col in df.columns if col.startswith(column_prefix)] 423 | df_compress = df[compress_cols] 424 | df.drop(compress_cols, inplace=True, axis=1) 425 | 426 | prefix_len = len(column_prefix) + 1 427 | cols_rename = {c: c[prefix_len:] for c in compress_cols} 428 | df_compress.rename(cols_rename, axis=1, inplace=True) 429 | 430 | df[column_prefix] = df_compress.apply(lambda x: str(x.to_dict()), axis=1) 431 | 432 | grouped = df.groupby(pk, axis=0)[column_prefix].apply(list).reset_index() 433 | df.drop_duplicates(pk, inplace=True) 434 | return df.merge(grouped, how="left", on=pk) 435 | 436 | -------------------------------------------------------------------------------- /gluestick/singer.py: -------------------------------------------------------------------------------- 1 | """Singer related util functions.""" 2 | 3 | import ast 4 | import datetime 5 | import json 6 | import os 7 | from contextlib import redirect_stdout 8 | from functools import singledispatch, partial 9 | import pandas as pd 10 | import singer 11 | from gluestick.reader import Reader 12 | import polars as pl 13 | 14 | def gen_singer_header(df: pd.DataFrame, allow_objects: bool, schema=None, catalog_schema=False, recursive_typing=True): 15 | """Generate singer headers based on pandas types. 16 | 17 | Parameters 18 | ---------- 19 | df: pandas.DataFrame 20 | The dataframe to extranct the types from. 21 | allow_objects: bool 22 | If the function should proccess objects in the columns. 23 | 24 | Returns 25 | ------- 26 | return: dict 27 | Dict of pandas.DataFrames. the keys of which are the entity names 28 | 29 | """ 30 | header_map = dict(type=["object", "null"], properties={}) 31 | 32 | type_mapping = { 33 | "float": {"type": ["number", "null"]}, 34 | "int": {"type": ["integer", "null"]}, 35 | "bool": {"type": ["boolean", "null"]}, 36 | "str": {"type": ["string", "null"]}, 37 | "date": { 38 | "format": "date-time", 39 | "type": ["string", "null"], 40 | }, 41 | "array": {"type": ["array", "null"], "items": {"type": ["object", "string", "null"]}}, 42 | } 43 | 44 | if schema and not catalog_schema: 45 | header_map = schema 46 | return df, header_map 47 | 48 | for col in df.columns: 49 | dtype = df[col].dtype.__str__().lower() 50 | 51 | if "date" in dtype: 52 | df[col] = df[col].dt.strftime("%Y-%m-%dT%H:%M:%S.%fZ") 53 | 54 | col_type = next((t for t in type_mapping.keys() if t in dtype), None) 55 | 56 | if col_type: 57 | header_map["properties"][col] = type_mapping[col_type] 58 | elif allow_objects: 59 | value = df[col].dropna() 60 | if value.empty: 61 | header_map["properties"][col] = type_mapping["str"] 62 | continue 63 | else: 64 | first_value = value.iloc[0] 65 | 66 | if isinstance(first_value, list): 67 | if recursive_typing: 68 | new_input = {} 69 | for row in value: 70 | if len(row): 71 | for arr_value in row: 72 | if isinstance(arr_value, dict): 73 | temp_dict = {k:v for k, v in arr_value.items() if (k not in new_input.keys()) or isinstance(v, float)} 74 | new_input.update(temp_dict) 75 | else: 76 | new_input = arr_value 77 | _schema = dict(type=["array", "null"], items=to_singer_schema(new_input)) 78 | header_map["properties"][col] = _schema 79 | if not new_input: 80 | header_map["properties"][col] = { 81 | "items": type_mapping["str"], 82 | "type": ["array", "null"], 83 | } 84 | else: 85 | header_map["properties"][col] = type_mapping["array"] 86 | elif isinstance(first_value, dict): 87 | _schema = dict(type=["object", "null"], properties={}) 88 | for k, v in first_value.items(): 89 | _schema["properties"][k] = to_singer_schema(v) 90 | header_map["properties"][col] = _schema 91 | else: 92 | header_map["properties"][col] = type_mapping["str"] 93 | else: 94 | header_map["properties"][col] = type_mapping["str"] 95 | 96 | def check_null(x): 97 | if isinstance(x, list) or isinstance(x, dict): 98 | return json.dumps(x, default=str) 99 | elif not pd.isna(x): 100 | return str(x) 101 | return x 102 | 103 | df[col] = df[col].apply(check_null) 104 | 105 | # update schema using types from catalog and keeping extra columns not defined in catalog 106 | # i.e. tenant, sync_date, etc 107 | if catalog_schema: 108 | header_map["properties"].update(schema["properties"]) 109 | 110 | return df, header_map 111 | 112 | 113 | def to_singer_schema(input): 114 | """Generate singer headers based on pandas types. 115 | 116 | Parameters 117 | ---------- 118 | input: 119 | Object to extract the types from. 120 | 121 | Returns 122 | ------- 123 | return: dict 124 | Dict of the singer mapped types. 125 | 126 | """ 127 | if type(input) == dict: 128 | property = dict(type=["object", "null"], properties={}) 129 | for k, v in input.items(): 130 | property["properties"][k] = to_singer_schema(v) 131 | return property 132 | elif type(input) == list: 133 | if len(input): 134 | return dict(type=["array", "null"], items=to_singer_schema(input[0])) 135 | else: 136 | return {"items": {"type": ["string", "null"]}, "type": ["array", "null"]} 137 | elif type(input) == bool: 138 | return {"type": ["boolean", "null"]} 139 | elif type(input) == int: 140 | return {"type": ["integer", "null"]} 141 | elif type(input) == float: 142 | return {"type": ["number", "null"]} 143 | return {"type": ["string", "null"]} 144 | 145 | 146 | def unwrap_json_schema(schema): 147 | def resolve_refs(schema, defs): 148 | if isinstance(schema, dict): 149 | if '$ref' in schema: 150 | ref_path = schema['$ref'].split('/') 151 | ref_name = ref_path[-1] 152 | return resolve_refs(defs[ref_name], defs) 153 | else: 154 | resolved_schema = {} 155 | for k,v in schema.items(): 156 | if type(v) != list and type(v) != dict: 157 | if k not in ['required', 'title']: 158 | resolved_schema[k] = v 159 | else: 160 | resolved_schema[k] = resolve_refs(v, defs) 161 | return resolved_schema 162 | elif isinstance(schema, list): 163 | return [resolve_refs(item, defs) for item in schema] 164 | else: 165 | return schema 166 | 167 | def simplify_anyof(schema): 168 | if isinstance(schema, dict): 169 | if 'anyOf' in schema: 170 | types = [item.get('type') for item in schema['anyOf'] if 'type' in item] 171 | 172 | # Handle cases where anyOf contains more than just type definitions 173 | # For example, when it includes properties or other nested structures 174 | combined_schema = {} 175 | for item in schema['anyOf']: 176 | for key, value in item.items(): 177 | combined_schema[key] = simplify_anyof(value) 178 | combined_schema['type'] = types 179 | return combined_schema 180 | else: 181 | resolved_schema = {} 182 | for k,v in schema.items(): 183 | if type(v) != list and type(v) != dict: 184 | if k not in ['required,' 'title']: 185 | resolved_schema[k] = v 186 | else: 187 | resolved_schema[k] = simplify_anyof(v) 188 | return resolved_schema 189 | elif isinstance(schema, list): 190 | return [simplify_anyof(item) for item in schema] 191 | else: 192 | return schema 193 | 194 | defs = schema.get('$defs', {}) 195 | resolved_schema = resolve_refs(schema, defs) 196 | simplified_schema = simplify_anyof(resolved_schema) 197 | simplified_schema.pop("$defs", None) 198 | return simplified_schema 199 | 200 | 201 | def deep_convert_datetimes(value): 202 | """Transforms all nested datetimes in a list or dict to %Y-%m-%dT%H:%M:%S.%fZ. 203 | 204 | Notes 205 | ----- 206 | This function transforms all datetimes to %Y-%m-%dT%H:%M:%S.%fZ 207 | 208 | Parameters 209 | ---------- 210 | value: list, dict, datetime 211 | 212 | Returns 213 | ------- 214 | return: list or dict with all datetime values transformed to %Y-%m-%dT%H:%M:%S.%fZ 215 | 216 | """ 217 | if isinstance(value, list): 218 | return [deep_convert_datetimes(child) for child in value] 219 | elif isinstance(value, dict): 220 | return {k: deep_convert_datetimes(v) for k, v in value.items()} 221 | elif isinstance(value, datetime.datetime): 222 | return value.strftime("%Y-%m-%dT%H:%M:%S.%fZ") 223 | elif isinstance(value, datetime.date): 224 | return value.strftime("%Y-%m-%d") 225 | return value 226 | 227 | def parse_objs(x): 228 | """Parse a stringified dict or list of dicts. 229 | 230 | Notes 231 | ----- 232 | This function will parse a stringified dict or list of dicts 233 | 234 | Parameters 235 | ---------- 236 | x: str 237 | stringified dict or list of dicts. 238 | 239 | Returns 240 | ------- 241 | return: dict, list 242 | parsed dict or list of dicts. 243 | 244 | """ 245 | # if it's not a string, we just return the input 246 | if type(x) != str: 247 | return x 248 | 249 | try: 250 | return ast.literal_eval(x) 251 | except: 252 | return json.loads(x) 253 | 254 | 255 | def get_catalog_schema(stream): 256 | """Get a df schema using the catalog. 257 | 258 | Parameters 259 | ---------- 260 | stream: str 261 | Stream name in catalog. 262 | 263 | """ 264 | input = Reader() 265 | catalog = input.read_catalog() 266 | schema = next( 267 | (str["schema"] for str in catalog["streams"] if str["stream"] == stream), None 268 | ) 269 | if not schema: 270 | raise Exception(f"No schema found in catalog for stream {stream}") 271 | else: 272 | # keep only relevant fields 273 | schema = {k: v for k, v in schema.items() if k in ["type", "properties"]} 274 | # need to ensure every array type has an items dict or we'll have issues 275 | for p in schema.get("properties", dict()): 276 | prop = schema["properties"][p] 277 | if prop.get("type") == "array" or "array" in prop.get("type") and prop.get("items") is None: 278 | prop["items"] = dict() 279 | return schema 280 | 281 | 282 | def parse_df_cols(df, schema): 283 | """Parse all df list and dict columns according to schema. 284 | 285 | Parameters 286 | ---------- 287 | stream: str 288 | Stream name in catalog. 289 | schema: dict 290 | Schema that will be used to export the data. 291 | 292 | """ 293 | for col in df.columns: 294 | col_type = schema["properties"].get(col, {}).get("type", []) 295 | if (isinstance(col_type, list) and any( 296 | item in ["object", "array"] 297 | for item in col_type 298 | )) or col_type in ["object", "array"]: 299 | df[col] = df[col].apply(lambda x: parse_objs(x)) 300 | return df 301 | 302 | @singledispatch 303 | def to_singer( 304 | df, 305 | stream, 306 | output_dir, 307 | keys=[], 308 | filename="data.singer", 309 | allow_objects=False, 310 | schema=None, 311 | unified_model=None, 312 | keep_null_fields=False, 313 | catalog_stream=None, 314 | recursive_typing=True 315 | ): 316 | raise NotImplementedError("to_singer is not implemented for this type") 317 | 318 | @to_singer.register(pd.DataFrame) 319 | def pandas_df_to_singer( 320 | df: pd.DataFrame, 321 | stream, 322 | output_dir, 323 | keys=[], 324 | filename="data.singer", 325 | allow_objects=False, 326 | schema=None, 327 | unified_model=None, 328 | keep_null_fields=False, 329 | catalog_stream=None, 330 | recursive_typing=True 331 | ): 332 | """Convert a pandas DataFrame into a singer file. 333 | 334 | Parameters 335 | ---------- 336 | df: pd.DataFrame 337 | Object to extract the types from. 338 | stream: str 339 | Stream name to be used in the singer output file. 340 | output_dir: str 341 | Path to the output directory. 342 | keys: list 343 | The primary-keys to be used. 344 | filename: str 345 | The output file name. 346 | allow_objects: boolean 347 | Allow or not objects to the parsed, if false defaults types to str. 348 | keep_null_fields: boolean 349 | Flag to keep all null fields 350 | catalog_stream: str 351 | Name of the stream in the catalog to be used to generate the schema if USE_CATALOG_SCHEMA is set as true 352 | If this is not set it will use stream parameter to generate the catalog 353 | recursive_typing: boolean 354 | If true, the function will recursively convert arrays of objects to arrays of primitives. 355 | If false, the function will fuzzy list types when generating singer header. 356 | """ 357 | catalog_schema = os.environ.get("USE_CATALOG_SCHEMA", "false").lower() == "true" 358 | include_all_unified_fields = os.environ.get("INCLUDE_ALL_UNIFIED_FIELDS", "false").lower() == "true" and unified_model is not None 359 | 360 | # drop columns with all null values except when we want to keep null fields 361 | if allow_objects and not (catalog_schema or include_all_unified_fields or keep_null_fields): 362 | df = df.dropna(how="all", axis=1) 363 | else: 364 | # df.dropna returns a new dataframe so df it's no longer pointing to the original dataframe, 365 | # if dropna is not applied we need to copy it or gen_singer_header will cast the original dataframe datetime columns as strings 366 | df = df.copy() 367 | 368 | if catalog_schema or catalog_stream: 369 | # it'll allow_objects but keeping all columns 370 | allow_objects = True 371 | # get schema from catalog 372 | stream_name = catalog_stream or stream 373 | schema = get_catalog_schema(stream_name) 374 | # parse all fields that are typed as objects or lists 375 | df = parse_df_cols(df, schema) 376 | 377 | elif unified_model: 378 | schema = unwrap_json_schema(unified_model.model_json_schema()) 379 | 380 | df, header_map = gen_singer_header(df, allow_objects, schema, catalog_schema, recursive_typing=recursive_typing) 381 | output = os.path.join(output_dir, filename) 382 | mode = "a" if os.path.isfile(output) else "w" 383 | 384 | with open(output, mode) as f: 385 | with redirect_stdout(f): 386 | singer.write_schema(stream, header_map, keys) 387 | for _, row in df.iterrows(): 388 | # keep null fields for catalog_schema, include_all_unified_fields and keep_null_fields 389 | if not (catalog_schema or include_all_unified_fields or keep_null_fields): 390 | filtered_row = row.dropna() 391 | else: 392 | filtered_row = row.where(pd.notna(row), None) 393 | filtered_row = filtered_row.to_dict() 394 | filtered_row = deep_convert_datetimes(filtered_row) 395 | singer.write_record(stream, filtered_row) 396 | singer.write_state({}) 397 | 398 | 399 | 400 | def gen_singer_header_from_polars_schema( 401 | schema: pl.Schema 402 | ) -> dict: 403 | """ 404 | Generate Singer headers from a Polars schema. 405 | 406 | Parameters 407 | ---------- 408 | schema : pl.Schema 409 | Polars DataFrame schema. 410 | 411 | Returns 412 | ------- 413 | dict 414 | Singer schema dictionary with non-primitives stringified. 415 | """ 416 | primitive_mapping = { 417 | "Float64": {"type": ["number", "null"]}, 418 | "Float32": {"type": ["number", "null"]}, 419 | "Int64": {"type": ["integer", "null"]}, 420 | "Int32": {"type": ["integer", "null"]}, 421 | "Int16": {"type": ["integer", "null"]}, 422 | "Int8": {"type": ["integer", "null"]}, 423 | "UInt64": {"type": ["integer", "null"]}, 424 | "UInt32": {"type": ["integer", "null"]}, 425 | "UInt16": {"type": ["integer", "null"]}, 426 | "UInt8": {"type": ["integer", "null"]}, 427 | "Boolean": {"type": ["boolean", "null"]}, 428 | "Utf8": {"type": ["string", "null"]}, 429 | "Date": {"type": ["string", "null"], "format": "date"}, 430 | "Datetime": {"type": ["string", "null"], "format": "date-time"}, 431 | "Time": {"type": ["string", "null"], "format": "time"}, 432 | } 433 | 434 | def map_dtype(dtype) -> dict: 435 | dtype_name = str(dtype) 436 | # Only primitive types keep their mapping 437 | if dtype_name.startswith("Struct("): 438 | return {"type": ["object", "null"]} 439 | 440 | if dtype_name.startswith("Datetime("): 441 | return {"type": ["string", "null"], "format": "date-time"} 442 | 443 | if dtype_name.startswith("List("): 444 | return {"type": ["array", "null"], "items": {"type": ["any", "null"]}} 445 | return primitive_mapping.get(dtype_name, {"type": ["string", "null"]}) 446 | 447 | header_map = { 448 | "type": ["object", "null"], 449 | "properties": {col: map_dtype(dtype) for col, dtype in schema.items()} 450 | } 451 | 452 | return header_map 453 | 454 | 455 | @to_singer.register(pl.DataFrame) 456 | def polars_df_to_singer( 457 | df: pl.DataFrame, 458 | stream, 459 | output_dir, 460 | keys=[], 461 | filename="data.singer", 462 | allow_objects=False, 463 | schema=None, 464 | unified_model=None, 465 | keep_null_fields=False, 466 | catalog_stream=None, 467 | recursive_typing=True 468 | ): 469 | """Convert a polars DataFrame into a singer file. 470 | 471 | Parameters 472 | ---------- 473 | df: pl.DataFrame 474 | Polars DataFrame to convert to singer. 475 | stream: str 476 | Stream name to be used in the singer output file. 477 | output_dir: str 478 | Path to the output directory. 479 | keys: list 480 | The primary-keys to be used. 481 | filename: str 482 | The output file name. 483 | allow_objects: boolean 484 | Allow or not objects to the parsed, if false defaults types to str. 485 | keep_null_fields: boolean 486 | Flag to keep all null fields 487 | catalog_stream: str 488 | Name of the stream in the catalog to be used to generate the schema if USE_CATALOG_SCHEMA is set as true 489 | If this is not set it will use stream parameter to generate the catalog 490 | recursive_typing: boolean 491 | If true, the function will recursively convert arrays of objects to arrays of primitives. 492 | If false, the function will fuzzy list types when generating singer header. 493 | """ 494 | 495 | output = os.path.join(output_dir, filename) 496 | mode = "a" if os.path.isfile(output) else "w" 497 | 498 | header_map = gen_singer_header_from_polars_schema(df.schema) 499 | 500 | 501 | 502 | with open(output, mode) as f: 503 | with redirect_stdout(f): 504 | singer.write_schema(stream, header_map, keys) 505 | for row in df.iter_rows(named=True): 506 | row = {k: v.strftime("%Y-%m-%dT%H:%M:%S.%fZ") if isinstance(v, datetime.datetime) else v for k, v in row.items()} 507 | singer.write_record(stream, row) 508 | 509 | 510 | 511 | 512 | @to_singer.register(pl.LazyFrame) 513 | def polars_lf_to_singer( 514 | df: pl.LazyFrame, 515 | stream, 516 | output_dir, 517 | keys=[], 518 | filename="data.singer", 519 | allow_objects=False, 520 | schema=None, 521 | unified_model=None, 522 | keep_null_fields=False, 523 | catalog_stream=None, 524 | recursive_typing=True 525 | ): 526 | """Convert a polars Lazyframe into a singer file. 527 | 528 | Parameters 529 | ---------- 530 | df: pd.DataFrame 531 | Object to extract the types from. 532 | stream: str 533 | Stream name to be used in the singer output file. 534 | output_dir: str 535 | Path to the output directory. 536 | keys: list 537 | The primary-keys to be used. 538 | filename: str 539 | The output file name. 540 | allow_objects: boolean 541 | Allow or not objects to the parsed, if false defaults types to str. 542 | keep_null_fields: boolean 543 | Flag to keep all null fields 544 | catalog_stream: str 545 | Name of the stream in the catalog to be used to generate the schema if USE_CATALOG_SCHEMA is set as true 546 | If this is not set it will use stream parameter to generate the catalog 547 | recursive_typing: boolean 548 | If true, the function will recursively convert arrays of objects to arrays of primitives. 549 | If false, the function will fuzzy list types when generating singer header. 550 | """ 551 | 552 | sink_fn = partial( 553 | polars_df_to_singer, 554 | stream=stream, 555 | output_dir=output_dir, 556 | keys=keys, 557 | filename=filename, 558 | allow_objects=allow_objects, 559 | schema=schema, 560 | unified_model=unified_model, 561 | keep_null_fields=keep_null_fields, 562 | catalog_stream=catalog_stream, 563 | recursive_typing=recursive_typing, 564 | ) 565 | df.sink_batches(sink_fn, chunk_size=1000) 566 | -------------------------------------------------------------------------------- /gluestick/etl_utils.py: -------------------------------------------------------------------------------- 1 | """Utilities for hotglue ETL scripts.""" 2 | 3 | import hashlib 4 | import json 5 | import os 6 | 7 | import pandas as pd 8 | import numpy as np 9 | import pyarrow.parquet as pq 10 | from datetime import datetime 11 | from pytz import utc 12 | from gluestick.singer import to_singer 13 | import re 14 | from gluestick.reader import Reader 15 | import polars as pl 16 | from gluestick.readers.pl_lazyframe_reader import PLLazyFrameReader 17 | from gluestick.readers.pl_reader import PolarsReader 18 | from functools import singledispatch 19 | 20 | 21 | def read_csv_folder(path, converters={}, index_cols={}, ignore=[]): 22 | """Read a set of CSV files in a folder using read_csv(). 23 | 24 | Notes 25 | ----- 26 | This method assumes that the files are being pulled in a stream and follow a 27 | naming convention with the stream/ entity / table name is the first word in the 28 | file name for example; Account-20200811T121507.csv is for an entity called 29 | ``Account``. 30 | 31 | Parameters 32 | ---------- 33 | path: str 34 | The folder directory 35 | converters: dict 36 | A dictionary with an array of converters that are passed to 37 | read_csv, the key of the dictionary is the name of the entity. 38 | index_cols: 39 | A dictionary with an array of index_cols, the key of the dictionary is the name 40 | of the entity. 41 | ignore: list 42 | List of files to ignore 43 | 44 | Returns 45 | ------- 46 | return: dict 47 | Dict of pandas.DataFrames. the keys of which are the entity names 48 | 49 | Examples 50 | -------- 51 | IN[31]: entity_data = read_csv_folder( 52 | CSV_FOLDER_PATH, 53 | index_cols={'Invoice': 'DocNumber'}, 54 | converters={'Invoice': { 55 | 'Line': ast.literal_eval, 56 | 'CustomField': ast.literal_eval, 57 | 'Categories': ast.literal_eval 58 | }} 59 | ) 60 | IN[32]: df = entity_data['Account'] 61 | 62 | """ 63 | is_directory = os.path.isdir(path) 64 | all_files = [] 65 | results = {} 66 | if is_directory: 67 | for entry in os.listdir(path): 68 | if os.path.isfile(os.path.join(path, entry)) and os.path.join( 69 | path, entry 70 | ).endswith(".csv"): 71 | all_files.append(os.path.join(path, entry)) 72 | 73 | else: 74 | all_files.append(path) 75 | 76 | for file in all_files: 77 | split_path = file.split("/") 78 | entity_type = split_path[len(split_path) - 1].rsplit(".csv", 1)[0] 79 | 80 | if "-" in entity_type: 81 | entity_type = entity_type.rsplit("-", 1)[0] 82 | 83 | if entity_type not in results and entity_type not in ignore: 84 | # print(f"Reading file of type {entity_type} in the data file {file}") 85 | results[entity_type] = pd.read_csv( 86 | file, 87 | index_col=index_cols.get(entity_type), 88 | converters=converters.get(entity_type), 89 | ) 90 | 91 | return results 92 | 93 | 94 | def read_parquet_folder(path, ignore=[]): 95 | """Read a set of parquet files in a folder using read_parquet(). 96 | 97 | Notes 98 | ----- 99 | This method assumes that the files are being pulled in a stream and follow a 100 | naming convention with the stream/ entity / table name is the first word in the 101 | file name for example; Account-20200811T121507.parquet is for an entity called 102 | ``Account``. 103 | 104 | Parameters 105 | ---------- 106 | path: str 107 | The folder directory 108 | ignore: list 109 | List of files to ignore 110 | 111 | Returns 112 | ------- 113 | return: dict 114 | Dict of pandas.DataFrames. the keys of which are the entity names 115 | 116 | Examples 117 | -------- 118 | IN[31]: entity_data = read_parquet_folder(PARQUET_FOLDER_PATH) 119 | IN[32]: df = entity_data['Account'] 120 | 121 | """ 122 | is_directory = os.path.isdir(path) 123 | all_files = [] 124 | results = {} 125 | if is_directory: 126 | for entry in os.listdir(path): 127 | if os.path.isfile(os.path.join(path, entry)) and os.path.join( 128 | path, entry 129 | ).endswith(".parquet"): 130 | all_files.append(os.path.join(path, entry)) 131 | 132 | else: 133 | all_files.append(path) 134 | 135 | for file in all_files: 136 | split_path = file.split("/") 137 | entity_type = split_path[len(split_path) - 1].rsplit(".parquet", 1)[0] 138 | 139 | if "-" in entity_type: 140 | entity_type = entity_type.rsplit("-", 1)[0] 141 | 142 | if entity_type not in results and entity_type not in ignore: 143 | df = pq.read_table(file, use_threads=False).to_pandas(safe=False, use_threads=False) 144 | # df = df.convert_dtypes() 145 | results[entity_type] = df 146 | 147 | return results 148 | 149 | 150 | def read_snapshots(stream, snapshot_dir, **kwargs): 151 | """Read a snapshot file. 152 | 153 | Parameters 154 | ---------- 155 | stream: str 156 | The name of the stream to extract the snapshots from. 157 | snapshot_dir: str 158 | The path for the directory where the snapshots are stored. 159 | **kwargs: 160 | Additional arguments that are passed to pandas read_csv. 161 | 162 | Returns 163 | ------- 164 | return: pd.DataFrame 165 | A pandas dataframe with the snapshot data. 166 | 167 | """ 168 | # Read snapshot file if it exists 169 | if os.path.isfile(f"{snapshot_dir}/{stream}.snapshot.parquet"): 170 | snapshot = pq.read_table(f"{snapshot_dir}/{stream}.snapshot.parquet", use_threads=False).to_pandas(safe=False, use_threads=False) 171 | # snapshot = snapshot.convert_dtypes() 172 | elif os.path.isfile(f"{snapshot_dir}/{stream}.snapshot.csv"): 173 | snapshot = pd.read_csv(f"{snapshot_dir}/{stream}.snapshot.csv", **kwargs) 174 | else: 175 | snapshot = None 176 | return snapshot 177 | 178 | 179 | def snapshot_records( 180 | stream_data, stream, snapshot_dir, pk="id", just_new=False, use_csv=False, coerce_types= False, localize_datetime_types=False, overwrite=False, **kwargs 181 | ): 182 | """Update a snapshot file. 183 | 184 | Parameters 185 | ---------- 186 | stream_data: str 187 | DataFrame with the data to be included in the snapshot. 188 | stream: str 189 | The name of the stream of the snapshots. 190 | snapshot_dir: str 191 | The name of the stream of the snapshots. 192 | pk: str 193 | The primary key used for the snapshot. 194 | just_new: str 195 | Return just the input data if True, else returns the whole data 196 | coerce_types: bool 197 | Coerces types to the stream_data types if True, else mantains current snapshot types 198 | localize_datetime_types: bool 199 | Localizes datetime columns to UTC if True, else mantains current snapshot types 200 | **kwargs: 201 | Additional arguments that are passed to pandas read_csv. 202 | 203 | Returns 204 | ------- 205 | return: pd.DataFrame 206 | A pandas dataframe with the snapshot data. 207 | 208 | """ 209 | # Read snapshot file if it exists 210 | snapshot = read_snapshots(stream, snapshot_dir, **kwargs) 211 | 212 | # If snapshot file and stream data exist update the snapshot 213 | if not overwrite and stream_data is not None and snapshot is not None: 214 | snapshot_types = snapshot.dtypes 215 | 216 | if localize_datetime_types: 217 | # Localize datetime columns to UTC (datetime64[ns, UTC]) if they are not already 218 | for column, dtype in snapshot_types.items(): 219 | if dtype == "datetime64[ns]": 220 | snapshot[column] = localize_datetime(snapshot, column) 221 | 222 | merged_data = pd.concat([snapshot, stream_data]) 223 | # coerce snapshot types to incoming data types 224 | if coerce_types: 225 | if not stream_data.empty and not snapshot.empty: 226 | # Save incoming data types 227 | df_types = stream_data.dtypes 228 | try: 229 | for column, dtype in df_types.items(): 230 | if dtype == 'bool': 231 | merged_data[column] = merged_data[column].astype('boolean') 232 | elif dtype in ["int64", "int32", "Int32", "Int64"]: 233 | merged_data[column] = merged_data[column].astype("Int64") 234 | else: 235 | merged_data[column] = merged_data[column].astype(dtype) 236 | except Exception as e: 237 | raise Exception(f"Snapshot failed while trying to convert field {column} from type {snapshot_types.get(column)} to type {dtype}") 238 | # drop duplicates 239 | merged_data = merged_data.drop_duplicates(pk, keep="last") 240 | # export data 241 | if use_csv: 242 | merged_data.to_csv(f"{snapshot_dir}/{stream}.snapshot.csv", index=False) 243 | else: 244 | merged_data.to_parquet(f"{snapshot_dir}/{stream}.snapshot.parquet", index=False) 245 | 246 | if not just_new: 247 | return merged_data 248 | else: 249 | return stream_data 250 | 251 | # If there is no snapshot file snapshots and return the new data 252 | if stream_data is not None: 253 | if use_csv: 254 | stream_data.to_csv(f"{snapshot_dir}/{stream}.snapshot.csv", index=False) 255 | else: 256 | stream_data.to_parquet(f"{snapshot_dir}/{stream}.snapshot.parquet", index=False) 257 | return stream_data 258 | 259 | if just_new or overwrite: 260 | return stream_data 261 | else: 262 | return snapshot 263 | 264 | 265 | def get_row_hash(row, columns): 266 | """Update a snapshot file. 267 | 268 | Parameters 269 | ---------- 270 | row: pd.DataSeries 271 | DataFrame row to create the hash from. 272 | 273 | Returns 274 | ------- 275 | return: str 276 | A string with the hash for the row. 277 | 278 | """ 279 | # ensure stable order 280 | values = [] 281 | 282 | for col in columns: 283 | v = row[col] 284 | 285 | if (isinstance(v, list) or not pd.isna(v)) and v==v and (v not in [None, np.nan]): 286 | values.append(str(v)) 287 | 288 | row_str = "".join(values) 289 | return hashlib.md5(row_str.encode()).hexdigest() 290 | 291 | 292 | def drop_redundant(df, name, output_dir, pk=[], updated_flag=False, use_csv=False): 293 | """Drop the rows that were present in previous versions of the dataframe. 294 | 295 | Notes 296 | ----- 297 | This function will create a hash for every row of the dataframe and snapshot it, if 298 | the same row was present in previous versions of the dataframe, it will be dropped. 299 | 300 | Parameters 301 | ---------- 302 | df: pd.DataFrame 303 | The dataframe do be checked for duplicates 304 | name: str 305 | The name used to snapshot the hash. 306 | output_dir: str 307 | The snapshot directory to save the state in. 308 | pk: list, str 309 | Primary key(s) used to associate the state with. 310 | updated_flag: bool 311 | To create of not a column with a flag for new/updated rows for the given 312 | primary key. 313 | 314 | Returns 315 | ------- 316 | return: pd.DataFrame 317 | Dataframe with the data after dropping the redundant rows. 318 | 319 | """ 320 | df = df.copy() 321 | 322 | if pk: 323 | # PK needs to be unique, so we drop the duplicated values 324 | df = df.drop_duplicates(subset=pk) 325 | 326 | # get a sorted list of columns to build the hash 327 | columns = list(df.columns) 328 | columns.sort() 329 | 330 | df["hash"] = df.apply(lambda row: get_row_hash(row, columns), axis=1) 331 | # If there is a snapshot file compare and filter the hash 332 | hash_df = None 333 | if os.path.isfile(f"{output_dir}/{name}.hash.snapshot.parquet"): 334 | hash_df = pq.read_table(f"{output_dir}/{name}.hash.snapshot.parquet", use_threads=False).to_pandas(safe=False, use_threads=False) 335 | elif os.path.isfile(f"{output_dir}/{name}.hash.snapshot.csv"): 336 | hash_df = pd.read_csv(f"{output_dir}/{name}.hash.snapshot.csv") 337 | 338 | if hash_df is not None: 339 | pk = [pk] if not isinstance(pk, list) else pk 340 | 341 | if pk: 342 | hash_df = hash_df.drop_duplicates(subset=pk) 343 | 344 | if updated_flag and pk: 345 | updated_pk = df[pk].merge(hash_df[pk], on=pk, how="inner") 346 | updated_pk["_updated"] = True 347 | 348 | df = df.merge( 349 | hash_df[pk + ["hash"]], on=pk + ["hash"], how="left", indicator=True 350 | ) 351 | df = df[df["_merge"] == "left_only"] 352 | df = df.drop("_merge", axis=1) 353 | 354 | if updated_flag and pk: 355 | df = df.merge(updated_pk, on=pk, how="left") 356 | df["_updated"] = df["_updated"].fillna(False) 357 | 358 | snapshot_records(df[pk + ["hash"]], f"{name}.hash", output_dir, pk, use_csv=use_csv) 359 | df = df.drop("hash", axis=1) 360 | return df 361 | 362 | def clean_convert(input): 363 | """Cleans all None values from a list or dict. 364 | 365 | Notes 366 | ----- 367 | This function will iterate through all the values of a list or dict 368 | and delete all None values 369 | 370 | Parameters 371 | ---------- 372 | input: dict, list 373 | The dict or list that will be cleaned. 374 | 375 | Returns 376 | ------- 377 | return: dict, list 378 | list or dict with the data after deleting all None values. 379 | 380 | """ 381 | if isinstance(input, list): 382 | return [clean_convert(i) for i in input] 383 | elif isinstance(input, dict): 384 | output = {} 385 | for k, v in input.items(): 386 | v = clean_convert(v) 387 | if isinstance(v, list): 388 | output[k] = [i for i in v if not pd.isna(i)] 389 | elif not pd.isna(v): 390 | output[k] = v 391 | return output 392 | elif isinstance(input, datetime): 393 | return input.isoformat() 394 | elif not pd.isna(input): 395 | return input 396 | 397 | def map_fields(row, mapping): 398 | """Maps the row values according to the mapping dict. 399 | 400 | Notes 401 | ----- 402 | This function will iterate through all the values of a mapping dict 403 | and map the values from the row accordingly 404 | 405 | Parameters 406 | ---------- 407 | row: dict or dataframe row with the values to be mapped 408 | mapping: dict that estabilsh how to map the fields 409 | 410 | Returns 411 | ------- 412 | return: dict 413 | dict with the mapped data. 414 | 415 | """ 416 | output = {} 417 | for key, value in mapping.items(): 418 | if isinstance(value, list): 419 | out_list = [] 420 | for v in value: 421 | mapped = map_fields(row, v) 422 | if mapped: 423 | out_list.append(mapped) 424 | if out_list: 425 | output[key] = out_list 426 | elif isinstance(value, dict): 427 | mapped = map_fields(row, value) 428 | if mapped: 429 | output[key] = mapped 430 | elif value is not None: 431 | if isinstance(row.get(value), list) or not pd.isna(row.get(value)): 432 | output[key] = row.get(value) 433 | return output 434 | 435 | def clean_obj_null_values(obj): 436 | """Replaces all null values by None. 437 | 438 | Notes 439 | ----- 440 | This function will replace all null values by None so other functions 441 | such as explode_json_to_cols, explode_json_to_rows, etc can be used 442 | 443 | Parameters 444 | ---------- 445 | obj: str 446 | stringified dict or list where null values should be replaced. 447 | 448 | Returns 449 | ------- 450 | return: str 451 | str with all null values replaced. 452 | 453 | """ 454 | if not pd.isna(obj): 455 | obj = obj.replace('null', 'None') 456 | return obj 457 | else: 458 | return {} 459 | 460 | 461 | def get_index_safely(arr, index): 462 | """Safely retrieves an item from an list by index. 463 | 464 | Parameters 465 | ---------- 466 | arr: list 467 | List of items. 468 | index: int 469 | The index position of the item 470 | 471 | Returns 472 | ------- 473 | return: any 474 | The item at the specified index, or `None` if the index is out of bounds. 475 | """ 476 | try: 477 | return arr[index] 478 | except: 479 | return None 480 | 481 | 482 | def build_string_format_variables( 483 | default_kwargs=dict(), use_tenant_metadata=True, subtenant_delimiter="_" 484 | ): 485 | """Builds a dictionary of string format variables from multiple sources. 486 | 487 | Parameters 488 | ---------- 489 | default_kwargs : dict 490 | A dictionary of default values for the format variables. Keys in this 491 | dictionary are reserved and cannot be overridden by tenant metadata. 492 | use_tenant_metadata : bool 493 | Whether to include variables derived from tenant metadata. If True, 494 | attempts to load metadata from the tenant configuration JSON file. 495 | subtenant_delimiter : str 496 | The delimiter used to split the `tenant_id` into root and sub-tenant 497 | components. 498 | 499 | Returns 500 | ------- 501 | dict 502 | A dictionary containing the consolidated string format variables. 503 | 504 | """ 505 | # Reserved keys are keys that may not be overriden by other sources of variabes (e.g., tenant metadata) 506 | # The keys in the "default_kwargs" are chosen to be these reserved keys 507 | reserved_keys = list(default_kwargs.keys()) 508 | 509 | final_kwargs = default_kwargs.copy() 510 | 511 | # Build tenant metadata variable 512 | tenant_metadata = dict() 513 | if use_tenant_metadata: 514 | tenant_metadata_path = ( 515 | f"{os.environ.get('ROOT')}/snapshots/tenant-config.json" 516 | ) 517 | if os.path.exists(tenant_metadata_path): 518 | with open(tenant_metadata_path, "r") as file: 519 | tenant_metadata = json.load(file) 520 | tenant_metadata = tenant_metadata.get("hotglue_metadata") or dict() 521 | tenant_metadata = tenant_metadata.get("metadata") or dict() 522 | 523 | # Iterate over "tenant_metadata" items and only add them in the "final_kwargs" if 524 | # the key is not in the "reserved_keys" 525 | for k, v in tenant_metadata.items(): 526 | if k in reserved_keys: 527 | continue 528 | 529 | final_kwargs[k] = v 530 | 531 | flow_id = os.environ.get("FLOW") 532 | job_id = os.environ.get("JOB_ID") 533 | tap = os.environ.get("TAP") 534 | connector = os.environ.get("CONNECTOR_ID") 535 | tenant_id = os.environ.get("TENANT", "") 536 | env_id = os.environ.get("ENV_ID") 537 | 538 | splitted_tenant_id = tenant_id.split(subtenant_delimiter) 539 | root_tenant_id = splitted_tenant_id[0] 540 | sub_tenant_id = get_index_safely(splitted_tenant_id, 1) or "" 541 | 542 | final_kwargs.update( 543 | { 544 | "tenant": tenant_id, 545 | "tenant_id": tenant_id, 546 | "root_tenant_id": root_tenant_id, 547 | "sub_tenant_id": sub_tenant_id, 548 | "env_id": env_id, 549 | "flow_id": flow_id, 550 | "job_id": job_id, 551 | "tap": tap, 552 | "connector": connector, 553 | } 554 | ) 555 | 556 | return final_kwargs 557 | 558 | 559 | def format_str_safely(str_to_format, **format_variables): 560 | """Safely formats a string by replacing placeholders with provided values. 561 | 562 | Notes 563 | ----- 564 | - This function skips placeholders with missing or empty values in 565 | `format_variables`. 566 | 567 | Parameters 568 | ---------- 569 | str_to_format : str 570 | The string containing placeholders to be replaced. Placeholders 571 | should be in the format `{key}`. 572 | **format_variables : dict 573 | Keyword arguments representing the variables to replace in the string. 574 | 575 | Returns 576 | ------- 577 | str 578 | A formatted string with the placeholders replaced by their 579 | corresponding values. 580 | 581 | """ 582 | str_output = str_to_format 583 | 584 | for k, v in format_variables.items(): 585 | if not v: 586 | continue 587 | str_output = re.sub(re.compile("{" + k + "}"), v, str_output) 588 | 589 | return str_output 590 | 591 | 592 | @singledispatch 593 | def to_export( 594 | data, 595 | name, 596 | output_dir, 597 | keys=[], 598 | unified_model=None, 599 | export_format=os.environ.get("DEFAULT_EXPORT_FORMAT", "singer"), 600 | output_file_prefix=os.environ.get("OUTPUT_FILE_PREFIX"), 601 | schema=None, 602 | stringify_objects=False, 603 | reserved_variables={} 604 | ): 605 | raise NotImplementedError("to_export is not implemented for this dataframe type") 606 | 607 | 608 | @to_export.register(pd.DataFrame) 609 | def pandas_df_to_export( 610 | data, 611 | name, 612 | output_dir, 613 | keys=[], 614 | unified_model=None, 615 | export_format=os.environ.get("DEFAULT_EXPORT_FORMAT", "singer"), 616 | output_file_prefix=os.environ.get("OUTPUT_FILE_PREFIX"), 617 | schema=None, 618 | stringify_objects=False, 619 | reserved_variables={}, 620 | ): 621 | """Parse a stringified dict or list of dicts. 622 | 623 | Notes 624 | ----- 625 | This function will export the input data to a specified format 626 | 627 | Parameters 628 | ---------- 629 | data: dataframe 630 | dataframe that will be transformed to a specified format. 631 | name: str 632 | name of the output file 633 | output_dir: str 634 | path of the folder that will store the output file 635 | output_file_prefix: str 636 | prefix of the output file name if needed 637 | export_format: str 638 | format to which the dataframe will be transformed 639 | supported values are: singer, parquet, json and csv 640 | unified_model: pydantic model 641 | pydantic model used to generate the schema for export format 642 | 'singer' 643 | schema: dict 644 | customized schema used for export format 'singer' 645 | stringify_objects: bool 646 | for parquet files it will stringify complex structures as arrays 647 | of objects 648 | reserved_variables: dict 649 | A dictionary of default values for the format variables to be used 650 | in the output_file_prefix. 651 | 652 | Returns 653 | ------- 654 | return: file 655 | it outputs a singer, parquet, json or csv file 656 | 657 | """ 658 | # NOTE: This is meant to allow users to override the default output name for a specific stream 659 | if os.environ.get(f"HG_UNIFIED_OUTPUT_{name.upper()}"): 660 | name = os.environ[f"HG_UNIFIED_OUTPUT_{name.upper()}"] 661 | 662 | if output_file_prefix: 663 | # format output_file_prefix with env variables 664 | format_variables = build_string_format_variables( 665 | default_kwargs=reserved_variables 666 | ) 667 | output_file_prefix = format_str_safely(output_file_prefix, **format_variables) 668 | composed_name = f"{output_file_prefix}{name}" 669 | else: 670 | composed_name = name 671 | 672 | if export_format == "singer": 673 | # get pk 674 | reader = Reader() 675 | keys = keys or reader.get_pk(name) 676 | # export data as singer 677 | to_singer(data, composed_name, output_dir, keys=keys, allow_objects=True, unified_model=unified_model, schema=schema) 678 | elif export_format == "parquet": 679 | if stringify_objects: 680 | data.to_parquet( 681 | os.path.join(output_dir, f"{composed_name}.parquet"), 682 | engine="fastparquet", 683 | ) 684 | else: 685 | data.to_parquet(os.path.join(output_dir, f"{composed_name}.parquet")) 686 | elif export_format == "json": 687 | data.to_json(f"{output_dir}/{composed_name}.json", orient="records", date_format='iso') 688 | elif export_format == "jsonl": 689 | data.to_json(f"{output_dir}/{composed_name}.jsonl", orient='records', lines=True, date_format='iso') 690 | else: 691 | data.to_csv(f"{output_dir}/{composed_name}.csv", index=False) 692 | 693 | 694 | @to_export.register(pl.LazyFrame) 695 | def polars_lf_to_export( 696 | data, 697 | name, 698 | output_dir, 699 | keys=[], 700 | unified_model=None, 701 | export_format=os.environ.get("DEFAULT_EXPORT_FORMAT", "singer"), 702 | output_file_prefix=os.environ.get("OUTPUT_FILE_PREFIX"), 703 | schema=None, 704 | stringify_objects=False, 705 | reserved_variables={}, 706 | ): 707 | """Write a Polars LazyFrame to a specified format. 708 | 709 | Notes 710 | ----- 711 | This function will export the input data to a specified format 712 | 713 | Parameters 714 | ---------- 715 | data: Polars LazyFrame 716 | Polars LazyFrame that will be transformed to a specified format. 717 | name: str 718 | name of the output file 719 | output_dir: str 720 | path of the folder that will store the output file 721 | output_file_prefix: str 722 | prefix of the output file name if needed 723 | export_format: str 724 | format to which the dataframe will be transformed 725 | supported values are: singer, parquet, json and csv 726 | unified_model: pydantic model 727 | pydantic model used to generate the schema for export format 728 | 'singer' 729 | schema: dict 730 | customized schema used for export format 'singer' 731 | stringify_objects: bool 732 | for parquet files it will stringify complex structures as arrays 733 | of objects 734 | reserved_variables: dict 735 | A dictionary of default values for the format variables to be used 736 | in the output_file_prefix. 737 | 738 | Returns 739 | ------- 740 | return: file 741 | it outputs a singer, parquet, json or csv file 742 | 743 | """ 744 | if output_file_prefix: 745 | # format output_file_prefix with env variables 746 | format_variables = build_string_format_variables( 747 | default_kwargs=reserved_variables 748 | ) 749 | output_file_prefix = format_str_safely(output_file_prefix, **format_variables) 750 | composed_name = f"{output_file_prefix}{name}" 751 | else: 752 | composed_name = name 753 | 754 | if export_format == "singer": 755 | # get pk 756 | reader = PLLazyFrameReader() 757 | keys = keys or reader.get_pk(name) 758 | # export data as singer 759 | to_singer(data, composed_name, output_dir, keys=keys, allow_objects=True, unified_model=unified_model, schema=schema) 760 | elif export_format == "parquet": 761 | data.sink_parquet(os.path.join(output_dir, f"{composed_name}.parquet")) 762 | elif export_format == "csv": 763 | data.sink_csv(os.path.join(output_dir, f"{composed_name}.csv")) 764 | else: 765 | raise ValueError(f"Unsupported export format: {export_format}") 766 | 767 | 768 | @to_export.register(pl.DataFrame) 769 | def polars_df_to_export( 770 | data, 771 | name, 772 | output_dir, 773 | keys=[], 774 | unified_model=None, 775 | export_format=os.environ.get("DEFAULT_EXPORT_FORMAT", "singer"), 776 | output_file_prefix=os.environ.get("OUTPUT_FILE_PREFIX"), 777 | schema=None, 778 | stringify_objects=False, 779 | reserved_variables={}, 780 | ): 781 | """Write a Polars DataFrame to a specified format. 782 | 783 | Notes 784 | ----- 785 | This function will export the input data to a specified format 786 | 787 | Parameters 788 | ---------- 789 | data: Polars DataFrame 790 | Polars DataFrame that will be transformed to a specified format. 791 | name: str 792 | name of the output file 793 | output_dir: str 794 | path of the folder that will store the output file 795 | output_file_prefix: str 796 | prefix of the output file name if needed 797 | export_format: str 798 | format to which the dataframe will be transformed 799 | supported values are: singer, parquet, json and csv 800 | unified_model: pydantic model 801 | pydantic model used to generate the schema for export format 802 | 'singer' 803 | schema: dict 804 | customized schema used for export format 'singer' 805 | stringify_objects: bool 806 | Unused for polars parquet; kept for signature compatibility 807 | reserved_variables: dict 808 | A dictionary of default values for the format variables to be used 809 | in the output_file_prefix. 810 | 811 | Returns 812 | ------- 813 | return: file 814 | it outputs a singer, parquet, csv, json or jsonl file 815 | 816 | """ 817 | if output_file_prefix: 818 | # format output_file_prefix with env variables 819 | format_variables = build_string_format_variables( 820 | default_kwargs=reserved_variables 821 | ) 822 | output_file_prefix = format_str_safely(output_file_prefix, **format_variables) 823 | composed_name = f"{output_file_prefix}{name}" 824 | else: 825 | composed_name = name 826 | 827 | if export_format == "singer": 828 | # get pk 829 | reader = PolarsReader() 830 | keys = keys or reader.get_pk(name) 831 | # export data as singer 832 | to_singer(data, composed_name, output_dir, keys=keys, allow_objects=True, unified_model=unified_model, schema=schema) 833 | elif export_format == "parquet": 834 | data.write_parquet(os.path.join(output_dir, f"{composed_name}.parquet")) 835 | elif export_format == "csv": 836 | data.write_csv(os.path.join(output_dir, f"{composed_name}.csv")) 837 | elif export_format == "json": 838 | data.write_json(os.path.join(output_dir, f"{composed_name}.json")) 839 | elif export_format == "jsonl": 840 | data.write_ndjson(os.path.join(output_dir, f"{composed_name}.jsonl")) 841 | else: 842 | raise ValueError(f"Unsupported export format: {export_format}") 843 | 844 | 845 | def localize_datetime(df, column_name): 846 | """ 847 | Localize a Pandas DataFrame column to a specific timezone. 848 | Parameters: 849 | ----------- 850 | df : pandas.DataFrame 851 | The DataFrame to be modified. 852 | column_name : str 853 | The name of the column to be localized. 854 | """ 855 | # Convert the column to a Pandas Timestamp object 856 | df[column_name] = pd.to_datetime(df[column_name], errors="coerce") 857 | # Localize the column to the specified timezone 858 | try: 859 | df[column_name] = df[column_name].dt.tz_localize(utc) 860 | except: 861 | df[column_name] = df[column_name].dt.tz_convert('UTC') 862 | 863 | return df[column_name] 864 | 865 | def exception(exception, root_dir, error_message=None): 866 | """ 867 | Stores an exception and a message into a file errors.txt, 868 | then the executor reads the error from the txt file to showcase the right error. 869 | It should be used instead of raise Exception. 870 | Parameters: 871 | ----------- 872 | exception : the exception caught in a try except code. 873 | root_dir : str 874 | The path of the roo_dir to store errors.txt 875 | error_message: str 876 | Additional message or data to make the error clearer. 877 | """ 878 | if error_message: 879 | error = f"ERROR: {error_message}. Cause: {exception}" 880 | else: 881 | error = f"ERROR: {exception}" 882 | with open(f"{root_dir}/errors.txt", "w") as outfile: 883 | outfile.write(error) 884 | raise Exception(error) 885 | 886 | def merge_id_from_snapshot(df, snapshot_dir, stream, flow_id, pk): 887 | """ 888 | Merges DataFrame with target created snapshot to retrieve existing target ids. 889 | 890 | Parameters 891 | ---------- 892 | df : pandas.DataFrame 893 | The DataFrame to be modified. 894 | snapshot_dir : str 895 | The path of the snapshot directory. 896 | stream : str 897 | The name of the stream. 898 | flow_id : str 899 | The id of the flow used to create the snapshot. 900 | pk : str 901 | The name of the primary key column to output. 902 | 903 | Returns 904 | ------- 905 | pandas.DataFrame 906 | The DataFrame with the primary key column added. 907 | """ 908 | 909 | # if no pk, set it to None and return 910 | if not pk: 911 | raise Exception(f"No PK found for '{stream}'. Cannot merge.") 912 | 913 | # if no externalId, raise an error 914 | if "externalId" not in df.columns: 915 | raise Exception(f"'externalId' missing for '{stream}'. Cannot merge.") 916 | 917 | # read snapshot 918 | prefix = f"{stream}_{flow_id}" 919 | print(f"Reading snapshot: '{prefix}'") 920 | snapshot_data_frame = read_snapshots(prefix, snapshot_dir) 921 | 922 | # if no snapshot, return dataframe 923 | if snapshot_data_frame is None or snapshot_data_frame.empty: 924 | print(f"No snapshot for '{prefix}'.") 925 | return df 926 | 927 | # get ids from snapshot 928 | ids = snapshot_data_frame[["InputId", "RemoteId"]].drop_duplicates( 929 | subset=["InputId"], keep="last" 930 | ) 931 | 932 | # merge dataframe with snapshot 933 | merged = df.merge( 934 | ids, 935 | left_on="externalId", 936 | right_on="InputId", 937 | how="left", 938 | suffixes=("", "_snap"), 939 | ) 940 | 941 | # rename RemoteId to pk 942 | if "RemoteId" in merged.columns: 943 | merged = merged.rename(columns={"RemoteId": pk}) 944 | 945 | # drop InputId (not needed) 946 | if "InputId" in merged.columns: 947 | merged = merged.drop(columns=["InputId"]) 948 | 949 | # set pk to None if not in snapshot 950 | if pk in merged.columns: 951 | merged[pk] = merged[pk].where(pd.notna(merged[pk]), None) 952 | print(f"Finished getting ids from snapshot for '{stream}'.") 953 | return merged 954 | 955 | def read_tenant_custom_mapping(tenant_config, flow_id=None): 956 | """Read the tenant mapping from the tenant config. 957 | 958 | Parameters 959 | ---------- 960 | tenant_config : dict 961 | The tenant config. 962 | """ 963 | # read mapping from tenant config 964 | raw_mapping_data = tenant_config.get("hotglue_mapping", {}).get("mapping", {}) 965 | if not raw_mapping_data: 966 | print("No 'hotglue_mapping.mapping' section found in tenant config.") 967 | return {}, {} 968 | 969 | custom_field_mappings = {} 970 | stream_name_mapping = {} 971 | 972 | # get flow_id from tenant config 973 | potential_flow_id_key = ( 974 | list(raw_mapping_data.keys())[0] 975 | if len(raw_mapping_data) == 1 976 | else None 977 | ) 978 | 979 | flow_id = flow_id or potential_flow_id_key 980 | raw_mapping_data = raw_mapping_data.get(flow_id) 981 | 982 | if not raw_mapping_data: 983 | print(f"No mapping found for flow_id: {flow_id}") 984 | return custom_field_mappings, stream_name_mapping 985 | 986 | if not isinstance(raw_mapping_data, dict): 987 | print(f"Unexpected structure in mapping content: Expected dict, got {type(raw_mapping_data)}") 988 | raise ValueError("Invalid mapping structure.") 989 | 990 | # process mapping 991 | for combined_stream_name, field_map in raw_mapping_data.items(): 992 | try: 993 | # Key format is SourceStream/TargetStream 994 | source_stream, target_stream = combined_stream_name.split("/", 1) 995 | custom_field_mappings[source_stream] = field_map 996 | stream_name_mapping[source_stream] = target_stream 997 | except Exception as e: 998 | raise Exception(f"Error processing mapping key '{combined_stream_name}': {e}. Skipping.") 999 | return custom_field_mappings, stream_name_mapping 1000 | --------------------------------------------------------------------------------