├── .github
    ├── CODEOWNERS
    ├── pull-request-template.md
    └── workflows
    │   ├── release-please.yml
    │   └── test.yml
├── .gitignore
├── .release-please-manifest.json
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── README.md
├── WORKFLOW.md
├── butterfree
    ├── __init__.py
    ├── _cli
    │   ├── __init__.py
    │   ├── main.py
    │   └── migrate.py
    ├── automated
    │   ├── __init__.py
    │   └── feature_set_creation.py
    ├── clients
    │   ├── __init__.py
    │   ├── abstract_client.py
    │   ├── cassandra_client.py
    │   └── spark_client.py
    ├── configs
    │   ├── __init__.py
    │   ├── db
    │   │   ├── __init__.py
    │   │   ├── abstract_config.py
    │   │   ├── cassandra_config.py
    │   │   ├── delta.py
    │   │   ├── kafka_config.py
    │   │   └── metastore_config.py
    │   └── environment.py
    ├── constants
    │   ├── __init__.py
    │   ├── columns.py
    │   ├── data_type.py
    │   ├── migrations.py
    │   ├── spark_constants.py
    │   └── window_definitions.py
    ├── dataframe_service
    │   ├── __init__.py
    │   ├── incremental_strategy.py
    │   ├── partitioning.py
    │   └── repartition.py
    ├── extract
    │   ├── __init__.py
    │   ├── pre_processing
    │   │   ├── __init__.py
    │   │   ├── explode_json_column_transform.py
    │   │   ├── filter_transform.py
    │   │   ├── forward_fill_transform.py
    │   │   ├── pivot_transform.py
    │   │   └── replace_transform.py
    │   ├── readers
    │   │   ├── __init__.py
    │   │   ├── file_reader.py
    │   │   ├── kafka_reader.py
    │   │   ├── reader.py
    │   │   └── table_reader.py
    │   └── source.py
    ├── hooks
    │   ├── __init__.py
    │   ├── hook.py
    │   ├── hookable_component.py
    │   └── schema_compatibility
    │   │   ├── __init__.py
    │   │   ├── cassandra_table_schema_compatibility_hook.py
    │   │   └── spark_table_schema_compatibility_hook.py
    ├── load
    │   ├── __init__.py
    │   ├── processing
    │   │   ├── __init__.py
    │   │   └── json_transform.py
    │   ├── sink.py
    │   └── writers
    │   │   ├── __init__.py
    │   │   ├── delta_feature_store_writer.py
    │   │   ├── delta_writer.py
    │   │   ├── historical_feature_store_writer.py
    │   │   ├── online_feature_store_writer.py
    │   │   └── writer.py
    ├── migrations
    │   ├── __init__.py
    │   └── database_migration
    │   │   ├── __init__.py
    │   │   ├── cassandra_migration.py
    │   │   ├── database_migration.py
    │   │   └── metastore_migration.py
    ├── pipelines
    │   ├── __init__.py
    │   └── feature_set_pipeline.py
    ├── reports
    │   ├── __init__.py
    │   └── metadata.py
    ├── testing
    │   ├── __init__.py
    │   └── dataframe
    │   │   └── __init__.py
    ├── transform
    │   ├── __init__.py
    │   ├── aggregated_feature_set.py
    │   ├── feature_set.py
    │   ├── features
    │   │   ├── __init__.py
    │   │   ├── feature.py
    │   │   ├── key_feature.py
    │   │   └── timestamp_feature.py
    │   ├── transformations
    │   │   ├── __init__.py
    │   │   ├── aggregated_transform.py
    │   │   ├── custom_transform.py
    │   │   ├── h3_transform.py
    │   │   ├── spark_function_transform.py
    │   │   ├── sql_expression_transform.py
    │   │   ├── stack_transform.py
    │   │   ├── transform_component.py
    │   │   └── user_defined_functions
    │   │   │   ├── __init__.py
    │   │   │   ├── mode.py
    │   │   │   └── most_frequent_set.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── date_range.py
    │   │   ├── function.py
    │   │   └── window_spec.py
    └── validations
    │   ├── __init__.py
    │   ├── basic_validaton.py
    │   └── validation.py
├── docs
    ├── Makefile
    ├── index.html
    ├── make.bat
    ├── requirements.txt
    └── source
    │   ├── butterfree.automated.rst
    │   ├── butterfree.clients.rst
    │   ├── butterfree.configs.db.rst
    │   ├── butterfree.configs.rst
    │   ├── butterfree.constants.rst
    │   ├── butterfree.dataframe_service.rst
    │   ├── butterfree.extract.pre_processing.rst
    │   ├── butterfree.extract.readers.rst
    │   ├── butterfree.extract.rst
    │   ├── butterfree.hooks.rst
    │   ├── butterfree.hooks.schema_compatibility.rst
    │   ├── butterfree.load.processing.rst
    │   ├── butterfree.load.rst
    │   ├── butterfree.load.writers.rst
    │   ├── butterfree.migrations.database_migration.rst
    │   ├── butterfree.migrations.rst
    │   ├── butterfree.pipelines.rst
    │   ├── butterfree.reports.rst
    │   ├── butterfree.rst
    │   ├── butterfree.testing.dataframe.rst
    │   ├── butterfree.testing.rst
    │   ├── butterfree.transform.features.rst
    │   ├── butterfree.transform.rst
    │   ├── butterfree.transform.transformations.rst
    │   ├── butterfree.transform.transformations.user_defined_functions.rst
    │   ├── butterfree.transform.utils.rst
    │   ├── butterfree.validations.rst
    │   ├── cli.md
    │   ├── conf.py
    │   ├── configuration.md
    │   ├── extract.md
    │   ├── getstart.md
    │   ├── home.md
    │   ├── index.rst
    │   ├── load.md
    │   ├── modules.rst
    │   ├── stream.md
    │   └── transform.md
├── examples
    ├── README.md
    ├── aggregated_feature_set
    │   └── aggregated_feature_set.ipynb
    ├── data
    │   ├── listing_events.json
    │   └── region.json
    ├── interval_runs
    │   └── interval_runs.ipynb
    ├── simple_feature_set
    │   └── simple_feature_set.ipynb
    ├── spark_function_and_window
    │   └── spark_function_and_window.ipynb
    ├── streaming_feature_set
    │   ├── events
    │   │   └── 20582255.json
    │   ├── pokedex.json
    │   └── streaming_feature_set.ipynb
    └── test_examples.py
├── mypy.ini
├── release-please-config.json
├── requirements.dev.txt
├── requirements.lint.txt
├── requirements.test.txt
├── requirements.txt
├── setup.cfg
├── setup.py
└── tests
    ├── __init__.py
    ├── integration
        ├── __init__.py
        ├── butterfree
        │   ├── __init__.py
        │   ├── extract
        │   │   ├── __init__.py
        │   │   ├── conftest.py
        │   │   └── test_source.py
        │   ├── load
        │   │   ├── __init__.py
        │   │   ├── conftest.py
        │   │   └── test_sink.py
        │   ├── pipelines
        │   │   ├── __init__.py
        │   │   ├── conftest.py
        │   │   └── test_feature_set_pipeline.py
        │   └── transform
        │   │   ├── __init__.py
        │   │   ├── conftest.py
        │   │   ├── test_aggregated_feature_set.py
        │   │   └── test_feature_set.py
        └── input
        │   └── data.json
    ├── mocks
        ├── __init__.py
        └── entities
        │   ├── __init__.py
        │   ├── first
        │       ├── __init__.py
        │       └── first_pipeline.py
        │   └── second
        │       ├── __init__.py
        │       └── deeper
        │           ├── __init__.py
        │           └── second_pipeline.py
    └── unit
        ├── __init__.py
        └── butterfree
            ├── __init__.py
            ├── _cli
                ├── __init__.py
                └── test_migrate.py
            ├── automated
                ├── __init__.py
                └── test_feature_set_creation.py
            ├── clients
                ├── __init__.py
                ├── conftest.py
                ├── test_cassandra_client.py
                └── test_spark_client.py
            ├── configs
                ├── __init__.py
                ├── db
                │   ├── __init__.py
                │   ├── conftest.py
                │   ├── test_cassandra_config.py
                │   ├── test_kafka_config.py
                │   └── test_metastore_config.py
                └── test_environment.py
            ├── dataframe_service
                ├── __init__.py
                ├── conftest.py
                ├── test_incremental_srategy.py
                ├── test_partitioning.py
                └── test_repartition.py
            ├── extract
                ├── __init__.py
                ├── conftest.py
                ├── pre_processing
                │   ├── __init__.py
                │   ├── conftest.py
                │   ├── test_explode_json_column.py
                │   ├── test_filter_transform.py
                │   ├── test_forward_fill.py
                │   ├── test_pivot_transform.py
                │   └── test_replace_transform.py
                ├── readers
                │   ├── __init__.py
                │   ├── file-reader-test.csv
                │   ├── file-reader-test.json
                │   ├── test_file_reader.py
                │   ├── test_kafka_reader.py
                │   ├── test_reader.py
                │   └── test_table_reader.py
                └── test_source.py
            ├── hooks
                ├── __init__.py
                ├── schema_compatibility
                │   ├── __init__.py
                │   ├── test_cassandra_table_schema_compatibility_hook.py
                │   └── test_spark_table_schema_compatibility_hook.py
                └── test_hookable_component.py
            ├── load
                ├── __init__.py
                ├── conftest.py
                ├── processing
                │   ├── __init__.py
                │   ├── conftest.py
                │   └── test_json_transform.py
                ├── test_sink.py
                └── writers
                │   ├── __init__.py
                │   ├── test_delta_writer.py
                │   ├── test_historical_feature_store_writer.py
                │   └── test_online_feature_store_writer.py
            ├── migrations
                ├── __init__.py
                └── database_migration
                │   ├── __init__.py
                │   ├── conftest.py
                │   ├── test_cassandra_migration.py
                │   ├── test_database_migration.py
                │   └── test_metastore_migration.py
            ├── pipelines
                ├── __init__.py
                ├── conftest.py
                └── test_feature_set_pipeline.py
            ├── reports
                ├── __init__.py
                └── test_metadata.py
            ├── testing
                └── dataframe
                │   ├── __init__.py
                │   └── test_dataframe.py
            ├── transform
                ├── __init__.py
                ├── conftest.py
                ├── features
                │   ├── __init__.py
                │   ├── conftest.py
                │   ├── test_feature.py
                │   ├── test_key_feature.py
                │   └── test_timestamp_feature.py
                ├── test_aggregated_feature_set.py
                ├── test_feature_set.py
                └── transformations
                │   ├── __init__.py
                │   ├── conftest.py
                │   ├── test_aggregated_transform.py
                │   ├── test_custom_transform.py
                │   ├── test_h3_transform.py
                │   ├── test_spark_function_transform.py
                │   ├── test_sql_expression_transform.py
                │   ├── test_stack_transform.py
                │   ├── test_transform_component.py
                │   └── user_defined_functions
                │       ├── __init__.py
                │       ├── conftest.py
                │       ├── test_mode.py
                │       └── test_most_frequent.py
            └── validations
                ├── __init__.py
                ├── conftest.py
                └── test_basic_validation.py


/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | *                             @quintoandar/mlcops


--------------------------------------------------------------------------------
/.github/pull-request-template.md:
--------------------------------------------------------------------------------
 1 | ## Why? :open_book:
 2 | _Replace me for a cool overview of why this PR is being created. You can
 3 | refer to the Jira task or Github issue here too. Never forget to put the
 4 | tag of a related Jira task in the title._
 5 | 
 6 | ## What? :wrench:
 7 | _Replace me for a detailed explanation of what is being modified._
 8 | _Want to add some awesome bullet points?_
 9 | - _First changes;_
10 | - _Second changes;_
11 | - _..._
12 | 
13 | _How about some cool checkboxes?_
14 | - [X] _First changes;_
15 | - [X] _Second changes;_
16 | - [ ] _..._
17 | 
18 | ## Type of change
19 | Please delete options that are not relevant.
20 | 
21 | - [ ] Bug fix (non-breaking change which fixes an issue)
22 | - [ ] New feature (non-breaking change which adds functionality)
23 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
24 | - [ ] This change requires a documentation update
25 | - [ ] Release
26 | 
27 | ## How everything was tested? :straight_ruler:
28 | _Have you achieved all the acceptance criteria? How?_
29 | _Is there any alternative flow in the testing process that you want to describe?_
30 | 
31 | ## Checklist
32 | - [ ] My code follows the style guidelines of this project (docstrings, type hinting and linter compliance);
33 | - [ ] I have performed a self-review of my own code;
34 | - [ ] I have made corresponding changes to the documentation;
35 | - [ ] I have added tests that prove my fix is effective or that my feature works;
36 | - [ ] New and existing unit tests pass locally with my changes;
37 | - [ ] Add labels to distinguish the type of pull request. Available labels are `bug`, `enhancement`, `feature`, and `review`.
38 | 
39 | ## Attention Points :warning:
40 | _Replace me for what the reviewer will need to pay attention to in the PR or just to cover any concerns after the merge._
41 | 


--------------------------------------------------------------------------------
/.github/workflows/release-please.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 |     branches:
 4 |       - staging
 5 | 
 6 | permissions:
 7 |   contents: write
 8 |   pull-requests: write
 9 | 
10 | name: release-please
11 | 
12 | jobs:
13 |   release-please:
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |       - uses: googleapis/release-please-action@v4
17 |         id: release
18 |         with:
19 |           token: ${{ secrets.GITHUB_TOKEN }}
20 |           target-branch: staging
21 |           config-file: release-please-config.json
22 |           manifest-file: .release-please-manifest.json
23 | 
24 |       - uses: actions/checkout@v4
25 |         if: ${{ steps.release.outputs.release_created }}
26 | 
27 |       - uses: actions/setup-python@v5
28 |         if: ${{ steps.release.outputs.release_created }}
29 |         with:
30 |           python-version: '3.9'
31 | 
32 |       - name: Install dependencies
33 |         if: ${{ steps.release.outputs.release_created }}
34 |         run: make ci-install
35 | 
36 |       - name: Build package
37 |         if: ${{ steps.release.outputs.release_created }}
38 |         run: make package
39 | 
40 |       - name: Publish release to pypi.org
41 |         if: ${{ steps.release.outputs.release_created }}
42 |         env:
43 |           PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }}
44 |           PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
45 |         run:  PYTHONPATH=./pip/deps python -m twine upload -u $PYPI_USERNAME -p $PYPI_PASSWORD --verbose dist/*
46 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: "Test"
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - master
 6 |       - staging
 7 |       - hotfix/**
 8 |   pull_request:
 9 | 
10 | jobs:
11 |   Pipeline:
12 |     runs-on: ubuntu-latest
13 | 
14 |     steps:
15 |     - uses: actions/checkout@v2
16 |     - uses: actions/setup-python@v5
17 |       with:
18 |         python-version: '3.9'
19 | 
20 |     - uses: actions/setup-java@v4
21 |       with:
22 |         java-version: '17'
23 |         distribution: microsoft
24 | 
25 |     - uses: vemonet/setup-spark@v1
26 |       with:
27 |         spark-version: '3.5.1'
28 |         hadoop-version: '3'
29 | 
30 |     - name: Install dependencies
31 |       run: make ci-install
32 | 
33 |     - name: Style check
34 |       run: PYTHONPATH=./pip/deps make style-check
35 | 
36 |     - name: Quality check
37 |       run: PYTHONPATH=./pip/deps make quality-check
38 | 
39 |     - name: Static Type check
40 |       run: PYTHONPATH=./pip/deps make type-check
41 | 
42 |     - name: Tests
43 |       run: PYTHONPATH=./pip/deps make tests
44 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .coverage
 43 | .coverage.*
 44 | .cache
 45 | nosetests.xml
 46 | coverage.xml
 47 | *.cover
 48 | .hypothesis/
 49 | *cov.xml
 50 | test_folder/
 51 | 
 52 | # Translations
 53 | *.mo
 54 | *.pot
 55 | 
 56 | # Django stuff:
 57 | *.log
 58 | local_settings.py
 59 | 
 60 | # Flask stuff:
 61 | instance/
 62 | .webassets-cache
 63 | 
 64 | # Scrapy stuff:
 65 | .scrapy
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | pip/
 70 | 
 71 | # Jupyter Notebook
 72 | .ipynb_checkpoints
 73 | 
 74 | # pyenv
 75 | .python-version
 76 | 
 77 | # celery beat schedule file
 78 | celerybeat-schedule
 79 | 
 80 | # SageMath parsed files
 81 | *.sage.py
 82 | 
 83 | # dotenv
 84 | .env
 85 | 
 86 | # virtualenv
 87 | .venv
 88 | venv/
 89 | ENV/
 90 | 
 91 | # Spyder project settings
 92 | .spyderproject
 93 | .spyproject
 94 | 
 95 | # Rope project settings
 96 | .ropeproject
 97 | 
 98 | # mkdocs documentation
 99 | /site
100 | 
101 | # mypy
102 | .mypy_cache/
103 | 
104 | # pytest
105 | .pytest_cache/
106 | 
107 | # PyCharm's Workspace
108 | .idea/
109 | 
110 | # Auto Generated: SHOULD NOT BE VERSIONED
111 | .version
112 | .package_name
113 | .repository_url
114 | .commit_hash
115 | *cov/
116 | 
117 | # VSCode Workspace
118 | spark-warehouse/
119 | .vscode/
120 | init/
121 | 
122 | # integration tests artifacts
123 | metastore_db/
124 | 


--------------------------------------------------------------------------------
/.release-please-manifest.json:
--------------------------------------------------------------------------------
1 | {
2 |   ".": "1.7.2"
3 | }
4 | 


--------------------------------------------------------------------------------
/butterfree/__init__.py:
--------------------------------------------------------------------------------
1 | """Module docstring example, following Google's docstring style."""
2 | 
3 | __version__ = "1.7.2"  # x-release-please-version
4 | 


--------------------------------------------------------------------------------
/butterfree/_cli/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/butterfree/_cli/__init__.py


--------------------------------------------------------------------------------
/butterfree/_cli/main.py:
--------------------------------------------------------------------------------
 1 | import typer
 2 | 
 3 | from butterfree._cli import migrate
 4 | 
 5 | app = typer.Typer(no_args_is_help=True)
 6 | app.add_typer(migrate.app, name="migrate")
 7 | 
 8 | if __name__ == "__main__":
 9 |     app()
10 | 


--------------------------------------------------------------------------------
/butterfree/automated/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/butterfree/automated/__init__.py


--------------------------------------------------------------------------------
/butterfree/clients/__init__.py:
--------------------------------------------------------------------------------
1 | """Holds connection clients."""
2 | 
3 | from butterfree.clients.abstract_client import AbstractClient
4 | from butterfree.clients.cassandra_client import CassandraClient
5 | from butterfree.clients.spark_client import SparkClient
6 | 
7 | __all__ = ["SparkClient", "CassandraClient", "AbstractClient"]
8 | 


--------------------------------------------------------------------------------
/butterfree/clients/abstract_client.py:
--------------------------------------------------------------------------------
 1 | """Abstract class for database clients."""
 2 | 
 3 | from abc import ABC, abstractmethod
 4 | from typing import Any, Optional
 5 | 
 6 | 
 7 | class AbstractClient(ABC):
 8 |     """Abstract base class for database clients."""
 9 | 
10 |     @property
11 |     @abstractmethod
12 |     def conn(self) -> Any:
13 |         """Returns a connection object."""
14 |         pass
15 | 
16 |     @abstractmethod
17 |     def sql(self, query: str) -> Any:
18 |         """Runs a query.
19 | 
20 |         Args:
21 |           query: client query.
22 | 
23 |         Returns:
24 |             Set of records.
25 |         """
26 |         pass
27 | 
28 |     @abstractmethod
29 |     def get_schema(self, table: str, database: Optional[str] = None) -> Any:
30 |         """Returns desired table schema.
31 | 
32 |         Attributes:
33 |             table: desired table.
34 | 
35 |         Returns:
36 |             A list of dictionaries in the format
37 |             [{"column_name": "example1", type: "Spark_type"}, ...]
38 | 
39 |         """
40 |         pass
41 | 


--------------------------------------------------------------------------------
/butterfree/configs/__init__.py:
--------------------------------------------------------------------------------
1 | """Holds configuration/setup for Butterfree components."""
2 | 


--------------------------------------------------------------------------------
/butterfree/configs/db/__init__.py:
--------------------------------------------------------------------------------
 1 | """This module holds database configurations to be used by clients."""
 2 | 
 3 | from butterfree.configs.db.abstract_config import AbstractWriteConfig
 4 | from butterfree.configs.db.cassandra_config import CassandraConfig
 5 | from butterfree.configs.db.delta import DeltaConfig
 6 | from butterfree.configs.db.kafka_config import KafkaConfig
 7 | from butterfree.configs.db.metastore_config import MetastoreConfig
 8 | 
 9 | __all__ = [
10 |     "AbstractWriteConfig",
11 |     "CassandraConfig",
12 |     "KafkaConfig",
13 |     "MetastoreConfig",
14 |     "DeltaConfig",
15 | ]
16 | 


--------------------------------------------------------------------------------
/butterfree/configs/db/abstract_config.py:
--------------------------------------------------------------------------------
 1 | """Abstract classes for database configurations with spark."""
 2 | 
 3 | from abc import ABC, abstractmethod
 4 | from typing import Any, Dict, List
 5 | 
 6 | 
 7 | class AbstractWriteConfig(ABC):
 8 |     """Abstract class for database write configurations with spark."""
 9 | 
10 |     @property
11 |     @abstractmethod
12 |     def database(self) -> str:
13 |         """Database name."""
14 | 
15 |     @property
16 |     @abstractmethod
17 |     def mode(self) -> Any:
18 |         """Config option "mode" for spark write.
19 | 
20 |         Args:
21 | 
22 |         Returns:
23 |           str: mode.
24 | 
25 |         """
26 | 
27 |     @property
28 |     @abstractmethod
29 |     def format_(self) -> Any:
30 |         """Config option "format" for spark write.
31 | 
32 |         Args:
33 | 
34 |         Returns:
35 |           str: format.
36 | 
37 |         """
38 | 
39 |     @abstractmethod
40 |     def translate(self, schema: Any) -> List[Dict[Any, Any]]:
41 |         """Translate feature set spark schema to the corresponding database.
42 | 
43 |         Args:
44 |           schema: feature set schema
45 | 
46 |         Returns:
47 |             Corresponding database schema.
48 | 
49 |         """
50 | 


--------------------------------------------------------------------------------
/butterfree/configs/environment.py:
--------------------------------------------------------------------------------
 1 | """Holds functions for managing the running environment."""
 2 | 
 3 | import os
 4 | from typing import Optional
 5 | 
 6 | specification = {
 7 |     "ENVIRONMENT": "dev",
 8 |     "CASSANDRA_HOST": "test",
 9 |     "CASSANDRA_KEYSPACE": "test",
10 |     "CASSANDRA_USERNAME": "test",
11 |     "CASSANDRA_PASSWORD": "test",
12 |     "FEATURE_STORE_S3_BUCKET": "test",
13 |     "FEATURE_STORE_HISTORICAL_DATABASE": "test",
14 |     "KAFKA_CONSUMER_CONNECTION_STRING": "test_host:1234,test_host2:1234",
15 |     "STREAM_CHECKPOINT_PATH": None,
16 |     "CASSANDRA_READ_CONSISTENCY_LEVEL": None,
17 |     "CASSANDRA_WRITE_CONSISTENCY_LEVEL": None,
18 |     "CASSANDRA_LOCAL_DC": None,
19 | }
20 | 
21 | 
22 | class UnspecifiedVariableError(RuntimeError):
23 |     """Environment variables not set error.
24 | 
25 |     Attributes:
26 |         variable_name: environment variable name.
27 | 
28 |     """
29 | 
30 |     def __init__(self, variable_name: str):
31 |         super().__init__(
32 |             f'Variable "{variable_name}" is not listed in the environment'
33 |             " specification\nUpdate the environment module"
34 |             f' to include "{variable_name}"'
35 |         )
36 | 
37 | 
38 | def get_variable(
39 |     variable_name: str, default_value: Optional[str] = None
40 | ) -> Optional[str]:
41 |     """Gets an environment variable.
42 | 
43 |     The variable comes from it's explicitly declared value in the running
44 |     environment or from the default value declared in specification or from the
45 |     default_value.
46 | 
47 |     Args:
48 |         variable_name: environment variable name.
49 |         default_value: default value to use in case no value is set in the
50 |             environment nor in the environment.yaml specification file.
51 | 
52 |     Returns:
53 |         The variable's string value
54 | 
55 |     """
56 |     try:
57 |         spec_default = specification[variable_name]
58 |     except KeyError:
59 |         raise UnspecifiedVariableError(variable_name)
60 |     return os.getenv(variable_name) or spec_default or default_value
61 | 


--------------------------------------------------------------------------------
/butterfree/constants/__init__.py:
--------------------------------------------------------------------------------
1 | """Holds constant attributes that are common for Butterfree."""
2 | 
3 | from butterfree.constants.data_type import DataType
4 | 
5 | __all__ = ["DataType"]
6 | 


--------------------------------------------------------------------------------
/butterfree/constants/columns.py:
--------------------------------------------------------------------------------
1 | """Holds common column names, constant through all Butterfree."""
2 | 
3 | from typing_extensions import Final
4 | 
5 | TIMESTAMP_COLUMN: Final = "timestamp"
6 | PARTITION_YEAR: Final = "year"
7 | PARTITION_MONTH: Final = "month"
8 | PARTITION_DAY: Final = "day"
9 | 


--------------------------------------------------------------------------------
/butterfree/constants/data_type.py:
--------------------------------------------------------------------------------
 1 | """DataType Enum Entity."""
 2 | 
 3 | from enum import Enum
 4 | 
 5 | from pyspark.sql.types import ArrayType, BinaryType, BooleanType
 6 | from pyspark.sql.types import DataType as PySparkDataType
 7 | from pyspark.sql.types import (
 8 |     DateType,
 9 |     DecimalType,
10 |     DoubleType,
11 |     FloatType,
12 |     IntegerType,
13 |     LongType,
14 |     StringType,
15 |     TimestampNTZType,
16 |     TimestampType,
17 | )
18 | from typing_extensions import final
19 | 
20 | 
21 | @final
22 | class DataType(Enum):
23 |     """Holds constants for data types within Butterfree."""
24 | 
25 |     TIMESTAMP_NTZ = (TimestampNTZType(), "timestamp", "TIMESTAMP_NTZ")
26 |     TIMESTAMP = (TimestampType(), "timestamp", "TIMESTAMP")
27 |     BINARY = (BinaryType(), "boolean", "BINARY")
28 |     BOOLEAN = (BooleanType(), "boolean", "BOOLEAN")
29 |     DATE = (DateType(), "timestamp", "DATE")
30 |     DECIMAL = (DecimalType(), "decimal", "DECIMAL")
31 |     DOUBLE = (DoubleType(), "double", "DOUBLE")
32 |     FLOAT = (FloatType(), "float", "FLOAT")
33 |     INTEGER = (IntegerType(), "int", "INT")
34 |     BIGINT = (LongType(), "bigint", "BIGINT")
35 |     STRING = (StringType(), "text", "STRING")
36 |     ARRAY_BIGINT = (ArrayType(LongType()), "frozen<list<bigint>>", "ARRAY<BIGINT>")
37 |     ARRAY_STRING = (ArrayType(StringType()), "frozen<list<text>>", "ARRAY<STRING>")
38 |     ARRAY_FLOAT = (ArrayType(FloatType()), "frozen<list<float>>", "ARRAY<FLOAT>")
39 | 
40 |     def __init__(self, spark: PySparkDataType, cassandra: str, spark_sql: str) -> None:
41 |         self.spark = spark
42 |         self.cassandra = cassandra
43 |         self.spark_sql = spark_sql
44 | 


--------------------------------------------------------------------------------
/butterfree/constants/migrations.py:
--------------------------------------------------------------------------------
 1 | """Migrations' Constants."""
 2 | 
 3 | from butterfree.constants import columns
 4 | 
 5 | PARTITION_BY = [
 6 |     {"column_name": columns.PARTITION_YEAR, "type": "INT"},
 7 |     {"column_name": columns.PARTITION_MONTH, "type": "INT"},
 8 |     {"column_name": columns.PARTITION_DAY, "type": "INT"},
 9 | ]
10 | 


--------------------------------------------------------------------------------
/butterfree/constants/spark_constants.py:
--------------------------------------------------------------------------------
 1 | """Holds common spark constants, present through all Butterfree."""
 2 | 
 3 | from typing_extensions import Final
 4 | 
 5 | # from spark.sql.shuffle.partitions default value
 6 | DEFAULT_NUM_PARTITIONS: Final = 200
 7 | 
 8 | # ratio between number of partitions per processor recommended (lower bound: 2)
 9 | # refs:
10 | # https://github.com/vaquarkhan/Apache-Kafka-poc-and-notes/wiki/Apache-Spark-Join-guidelines-and-Performance-tuning
11 | PARTITION_PROCESSOR_RATIO: Final = 4
12 | 


--------------------------------------------------------------------------------
/butterfree/constants/window_definitions.py:
--------------------------------------------------------------------------------
 1 | """Allowed windows units and lengths in seconds."""
 2 | 
 3 | ALLOWED_WINDOWS = {
 4 |     "second": 1,
 5 |     "seconds": 1,
 6 |     "minute": 60,
 7 |     "minutes": 60,
 8 |     "hour": 3600,
 9 |     "hours": 3600,
10 |     "day": 86400,
11 |     "days": 86400,
12 |     "week": 604800,
13 |     "weeks": 604800,
14 |     "year": 29030400,
15 |     "years": 29030400,
16 | }
17 | 


--------------------------------------------------------------------------------
/butterfree/dataframe_service/__init__.py:
--------------------------------------------------------------------------------
 1 | """Dataframe optimization components regarding Butterfree."""
 2 | 
 3 | from butterfree.dataframe_service.incremental_strategy import IncrementalStrategy
 4 | from butterfree.dataframe_service.partitioning import extract_partition_values
 5 | from butterfree.dataframe_service.repartition import repartition_df, repartition_sort_df
 6 | 
 7 | __all__ = [
 8 |     "extract_partition_values",
 9 |     "IncrementalStrategy",
10 |     "repartition_df",
11 |     "repartition_sort_df",
12 | ]
13 | 


--------------------------------------------------------------------------------
/butterfree/dataframe_service/partitioning.py:
--------------------------------------------------------------------------------
 1 | """Module defining partitioning methods."""
 2 | 
 3 | from typing import Any, Dict, List
 4 | 
 5 | from pyspark.sql import DataFrame
 6 | 
 7 | 
 8 | def extract_partition_values(
 9 |     dataframe: DataFrame, partition_columns: List[str]
10 | ) -> List[Dict[str, Any]]:
11 |     """Extract distinct partition values from a given dataframe.
12 | 
13 |     Args:
14 |         dataframe: dataframe from where to extract partition values.
15 |         partition_columns: name of partition columns presented on the dataframe.
16 | 
17 |     Returns:
18 |         distinct partition values.
19 |     """
20 |     return [
21 |         row.asDict()
22 |         for row in dataframe.select(*partition_columns).distinct().collect()
23 |     ]
24 | 


--------------------------------------------------------------------------------
/butterfree/dataframe_service/repartition.py:
--------------------------------------------------------------------------------
 1 | """Module where there are repartition methods."""
 2 | 
 3 | from typing import List, Optional
 4 | 
 5 | from pyspark.sql.dataframe import DataFrame
 6 | 
 7 | from butterfree.constants.spark_constants import (
 8 |     DEFAULT_NUM_PARTITIONS,
 9 |     PARTITION_PROCESSOR_RATIO,
10 | )
11 | 
12 | 
13 | def _num_partitions_definition(
14 |     num_processors: Optional[int] = None, num_partitions: Optional[int] = None
15 | ) -> int:
16 |     num_partitions = (
17 |         num_processors * PARTITION_PROCESSOR_RATIO
18 |         if num_processors
19 |         else num_partitions or DEFAULT_NUM_PARTITIONS
20 |     )
21 | 
22 |     return num_partitions
23 | 
24 | 
25 | def repartition_df(
26 |     dataframe: DataFrame,
27 |     partition_by: List[str],
28 |     num_partitions: Optional[int] = None,
29 |     num_processors: Optional[int] = None,
30 | ) -> DataFrame:
31 |     """Partition the DataFrame.
32 | 
33 |     Args:
34 |         dataframe: Spark DataFrame.
35 |         partition_by: list of partitions.
36 |         num_processors: number of processors.
37 |         num_partitions: number of partitions.
38 | 
39 |     Returns:
40 |         Partitioned dataframe.
41 | 
42 |     """
43 |     num_partitions = _num_partitions_definition(num_processors, num_partitions)
44 |     return dataframe.repartition(num_partitions, *partition_by)
45 | 
46 | 
47 | def repartition_sort_df(
48 |     dataframe: DataFrame,
49 |     partition_by: List[str],
50 |     order_by: List[str],
51 |     num_processors: Optional[int] = None,
52 |     num_partitions: Optional[int] = None,
53 | ) -> DataFrame:
54 |     """Partition and Sort the DataFrame.
55 | 
56 |     Args:
57 |         dataframe: Spark DataFrame.
58 |         partition_by: list of columns to partition by.
59 |         order_by: list of columns to order by.
60 |         num_processors: number of processors.
61 |         num_partitions: number of partitions.
62 | 
63 |     Returns:
64 |         Partitioned and sorted dataframe.
65 | 
66 |     """
67 |     num_partitions = _num_partitions_definition(num_processors, num_partitions)
68 |     dataframe = repartition_df(dataframe, partition_by, num_partitions)
69 |     return dataframe.sortWithinPartitions(*order_by)
70 | 


--------------------------------------------------------------------------------
/butterfree/extract/__init__.py:
--------------------------------------------------------------------------------
1 | """The Source Component of a Feature Set."""
2 | 
3 | from butterfree.extract.source import Source
4 | 
5 | __all__ = ["Source"]
6 | 


--------------------------------------------------------------------------------
/butterfree/extract/pre_processing/__init__.py:
--------------------------------------------------------------------------------
 1 | """Pre Processing Components regarding Readers."""
 2 | 
 3 | from butterfree.extract.pre_processing.explode_json_column_transform import (
 4 |     explode_json_column,
 5 | )
 6 | from butterfree.extract.pre_processing.filter_transform import filter
 7 | from butterfree.extract.pre_processing.forward_fill_transform import forward_fill
 8 | from butterfree.extract.pre_processing.pivot_transform import pivot
 9 | from butterfree.extract.pre_processing.replace_transform import replace
10 | 
11 | __all__ = ["explode_json_column", "filter", "forward_fill", "pivot", "replace"]
12 | 


--------------------------------------------------------------------------------
/butterfree/extract/pre_processing/explode_json_column_transform.py:
--------------------------------------------------------------------------------
 1 | """Explode json column for dataframes."""
 2 | 
 3 | from pyspark.sql.dataframe import DataFrame, StructType
 4 | from pyspark.sql.functions import from_json, get_json_object
 5 | 
 6 | JSON_TYPE_NAMES = ["array", "struct"]
 7 | 
 8 | 
 9 | def explode_json_column(
10 |     df: DataFrame, column: str, json_schema: StructType
11 | ) -> DataFrame:
12 |     """Create new columns extracting properties from a JSON column.
13 | 
14 |     Example:
15 | 
16 |     >>> from pyspark import SparkContext
17 |     >>> from pyspark.sql import session
18 |     >>> from butterfree.testing.dataframe import create_df_from_collection
19 |     >>> from butterfree.extract.pre_processing import explode_json_column
20 |     >>> from pyspark.sql.types import (
21 |     ...     ArrayType,
22 |     ...     IntegerType,
23 |     ...     StringType,
24 |     ...     StructField,
25 |     ...     StructType,
26 |     ... )
27 |     >>> spark_context = SparkContext.getOrCreate()
28 |     >>> spark_session = session.SparkSession(spark_context)
29 |     >>> data = [{"json_column": '{"a": 123, "b": "abc", "c": "123", "d": [1, 2, 3]}'}]
30 |     >>> df = create_df_from_collection(data, spark_context, spark_session)
31 |     >>> df.collect()
32 | 
33 |     [Row(json_column='{"a": 123, "b": "abc", "c": "123", "d": [1, 2, 3]}')]
34 | 
35 |     >>> json_column_schema = StructType(
36 |     ... [
37 |     ...    StructField("a", IntegerType()),
38 |     ...    StructField("b", StringType()),
39 |     ...    StructField("c", IntegerType()),
40 |     ...    StructField("d", ArrayType(IntegerType())),
41 |     ... ]
42 |     >>> explode_json_column(
43 |     ...     df, column='json_column', json_schema=json_column_schema
44 |     ... ).collect()
45 | 
46 |     [
47 |         Row(
48 |             json_column='{"a": 123, "b": "abc", "c": "123", "d": [1, 2, 3]}',
49 |             a=123,
50 |             b='abc',
51 |             c=123,
52 |             d=[1, 2, 3]
53 |         )
54 |     ]
55 | 
56 |     Args:
57 |         df: input dataframe with the target JSON column.
58 |         column: column name that is going to be exploded.
59 |         json_schema: expected schema from that JSON column.
60 |             Not all "first layer" fields need to be mapped in the json_schema,
61 |             just the desired columns. If there is any JSON field that is needed
62 |             to be cast to a struct, the declared expected schema (a StructType)
63 |             need to have the exact same schema as the presented record, if don't,
64 |             the value in the resulting column will be null.
65 | 
66 |     Returns:
67 |         dataframe with the new extracted columns from the JSON column.
68 | 
69 |     """
70 |     for field in json_schema:
71 |         if field.dataType.typeName() in JSON_TYPE_NAMES:
72 |             df = df.withColumn(
73 |                 field.name,
74 |                 from_json(
75 |                     get_json_object(df[column], "$.{}".format(field.name)),
76 |                     schema=field.dataType,  # type: ignore
77 |                 ),
78 |             )
79 |         else:  # non-collection data types
80 |             df = df.withColumn(
81 |                 field.name,
82 |                 get_json_object(df[column], "$.{}".format(field.name)).cast(
83 |                     field.dataType
84 |                 ),
85 |             )
86 |     return df
87 | 


--------------------------------------------------------------------------------
/butterfree/extract/pre_processing/filter_transform.py:
--------------------------------------------------------------------------------
 1 | """Module where filter DataFrames coming from readers."""
 2 | 
 3 | from pyspark.sql.dataframe import DataFrame
 4 | 
 5 | 
 6 | def filter(dataframe: DataFrame, condition: str) -> DataFrame:
 7 |     """Filters DataFrame's rows using the given condition and value.
 8 | 
 9 |     Args:
10 |         dataframe: Spark DataFrame.
11 |         condition: SQL expression with column, operation and value
12 |             to filter the dataframe.
13 | 
14 |     Returns:
15 |         Filtered dataframe
16 |     """
17 |     if not isinstance(condition, str):
18 |         raise TypeError("condition should be string.")
19 | 
20 |     return dataframe.filter(condition)
21 | 


--------------------------------------------------------------------------------
/butterfree/extract/pre_processing/replace_transform.py:
--------------------------------------------------------------------------------
 1 | """Replace transformer for dataframes."""
 2 | 
 3 | from itertools import chain
 4 | from typing import Dict
 5 | 
 6 | from pyspark.sql.dataframe import DataFrame
 7 | from pyspark.sql.functions import coalesce, col, create_map, lit
 8 | 
 9 | 
10 | def replace(
11 |     dataframe: DataFrame, column: str, replace_dict: Dict[str, str]
12 | ) -> DataFrame:
13 |     """Replace values of a string column in the dataframe using a dict.
14 | 
15 |     Example:
16 | 
17 |     >>> from butterfree.extract.pre_processing import replace
18 |     ... from butterfree.testing.dataframe import (
19 |     ...     assert_dataframe_equality,
20 |     ...     create_df_from_collection,
21 |     ... )
22 |     >>> from pyspark import SparkContext
23 |     >>> from pyspark.sql import session
24 |     >>> spark_context = SparkContext.getOrCreate()
25 |     >>> spark_session = session.SparkSession(spark_context)
26 |     >>> input_data = [
27 |     ...     {"id":1, "type": "a"}, {"id":2, "type": "b"}, {"id":3, "type": "c"}
28 |     ... ]
29 |     >>> input_df = create_df_from_collection(input_data, spark_context, spark_session)
30 |     >>> input_df.collect()
31 | 
32 |     [Row(id=1, type='a'), Row(id=2, type='b'), Row(id=3, type='c')]
33 | 
34 |     >>> replace_dict = {"a": "type_a", "b": "type_b"}
35 |     >>> replace(input_df, "type", replace_dict).collect()
36 | 
37 |     [Row(id=1, type='type_a'), Row(id=2, type='type_b'), Row(id=3, type='c')]
38 | 
39 |     Args:
40 |         dataframe: data to be transformed.
41 |         column: string column on the dataframe where to apply the replace.
42 |         replace_dict: dict with values to be replaced.
43 |             All mapped values must be string.
44 | 
45 |     Returns:
46 |         Dataframe with column values replaced.
47 | 
48 |     """
49 |     if (column not in dict(dataframe.dtypes)) or (
50 |         dict(dataframe.dtypes)[column] != "string"
51 |     ):
52 |         raise ValueError("column needs to be the name of an string column in dataframe")
53 |     if (not isinstance(replace_dict, dict)) or (
54 |         not all(isinstance(value, str) for value in chain(*replace_dict.items()))
55 |     ):
56 |         raise ValueError(
57 |             "replace_dict needs to be a Python dict with "
58 |             "all keys and values as string values"
59 |         )
60 | 
61 |     mapping = create_map(
62 |         [lit(value) for value in chain(*replace_dict.items())]  # type: ignore
63 |     )
64 |     return dataframe.withColumn(column, coalesce(mapping[col(column)], col(column)))
65 | 


--------------------------------------------------------------------------------
/butterfree/extract/readers/__init__.py:
--------------------------------------------------------------------------------
1 | """The Reader Component of a Source."""
2 | 
3 | from butterfree.extract.readers.file_reader import FileReader
4 | from butterfree.extract.readers.kafka_reader import KafkaReader
5 | from butterfree.extract.readers.table_reader import TableReader
6 | 
7 | __all__ = ["FileReader", "KafkaReader", "TableReader"]
8 | 


--------------------------------------------------------------------------------
/butterfree/extract/readers/table_reader.py:
--------------------------------------------------------------------------------
 1 | """TableSource entity."""
 2 | 
 3 | from typing import Optional
 4 | 
 5 | from pyspark.sql import DataFrame
 6 | 
 7 | from butterfree.clients import SparkClient
 8 | from butterfree.extract.readers.reader import Reader
 9 | 
10 | 
11 | class TableReader(Reader):
12 |     """Responsible for get data from tables registered in the metastore.
13 | 
14 |     Attributes:
15 |         id: unique string id for register the reader as a view on the metastore.
16 |         database: name of the metastore database/schema.
17 |         table: name of the table.
18 | 
19 |     Example:
20 |         Simple example regarding TableReader class instantiation.
21 | 
22 |     >>> from butterfree.extract.readers import TableReader
23 |     >>> from butterfree.clients import SparkClient
24 |     >>> from butterfree.extract.pre_processing import filter
25 |     >>> spark_client = SparkClient()
26 |     >>> table_reader = TableReader(
27 |     ...                     id="table_reader_id",
28 |     ...                     database="table_reader_db",
29 |     ...                     table="table_reader_table"
30 |     ...                )
31 |     >>> df = table_reader.consume(spark_client)
32 | 
33 |         This last method will use the Spark Client, as default, to read
34 |         the desired table, loading data into a dataframe, according to
35 |         TableReader class arguments.
36 | 
37 |         It's also possible to define simple transformations within the
38 |         reader's scope:
39 | 
40 |     >>> table_reader.with_(filter, condition="year = 2019").build(spark_client)
41 | 
42 |         In this case, however, a temp view will be created, cointaining
43 |         the transformed data.
44 | 
45 |     """
46 | 
47 |     __name__ = "Table Reader"
48 | 
49 |     def __init__(self, id: str, table: str, database: Optional[str] = None):
50 |         super().__init__(id)
51 |         if not isinstance(table, str):
52 |             raise ValueError(
53 |                 "table needs to be a string with the name of the registered table"
54 |             )
55 |         self.database = database
56 |         self.table = table
57 | 
58 |     def consume(self, client: SparkClient) -> DataFrame:
59 |         """Extract data from a table in Spark metastore.
60 | 
61 |         Args:
62 |             client: client responsible for connecting to Spark session.
63 | 
64 |         Returns:
65 |             Dataframe with all the data from the table.
66 | 
67 |         """
68 |         return client.read_table(self.table, self.database)
69 | 


--------------------------------------------------------------------------------
/butterfree/hooks/__init__.py:
--------------------------------------------------------------------------------
1 | """Holds Hooks definitions."""
2 | 
3 | from butterfree.hooks.hook import Hook
4 | from butterfree.hooks.hookable_component import HookableComponent
5 | 
6 | __all__ = ["Hook", "HookableComponent"]
7 | 


--------------------------------------------------------------------------------
/butterfree/hooks/hook.py:
--------------------------------------------------------------------------------
 1 | """Hook abstract class entity."""
 2 | 
 3 | from abc import ABC, abstractmethod
 4 | 
 5 | from pyspark.sql import DataFrame
 6 | 
 7 | 
 8 | class Hook(ABC):
 9 |     """Definition of a hook function to call on a Dataframe."""
10 | 
11 |     @abstractmethod
12 |     def run(self, dataframe: DataFrame) -> DataFrame:
13 |         """Run interface for Hook.
14 | 
15 |         Args:
16 |             dataframe: dataframe to use in the Hook.
17 | 
18 |         Returns:
19 |             dataframe result from the Hook.
20 |         """
21 | 


--------------------------------------------------------------------------------
/butterfree/hooks/schema_compatibility/__init__.py:
--------------------------------------------------------------------------------
 1 | """Holds Schema Compatibility Hooks definitions."""
 2 | 
 3 | from butterfree.hooks.schema_compatibility.cassandra_table_schema_compatibility_hook import (  # noqa
 4 |     CassandraTableSchemaCompatibilityHook,
 5 | )
 6 | from butterfree.hooks.schema_compatibility.spark_table_schema_compatibility_hook import (  # noqa
 7 |     SparkTableSchemaCompatibilityHook,
 8 | )
 9 | 
10 | __all__ = ["SparkTableSchemaCompatibilityHook", "CassandraTableSchemaCompatibilityHook"]
11 | 


--------------------------------------------------------------------------------
/butterfree/hooks/schema_compatibility/cassandra_table_schema_compatibility_hook.py:
--------------------------------------------------------------------------------
 1 | """Cassandra table schema compatibility Hook definition."""
 2 | 
 3 | from pyspark.sql import DataFrame
 4 | 
 5 | from butterfree.clients import CassandraClient
 6 | from butterfree.constants import DataType
 7 | from butterfree.hooks.hook import Hook
 8 | 
 9 | 
10 | class CassandraTableSchemaCompatibilityHook(Hook):
11 |     """Hook to verify the schema compatibility with a Cassandra's table.
12 | 
13 |     Verifies if all columns presented on the dataframe exists and are the same
14 |     type on the target Cassandra's table.
15 | 
16 |     Attributes:
17 |         cassandra_client: client to connect to Cassandra DB.
18 |         table: table name.
19 |     """
20 | 
21 |     def __init__(self, cassandra_client: CassandraClient, table: str):
22 |         self.cassandra_client = cassandra_client
23 |         self.table = table
24 | 
25 |     def run(self, dataframe: DataFrame) -> DataFrame:
26 |         """Check the schema compatibility from a given Dataframe.
27 | 
28 |         This method does not change anything on the Dataframe.
29 | 
30 |         Args:
31 |             dataframe: dataframe to verify schema compatibility.
32 | 
33 |         Returns:
34 |             unchanged dataframe.
35 | 
36 |         Raises:
37 |             ValueError if the schemas are incompatible.
38 |         """
39 |         table_schema = self.cassandra_client.get_schema(self.table)
40 |         type_cassandra = [
41 |             type.cassandra
42 |             for field_id in range(len(dataframe.schema.fieldNames()))
43 |             for type in DataType
44 |             if dataframe.schema.fields.__getitem__(field_id).dataType == type.spark
45 |         ]
46 |         schema = [
47 |             {"column_name": f"{column}", "type": f"{type}"}
48 |             for column, type in zip(dataframe.columns, type_cassandra)
49 |         ]
50 | 
51 |         if not all([column in table_schema for column in schema]):
52 |             raise ValueError(
53 |                 "There's a schema incompatibility "
54 |                 "between the defined dataframe and the Cassandra table.\n"
55 |                 f"Dataframe schema = {schema}"
56 |                 f"Target table schema = {table_schema}"
57 |             )
58 |         return dataframe
59 | 


--------------------------------------------------------------------------------
/butterfree/hooks/schema_compatibility/spark_table_schema_compatibility_hook.py:
--------------------------------------------------------------------------------
 1 | """Spark table schema compatibility Hook definition."""
 2 | 
 3 | from typing import Optional
 4 | 
 5 | from pyspark.sql import DataFrame
 6 | 
 7 | from butterfree.clients import SparkClient
 8 | from butterfree.hooks.hook import Hook
 9 | 
10 | 
11 | class SparkTableSchemaCompatibilityHook(Hook):
12 |     """Hook to verify the schema compatibility with a Spark's table.
13 | 
14 |     Verifies if all columns presented on the dataframe exists and are the same
15 |     type on the target Spark's table.
16 | 
17 |     Attributes:
18 |         spark_client: client to connect to Spark's metastore.
19 |         table: table name.
20 |         database: database name.
21 |     """
22 | 
23 |     def __init__(
24 |         self, spark_client: SparkClient, table: str, database: Optional[str] = None
25 |     ):
26 |         self.spark_client = spark_client
27 |         self.table_expression = (f"`{database}`." if database else "") + f"`{table}`"
28 | 
29 |     def run(self, dataframe: DataFrame) -> DataFrame:
30 |         """Check the schema compatibility from a given Dataframe.
31 | 
32 |         This method does not change anything on the Dataframe.
33 | 
34 |         Args:
35 |             dataframe: dataframe to verify schema compatibility.
36 | 
37 |         Returns:
38 |             unchanged dataframe.
39 | 
40 |         Raises:
41 |             ValueError if the schemas are incompatible.
42 |         """
43 |         table_schema = self.spark_client.conn.table(self.table_expression).schema
44 |         if not all([column in table_schema for column in dataframe.schema]):
45 |             raise ValueError(
46 |                 "The dataframe has a schema incompatible with the defined table.\n"
47 |                 f"Dataframe schema = {dataframe.schema}"
48 |                 f"Target table schema = {table_schema}"
49 |             )
50 |         return dataframe
51 | 


--------------------------------------------------------------------------------
/butterfree/load/__init__.py:
--------------------------------------------------------------------------------
1 | """Holds the Sink component of a feature set pipeline."""
2 | 
3 | from butterfree.load.sink import Sink
4 | 
5 | __all__ = ["Sink"]
6 | 


--------------------------------------------------------------------------------
/butterfree/load/processing/__init__.py:
--------------------------------------------------------------------------------
1 | """Pre Processing Components regarding Readers."""
2 | 
3 | from butterfree.load.processing.json_transform import json_transform
4 | 
5 | __all__ = ["json_transform"]
6 | 


--------------------------------------------------------------------------------
/butterfree/load/processing/json_transform.py:
--------------------------------------------------------------------------------
 1 | """Json conversion for writers."""
 2 | 
 3 | from pyspark.sql.dataframe import DataFrame
 4 | from pyspark.sql.functions import struct, to_json
 5 | 
 6 | 
 7 | def json_transform(dataframe: DataFrame) -> DataFrame:
 8 |     """Filters DataFrame's rows using the given condition and value.
 9 | 
10 |     Args:
11 |         dataframe: Spark DataFrame.
12 | 
13 |     Returns:
14 |         Converted dataframe.
15 |     """
16 |     return dataframe.select(
17 |         to_json(
18 |             struct([dataframe[column] for column in dataframe.columns])  # type: ignore
19 |         ).alias("value")
20 |     )
21 | 


--------------------------------------------------------------------------------
/butterfree/load/writers/__init__.py:
--------------------------------------------------------------------------------
 1 | """Holds data loaders for historical and online feature store."""
 2 | 
 3 | from butterfree.load.writers.delta_feature_store_writer import DeltaFeatureStoreWriter
 4 | from butterfree.load.writers.delta_writer import DeltaWriter
 5 | from butterfree.load.writers.historical_feature_store_writer import (
 6 |     HistoricalFeatureStoreWriter,
 7 | )
 8 | from butterfree.load.writers.online_feature_store_writer import OnlineFeatureStoreWriter
 9 | 
10 | __all__ = [
11 |     "HistoricalFeatureStoreWriter",
12 |     "OnlineFeatureStoreWriter",
13 |     "DeltaWriter",
14 |     "DeltaFeatureStoreWriter",
15 | ]
16 | 


--------------------------------------------------------------------------------
/butterfree/migrations/__init__.py:
--------------------------------------------------------------------------------
1 | """Holds available migrations."""
2 | 


--------------------------------------------------------------------------------
/butterfree/migrations/database_migration/__init__.py:
--------------------------------------------------------------------------------
 1 | """Holds available database migrations."""
 2 | 
 3 | from butterfree.migrations.database_migration.cassandra_migration import (
 4 |     CassandraMigration,
 5 | )
 6 | from butterfree.migrations.database_migration.database_migration import Diff
 7 | from butterfree.migrations.database_migration.metastore_migration import (
 8 |     MetastoreMigration,
 9 | )
10 | 
11 | __all__ = ["CassandraMigration", "MetastoreMigration", "Diff"]
12 | 
13 | 
14 | ALLOWED_DATABASE = {
15 |     "cassandra": CassandraMigration(),
16 |     "metastore": MetastoreMigration(),
17 | }
18 | 


--------------------------------------------------------------------------------
/butterfree/pipelines/__init__.py:
--------------------------------------------------------------------------------
1 | """ETL Pipelines."""
2 | 
3 | from butterfree.pipelines.feature_set_pipeline import FeatureSetPipeline
4 | 
5 | __all__ = ["FeatureSetPipeline"]
6 | 


--------------------------------------------------------------------------------
/butterfree/reports/__init__.py:
--------------------------------------------------------------------------------
1 | """Reports module."""
2 | 
3 | from butterfree.reports.metadata import Metadata
4 | 
5 | __all__ = ["Metadata"]
6 | 


--------------------------------------------------------------------------------
/butterfree/testing/__init__.py:
--------------------------------------------------------------------------------
1 | """Utilities to make testing of Butterfree tools easier."""
2 | 


--------------------------------------------------------------------------------
/butterfree/transform/__init__.py:
--------------------------------------------------------------------------------
1 | """The Transform Component of a Feature Set."""
2 | 
3 | from butterfree.transform.feature_set import FeatureSet
4 | 
5 | __all__ = ["FeatureSet"]
6 | 


--------------------------------------------------------------------------------
/butterfree/transform/features/__init__.py:
--------------------------------------------------------------------------------
1 | """Holds all feature types to be part of a FeatureSet."""
2 | 
3 | from butterfree.transform.features.feature import Feature
4 | from butterfree.transform.features.key_feature import KeyFeature
5 | from butterfree.transform.features.timestamp_feature import TimestampFeature
6 | 
7 | __all__ = ["Feature", "KeyFeature", "TimestampFeature"]
8 | 


--------------------------------------------------------------------------------
/butterfree/transform/features/key_feature.py:
--------------------------------------------------------------------------------
 1 | """KeyFeature entity."""
 2 | 
 3 | from typing import Optional
 4 | 
 5 | from butterfree.constants.data_type import DataType
 6 | from butterfree.transform.features.feature import Feature
 7 | from butterfree.transform.transformations import TransformComponent
 8 | 
 9 | 
10 | class KeyFeature(Feature):
11 |     """Defines a KeyFeature.
12 | 
13 |     A FeatureSet must contain one or more KeyFeatures, which will be used as
14 |     keys when storing the feature set dataframe as tables. The Feature Set may
15 |     validate keys are unique for the latest state of a feature set.
16 | 
17 |     Attributes:
18 |         name: key name.
19 |             Can be use by the transformation to derive multiple key columns.
20 |         description: brief explanation regarding the key.
21 |         dtype: data type for the output column of this key.
22 |         from_column: original column to build a key.
23 |             Used when there is transformation or the transformation has no
24 |             reference about the column to use for.
25 |         transformation: transformation that will be applied to create this key.
26 |             Keys can be derived by transformations over any data column. Like a
27 |             location hash based on latitude and longitude.
28 | 
29 |     """
30 | 
31 |     def __init__(
32 |         self,
33 |         name: str,
34 |         description: str,
35 |         dtype: DataType,
36 |         from_column: Optional[str] = None,
37 |         transformation: Optional[TransformComponent] = None,
38 |     ) -> None:
39 |         super(KeyFeature, self).__init__(
40 |             name=name,
41 |             description=description,
42 |             dtype=dtype,
43 |             from_column=from_column,
44 |             transformation=transformation,
45 |         )
46 | 


--------------------------------------------------------------------------------
/butterfree/transform/features/timestamp_feature.py:
--------------------------------------------------------------------------------
 1 | """TimestampFeature entity."""
 2 | 
 3 | from typing import Optional
 4 | 
 5 | from pyspark.sql import DataFrame
 6 | from pyspark.sql.functions import to_timestamp
 7 | 
 8 | from butterfree.constants import DataType
 9 | from butterfree.constants.columns import TIMESTAMP_COLUMN
10 | from butterfree.transform.features import Feature
11 | from butterfree.transform.transformations import TransformComponent
12 | 
13 | 
14 | class TimestampFeature(Feature):
15 |     """Defines a TimestampFeature.
16 | 
17 |     A FeatureSet must contain one TimestampFeature, which will be used as a time
18 |     tag for the state of all features. By containing a timestamp feature, users
19 |     may time travel over their features. The Feature Set may validate that the
20 |     set of keys and timestamp are unique for a feature set.
21 | 
22 |     By defining a TimestampColumn, the feature set will always contain a data
23 |     column called "timestamp" of TimestampType (spark dtype).
24 | 
25 |     Attributes:
26 |         from_column: original column to build a "timestamp" feature column.
27 |             Used when there is transformation or the transformation has no
28 |             reference about the column to use for.
29 |             If from_column is None, the FeatureSet will assume the input
30 |             dataframe already has a data column called "timestamp".
31 |         transformation: transformation that will be applied to create the
32 |             "timestamp". Type casting will already happen when no transformation
33 |             is given. But a timestamp can be derived from multiple columns, like
34 |             year, month and day, for example. The transformation must always
35 |             handle naming and typing.
36 |         from_ms: true if timestamp column presents milliseconds time unit. A
37 |         conversion is then performed.
38 |         mask: specified timestamp format by the user.
39 | 
40 |     """
41 | 
42 |     def __init__(
43 |         self,
44 |         dtype: Optional[DataType] = DataType.TIMESTAMP,
45 |         from_column: Optional[str] = None,
46 |         transformation: Optional[TransformComponent] = None,
47 |         from_ms: bool = False,
48 |         mask: Optional[str] = None,
49 |     ) -> None:
50 |         description = "Time tag for the state of all features."
51 |         super(TimestampFeature, self).__init__(
52 |             name=TIMESTAMP_COLUMN,
53 |             description=description,
54 |             from_column=from_column,
55 |             dtype=dtype,
56 |             transformation=transformation,
57 |         )
58 |         self.from_ms = from_ms
59 |         self.mask = mask
60 | 
61 |     def transform(self, dataframe: DataFrame) -> DataFrame:
62 |         """Performs a transformation to the feature pipeline.
63 | 
64 |         Args:
65 |             dataframe: input dataframe for the transformation.
66 | 
67 |         Returns:
68 |             Transformed dataframe.
69 |         """
70 |         column_name = self.from_column if self.from_column else self.name
71 | 
72 |         ts_column = dataframe[column_name]
73 |         if self.from_ms:
74 |             ts_column = ts_column / 1000
75 | 
76 |         dataframe = dataframe.withColumn(
77 |             column_name, to_timestamp(ts_column, self.mask)  # type: ignore
78 |         )
79 | 
80 |         return super().transform(dataframe)
81 | 


--------------------------------------------------------------------------------
/butterfree/transform/transformations/__init__.py:
--------------------------------------------------------------------------------
 1 | """Holds all transformations to be used by Features.
 2 | 
 3 | A transformation must inherit from a TransformComponent and handle data modification,
 4 | renaming and cast types using parent's (a Feature) information.
 5 | """
 6 | 
 7 | from butterfree.transform.transformations.aggregated_transform import (
 8 |     AggregatedTransform,
 9 | )
10 | from butterfree.transform.transformations.custom_transform import CustomTransform
11 | from butterfree.transform.transformations.spark_function_transform import (
12 |     SparkFunctionTransform,
13 | )
14 | from butterfree.transform.transformations.sql_expression_transform import (
15 |     SQLExpressionTransform,
16 | )
17 | from butterfree.transform.transformations.stack_transform import StackTransform
18 | from butterfree.transform.transformations.transform_component import TransformComponent
19 | 
20 | __all__ = [
21 |     "AggregatedTransform",
22 |     "CustomTransform",
23 |     "SparkFunctionTransform",
24 |     "SQLExpressionTransform",
25 |     "StackTransform",
26 |     "TransformComponent",
27 | ]
28 | 


--------------------------------------------------------------------------------
/butterfree/transform/transformations/transform_component.py:
--------------------------------------------------------------------------------
 1 | """Transform Abstract Class."""
 2 | 
 3 | from abc import ABC, abstractmethod
 4 | from typing import Any, List
 5 | 
 6 | from pyspark.sql import DataFrame
 7 | 
 8 | 
 9 | class TransformComponent(ABC):
10 |     """Defines an abstract class for Transform entities.
11 | 
12 |     Attributes:
13 |         parent: parent transform component.
14 |     """
15 | 
16 |     def __init__(self) -> None:
17 |         self._parent = None
18 | 
19 |     @property
20 |     def parent(self) -> Any:
21 |         """Parent transform component."""
22 |         return self._parent
23 | 
24 |     @parent.setter
25 |     def parent(self, parent: None) -> None:
26 |         self._parent = parent
27 | 
28 |     @property
29 |     @abstractmethod
30 |     def output_columns(self) -> List[str]:
31 |         """Columns generated by the transformation."""
32 | 
33 |     @abstractmethod
34 |     def transform(self, dataframe: DataFrame) -> DataFrame:
35 |         """Performs a transformation to the feature pipeline.
36 | 
37 |         Args:
38 |             dataframe: input dataframe.
39 | 
40 |         Returns:
41 |             Transformed dataframe.
42 |         """
43 | 


--------------------------------------------------------------------------------
/butterfree/transform/transformations/user_defined_functions/__init__.py:
--------------------------------------------------------------------------------
 1 | """Holds all transformations to be used by Features.
 2 | 
 3 | A transformation must inherit from a TransformComponent and handle data modification,
 4 | renaming and cast types using parent's (a Feature) information.
 5 | """
 6 | 
 7 | from butterfree.transform.transformations.user_defined_functions.mode import mode
 8 | from butterfree.transform.transformations.user_defined_functions.most_frequent_set import (  # noqa
 9 |     most_frequent_set,
10 | )
11 | 
12 | __all__ = [
13 |     "mode",
14 |     "most_frequent_set",
15 | ]
16 | 


--------------------------------------------------------------------------------
/butterfree/transform/transformations/user_defined_functions/mode.py:
--------------------------------------------------------------------------------
 1 | """Method to compute mode aggregation."""
 2 | 
 3 | import pandas as pd
 4 | from pyspark.sql.functions import pandas_udf
 5 | from pyspark.sql.types import StringType
 6 | 
 7 | 
 8 | @pandas_udf(StringType())  # type: ignore
 9 | def mode(column: pd.Series) -> str:
10 |     """Computes a mode aggregation.
11 | 
12 |     Attributes:
13 |         column: desired data to be aggregated with mode.
14 | 
15 |     Example:
16 |         It's necessary to declare the desired aggregation method, (average,
17 |         standard deviation and count are currently supported, as it can be
18 |         seen in __ALLOWED_AGGREGATIONS) and, finally, define the mode.
19 | 
20 |         >>> from pyspark import SparkContext
21 |         >>> from pyspark.sql import session, Window
22 |         >>> from pyspark.sql.functions import pandas_udf
23 |         >>> from butterfree.transform\
24 |         ...      .transformations.user_defined_functions import (mode)
25 |         >>> sc = SparkContext.getOrCreate()
26 |         >>> spark = session.SparkSession(sc)
27 |         >>> df = spark.createDataFrame(
28 |         >>>      [(1, 1), (1, 1), (2, 2), (2, 1), (2, 2)],
29 |         >>>      ("id", "column"))
30 |         >>> df.groupby("id").agg(mode("column")).show()
31 |         +---+------------+
32 |         | id|mode(column)|
33 |         +---+------------+
34 |         |  1|           1|
35 |         |  2|           2|
36 |         +---+------------+
37 |         >>> w = Window.partitionBy('id').rowsBetween(
38 |         ...       Window.unboundedPreceding, Window.unboundedFollowing)
39 |         >>> df.withColumn('most_viewed', mode("column").over(w)).show()
40 |         +---+------+-----------+
41 |         | id|column|most_viewed|
42 |         +---+------+-----------+
43 |         |  1|     1|          1|
44 |         |  1|     1|          1|
45 |         |  2|     2|          2|
46 |         |  2|     1|          2|
47 |         |  2|     2|          2|
48 |         +---+------+-----------+
49 | 
50 |         This example shows the mode aggregation. It's important to notice,
51 |         however, that if we want to used in fixed_windows or row_windows mode,
52 |         we'd need unbounded windows. For that reason, mode is meant to be used
53 |         just in rolling_windows mode, initially. We intend to make it available
54 |         to others modes soon.
55 | 
56 |     """
57 |     return str(column.mode()[0])
58 | 


--------------------------------------------------------------------------------
/butterfree/transform/transformations/user_defined_functions/most_frequent_set.py:
--------------------------------------------------------------------------------
 1 | """Method to compute most frequent set aggregation."""
 2 | 
 3 | from typing import Any
 4 | 
 5 | import pandas as pd
 6 | from pyspark.sql.functions import pandas_udf
 7 | from pyspark.sql.types import ArrayType, StringType
 8 | 
 9 | 
10 | @pandas_udf(ArrayType(StringType()))  # type: ignore
11 | def most_frequent_set(column: pd.Series) -> Any:
12 |     """Computes the most frequent set aggregation.
13 | 
14 |     Attributes:
15 |         column: desired data to be aggregated with most frequent set aggregation.
16 | 
17 |     Example:
18 |         It's necessary to declare the desired aggregation method, (average,
19 |         standard deviation and count are currently supported, as it can be
20 |         seen in __ALLOWED_AGGREGATIONS) and define the most frequent set aggregation.
21 | 
22 |         >>> from pyspark import SparkContext
23 |         >>> from pyspark.sql import session, Window
24 |         >>> from butterfree.transform\
25 |         ...     .transformations.user_defined_functions import (most_frequent_set)
26 |         >>> sc = SparkContext.getOrCreate()
27 |         >>> spark = session.SparkSession(sc)
28 |         >>> df = spark.createDataFrame(
29 |         >>>      [(1, 1), (1, 1), (2, 2), (2, 1), (2, 2)],
30 |         >>>      ("id", "column"))
31 |         >>> df.groupby("id").agg(most_frequent_set("column")).show()
32 |         +---+-------------------------+
33 |         | id|most_frequent_set(column)|
34 |         +---+-------------------------+
35 |         |  1|                      [1]|
36 |         |  2|                   [2, 1]|
37 |         +---+-------------------------+
38 |         >>> w = Window.partitionBy('id').rowsBetween(
39 |         ...        Window.unboundedPreceding, Window.unboundedFollowing)
40 |         >>> df.withColumn(
41 |         ...     'most_viewed', most_frequent_set("column").over(w)
42 |         ... ).show()
43 |         +---+------+-----------+
44 |         | id|column|most_viewed|
45 |         +---+------+-----------+
46 |         |  1|     1|        [1]|
47 |         |  1|     1|        [1]|
48 |         |  2|     2|     [2, 1]|
49 |         |  2|     1|     [2, 1]|
50 |         |  2|     2|     [2, 1]|
51 |         +---+------+-----------+
52 | 
53 |         This example shows the mode aggregation. It returns a list with the most
54 |         frequent values. It's important to notice, however, that if we want to
55 |         use it in fixed_windows or row_windows mode, we'd need unbounded windows.
56 |         For that reason, mode is meant to be used just in rolling_windows mode,
57 |         initially. We intend to make it available to others modes soon.
58 | 
59 |     """
60 |     return column.astype(str).value_counts().index.tolist()
61 | 


--------------------------------------------------------------------------------
/butterfree/transform/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """This module holds utils to be used by transformations."""
2 | 
3 | from butterfree.transform.utils.function import Function
4 | from butterfree.transform.utils.window_spec import Window
5 | 
6 | __all__ = ["Window", "Function"]
7 | 


--------------------------------------------------------------------------------
/butterfree/transform/utils/date_range.py:
--------------------------------------------------------------------------------
 1 | """Utils for date range generation."""
 2 | 
 3 | from datetime import datetime
 4 | from typing import Optional, Union
 5 | 
 6 | from pyspark.sql import DataFrame, functions
 7 | 
 8 | from butterfree.clients import SparkClient
 9 | from butterfree.constants import DataType
10 | from butterfree.constants.columns import TIMESTAMP_COLUMN
11 | 
12 | 
13 | def get_date_range(
14 |     client: SparkClient,
15 |     start_date: Union[str, datetime],
16 |     end_date: Union[str, datetime],
17 |     step: Optional[int] = None,
18 | ) -> DataFrame:
19 |     """Create a date range dataframe.
20 | 
21 |     The dataframe returning from this method will containing a single column
22 |     TIMESTAMP_COLUMN, of timestamp type, with dates between start and end.
23 | 
24 |     Args:
25 |         client: a spark client.
26 |         start_date: range beginning value (inclusive).
27 |         end_date: range last value (exclusive)
28 |         step: optional step, in seconds.
29 | 
30 |     Returns:
31 |         A single column date range spark dataframe.
32 |     """
33 |     day_in_seconds = 60 * 60 * 24
34 |     step = step or day_in_seconds
35 |     start_date = (
36 |         start_date if isinstance(start_date, str) else start_date.strftime("%Y-%m-%d")
37 |     )
38 |     end_date = end_date if isinstance(end_date, str) else end_date.strftime("%Y-%m-%d")
39 |     date_df = client.conn.createDataFrame(
40 |         [(start_date, end_date)], ("start_date", "end_date")
41 |     ).select(
42 |         [
43 |             functions.col(c).cast(DataType.TIMESTAMP.spark).cast(DataType.BIGINT.spark)
44 |             for c in ("start_date", "end_date")
45 |         ]
46 |     )
47 |     start_date, end_date = date_df.first()  # type: ignore
48 |     return client.conn.range(
49 |         start_date, end_date + day_in_seconds, step  # type: ignore
50 |     ).select(functions.col("id").cast(DataType.TIMESTAMP.spark).alias(TIMESTAMP_COLUMN))
51 | 


--------------------------------------------------------------------------------
/butterfree/transform/utils/function.py:
--------------------------------------------------------------------------------
 1 | """Utils for custom or spark function to generation namedtuple."""
 2 | 
 3 | from typing import Callable
 4 | 
 5 | from butterfree.constants import DataType
 6 | 
 7 | 
 8 | class Function:
 9 |     """Define a class Function.
10 | 
11 |     Like a namedtuple:
12 |         Function = namedtuple("Function", ["function", "data_type"]).
13 | 
14 |     Attributes:
15 |         func: custom or spark functions, such as avg, std, count.
16 |             For more information check spark functions:
17 |                 'https://spark.apache.org/docs/2.3.1/api/python/_modules/pyspark/sql/functions.html'
18 |             For custom functions, look the path:
19 |                 'butterfree/transform/transformations/user_defined_functions'.
20 |         data_type: data type for the output columns.
21 |     """
22 | 
23 |     def __init__(self, func: Callable, data_type: DataType):
24 |         self.func = func
25 |         self.data_type = data_type
26 | 
27 |     @property
28 |     def func(self) -> Callable:
29 |         """Function to be used in the transformation."""
30 |         return self._func
31 | 
32 |     @func.setter
33 |     def func(self, value: Callable) -> None:
34 |         """Definitions to be used in the transformation."""
35 |         if value is None:
36 |             raise ValueError("Function must not be empty.")
37 |         if callable(value) is False:
38 |             raise TypeError("Function must be callable.")
39 | 
40 |         self._func = value
41 | 
42 |     @property
43 |     def data_type(self) -> DataType:
44 |         """Function to be used in the transformation."""
45 |         return self._data_type
46 | 
47 |     @data_type.setter
48 |     def data_type(self, value: DataType) -> None:
49 |         """Definitions to be used in the transformation."""
50 |         if not value:
51 |             raise ValueError("DataType must not be empty.")
52 |         if not isinstance(value, DataType):
53 |             raise TypeError("Data type must be DataType.")
54 | 
55 |         self._data_type = value
56 | 


--------------------------------------------------------------------------------
/butterfree/validations/__init__.py:
--------------------------------------------------------------------------------
1 | """Holds dataframe validate for multiple destinations."""
2 | 
3 | from butterfree.validations.basic_validaton import BasicValidation
4 | 
5 | __all__ = ["BasicValidation"]
6 | 


--------------------------------------------------------------------------------
/butterfree/validations/basic_validaton.py:
--------------------------------------------------------------------------------
 1 | """Validation implementing basic checks over the dataframe."""
 2 | 
 3 | from typing import TYPE_CHECKING, Optional, Union
 4 | 
 5 | if TYPE_CHECKING:
 6 |     from pyspark.sql.connect.dataframe import DataFrame as ConnectDataFrame
 7 | 
 8 | from pyspark.sql.dataframe import DataFrame
 9 | 
10 | from butterfree.constants.columns import TIMESTAMP_COLUMN
11 | from butterfree.validations.validation import Validation
12 | 
13 | 
14 | class BasicValidation(Validation):
15 |     """Basic validation suite for Feature Set's dataframe.
16 | 
17 |     Attributes:
18 |         dataframe: object to be verified
19 | 
20 |     """
21 | 
22 |     def __init__(
23 |         self, dataframe: Optional[Union["ConnectDataFrame", DataFrame]] = None
24 |     ):
25 |         super().__init__(dataframe)
26 | 
27 |     def check(self) -> None:
28 |         """Check basic validation properties about the dataframe.
29 | 
30 |         Raises:
31 |             ValueError: if any of the verifications fail
32 | 
33 |         """
34 |         self.validate_column_ts()
35 |         self.validate_df_is_empty()
36 | 
37 |     def validate_column_ts(self) -> None:
38 |         """Check dataframe's ts column.
39 | 
40 |         Raises:
41 |             ValueError: if dataframe don't have a column named ts.
42 | 
43 |         """
44 |         if not self.dataframe:
45 |             raise ValueError("DataFrame can't be None.")
46 |         if TIMESTAMP_COLUMN not in self.dataframe.columns:
47 |             raise ValueError(f"DataFrame must have a '{TIMESTAMP_COLUMN}' column.")
48 | 
49 |     def _is_empty(self) -> bool:
50 |         if hasattr(self.dataframe, "isEmpty"):
51 |             # pyspark >= 3.4
52 |             return self.dataframe.isEmpty()
53 |         # pyspark < 3.4
54 |         return self.dataframe.rdd.isEmpty()
55 | 
56 |     def validate_df_is_empty(self) -> None:
57 |         """Check dataframe emptiness.
58 | 
59 |         Raises:
60 |             ValueError: if dataframe is empty and is not streaming.
61 | 
62 |         """
63 | 
64 |         if not self.dataframe:
65 |             raise ValueError("DataFrame can't be None.")
66 |         if (not self.dataframe.isStreaming) and self._is_empty():
67 |             raise ValueError("DataFrame can't be empty.")
68 | 


--------------------------------------------------------------------------------
/butterfree/validations/validation.py:
--------------------------------------------------------------------------------
 1 | """Abstract Validation class."""
 2 | 
 3 | from abc import ABC, abstractmethod
 4 | from typing import Optional
 5 | 
 6 | from pyspark.sql.dataframe import DataFrame
 7 | 
 8 | 
 9 | class Validation(ABC):
10 |     """Validate dataframe properties.
11 | 
12 |     Attributes:
13 |         dataframe: data to be verified.
14 | 
15 |     """
16 | 
17 |     def __init__(self, dataframe: Optional[DataFrame] = None):
18 |         self.dataframe = dataframe
19 | 
20 |     def input(self, dataframe: DataFrame) -> "Validation":
21 |         """Input a dataframe to check.
22 | 
23 |         Args:
24 |             dataframe: data to check.
25 | 
26 |         """
27 |         self.dataframe = dataframe
28 |         return self
29 | 
30 |     @abstractmethod
31 |     def check(self) -> None:
32 |         """Check validation properties about the dataframe.
33 | 
34 |         Raises:
35 |             ValueError: if any of the verifications fail.
36 | 
37 |         """
38 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SOURCEDIR     = source
 8 | BUILDDIR      = build
 9 | 
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 | 
14 | .PHONY: help Makefile
15 | 
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
1 | <meta http-equiv="refresh" content="0; url=./build/html/index.html" />
2 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | recommonmark==0.6.0
2 | sphinx-rtd-theme==0.4.3
3 | sphinxemoji==0.1.6
4 | typing-extensions==3.7.4.2
5 | cmake==3.18.4
6 | h3==3.7.0
7 | pyarrow==16.1.0
8 | 


--------------------------------------------------------------------------------
/docs/source/butterfree.automated.rst:
--------------------------------------------------------------------------------
 1 | butterfree.automated package
 2 | ============================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | 
 8 | .. automodule:: butterfree.automated.feature_set_creation
 9 |    :members:
10 |    :undoc-members:
11 |    :show-inheritance:
12 | 
13 | Module contents
14 | ---------------
15 | 
16 | .. automodule:: butterfree.automated
17 |    :members:
18 |    :undoc-members:
19 |    :show-inheritance:
20 | 


--------------------------------------------------------------------------------
/docs/source/butterfree.clients.rst:
--------------------------------------------------------------------------------
 1 | butterfree.clients package
 2 | ==========================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | butterfree.clients.abstract\_client module
 8 | ------------------------------------------
 9 | 
10 | .. automodule:: butterfree.clients.abstract_client
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | butterfree.clients.cassandra\_client module
16 | -------------------------------------------
17 | 
18 | .. automodule:: butterfree.clients.cassandra_client
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | butterfree.clients.spark\_client module
24 | ---------------------------------------
25 | 
26 | .. automodule:: butterfree.clients.spark_client
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | Module contents
32 | ---------------
33 | 
34 | .. automodule:: butterfree.clients
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 | 


--------------------------------------------------------------------------------
/docs/source/butterfree.configs.db.rst:
--------------------------------------------------------------------------------
 1 | butterfree.configs.db package
 2 | =============================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | butterfree.configs.db.abstract\_config module
 8 | ---------------------------------------------
 9 | 
10 | .. automodule:: butterfree.configs.db.abstract_config
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | butterfree.configs.db.cassandra\_config module
16 | ----------------------------------------------
17 | 
18 | .. automodule:: butterfree.configs.db.cassandra_config
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | butterfree.configs.db.kafka\_config module
24 | ------------------------------------------
25 | 
26 | .. automodule:: butterfree.configs.db.kafka_config
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | butterfree.configs.db.metastore\_config module
32 | ----------------------------------------------
33 | 
34 | .. automodule:: butterfree.configs.db.metastore_config
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 | 
39 | Module contents
40 | ---------------
41 | 
42 | .. automodule:: butterfree.configs.db
43 |    :members:
44 |    :undoc-members:
45 |    :show-inheritance:
46 | 


--------------------------------------------------------------------------------
/docs/source/butterfree.configs.rst:
--------------------------------------------------------------------------------
 1 | butterfree.configs package
 2 | ==========================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    butterfree.configs.db
11 | 
12 | Submodules
13 | ----------
14 | 
15 | butterfree.configs.environment module
16 | -------------------------------------
17 | 
18 | .. automodule:: butterfree.configs.environment
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | butterfree.configs.logger module
24 | --------------------------------
25 | 
26 | .. automodule:: butterfree.configs.logger
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | .. automodule:: butterfree.configs.logger
32 |    :members:
33 |    :undoc-members:
34 |    :show-inheritance:
35 | 
36 | .. automodule:: butterfree.configs.logger
37 |    :members:
38 |    :undoc-members:
39 |    :show-inheritance:
40 | 
41 | .. automodule:: butterfree.configs.logger
42 |    :members:
43 |    :undoc-members:
44 |    :show-inheritance:
45 | 
46 | .. automodule:: butterfree.configs.logger
47 |    :members:
48 |    :undoc-members:
49 |    :show-inheritance:
50 | 
51 | .. automodule:: butterfree.configs.logger
52 |    :members:
53 |    :undoc-members:
54 |    :show-inheritance:
55 | 
56 | Module contents
57 | ---------------
58 | 
59 | .. automodule:: butterfree.configs
60 |    :members:
61 |    :undoc-members:
62 |    :show-inheritance:
63 | 


--------------------------------------------------------------------------------
/docs/source/butterfree.constants.rst:
--------------------------------------------------------------------------------
  1 | butterfree.constants package
  2 | ============================
  3 | 
  4 | Submodules
  5 | ----------
  6 | 
  7 | butterfree.constants.columns module
  8 | -----------------------------------
  9 | 
 10 | .. automodule:: butterfree.constants.columns
 11 |    :members:
 12 |    :undoc-members:
 13 |    :show-inheritance:
 14 | 
 15 | butterfree.constants.data\_type module
 16 | --------------------------------------
 17 | 
 18 | .. automodule:: butterfree.constants.data_type
 19 |    :members:
 20 |    :undoc-members:
 21 |    :show-inheritance:
 22 | 
 23 | butterfree.constants.migrations module
 24 | --------------------------------------
 25 | 
 26 | .. automodule:: butterfree.constants.migrations
 27 |    :members:
 28 |    :undoc-members:
 29 |    :show-inheritance:
 30 | 
 31 | butterfree.constants.spark\_constants module
 32 | --------------------------------------------
 33 | 
 34 | .. automodule:: butterfree.constants.migrations
 35 |    :members:
 36 |    :undoc-members:
 37 |    :show-inheritance:
 38 | 
 39 | 
 40 | .. automodule:: butterfree.constants.migrations
 41 |    :members:
 42 |    :undoc-members:
 43 |    :show-inheritance:
 44 | 
 45 | 
 46 | .. automodule:: butterfree.constants.migrations
 47 |    :members:
 48 |    :undoc-members:
 49 |    :show-inheritance:
 50 | 
 51 | 
 52 | .. automodule:: butterfree.constants.migrations
 53 |    :members:
 54 |    :undoc-members:
 55 |    :show-inheritance:
 56 | 
 57 | 
 58 | .. automodule:: butterfree.constants.migrations
 59 |    :members:
 60 |    :undoc-members:
 61 |    :show-inheritance:
 62 | 
 63 | 
 64 | .. automodule:: butterfree.constants.spark_constants
 65 |    :members:
 66 |    :undoc-members:
 67 |    :show-inheritance:
 68 | 
 69 | butterfree.constants.window\_definitions module
 70 | -----------------------------------------------
 71 | 
 72 | .. automodule:: butterfree.constants.window_definitions
 73 |    :members:
 74 |    :undoc-members:
 75 |    :show-inheritance:
 76 | 
 77 | .. automodule:: butterfree.constants.window_definitions
 78 |    :members:
 79 |    :undoc-members:
 80 |    :show-inheritance:
 81 | 
 82 | .. automodule:: butterfree.constants.window_definitions
 83 |    :members:
 84 |    :undoc-members:
 85 |    :show-inheritance:
 86 | 
 87 | .. automodule:: butterfree.constants.window_definitions
 88 |    :members:
 89 |    :undoc-members:
 90 |    :show-inheritance:
 91 | 
 92 | .. automodule:: butterfree.constants.window_definitions
 93 |    :members:
 94 |    :undoc-members:
 95 |    :show-inheritance:
 96 | 
 97 | .. automodule:: butterfree.constants.window_definitions
 98 |    :members:
 99 |    :undoc-members:
100 |    :show-inheritance:
101 | 
102 | Module contents
103 | ---------------
104 | 
105 | .. automodule:: butterfree.constants
106 |    :members:
107 |    :undoc-members:
108 |    :show-inheritance:
109 | 


--------------------------------------------------------------------------------
/docs/source/butterfree.dataframe_service.rst:
--------------------------------------------------------------------------------
 1 | butterfree.dataframe\_service package
 2 | =====================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | 
 8 | .. automodule:: butterfree.dataframe_service.incremental_strategy
 9 |    :members:
10 |    :undoc-members:
11 |    :show-inheritance:
12 | 
13 | 
14 | .. automodule:: butterfree.dataframe_service.partitioning
15 |    :members:
16 |    :undoc-members:
17 |    :show-inheritance:
18 | 
19 | 
20 | .. automodule:: butterfree.dataframe_service.repartition
21 |    :members:
22 |    :undoc-members:
23 |    :show-inheritance:
24 | 
25 | Module contents
26 | ---------------
27 | 
28 | .. automodule:: butterfree.dataframe_service
29 |    :members:
30 |    :undoc-members:
31 |    :show-inheritance:
32 | 


--------------------------------------------------------------------------------
/docs/source/butterfree.extract.pre_processing.rst:
--------------------------------------------------------------------------------
 1 | butterfree.extract.pre\_processing package
 2 | ==========================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | butterfree.extract.pre\_processing.explode\_json\_column\_transform module
 8 | --------------------------------------------------------------------------
 9 | 
10 | .. automodule:: butterfree.extract.pre_processing.explode_json_column_transform
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | butterfree.extract.pre\_processing.filter\_transform module
16 | -----------------------------------------------------------
17 | 
18 | .. automodule:: butterfree.extract.pre_processing.filter_transform
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | butterfree.extract.pre\_processing.forward\_fill\_transform module
24 | ------------------------------------------------------------------
25 | 
26 | .. automodule:: butterfree.extract.pre_processing.forward_fill_transform
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | butterfree.extract.pre\_processing.pivot\_transform module
32 | ----------------------------------------------------------
33 | 
34 | .. automodule:: butterfree.extract.pre_processing.pivot_transform
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 | 
39 | butterfree.extract.pre\_processing.replace\_transform module
40 | ------------------------------------------------------------
41 | 
42 | .. automodule:: butterfree.extract.pre_processing.replace_transform
43 |    :members:
44 |    :undoc-members:
45 |    :show-inheritance:
46 | 
47 | Module contents
48 | ---------------
49 | 
50 | .. automodule:: butterfree.extract.pre_processing
51 |    :members:
52 |    :undoc-members:
53 |    :show-inheritance:
54 | 


--------------------------------------------------------------------------------
/docs/source/butterfree.extract.readers.rst:
--------------------------------------------------------------------------------
 1 | butterfree.extract.readers package
 2 | ==================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | butterfree.extract.readers.file\_reader module
 8 | ----------------------------------------------
 9 | 
10 | .. automodule:: butterfree.extract.readers.file_reader
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | butterfree.extract.readers.kafka\_reader module
16 | -----------------------------------------------
17 | 
18 | .. automodule:: butterfree.extract.readers.kafka_reader
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | butterfree.extract.readers.reader module
24 | ----------------------------------------
25 | 
26 | .. automodule:: butterfree.extract.readers.reader
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | butterfree.extract.readers.table\_reader module
32 | -----------------------------------------------
33 | 
34 | .. automodule:: butterfree.extract.readers.table_reader
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 | 
39 | Module contents
40 | ---------------
41 | 
42 | .. automodule:: butterfree.extract.readers
43 |    :members:
44 |    :undoc-members:
45 |    :show-inheritance:
46 | 


--------------------------------------------------------------------------------
/docs/source/butterfree.extract.rst:
--------------------------------------------------------------------------------
 1 | butterfree.extract package
 2 | ==========================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    butterfree.extract.pre_processing
11 |    butterfree.extract.readers
12 | 
13 | Submodules
14 | ----------
15 | 
16 | butterfree.extract.source module
17 | --------------------------------
18 | 
19 | .. automodule:: butterfree.extract.source
20 |    :members:
21 |    :undoc-members:
22 |    :show-inheritance:
23 | 
24 | Module contents
25 | ---------------
26 | 
27 | .. automodule:: butterfree.extract
28 |    :members:
29 |    :undoc-members:
30 |    :show-inheritance:
31 | 


--------------------------------------------------------------------------------
/docs/source/butterfree.hooks.rst:
--------------------------------------------------------------------------------
 1 | butterfree.hooks package
 2 | ========================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    butterfree.hooks.schema_compatibility
11 | 
12 | Submodules
13 | ----------
14 | 
15 | 
16 | .. automodule:: butterfree.hooks.hook
17 |    :members:
18 |    :undoc-members:
19 |    :show-inheritance:
20 | 
21 | 
22 | .. automodule:: butterfree.hooks.hookable_component
23 |    :members:
24 |    :undoc-members:
25 |    :show-inheritance:
26 | 
27 | Module contents
28 | ---------------
29 | 
30 | .. automodule:: butterfree.hooks
31 |    :members:
32 |    :undoc-members:
33 |    :show-inheritance:
34 | 


--------------------------------------------------------------------------------
/docs/source/butterfree.hooks.schema_compatibility.rst:
--------------------------------------------------------------------------------
 1 | butterfree.hooks.schema\_compatibility package
 2 | ==============================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | 
 8 | .. automodule:: butterfree.hooks.schema_compatibility.cassandra_table_schema_compatibility_hook
 9 |    :members:
10 |    :undoc-members:
11 |    :show-inheritance:
12 | 
13 | 
14 | .. automodule:: butterfree.hooks.schema_compatibility.spark_table_schema_compatibility_hook
15 |    :members:
16 |    :undoc-members:
17 |    :show-inheritance:
18 | 
19 | Module contents
20 | ---------------
21 | 
22 | .. automodule:: butterfree.hooks.schema_compatibility
23 |    :members:
24 |    :undoc-members:
25 |    :show-inheritance:
26 | 


--------------------------------------------------------------------------------
/docs/source/butterfree.load.processing.rst:
--------------------------------------------------------------------------------
 1 | butterfree.load.processing package
 2 | ==================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | butterfree.load.processing.json\_transform module
 8 | -------------------------------------------------
 9 | 
10 | .. automodule:: butterfree.load.processing.json_transform
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | Module contents
16 | ---------------
17 | 
18 | .. automodule:: butterfree.load.processing
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 


--------------------------------------------------------------------------------
/docs/source/butterfree.load.rst:
--------------------------------------------------------------------------------
 1 | butterfree.load package
 2 | =======================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    butterfree.load.processing
11 |    butterfree.load.writers
12 | 
13 | Submodules
14 | ----------
15 | 
16 | butterfree.load.sink module
17 | ---------------------------
18 | 
19 | .. automodule:: butterfree.load.sink
20 |    :members:
21 |    :undoc-members:
22 |    :show-inheritance:
23 | 
24 | Module contents
25 | ---------------
26 | 
27 | .. automodule:: butterfree.load
28 |    :members:
29 |    :undoc-members:
30 |    :show-inheritance:
31 | 


--------------------------------------------------------------------------------
/docs/source/butterfree.load.writers.rst:
--------------------------------------------------------------------------------
 1 | butterfree.load.writers package
 2 | ===============================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | butterfree.load.writers.historical\_feature\_store\_writer module
 8 | -----------------------------------------------------------------
 9 | 
10 | .. automodule:: butterfree.load.writers.historical_feature_store_writer
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | butterfree.load.writers.online\_feature\_store\_writer module
16 | -------------------------------------------------------------
17 | 
18 | .. automodule:: butterfree.load.writers.online_feature_store_writer
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | butterfree.load.writers.writer module
24 | -------------------------------------
25 | 
26 | .. automodule:: butterfree.load.writers.writer
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | Module contents
32 | ---------------
33 | 
34 | .. automodule:: butterfree.load.writers
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 | 


--------------------------------------------------------------------------------
/docs/source/butterfree.migrations.database_migration.rst:
--------------------------------------------------------------------------------
 1 | butterfree.migrations.database\_migration package
 2 | =================================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | 
 8 | .. automodule:: butterfree.migrations.database_migration.cassandra_migration
 9 |    :members:
10 |    :undoc-members:
11 |    :show-inheritance:
12 | 
13 | 
14 | .. automodule:: butterfree.migrations.database_migration.database_migration
15 |    :members:
16 |    :undoc-members:
17 |    :show-inheritance:
18 | 
19 | 
20 | .. automodule:: butterfree.migrations.database_migration.metastore_migration
21 |    :members:
22 |    :undoc-members:
23 |    :show-inheritance:
24 | 
25 | Module contents
26 | ---------------
27 | 
28 | .. automodule:: butterfree.migrations.database_migration
29 |    :members:
30 |    :undoc-members:
31 |    :show-inheritance:
32 | 


--------------------------------------------------------------------------------
/docs/source/butterfree.migrations.rst:
--------------------------------------------------------------------------------
 1 | butterfree.migrations package
 2 | =============================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    butterfree.migrations.database_migration
11 | 
12 | Module contents
13 | ---------------
14 | 
15 | .. automodule:: butterfree.migrations
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 


--------------------------------------------------------------------------------
/docs/source/butterfree.pipelines.rst:
--------------------------------------------------------------------------------
 1 | butterfree.pipelines package
 2 | ============================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | butterfree.pipelines.feature\_set\_pipeline module
 8 | --------------------------------------------------
 9 | 
10 | .. automodule:: butterfree.pipelines.feature_set_pipeline
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | Module contents
16 | ---------------
17 | 
18 | .. automodule:: butterfree.pipelines
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 


--------------------------------------------------------------------------------
/docs/source/butterfree.reports.rst:
--------------------------------------------------------------------------------
 1 | butterfree.reports package
 2 | ==========================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | butterfree.reports.metadata module
 8 | ----------------------------------
 9 | 
10 | .. automodule:: butterfree.reports.metadata
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | Module contents
16 | ---------------
17 | 
18 | .. automodule:: butterfree.reports
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 


--------------------------------------------------------------------------------
/docs/source/butterfree.rst:
--------------------------------------------------------------------------------
 1 | butterfree package
 2 | ==================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    butterfree.automated
11 |    butterfree.clients
12 |    butterfree.configs
13 |    butterfree.constants
14 |    butterfree.dataframe_service
15 |    butterfree.extract
16 |    butterfree.hooks
17 |    butterfree.load
18 |    butterfree.migrations
19 |    butterfree.pipelines
20 |    butterfree.reports
21 |    butterfree.testing
22 |    butterfree.transform
23 |    butterfree.validations
24 | 
25 | Module contents
26 | ---------------
27 | 
28 | .. automodule:: butterfree
29 |    :members:
30 |    :undoc-members:
31 |    :show-inheritance:
32 | 


--------------------------------------------------------------------------------
/docs/source/butterfree.testing.dataframe.rst:
--------------------------------------------------------------------------------
 1 | butterfree.testing.dataframe package
 2 | ====================================
 3 | 
 4 | Module contents
 5 | ---------------
 6 | 
 7 | .. automodule:: butterfree.testing.dataframe
 8 |    :members:
 9 |    :undoc-members:
10 |    :show-inheritance:
11 | 


--------------------------------------------------------------------------------
/docs/source/butterfree.testing.rst:
--------------------------------------------------------------------------------
 1 | butterfree.testing package
 2 | ==========================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    butterfree.testing.dataframe
11 | 
12 | Module contents
13 | ---------------
14 | 
15 | .. automodule:: butterfree.testing
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 


--------------------------------------------------------------------------------
/docs/source/butterfree.transform.features.rst:
--------------------------------------------------------------------------------
 1 | butterfree.transform.features package
 2 | =====================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | butterfree.transform.features.feature module
 8 | --------------------------------------------
 9 | 
10 | .. automodule:: butterfree.transform.features.feature
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | butterfree.transform.features.key\_feature module
16 | -------------------------------------------------
17 | 
18 | .. automodule:: butterfree.transform.features.key_feature
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | butterfree.transform.features.timestamp\_feature module
24 | -------------------------------------------------------
25 | 
26 | .. automodule:: butterfree.transform.features.timestamp_feature
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | Module contents
32 | ---------------
33 | 
34 | .. automodule:: butterfree.transform.features
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 | 


--------------------------------------------------------------------------------
/docs/source/butterfree.transform.rst:
--------------------------------------------------------------------------------
 1 | butterfree.transform package
 2 | ============================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    butterfree.transform.features
11 |    butterfree.transform.transformations
12 |    butterfree.transform.utils
13 | 
14 | Submodules
15 | ----------
16 | 
17 | butterfree.transform.aggregated\_feature\_set module
18 | ----------------------------------------------------
19 | 
20 | .. automodule:: butterfree.transform.aggregated_feature_set
21 |    :members:
22 |    :undoc-members:
23 |    :show-inheritance:
24 | 
25 | butterfree.transform.feature\_set module
26 | ----------------------------------------
27 | 
28 | .. automodule:: butterfree.transform.feature_set
29 |    :members:
30 |    :undoc-members:
31 |    :show-inheritance:
32 | 
33 | Module contents
34 | ---------------
35 | 
36 | .. automodule:: butterfree.transform
37 |    :members:
38 |    :undoc-members:
39 |    :show-inheritance:
40 | 


--------------------------------------------------------------------------------
/docs/source/butterfree.transform.transformations.rst:
--------------------------------------------------------------------------------
 1 | butterfree.transform.transformations package
 2 | ============================================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    butterfree.transform.transformations.user_defined_functions
11 | 
12 | Submodules
13 | ----------
14 | 
15 | butterfree.transform.transformations.aggregated\_transform module
16 | -----------------------------------------------------------------
17 | 
18 | .. automodule:: butterfree.transform.transformations.aggregated_transform
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | butterfree.transform.transformations.custom\_transform module
24 | -------------------------------------------------------------
25 | 
26 | .. automodule:: butterfree.transform.transformations.custom_transform
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | butterfree.transform.transformations.h3\_transform module
32 | ---------------------------------------------------------
33 | 
34 | .. automodule:: butterfree.transform.transformations.h3_transform
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 | 
39 | butterfree.transform.transformations.spark\_function\_transform module
40 | ----------------------------------------------------------------------
41 | 
42 | .. automodule:: butterfree.transform.transformations.spark_function_transform
43 |    :members:
44 |    :undoc-members:
45 |    :show-inheritance:
46 | 
47 | butterfree.transform.transformations.sql\_expression\_transform module
48 | ----------------------------------------------------------------------
49 | 
50 | .. automodule:: butterfree.transform.transformations.sql_expression_transform
51 |    :members:
52 |    :undoc-members:
53 |    :show-inheritance:
54 | 
55 | butterfree.transform.transformations.stack\_transform module
56 | ------------------------------------------------------------
57 | 
58 | .. automodule:: butterfree.transform.transformations.stack_transform
59 |    :members:
60 |    :undoc-members:
61 |    :show-inheritance:
62 | 
63 | butterfree.transform.transformations.transform\_component module
64 | ----------------------------------------------------------------
65 | 
66 | .. automodule:: butterfree.transform.transformations.transform_component
67 |    :members:
68 |    :undoc-members:
69 |    :show-inheritance:
70 | 
71 | Module contents
72 | ---------------
73 | 
74 | .. automodule:: butterfree.transform.transformations
75 |    :members:
76 |    :undoc-members:
77 |    :show-inheritance:
78 | 


--------------------------------------------------------------------------------
/docs/source/butterfree.transform.transformations.user_defined_functions.rst:
--------------------------------------------------------------------------------
 1 | butterfree.transform.transformations.user\_defined\_functions package
 2 | =====================================================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | butterfree.transform.transformations.user\_defined\_functions.mode module
 8 | -------------------------------------------------------------------------
 9 | 
10 | .. automodule:: butterfree.transform.transformations.user_defined_functions.mode
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | butterfree.transform.transformations.user\_defined\_functions.most\_frequent\_set module
16 | ----------------------------------------------------------------------------------------
17 | 
18 | .. automodule:: butterfree.transform.transformations.user_defined_functions.most_frequent_set
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | Module contents
24 | ---------------
25 | 
26 | .. automodule:: butterfree.transform.transformations.user_defined_functions
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 


--------------------------------------------------------------------------------
/docs/source/butterfree.transform.utils.rst:
--------------------------------------------------------------------------------
 1 | butterfree.transform.utils package
 2 | ==================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | butterfree.transform.utils.date\_range module
 8 | ---------------------------------------------
 9 | 
10 | .. automodule:: butterfree.transform.utils.date_range
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | butterfree.transform.utils.function module
16 | ------------------------------------------
17 | 
18 | .. automodule:: butterfree.transform.utils.function
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | butterfree.transform.utils.window\_spec module
24 | ----------------------------------------------
25 | 
26 | .. automodule:: butterfree.transform.utils.window_spec
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | Module contents
32 | ---------------
33 | 
34 | .. automodule:: butterfree.transform.utils
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 | 


--------------------------------------------------------------------------------
/docs/source/butterfree.validations.rst:
--------------------------------------------------------------------------------
 1 | butterfree.validations package
 2 | ==============================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | butterfree.validations.basic\_validaton module
 8 | ----------------------------------------------
 9 | 
10 | .. automodule:: butterfree.validations.basic_validaton
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | butterfree.validations.validation module
16 | ----------------------------------------
17 | 
18 | .. automodule:: butterfree.validations.validation
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | Module contents
24 | ---------------
25 | 
26 | .. automodule:: butterfree.validations
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 


--------------------------------------------------------------------------------
/docs/source/cli.md:
--------------------------------------------------------------------------------
 1 | # Command-line Interface (CLI)
 2 | 
 3 | Butterfree has now a command-line interface, introduced with the new automatic migration ability.
 4 | 
 5 | As soon as you install butterfree, you can check what's available through butterfree's cli with:
 6 | 
 7 | ```shell
 8 | $~ butterfree --help
 9 | ```
10 | 
11 | ### Automated Database Schema Migration
12 | 
13 | When developing your feature sets, you need also to prepare your database for the changes
14 | to come into your Feature Store. Normally, when creating a new feature set, you needed
15 | to manually create a new table in cassandra. Or, when creating a new feature in an existing
16 | feature set, you needed to create new column in cassandra too.
17 | 
18 | Now, you can just use `butterfree migrate apply ...`, butterfree will scan your python
19 | files, looking for classes that inherit from `butterfree.pipelines.FeatureSetPipeline`,
20 | then compare its schema with the database schema where the feature set would be written.
21 | Then it will prepare migration queries and run against the databases.
22 | 
23 | For more information, please, check `butterfree migrate apply --help` :)
24 | 
25 | ### Supported databases
26 | 
27 | This functionality currently supports only the **Cassandra** database, which is the default
28 | storage for an Online Feature Store built with Butterfree. Nonetheless, it was made with
29 | the intent to be easily extended for other databases.
30 | 
31 | Also, each database has its own rules for schema migration commands. Some changes may
32 | still require manual interference.


--------------------------------------------------------------------------------
/docs/source/extract.md:
--------------------------------------------------------------------------------
 1 | # Source
 2 | 
 3 | Regarding the extract step, we can define a ```Source``` as a set of data sources in order to join your raw data for the transform step. 
 4 | 
 5 | Currently, we support three different data sources or, as it's called within ```Butterfree```, ```readers```:
 6 | 
 7 | * ```FileReader```: this reader loads data from a file, as the name suggests, and returns a dataframe. It can be instantiated as:
 8 | 
 9 | ```python
10 | file_reader = FileReader(
11 |                 id="file_reader_id",
12 |                 path="data_path",
13 |                 format="json"
14 |               )
15 | ```
16 | 
17 | * ```TableReader```: this reader loads data from a table registered in spark metastore and returns a dataframe. It can be instantiated as:
18 | 
19 | ```python
20 | table_reader = TableReader(
21 |                     id="table_reader_id",
22 |                     database="table_reader_db",
23 |                     table="table_reader_table"
24 |                )
25 | ```
26 | 
27 | * ```KafkaReader```: this reader loads data from a kafka topic and returns a dataframe. It can be instantiated as:
28 | 
29 | ```python
30 | kafka_reader = KafkaReader(
31 |                 id="kafka_reader_id",
32 |                 topic="topic",
33 |                 value_schema=value_schema
34 |                 connection_string="host1:port,host2:port",
35 |                )
36 | ```
37 | 
38 | After defining all your data sources, it's important to write a query in order to define the relation between them, something like this:
39 | 
40 | ```python
41 | source = Source(
42 |    readers=[
43 |        TableReader(
44 |            id="table_reader_id",
45 |            database="table_reader_db",
46 |            table="table_reader_table",
47 |        ),
48 |        FileReader(id="file_reader_id", path="data_sample_path", format="json"),
49 |    ],
50 |    query=f"select a.*, b.feature2 "
51 |    f"from table_reader_id a "
52 |    f"inner join file_reader_id b on a.id = b.id ",
53 | )
54 | ```
55 | 
56 | It's important to state that we have some pre-processing methods as well, such as filter and pivot. Feel free to check them [here](https://github.com/quintoandar/butterfree/tree/master/butterfree/extract/pre_processing).


--------------------------------------------------------------------------------
/docs/source/getstart.md:
--------------------------------------------------------------------------------
 1 | # Getting Started
 2 | 
 3 | Butterfree depends on **Python 3.7+** and it is **Spark 3.0 ready**.
 4 | 
 5 | [Python Package Index](https://quintoandar.github.io/python-package-server/) hosts reference to a pip-installable module of this library, using it is as straightforward as including it on your project's requirements.
 6 | 
 7 | ```bash
 8 | pip install butterfree
 9 | ```
10 | 
11 | Or after listing `butterfree` in your `requirements.txt` file:
12 | 
13 | ```bash
14 | pip install -r requirements.txt
15 | ```
16 | 
17 | ## Discovering Butterfree
18 | 
19 | Welcome to **Discovering Butterfree** tutorial series! Click on the following links to open the tutorials:
20 | 
21 | **[#1 Feature Set Basics](https://github.com/quintoandar/butterfree/blob/master/examples/simple_feature_set/simple_feature_set.ipynb)**
22 | 
23 | **[#2 Spark Functions and Window](https://github.com/quintoandar/butterfree/blob/master/examples/spark_function_and_window/spark_function_and_window.ipynb)**
24 | 
25 | **[#3 Aggregated Feature Set](https://github.com/quintoandar/butterfree/blob/master/examples/aggregated_feature_set/aggregated_feature_set%20.ipynb)**
26 | 
27 | **[#4 Streaming Feature Set](https://github.com/quintoandar/butterfree/blob/master/examples/streaming_feature_set/streaming_feature_set.ipynb)**


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | Butterfree Docs
 2 | ===============
 3 | Made with |:heart:| by the **MLOps** team from `QuintoAndar <https://github.com/quintoandar/>`_.
 4 | 
 5 | The library is centered on the following concetps:
 6 | 
 7 | - **ETL**: central framework to create data pipelines. Spark-based Extract, Transform and Load modules ready to use.
 8 | - **Declarative Feature Engineering**: care about what you want to compute and not how to code it.
 9 | - **Feature Store Modeling**: the library easily provides everything you need to process and load data to your Feature Store.
10 | 
11 | Navigation
12 | ^^^^^^^^^^
13 | 
14 | .. toctree::
15 |    :maxdepth: 2
16 | 
17 |    home
18 |    getstart
19 |    extract
20 |    transform
21 |    load
22 |    stream
23 |    configuration
24 |    modules
25 |    cli
26 | 


--------------------------------------------------------------------------------
/docs/source/load.md:
--------------------------------------------------------------------------------
 1 | # Sink
 2 | 
 3 | The Load step is the `Sink` method, where we define the destinations for the feature set pipeline, that is, it is the process of recording the transformed data after the transformation step.
 4 | 
 5 | Declaring the sink:
 6 | ```python
 7 | sink = Sink(
 8 |            writers = [HistoricalFeatureStoreWriter(), OnlineFeatureStoreWriter()]
 9 |        ),
10 | ```
11 | 
12 | Currently, you can write your data into two types of `writers`:
13 | 
14 | * `HistoricalFeatureStoreWriter`: The Historical Feature Store will write the data to an AWS S3 bucket.
15 | 
16 | * `OnlineFeatureStoreWriter`: The Online Feature Store will write the data to a Cassandra database.
17 | 
18 | If you declare your writers without a database configuration, they will use their default settings. But we can also define this configuration, such as:
19 | 
20 | * `HistoricalFeatureStoreWriter`: 
21 | ```python
22 | config = S3Config(bucket="my_bucket", mode="append", format_="parquet")
23 | writers = [HistoricalFeatureStoreWriter(db_config=config)]
24 | ```
25 | 
26 | * `OnlineFeatureStoreWriter`:
27 | ```python
28 | config = CassandraConfig(
29 |                       mode="overwrite", 
30 |                       format_="org.apache.spark.sql.cassandra", 
31 |                       keyspace="keyspace_name"
32 |          )
33 | writers = [OnlineFeatureStoreWriter(db_config=config)]
34 | `````
35 | 
36 | You can see the writers [here](https://github.com/quintoandar/butterfree/tree/staging/butterfree/core/load/writers) and database configuration [here](https://github.com/quintoandar/butterfree/tree/staging/butterfree/core/configs/db).
37 | 
38 | It's also important to highlight that our writers support a ```debug_mode``` option:
39 | ```python
40 | writers = [HistoricalFeatureStoreWriter(debug_mode=True), OnlineFeatureStoreWriter(debug_mode=True)]
41 | sink = Sink(writers=writers)
42 | ```
43 | When ```debug_mode``` assumes ```True```, then a temporary view will be created, therefore no data will be actually saved to both historical and online feature store. Feel free to check our [examples section](https://github.com/quintoandar/butterfree/tree/staging/examples), in order to learn more about how to use this mode.


--------------------------------------------------------------------------------
/docs/source/modules.rst:
--------------------------------------------------------------------------------
1 | API Specification
2 | =================
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    butterfree
8 | 


--------------------------------------------------------------------------------
/docs/source/stream.md:
--------------------------------------------------------------------------------
 1 | # Streaming Feature Sets in Butterfree
 2 | 
 3 | ## Introduction
 4 | 
 5 | Spark enables us to deal with streaming processing in a very powerful way. For an introduction of all Spark's capabilities in streaming you can read more in this [link](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html). As the core Spark, Butterfree also let you declare pipelines to deal with streaming data. The best is that the pipeline declaration is almost the same as dealing with batch use-cases, so there isn't too complex to deal with this type of challenge using Butterfree tools.
 6 | 
 7 | Streaming feature sets are the ones that have at least one streaming source of data declared in the `Readers` of a `FeatureSetPipeline`. The pipeline is considered a streaming job if it has at least one reader in streaming mode (`stream=True`).
 8 | 
 9 | ## Readers
10 | 
11 | Using readers in streaming mode will make use of Spark's `readStream` API instead of the normal `read`. That means it will produce a stream dataframe (`df.isStreaming == True`) instead of a normal Spark's dataframe.
12 | 
13 | The currently supported readers in stream mode are `FileReader` and `KafkaReader`. For more information about the specifications read their docstrings, [here](https://github.com/quintoandar/butterfree/blob/master/butterfree/extract/readers/file_reader.py#L10) and [here](https://github.com/quintoandar/butterfree/blob/master/butterfree/extract/readers/kafka_reader.py#L12) respectively. 
14 | 
15 | ## Online Feature Store Writer
16 | `OnlineFeatureStoreWriter` is currently the only writer that supports streaming dataframes. It will write, in real time, and upserts to Cassandra. It uses `df.writeStream` and `foreachBatch` Spark functionality to do that. You can read more about `forachBatch` [here](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#using-foreach-and-foreachbatch).
17 | 
18 | ![](https://i.imgur.com/KoI1HuC.png)
19 | 
20 | 
21 | ### Debug Mode
22 | You can use the `OnlineFeatureStoreWriter` in debug mode (`debug_mode=True`) with streaming dataframes. So instead of trying to write to Cassandra, the data will be written to an in-memory table. So you can query this table to show the output as it is being calculated. Normally this functionality is used for the purpose of testing if the defined features have the expected results in real time.
23 | 
24 | ## Pipeline Run
25 | Differently from a batch run, a pipeline running with a streaming dataframe will not "finish to run". The pipeline will continue to get data from the streaming, process the data and save it to the defined sink sources. So when managing a job using this feature, an operation needs to be designed to support a continuously-up streaming job.


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | # Discovering Butterfree
 2 | 
 3 | Welcome to **Discovering Butterfree** tutorial series! Click on the following links to open the tutorials:
 4 | 
 5 | **[#1 Feature Set Basics](https://github.com/quintoandar/butterfree/blob/master/examples/simple_feature_set/simple_feature_set.ipynb)**
 6 | 
 7 | **[#2 Spark Functions and Window](https://github.com/quintoandar/butterfree/blob/master/examples/spark_function_and_window/spark_function_and_window.ipynb)**
 8 | 
 9 | **[#3 Aggregated Feature Set](https://github.com/quintoandar/butterfree/blob/master/examples/aggregated_feature_set/aggregated_feature_set.ipynb)**
10 | 
11 | **[#4 Streaming Feature Set](https://github.com/quintoandar/butterfree/blob/master/examples/streaming_feature_set/streaming_feature_set.ipynb)**
12 | 


--------------------------------------------------------------------------------
/examples/data/listing_events.json:
--------------------------------------------------------------------------------
1 | {"id": 1, "rent": 1300, "region_id":  1, "area":  50, "bedrooms": 1, "bathrooms": 1, "timestamp":  1588302000000}
2 | {"id": 1, "rent": 2000, "region_id":  1, "area":  50, "bedrooms": 1, "bathrooms": 1, "timestamp":  1588647600000}
3 | {"id": 2, "rent": 1500, "region_id":  2, "area":  100, "bedrooms": 2, "bathrooms": 1, "timestamp":  1588734000000}
4 | {"id": 2, "rent": 2500, "region_id":  2, "area":  100, "bedrooms": 2, "bathrooms": 1, "timestamp":  1589252400000}
5 | {"id": 3, "rent": 3000, "region_id":  3, "area":  150, "bedrooms": 2, "bathrooms": 2, "timestamp":  1589943600000}
6 | {"id": 4, "rent": 3200, "region_id":  4, "area":  175, "bedrooms": 2, "bathrooms": 2, "timestamp":  1589943600000}
7 | {"id": 5, "rent": 3200, "region_id":  5, "area":  250, "bedrooms": 3, "bathrooms": 3, "timestamp":  1590030000000}
8 | {"id": 6, "rent": 3200, "region_id":  6, "area":  225, "bedrooms": 2, "bathrooms": 3, "timestamp":  1590116400000}
9 | 


--------------------------------------------------------------------------------
/examples/data/region.json:
--------------------------------------------------------------------------------
1 | {"id": 1, "city": "Cerulean", "lat": 73.44489, "lng": 31.75030, "region": "Kanto"}
2 | {"id": 2, "city": "Veridian", "lat": -9.43510, "lng": -167.11772, "region": "Kanto"}
3 | {"id": 3, "city": "Cinnabar", "lat": 29.73043, "lng": 117.66164, "region": "Kanto"}
4 | {"id": 4, "city": "Pallet", "lat": -52.95717, "lng": -81.15251, "region": "Kanto"}
5 | {"id": 5, "city": "Violet", "lat": -47.35798, "lng": -178.77255, "region": "Johto"}
6 | {"id": 6, "city": "Olivine", "lat": 51.72820, "lng": 46.21958, "region": "Johto"}


--------------------------------------------------------------------------------
/examples/streaming_feature_set/events/20582255.json:
--------------------------------------------------------------------------------
1 | {"id": 0, "timestamp": 20582255, "payload": "{\"id_pokemon\": 1, \"pokeball\": \"Ultra\"}"}


--------------------------------------------------------------------------------
/examples/streaming_feature_set/pokedex.json:
--------------------------------------------------------------------------------
1 | {"id": 1, "name": "Geodude", "type": "Rock"}
2 | {"id": 2, "name": "Bulbasaur", "type": "Grass"}
3 | {"id": 3, "name": "Pikachu", "type": "Electric"}
4 | {"id": 4, "name": "Eevee", "type": "Normal"}
5 | {"id": 5, "name": "Oddish", "type": "Grass"}
6 | {"id": 6, "name": "Magikarp", "type": "Water"}
7 | 


--------------------------------------------------------------------------------
/examples/test_examples.py:
--------------------------------------------------------------------------------
 1 | """Script to test all notebooks under examples/ folder."""
 2 | 
 3 | import os
 4 | from pathlib import Path
 5 | from subprocess import PIPE, Popen  # noqa S404
 6 | 
 7 | if __name__ == "__main__":
 8 |     dir_path = os.path.dirname(os.path.realpath(__file__))
 9 |     example_notebook_paths = [
10 |         str(path)
11 |         for path in list(Path(dir_path).rglob("*.ipynb"))
12 |         if ".ipynb_checkpoints" not in str(path)
13 |     ]
14 | 
15 |     print("\n>>> Notebook Examples Tests")
16 |     errors = []
17 |     for path in example_notebook_paths:
18 |         print(f"    >>> Running {path}")
19 | 
20 |         p = Popen(  # noqa S607, S603
21 |             [
22 |                 "jupyter",
23 |                 "nbconvert",
24 |                 "--to",
25 |                 "notebook",
26 |                 "--inplace",
27 |                 "--no-prompt",
28 |                 "--execute",
29 |                 "--log-level='ERROR'",
30 |                 path,
31 |             ],
32 |             stdout=PIPE,
33 |             stderr=PIPE,
34 |         )
35 | 
36 |         _, error = p.communicate()
37 |         if p.returncode != 0:
38 |             errors.append({"notebook": path, "error": error})
39 |             print("    >>> Error in execution!\n")
40 |         else:
41 |             print("    >>> Successful execution\n")
42 | 
43 |     if errors:
44 |         print(">>> Errors in the following notebooks:")
45 |         for run in errors:
46 |             print("\n    >>>", run["notebook"])
47 |             print(run["error"].decode("utf-8"))
48 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | python_version = 3.9
 3 | ignore_missing_imports = True
 4 | disallow_untyped_calls = False
 5 | disallow_untyped_defs = True
 6 | disallow_incomplete_defs = True
 7 | warn_redundant_casts = True
 8 | show_error_codes = True
 9 | show_error_context = True
10 | disable_error_code = attr-defined, list-item, operator
11 | pretty = True
12 | 
13 | [mypy-butterfree.pipelines.*]
14 | ignore_errors = True
15 | 
16 | [mypy-butterfree.load.*]
17 | ignore_errors = True
18 | 
19 | [mypy-butterfree.transform.*]
20 | ignore_errors = True
21 | 
22 | [mypy-butterfree.extract.*]
23 | ignore_errors = True
24 | 
25 | [mypy-butterfree.config.*]
26 | ignore_errors = True
27 | 
28 | [mypy-butterfree.clients.*]
29 | ignore_errors = True
30 | 
31 | [mypy-butterfree.configs.*]
32 | ignore_errors = True
33 | 
34 | [mypy-butterfree.dataframe_service.*]
35 | ignore_errors = True
36 | 
37 | [mypy-butterfree.validations.*]
38 | ignore_errors = True
39 | 
40 | [mypy-butterfree.migrations.*]
41 | ignore_errors = True
42 | 
43 | [mypy-butterfree.testing.*]
44 | ignore_errors = True
45 | 
46 | [mypy-butterfree.hooks.*]
47 | ignore_errors = True
48 | 
49 | [mypy-butterfree._cli.*]
50 | ignore_errors = True
51 | 


--------------------------------------------------------------------------------
/release-please-config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "packages": {
 3 |     ".": {
 4 |       "changelog-path": "CHANGELOG.md",
 5 |       "release-type": "python",
 6 |       "bump-minor-pre-major": false,
 7 |       "bump-patch-for-minor-pre-major": false,
 8 |       "include-component-in-tag": false,
 9 |       "include-v-in-tag": false,
10 |       "extra-files": [
11 |         "setup.py",
12 |         "butterfree/__init__.py"
13 |       ]
14 |     }
15 |   },
16 |   "$schema": "https://raw.githubusercontent.com/googleapis/release-please/main/schemas/config.json"
17 | }
18 | 


--------------------------------------------------------------------------------
/requirements.dev.txt:
--------------------------------------------------------------------------------
 1 | h3==3.7.7
 2 | jupyter==1.0.0
 3 | twine==3.1.1
 4 | mypy==1.10.0
 5 | sphinx==6.2.1
 6 | sphinxemoji==0.3.1
 7 | sphinx-rtd-theme==1.3.0
 8 | recommonmark==0.7.1
 9 | pyarrow==16.1.0
10 | setuptools==70.0.0
11 | wheel==0.43.0
12 | 


--------------------------------------------------------------------------------
/requirements.lint.txt:
--------------------------------------------------------------------------------
1 | black==24.3.0
2 | flake8==4.0.1
3 | flake8-isort==4.1.1
4 | flake8-docstrings==1.5.0
5 | flake8-bugbear==20.1.0
6 | flake8-bandit==2.1.2
7 | bandit==1.7.2
8 | 


--------------------------------------------------------------------------------
/requirements.test.txt:
--------------------------------------------------------------------------------
1 | pytest==5.3.2
2 | pytest-cov==2.8.1
3 | pytest-xdist==1.31.0
4 | pytest-mock==2.0.0
5 | pytest-spark==0.6.0
6 | pyspark[connect]==3.5.1
7 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | cassandra-driver==3.24.0
 2 | mdutils>=1.2.2,<2.0
 3 | pandas>=0.24,<2.0
 4 | parameters-validation>=1.1.5,<2.0
 5 | pyspark==3.5.1
 6 | typer==0.4.2
 7 | typing-extensions>3.7.4,<5
 8 | boto3==1.35.*
 9 | numpy==1.26.4
10 | delta-spark==3.2.0
11 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | docstring-convention = google
 3 | max-line-length = 88
 4 | max-complexity = 12
 5 | ignore = W503, E203, D203, D401, D107, S101, D105, D100, W605, D202, D212, D104, E261
 6 | exclude = dist/*,build/*,.pytest_cache/*,.git/*,pip/*
 7 | per-file-ignores =
 8 |     # We will not check for docstrings or the use of asserts in tests
 9 |     tests/*:D,S101
10 |     setup.py:D,S101
11 | 
12 | [isort]
13 | profile = black
14 | line_length = 88
15 | known_first_party = butterfree
16 | default_section = THIRDPARTY
17 | multi_line_output = 3
18 | indent = '    '
19 | skip_glob = pip
20 | include_trailing_comma = True
21 | 
22 | [tool:pytest]
23 | spark_options =
24 |     spark.sql.session.timeZone: UTC
25 |     spark.driver.bindAddress: 127.0.0.1
26 |     spark.sql.legacy.timeParserPolicy: LEGACY
27 |     spark.sql.legacy.createHiveTableByDefault: false
28 | 
29 | [mypy]
30 | # suppress errors about unsatisfied imports
31 | ignore_missing_imports=True
32 | 
33 | # be strict
34 | warn_return_any = True
35 | strict_optional = True
36 | warn_no_return = True
37 | warn_redundant_casts = True
38 | warn_unused_ignores = True
39 | disallow_any_generics = True
40 | 
41 | disallow_untyped_defs = True
42 | check_untyped_defs = True
43 | disallow_untyped_calls = True
44 | 
45 | [build_sphinx]
46 | all-files = 1
47 | source-dir = docs/source
48 | build-dir = docs/build
49 | warning-is-error = 0
50 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | __package_name__ = "butterfree"
 4 | __version__ = "1.7.2"  # x-release-please-version
 5 | __repository_url__ = "https://github.com/quintoandar/butterfree"
 6 | 
 7 | with open("requirements.txt") as f:
 8 |     requirements = f.read().splitlines()
 9 | 
10 | with open("README.md") as f:
11 |     long_description = f.read()
12 | 
13 | setup(
14 |     name=__package_name__,
15 |     description="A tool for building feature stores - Transform your raw data "
16 |     "into beautiful features.",
17 |     long_description=long_description,
18 |     long_description_content_type="text/markdown",
19 |     keywords="feature store sets ETL",
20 |     version=__version__,
21 |     url=__repository_url__,
22 |     packages=find_packages(
23 |         exclude=(
24 |             "docs",
25 |             "tests",
26 |             "tests.*",
27 |             "pipenv",
28 |             "env",
29 |             "examples",
30 |             "htmlcov",
31 |             ".pytest_cache",
32 |         )
33 |     ),
34 |     license="Copyright",
35 |     author="QuintoAndar",
36 |     install_requires=requirements,
37 |     extras_require={"h3": ["h3>=3.7.4,<4"]},
38 |     python_requires=">=3.9, <4",
39 |     entry_points={"console_scripts": ["butterfree=butterfree._cli.main:app"]},
40 |     include_package_data=True,
41 | )
42 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/__init__.py


--------------------------------------------------------------------------------
/tests/integration/__init__.py:
--------------------------------------------------------------------------------
1 | from os import path
2 | 
3 | INPUT_PATH = path.join(path.dirname(path.abspath(__file__)), "input")
4 | OUTPUT_PATH = path.join(path.dirname(path.abspath(__file__)), "output")
5 | 


--------------------------------------------------------------------------------
/tests/integration/butterfree/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/integration/butterfree/__init__.py


--------------------------------------------------------------------------------
/tests/integration/butterfree/extract/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/integration/butterfree/extract/__init__.py


--------------------------------------------------------------------------------
/tests/integration/butterfree/extract/conftest.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import Mock
 2 | 
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.fixture()
 7 | def target_df_table_reader(spark_context, spark_session):
 8 |     data = [
 9 |         {"id": 1, "feature1": 100},
10 |         {"id": 2, "feature1": 200},
11 |         {"id": 3, "feature1": 300},
12 |         {"id": 4, "feature1": 400},
13 |         {"id": 5, "feature1": 500},
14 |         {"id": 6, "feature1": 600},
15 |     ]
16 |     return spark_session.read.json(spark_context.parallelize(data, 1))
17 | 
18 | 
19 | @pytest.fixture()
20 | def target_df_source(spark_context, spark_session):
21 |     data = [
22 |         {"id": 1, "feature1": 100, "feature2": 200},
23 |         {"id": 2, "feature1": 200, "feature2": 400},
24 |         {"id": 3, "feature1": 300, "feature2": 600},
25 |         {"id": 4, "feature1": 400, "feature2": 800},
26 |         {"id": 5, "feature1": 500, "feature2": 1000},
27 |         {"id": 6, "feature1": 600, "feature2": 1200},
28 |     ]
29 |     return spark_session.read.json(spark_context.parallelize(data, 1))
30 | 
31 | 
32 | @pytest.fixture()
33 | def spark_client_mock():
34 |     return Mock()
35 | 


--------------------------------------------------------------------------------
/tests/integration/butterfree/extract/test_source.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from pyspark.sql import DataFrame
 4 | 
 5 | from butterfree.clients import SparkClient
 6 | from butterfree.extract import Source
 7 | from butterfree.extract.readers import FileReader, TableReader
 8 | from tests.integration import INPUT_PATH
 9 | 
10 | 
11 | def create_temp_view(dataframe: DataFrame, name):
12 |     dataframe.createOrReplaceTempView(name)
13 | 
14 | 
15 | def create_db_and_table(spark, table_reader_id, table_reader_db, table_reader_table):
16 |     spark.sql(f"drop schema if exists {table_reader_db} cascade")
17 |     spark.sql(f"create database {table_reader_db}")
18 |     spark.sql(f"use {table_reader_db}")
19 |     spark.sql(
20 |         f"create table {table_reader_db}.{table_reader_table} "  # noqa
21 |         f"as select * from {table_reader_id}"  # noqa
22 |     )
23 | 
24 | 
25 | def compare_dataframes(
26 |     actual_df: DataFrame, expected_df: DataFrame, columns_sort: List[str] = None
27 | ):
28 |     if not columns_sort:
29 |         columns_sort = actual_df.schema.fieldNames()
30 |     return sorted(actual_df.select(*columns_sort).collect()) == sorted(
31 |         expected_df.select(*columns_sort).collect()
32 |     )
33 | 
34 | 
35 | class TestSource:
36 |     def test_source(
37 |         self,
38 |         target_df_source,
39 |         target_df_table_reader,
40 |         spark_session,
41 |     ):
42 |         # given
43 |         spark_client = SparkClient()
44 | 
45 |         table_reader_id = "a_test_source"
46 |         table_reader_db = "db"
47 |         table_reader_table = "table_test_source"
48 | 
49 |         create_temp_view(dataframe=target_df_table_reader, name=table_reader_id)
50 |         create_db_and_table(
51 |             spark=spark_session,
52 |             table_reader_id=table_reader_id,
53 |             table_reader_db=table_reader_db,
54 |             table_reader_table=table_reader_table,
55 |         )
56 | 
57 |         file_reader_id = "b_test_source"
58 |         data_sample_path = INPUT_PATH + "/data.json"
59 | 
60 |         # when
61 |         source = Source(
62 |             readers=[
63 |                 TableReader(
64 |                     id=table_reader_id,
65 |                     database=table_reader_db,
66 |                     table=table_reader_table,
67 |                 ),
68 |                 FileReader(id=file_reader_id, path=data_sample_path, format="json"),
69 |             ],
70 |             query=f"select a.*, b.feature2 "  # noqa
71 |             f"from {table_reader_id} a "  # noqa
72 |             f"inner join {file_reader_id} b on a.id = b.id ",  # noqa
73 |             eager_evaluation=False,
74 |         )
75 | 
76 |         result_df = source.construct(client=spark_client)
77 |         target_df = target_df_source
78 | 
79 |         # then
80 |         assert (
81 |             compare_dataframes(
82 |                 actual_df=result_df,
83 |                 expected_df=target_df,
84 |                 columns_sort=result_df.columns,
85 |             )
86 |             is True
87 |         )
88 | 


--------------------------------------------------------------------------------
/tests/integration/butterfree/load/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/integration/butterfree/load/__init__.py


--------------------------------------------------------------------------------
/tests/integration/butterfree/load/conftest.py:
--------------------------------------------------------------------------------
 1 | from pytest import fixture
 2 | 
 3 | from butterfree.constants import DataType, columns
 4 | from butterfree.transform import FeatureSet
 5 | from butterfree.transform.features import Feature, KeyFeature, TimestampFeature
 6 | 
 7 | 
 8 | @fixture
 9 | def input_dataframe(spark_context, spark_session):
10 |     data = [
11 |         {
12 |             "id": 1,
13 |             "timestamp": "2019-12-01",
14 |             "feature": 100,
15 |             columns.PARTITION_YEAR: 2019,
16 |             columns.PARTITION_MONTH: 12,
17 |             columns.PARTITION_DAY: 1,
18 |         },
19 |         {
20 |             "id": 2,
21 |             "timestamp": "2020-01-01",
22 |             "feature": 200,
23 |             columns.PARTITION_YEAR: 2020,
24 |             columns.PARTITION_MONTH: 1,
25 |             columns.PARTITION_DAY: 1,
26 |         },
27 |         {
28 |             "id": 1,
29 |             "timestamp": "2020-02-01",
30 |             "feature": 110,
31 |             columns.PARTITION_YEAR: 2020,
32 |             columns.PARTITION_MONTH: 2,
33 |             columns.PARTITION_DAY: 1,
34 |         },
35 |         {
36 |             "id": 1,
37 |             "timestamp": "2020-02-02",
38 |             "feature": 120,
39 |             columns.PARTITION_YEAR: 2020,
40 |             columns.PARTITION_MONTH: 2,
41 |             columns.PARTITION_DAY: 2,
42 |         },
43 |     ]
44 |     return spark_session.read.json(spark_context.parallelize(data, 1))
45 | 
46 | 
47 | @fixture
48 | def feature_set():
49 |     key_features = [
50 |         KeyFeature(name="id", description="Description", dtype=DataType.INTEGER)
51 |     ]
52 |     ts_feature = TimestampFeature(from_column="timestamp")
53 |     features = [
54 |         Feature(name="feature", description="Description", dtype=DataType.INTEGER),
55 |     ]
56 |     return FeatureSet(
57 |         "test_sink_feature_set",
58 |         "test_sink_entity",
59 |         "description",
60 |         keys=key_features,
61 |         timestamp=ts_feature,
62 |         features=features,
63 |     )
64 | 


--------------------------------------------------------------------------------
/tests/integration/butterfree/load/test_sink.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | from unittest.mock import Mock
 3 | 
 4 | from butterfree.clients import SparkClient
 5 | from butterfree.load import Sink
 6 | from butterfree.load.writers import (
 7 |     HistoricalFeatureStoreWriter,
 8 |     OnlineFeatureStoreWriter,
 9 | )
10 | 
11 | 
12 | def test_sink(input_dataframe, feature_set):
13 |     # arrange
14 |     client = SparkClient()
15 |     client.conn.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")
16 |     feature_set_df = feature_set.construct(input_dataframe, client)
17 |     target_latest_df = OnlineFeatureStoreWriter.filter_latest(
18 |         feature_set_df, id_columns=[key.name for key in feature_set.keys]
19 |     )
20 |     columns_sort = feature_set_df.schema.fieldNames()
21 | 
22 |     # setup historical writer
23 |     s3config = Mock()
24 |     s3config.mode = "overwrite"
25 |     s3config.format_ = "parquet"
26 |     s3config.get_options = Mock(
27 |         return_value={
28 |             "path": "test_folder/historical/entity/feature_set",
29 |             "mode": "overwrite",
30 |         }
31 |     )
32 |     s3config.get_path_with_partitions = Mock(
33 |         return_value="spark-warehouse/test.db/test_folder/historical/entity/feature_set"
34 |     )
35 | 
36 |     historical_writer = HistoricalFeatureStoreWriter(
37 |         db_config=s3config, interval_mode=True
38 |     )
39 | 
40 |     # setup online writer
41 |     # TODO: Change for CassandraConfig when Cassandra for test is ready
42 |     online_config = Mock()
43 |     online_config.mode = "overwrite"
44 |     online_config.format_ = "parquet"
45 |     online_config.get_options = Mock(
46 |         return_value={"path": "test_folder/online/entity/feature_set"}
47 |     )
48 |     online_writer = OnlineFeatureStoreWriter(db_config=online_config)
49 | 
50 |     writers = [historical_writer, online_writer]
51 |     sink = Sink(writers)
52 | 
53 |     # act
54 |     client.sql("CREATE DATABASE IF NOT EXISTS {}".format(historical_writer.database))
55 |     sink.flush(feature_set, feature_set_df, client)
56 | 
57 |     # get historical results
58 |     historical_result_df = client.read(
59 |         s3config.format_,
60 |         path=s3config.get_path_with_partitions(feature_set.name, feature_set_df),
61 |     )
62 | 
63 |     # get online results
64 |     online_result_df = client.read(
65 |         online_config.format_, **online_config.get_options(feature_set.name)
66 |     )
67 | 
68 |     # assert
69 |     # assert historical results
70 |     assert sorted(feature_set_df.select(*columns_sort).collect()) == sorted(
71 |         historical_result_df.select(*columns_sort).collect()
72 |     )
73 | 
74 |     # assert online results
75 |     assert sorted(target_latest_df.select(*columns_sort).collect()) == sorted(
76 |         online_result_df.select(*columns_sort).collect()
77 |     )
78 | 
79 |     # tear down
80 |     shutil.rmtree("test_folder")
81 | 


--------------------------------------------------------------------------------
/tests/integration/butterfree/pipelines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/integration/butterfree/pipelines/__init__.py


--------------------------------------------------------------------------------
/tests/integration/butterfree/transform/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/integration/butterfree/transform/__init__.py


--------------------------------------------------------------------------------
/tests/integration/input/data.json:
--------------------------------------------------------------------------------
1 | {"feature2":200,"id":1}
2 | {"feature2":400,"id":2}
3 | {"feature2":600,"id":3}
4 | {"feature2":800,"id":4}
5 | {"feature2":1000,"id":5}
6 | {"feature2":1200,"id":6}
7 | 


--------------------------------------------------------------------------------
/tests/mocks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/mocks/__init__.py


--------------------------------------------------------------------------------
/tests/mocks/entities/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/mocks/entities/__init__.py


--------------------------------------------------------------------------------
/tests/mocks/entities/first/__init__.py:
--------------------------------------------------------------------------------
1 | from .first_pipeline import FirstPipeline
2 | 
3 | __all__ = ["FirstPipeline"]
4 | 


--------------------------------------------------------------------------------
/tests/mocks/entities/first/first_pipeline.py:
--------------------------------------------------------------------------------
 1 | from butterfree.constants.data_type import DataType
 2 | from butterfree.extract import Source
 3 | from butterfree.extract.readers import TableReader
 4 | from butterfree.load import Sink
 5 | from butterfree.load.writers import (
 6 |     HistoricalFeatureStoreWriter,
 7 |     OnlineFeatureStoreWriter,
 8 | )
 9 | from butterfree.pipelines import FeatureSetPipeline
10 | from butterfree.transform import FeatureSet
11 | from butterfree.transform.features import Feature, KeyFeature, TimestampFeature
12 | 
13 | 
14 | class FirstPipeline(FeatureSetPipeline):
15 |     def __init__(self):
16 |         super(FirstPipeline, self).__init__(
17 |             source=Source(
18 |                 readers=[
19 |                     TableReader(
20 |                         id="t",
21 |                         database="db",
22 |                         table="table",
23 |                     )
24 |                 ],
25 |                 query=f"select * from t",  # noqa
26 |             ),
27 |             feature_set=FeatureSet(
28 |                 name="first",
29 |                 entity="entity",
30 |                 description="description",
31 |                 features=[
32 |                     Feature(
33 |                         name="feature1",
34 |                         description="test",
35 |                         dtype=DataType.FLOAT,
36 |                     ),
37 |                     Feature(
38 |                         name="feature2",
39 |                         description="another test",
40 |                         dtype=DataType.STRING,
41 |                     ),
42 |                 ],
43 |                 keys=[
44 |                     KeyFeature(
45 |                         name="id",
46 |                         description="identifier",
47 |                         dtype=DataType.BIGINT,
48 |                     )
49 |                 ],
50 |                 timestamp=TimestampFeature(),
51 |             ),
52 |             sink=Sink(
53 |                 writers=[HistoricalFeatureStoreWriter(), OnlineFeatureStoreWriter()]
54 |             ),
55 |         )
56 | 


--------------------------------------------------------------------------------
/tests/mocks/entities/second/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/mocks/entities/second/__init__.py


--------------------------------------------------------------------------------
/tests/mocks/entities/second/deeper/__init__.py:
--------------------------------------------------------------------------------
1 | from .second_pipeline import SecondPipeline
2 | 
3 | __all__ = ["SecondPipeline"]
4 | 


--------------------------------------------------------------------------------
/tests/mocks/entities/second/deeper/second_pipeline.py:
--------------------------------------------------------------------------------
 1 | from butterfree.constants.data_type import DataType
 2 | from butterfree.extract import Source
 3 | from butterfree.extract.readers import TableReader
 4 | from butterfree.load import Sink
 5 | from butterfree.load.writers import (
 6 |     HistoricalFeatureStoreWriter,
 7 |     OnlineFeatureStoreWriter,
 8 | )
 9 | from butterfree.pipelines import FeatureSetPipeline
10 | from butterfree.transform import FeatureSet
11 | from butterfree.transform.features import Feature, KeyFeature, TimestampFeature
12 | 
13 | 
14 | class SecondPipeline(FeatureSetPipeline):
15 |     def __init__(self):
16 |         super(SecondPipeline, self).__init__(
17 |             source=Source(
18 |                 readers=[
19 |                     TableReader(
20 |                         id="t",
21 |                         database="db",
22 |                         table="table",
23 |                     )
24 |                 ],
25 |                 query=f"select * from t",  # noqa
26 |             ),
27 |             feature_set=FeatureSet(
28 |                 name="second",
29 |                 entity="entity",
30 |                 description="description",
31 |                 features=[
32 |                     Feature(
33 |                         name="feature1",
34 |                         description="test",
35 |                         dtype=DataType.STRING,
36 |                     ),
37 |                     Feature(
38 |                         name="feature2",
39 |                         description="another test",
40 |                         dtype=DataType.FLOAT,
41 |                     ),
42 |                 ],
43 |                 keys=[
44 |                     KeyFeature(
45 |                         name="id",
46 |                         description="identifier",
47 |                         dtype=DataType.BIGINT,
48 |                     )
49 |                 ],
50 |                 timestamp=TimestampFeature(),
51 |             ),
52 |             sink=Sink(
53 |                 writers=[HistoricalFeatureStoreWriter(), OnlineFeatureStoreWriter()]
54 |             ),
55 |         )
56 | 


--------------------------------------------------------------------------------
/tests/unit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/__init__.py


--------------------------------------------------------------------------------
/tests/unit/butterfree/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/__init__.py


--------------------------------------------------------------------------------
/tests/unit/butterfree/_cli/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/_cli/__init__.py


--------------------------------------------------------------------------------
/tests/unit/butterfree/_cli/test_migrate.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import call
 2 | 
 3 | from typer.testing import CliRunner
 4 | 
 5 | from butterfree._cli import migrate
 6 | from butterfree._cli.main import app
 7 | from butterfree.migrations.database_migration import CassandraMigration
 8 | from butterfree.pipelines import FeatureSetPipeline
 9 | 
10 | runner = CliRunner()
11 | 
12 | 
13 | class TestMigrate:
14 |     def test_migrate_success(self, mocker):
15 |         mocker.patch.object(migrate.Migrate, "run")
16 |         all_fs = migrate.migrate("tests/mocks/entities/")
17 |         assert all(isinstance(fs, FeatureSetPipeline) for fs in all_fs)
18 |         assert sorted([fs.feature_set.name for fs in all_fs]) == ["first", "second"]
19 | 
20 |     def test_migrate_run_methods(self, mocker):
21 |         mocker.patch.object(CassandraMigration, "apply_migration")
22 |         mocker.patch.object(migrate.Migrate, "_send_logs_to_s3")
23 | 
24 |         all_fs = migrate.migrate("tests/mocks/entities/", False, False)
25 | 
26 |         assert CassandraMigration.apply_migration.call_count == 2
27 | 
28 |         cassandra_pairs = [
29 |             call(pipe.feature_set, pipe.sink.writers[1], False) for pipe in all_fs
30 |         ]
31 |         CassandraMigration.apply_migration.assert_has_calls(
32 |             cassandra_pairs, any_order=True
33 |         )
34 |         migrate.Migrate._send_logs_to_s3.assert_called_once()
35 | 
36 |     def test_app_cli(self):
37 |         result = runner.invoke(app, "migrate")
38 |         assert result.exit_code == 0
39 | 
40 |     def test_app_migrate(self, mocker):
41 |         mocker.patch.object(migrate.Migrate, "run")
42 |         result = runner.invoke(app, ["migrate", "apply", "tests/mocks/entities/"])
43 |         assert result.exit_code == 0
44 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/automated/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/automated/__init__.py


--------------------------------------------------------------------------------
/tests/unit/butterfree/automated/test_feature_set_creation.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from unittest.mock import MagicMock
 3 | 
 4 | from butterfree.automated.feature_set_creation import FeatureSetCreation
 5 | 
 6 | 
 7 | class TestFeatureSetCreation(unittest.TestCase):
 8 |     def setUp(self):
 9 |         self.feature_set_creation = FeatureSetCreation()
10 | 
11 |     def test_get_features_with_regex(self):
12 |         sql_query = "SELECT column1, column2 FROM table1"
13 |         expected_features = ["column1", "column2"]
14 | 
15 |         features = self.feature_set_creation._get_features_with_regex(sql_query)
16 | 
17 |         self.assertEqual(features, expected_features)
18 | 
19 |     def test_get_data_type(self):
20 |         field_name = "column1"
21 |         df_mock = MagicMock()
22 |         df_mock.schema.jsonValue.return_value = {
23 |             "fields": [{"name": "column1", "type": "string"}]
24 |         }
25 | 
26 |         data_type = self.feature_set_creation._get_data_type(field_name, df_mock)
27 | 
28 |         self.assertEqual(data_type, ".STRING")
29 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/clients/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/clients/__init__.py


--------------------------------------------------------------------------------
/tests/unit/butterfree/clients/conftest.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict, List
 2 | from unittest.mock import Mock
 3 | 
 4 | import pytest
 5 | from pyspark import SparkContext
 6 | from pyspark.sql import DataFrame, SparkSession
 7 | from pyspark.sql.streaming import StreamingQuery
 8 | 
 9 | from butterfree.clients import CassandraClient
10 | 
11 | 
12 | @pytest.fixture()
13 | def target_df(spark_context: SparkContext, spark_session: SparkSession) -> DataFrame:
14 |     data = [{"col1": "value", "col2": 123}]
15 |     return spark_session.read.json(spark_context.parallelize(data, 1))  # type: ignore
16 | 
17 | 
18 | @pytest.fixture()
19 | def mocked_spark_read() -> Mock:
20 |     mock = Mock()
21 |     mock.readStream = mock
22 |     mock.read = mock
23 |     mock.format.return_value = mock
24 |     mock.options.return_value = mock
25 |     return mock
26 | 
27 | 
28 | @pytest.fixture()
29 | def mocked_spark_write() -> Mock:
30 |     mock = Mock()
31 |     mock.dataframe = mock
32 |     mock.write = mock
33 |     return mock
34 | 
35 | 
36 | @pytest.fixture()
37 | def mocked_stream_df() -> Mock:
38 |     mock = Mock()
39 |     mock.isStreaming = True
40 |     mock.writeStream = mock
41 |     mock.trigger.return_value = mock
42 |     mock.outputMode.return_value = mock
43 |     mock.option.return_value = mock
44 |     mock.foreachBatch.return_value = mock
45 |     mock.start.return_value = Mock(spec=StreamingQuery)
46 |     return mock
47 | 
48 | 
49 | @pytest.fixture()
50 | def mock_spark_sql() -> Mock:
51 |     mock = Mock()
52 |     mock.sql = mock
53 |     return mock
54 | 
55 | 
56 | @pytest.fixture
57 | def cassandra_client() -> CassandraClient:
58 |     return CassandraClient(host=["mock"], keyspace="dummy_keyspace")
59 | 
60 | 
61 | @pytest.fixture
62 | def cassandra_feature_set() -> List[Dict[str, Any]]:
63 |     return [
64 |         {"feature1": "value1", "feature2": 10.5},
65 |         {"feature1": "value1", "feature2": 10},
66 |     ]
67 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/configs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/configs/__init__.py


--------------------------------------------------------------------------------
/tests/unit/butterfree/configs/db/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/configs/db/__init__.py


--------------------------------------------------------------------------------
/tests/unit/butterfree/configs/db/conftest.py:
--------------------------------------------------------------------------------
 1 | from pytest import fixture
 2 | 
 3 | from butterfree.configs.db import CassandraConfig, KafkaConfig, MetastoreConfig
 4 | 
 5 | 
 6 | @fixture
 7 | def cassandra_config(monkeypatch):
 8 |     monkeypatch.setenv("CASSANDRA_KEYSPACE", "test")
 9 |     monkeypatch.setenv("CASSANDRA_HOST", "test")
10 |     monkeypatch.setenv("CASSANDRA_PASSWORD", "test")
11 |     monkeypatch.setenv("CASSANDRA_USERNAME", "test")
12 | 
13 |     return CassandraConfig()
14 | 
15 | 
16 | @fixture
17 | def kafka_config(monkeypatch):
18 |     monkeypatch.setenv("KAFKA_CONNECTION_STRING", "test")
19 | 
20 |     return KafkaConfig()
21 | 
22 | 
23 | @fixture
24 | def metastore_config(monkeypatch):
25 |     monkeypatch.setenv("FEATURE_STORE_S3_BUCKET", "test")
26 | 
27 |     return MetastoreConfig()
28 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/configs/db/test_metastore_config.py:
--------------------------------------------------------------------------------
 1 | from butterfree.configs import environment
 2 | 
 3 | 
 4 | class TestMetastoreConfig:
 5 |     def test_mode(self, metastore_config):
 6 |         # expecting
 7 |         default = "overwrite"
 8 |         assert metastore_config.mode == default
 9 | 
10 |         # given
11 |         metastore_config.mode = None
12 |         # then
13 |         assert metastore_config.mode == default
14 | 
15 |     def test_mode_custom(self, metastore_config):
16 |         # given
17 |         mode = "append"
18 |         metastore_config.mode = mode
19 | 
20 |         # then
21 |         assert metastore_config.mode == mode
22 | 
23 |     def test_format(self, metastore_config):
24 |         # expecting
25 |         default = "parquet"
26 |         assert metastore_config.format_ == default
27 | 
28 |         # given
29 |         metastore_config.format_ = None
30 |         # then
31 |         assert metastore_config.format_ == default
32 | 
33 |     def test_format_custom(self, metastore_config):
34 |         # given
35 |         format_ = "json"
36 |         metastore_config.format_ = format_
37 | 
38 |         # then
39 |         assert metastore_config.format_ == format_
40 | 
41 |     def test_path(self, metastore_config):
42 |         # expecting
43 |         default = environment.get_variable("FEATURE_STORE_S3_BUCKET")
44 |         assert metastore_config.path == default
45 | 
46 |     def test_path_custom(self, metastore_config):
47 |         # given
48 |         bucket = "test"
49 |         metastore_config.path = bucket
50 | 
51 |         # then
52 |         assert metastore_config.path == bucket
53 | 
54 |     def test_file_system(self, metastore_config):
55 |         # expecting
56 |         default = "s3a"
57 |         assert metastore_config.file_system == default
58 | 
59 |     def test_file_system_custom(self, metastore_config):
60 |         # given
61 |         file_system = "dbfs"
62 |         metastore_config.file_system = file_system
63 | 
64 |         # then
65 |         assert metastore_config.file_system == file_system
66 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/configs/test_environment.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from butterfree.configs import environment
 4 | 
 5 | 
 6 | def test_get_variable_success(monkeypatch):
 7 |     # given
 8 |     specified_variable = "specified_variable"
 9 |     effective_value = "effective_value"
10 |     monkeypatch.setenv(specified_variable, effective_value)
11 |     environment.specification[specified_variable] = "spec_default_value"
12 | 
13 |     # when
14 |     return_value = environment.get_variable(specified_variable, "anything")
15 | 
16 |     # then
17 |     assert return_value == effective_value
18 | 
19 | 
20 | def test_get_variable_from_spec_default(monkeypatch):
21 |     # given
22 |     specified_variable = "specified_variable"
23 |     spec_default_value = "default_value"
24 |     monkeypatch.setenv(specified_variable, "overwrite")
25 |     monkeypatch.delenv(specified_variable)
26 |     environment.specification[specified_variable] = spec_default_value
27 | 
28 |     # when
29 |     return_value = environment.get_variable(specified_variable, "anything")
30 | 
31 |     # then
32 |     assert return_value == spec_default_value
33 | 
34 | 
35 | def test_get_variable_default(monkeypatch):
36 |     # given
37 |     default = "default_value"
38 |     variable = "environment_variable"
39 |     environment.specification[variable] = None
40 |     monkeypatch.setenv(variable, "overwrite")
41 |     monkeypatch.delenv(variable)
42 | 
43 |     # when
44 |     return_value = environment.get_variable(variable, default)
45 | 
46 |     # then
47 |     assert return_value == default
48 | 
49 | 
50 | def test_get_variable_out_of_spec_fails(monkeypatch):
51 |     # given
52 |     not_specified_variable = "not_specified_variable"
53 |     monkeypatch.setenv(not_specified_variable, "anything")
54 |     if not_specified_variable in environment.specification:
55 |         del environment.specification[not_specified_variable]
56 | 
57 |     # then
58 |     with pytest.raises(
59 |         environment.UnspecifiedVariableError, match="not listed in the environment"
60 |     ):
61 |         environment.get_variable(not_specified_variable, "anything")
62 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/dataframe_service/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/dataframe_service/__init__.py


--------------------------------------------------------------------------------
/tests/unit/butterfree/dataframe_service/conftest.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import random
 3 | 
 4 | import pytest
 5 | 
 6 | 
 7 | @pytest.fixture()
 8 | def input_df(spark_context, spark_session):
 9 |     start = datetime.datetime(year=1970, month=1, day=1)
10 |     end = datetime.datetime(year=2020, month=12, day=31)
11 |     random_dates = [
12 |         (
13 |             lambda: start
14 |             + datetime.timedelta(
15 |                 seconds=random.randint(  # noqa: S311
16 |                     0, int((end - start).total_seconds())
17 |                 )
18 |             )
19 |         )()
20 |         .date()
21 |         .isoformat()
22 |         for _ in range(10000)
23 |     ]
24 |     data = [{"timestamp": date} for date in random_dates]
25 |     return spark_session.read.json(
26 |         spark_context.parallelize(data, 1), schema="timestamp timestamp"
27 |     )
28 | 
29 | 
30 | @pytest.fixture()
31 | def test_partitioning_input_df(spark_context, spark_session):
32 |     data = [
33 |         {"feature": 1, "year": 2009, "month": 8, "day": 20},
34 |         {"feature": 2, "year": 2009, "month": 8, "day": 20},
35 |         {"feature": 3, "year": 2020, "month": 8, "day": 20},
36 |         {"feature": 4, "year": 2020, "month": 9, "day": 20},
37 |         {"feature": 5, "year": 2020, "month": 9, "day": 20},
38 |         {"feature": 6, "year": 2020, "month": 8, "day": 20},
39 |         {"feature": 7, "year": 2020, "month": 8, "day": 21},
40 |     ]
41 |     return spark_session.read.json(spark_context.parallelize(data, 1))
42 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/dataframe_service/test_incremental_srategy.py:
--------------------------------------------------------------------------------
 1 | from butterfree.dataframe_service import IncrementalStrategy
 2 | 
 3 | 
 4 | class TestIncrementalStrategy:
 5 |     def test_from_milliseconds(self):
 6 |         # arrange
 7 |         incremental_strategy = IncrementalStrategy().from_milliseconds("ts")
 8 |         target_expression = "date(from_unixtime(ts/ 1000.0)) >= date('2020-01-01')"
 9 | 
10 |         # act
11 |         result_expression = incremental_strategy.get_expression(start_date="2020-01-01")
12 | 
13 |         # assert
14 |         assert target_expression.split() == result_expression.split()
15 | 
16 |     def test_from_string(self):
17 |         # arrange
18 |         incremental_strategy = IncrementalStrategy().from_string(
19 |             "dt", mask="dd/MM/yyyy"
20 |         )
21 |         target_expression = "date(to_date(dt, 'dd/MM/yyyy')) >= date('2020-01-01')"
22 | 
23 |         # act
24 |         result_expression = incremental_strategy.get_expression(start_date="2020-01-01")
25 | 
26 |         # assert
27 |         assert target_expression.split() == result_expression.split()
28 | 
29 |     def test_from_year_month_day_partitions(self):
30 |         # arrange
31 |         incremental_strategy = IncrementalStrategy().from_year_month_day_partitions(
32 |             year_column="y", month_column="m", day_column="d"
33 |         )
34 |         target_expression = (
35 |             "date(concat(string(y), "
36 |             "'-', string(m), "
37 |             "'-', string(d))) >= date('2020-01-01')"
38 |         )
39 | 
40 |         # act
41 |         result_expression = incremental_strategy.get_expression(start_date="2020-01-01")
42 | 
43 |         # assert
44 |         assert target_expression.split() == result_expression.split()
45 | 
46 |     def test_get_expression_with_just_end_date(self):
47 |         # arrange
48 |         incremental_strategy = IncrementalStrategy(column="dt")
49 |         target_expression = "date(dt) <= date('2020-01-01')"
50 | 
51 |         # act
52 |         result_expression = incremental_strategy.get_expression(end_date="2020-01-01")
53 | 
54 |         # assert
55 |         assert target_expression.split() == result_expression.split()
56 | 
57 |     def test_get_expression_with_start_and_end_date(self):
58 |         # arrange
59 |         incremental_strategy = IncrementalStrategy(column="dt")
60 |         target_expression = (
61 |             "date(dt) >= date('2019-12-30') and date(dt) <= date('2020-01-01')"
62 |         )
63 | 
64 |         # act
65 |         result_expression = incremental_strategy.get_expression(
66 |             start_date="2019-12-30", end_date="2020-01-01"
67 |         )
68 | 
69 |         # assert
70 |         assert target_expression.split() == result_expression.split()
71 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/dataframe_service/test_partitioning.py:
--------------------------------------------------------------------------------
 1 | from butterfree.dataframe_service import extract_partition_values
 2 | 
 3 | 
 4 | class TestPartitioning:
 5 |     def test_extract_partition_values(self, test_partitioning_input_df):
 6 |         # arrange
 7 |         target_values = [
 8 |             {"year": 2009, "month": 8, "day": 20},
 9 |             {"year": 2020, "month": 8, "day": 20},
10 |             {"year": 2020, "month": 9, "day": 20},
11 |             {"year": 2020, "month": 8, "day": 21},
12 |         ]
13 | 
14 |         # act
15 |         result_values = extract_partition_values(
16 |             test_partitioning_input_df, partition_columns=["year", "month", "day"]
17 |         )
18 | 
19 |         # assert
20 |         assert result_values == target_values
21 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/dataframe_service/test_repartition.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql.functions import spark_partition_id
 2 | 
 3 | from butterfree.dataframe_service import repartition_df, repartition_sort_df
 4 | 
 5 | 
 6 | class TestRepartition:
 7 |     def test_repartition_df(self, input_df):
 8 |         result_df = repartition_df(dataframe=input_df, partition_by=["timestamp"])
 9 | 
10 |         # Only one partition id, meaning data is not partitioned
11 |         assert input_df.select(spark_partition_id()).distinct().count() == 1
12 |         # Desired number of partitions
13 |         assert result_df.select(spark_partition_id()).distinct().count() == 200
14 | 
15 |     def test_repartition_df_partitions(self, input_df):
16 |         result_df = repartition_df(
17 |             dataframe=input_df, partition_by=["timestamp"], num_partitions=50
18 |         )
19 | 
20 |         # Only one partition id, meaning data is not partitioned
21 |         assert input_df.select(spark_partition_id()).distinct().count() == 1
22 |         # Desired number of partitions
23 |         assert result_df.select(spark_partition_id()).distinct().count() == 50
24 | 
25 |     def test_repartition_sort_df(self, input_df):
26 |         result_df = repartition_sort_df(
27 |             dataframe=input_df, partition_by=["timestamp"], order_by=["timestamp"]
28 |         )
29 | 
30 |         # Only one partition id, meaning data is not partitioned
31 |         assert input_df.select(spark_partition_id()).distinct().count() == 1
32 |         # Desired number of partitions
33 |         assert result_df.select(spark_partition_id()).distinct().count() == 200
34 | 
35 |     def test_repartition_sort_df_processors(self, input_df):
36 |         result_df = repartition_sort_df(
37 |             dataframe=input_df,
38 |             partition_by=["timestamp"],
39 |             order_by=["timestamp"],
40 |             num_processors=3,
41 |         )
42 | 
43 |         # Only one partition id, meaning data is not partitioned
44 |         assert input_df.select(spark_partition_id()).distinct().count() == 1
45 |         # Desired number of partitions
46 |         assert result_df.select(spark_partition_id()).distinct().count() == 12
47 | 
48 |     def test_repartition_sort_df_processors_partitions(self, input_df):
49 |         result_df = repartition_sort_df(
50 |             dataframe=input_df,
51 |             partition_by=["timestamp"],
52 |             order_by=["timestamp"],
53 |             num_partitions=50,
54 |         )
55 | 
56 |         # Only one partition id, meaning data is not partitioned
57 |         assert input_df.select(spark_partition_id()).distinct().count() == 1
58 |         # Desired number of partitions
59 |         assert result_df.select(spark_partition_id()).distinct().count() == 50
60 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/extract/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/extract/__init__.py


--------------------------------------------------------------------------------
/tests/unit/butterfree/extract/conftest.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import Mock
 2 | 
 3 | import pytest
 4 | from pyspark.sql.functions import col, to_date
 5 | 
 6 | from butterfree.constants.columns import TIMESTAMP_COLUMN
 7 | 
 8 | 
 9 | @pytest.fixture()
10 | def column_target_df(spark_context, spark_session):
11 |     data = [{"new_col1": "value", "new_col2": 123}]
12 |     return spark_session.read.json(spark_context.parallelize(data, 1))
13 | 
14 | 
15 | @pytest.fixture()
16 | def target_df(spark_context, spark_session):
17 |     data = [{"col1": "value", "col2": 123}]
18 |     return spark_session.read.json(spark_context.parallelize(data, 1))
19 | 
20 | 
21 | @pytest.fixture()
22 | def incremental_source_df(spark_context, spark_session):
23 |     data = [
24 |         {
25 |             "id": 1,
26 |             "feature": 100,
27 |             "date_str": "28/07/2020",
28 |             "milliseconds": 1595894400000,
29 |             "year": 2020,
30 |             "month": 7,
31 |             "day": 28,
32 |         },
33 |         {
34 |             "id": 1,
35 |             "feature": 110,
36 |             "date_str": "29/07/2020",
37 |             "milliseconds": 1595980800000,
38 |             "year": 2020,
39 |             "month": 7,
40 |             "day": 29,
41 |         },
42 |         {
43 |             "id": 1,
44 |             "feature": 120,
45 |             "date_str": "30/07/2020",
46 |             "milliseconds": 1596067200000,
47 |             "year": 2020,
48 |             "month": 7,
49 |             "day": 30,
50 |         },
51 |         {
52 |             "id": 2,
53 |             "feature": 150,
54 |             "date_str": "31/07/2020",
55 |             "milliseconds": 1596153600000,
56 |             "year": 2020,
57 |             "month": 7,
58 |             "day": 31,
59 |         },
60 |         {
61 |             "id": 2,
62 |             "feature": 200,
63 |             "date_str": "01/08/2020",
64 |             "milliseconds": 1596240000000,
65 |             "year": 2020,
66 |             "month": 8,
67 |             "day": 1,
68 |         },
69 |     ]
70 |     return spark_session.read.json(spark_context.parallelize(data, 1)).withColumn(
71 |         "date", to_date(col("date_str"), "dd/MM/yyyy")
72 |     )
73 | 
74 | 
75 | @pytest.fixture()
76 | def spark_client():
77 |     return Mock()
78 | 
79 | 
80 | @pytest.fixture
81 | def feature_set_dataframe(spark_context, spark_session):
82 |     data = [
83 |         {"id": 1, TIMESTAMP_COLUMN: 0, "feature": 100, "test": "fail"},
84 |         {"id": 2, TIMESTAMP_COLUMN: 0, "feature": 200, "test": "running"},
85 |         {"id": 1, TIMESTAMP_COLUMN: 1, "feature": 110, "test": "pass"},
86 |         {"id": 1, TIMESTAMP_COLUMN: 2, "feature": 120, "test": "pass"},
87 |     ]
88 |     return spark_session.read.json(spark_context.parallelize(data, 1))
89 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/extract/pre_processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/extract/pre_processing/__init__.py


--------------------------------------------------------------------------------
/tests/unit/butterfree/extract/pre_processing/conftest.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from typing import List
 3 | 
 4 | import pytest
 5 | from pyspark.sql import DataFrame
 6 | 
 7 | 
 8 | @pytest.fixture()
 9 | def input_df(spark_context, spark_session):
10 |     data = [
11 |         {"id": 1, "ts": "2016-04-11 11:31:11", "pivot_column": 1, "has_feature": 1},
12 |         {"id": 1, "ts": "2016-04-11 11:44:12", "pivot_column": 2, "has_feature": 0},
13 |         {"id": 1, "ts": "2016-04-11 11:46:24", "pivot_column": 3, "has_feature": 1},
14 |         {"id": 1, "ts": "2016-04-11 12:03:21", "pivot_column": 4, "has_feature": 0},
15 |         {"id": 1, "ts": "2016-04-11 13:46:24", "pivot_column": 3, "has_feature": None},
16 |     ]
17 |     df = spark_session.read.json(
18 |         spark_context.parallelize(data).map(lambda x: json.dumps(x))
19 |     )
20 |     return df
21 | 
22 | 
23 | @pytest.fixture()
24 | def pivot_df(spark_context, spark_session):
25 |     data = [
26 |         {"id": 1, "ts": "2016-04-11 11:31:11", "1": 1, "2": None, "3": None, "4": None},
27 |         {"id": 1, "ts": "2016-04-11 11:44:12", "1": None, "2": 0, "3": None, "4": None},
28 |         {"id": 1, "ts": "2016-04-11 11:46:24", "1": None, "2": None, "3": 1, "4": None},
29 |         {"id": 1, "ts": "2016-04-11 12:03:21", "1": None, "2": None, "3": None, "4": 0},
30 |         {
31 |             "id": 1,
32 |             "ts": "2016-04-11 13:46:24",
33 |             "1": None,
34 |             "2": None,
35 |             "3": None,
36 |             "4": None,
37 |         },
38 |     ]
39 |     df = spark_session.read.json(
40 |         spark_context.parallelize(data).map(lambda x: json.dumps(x))
41 |     )
42 |     return df.orderBy("ts")
43 | 
44 | 
45 | @pytest.fixture()
46 | def pivot_ffill_df(spark_context, spark_session):
47 |     data = [
48 |         {"id": 1, "ts": "2016-04-11 11:31:11", "1": 1, "2": None, "3": None, "4": None},
49 |         {"id": 1, "ts": "2016-04-11 11:44:12", "1": 1, "2": 0, "3": None, "4": None},
50 |         {"id": 1, "ts": "2016-04-11 11:46:24", "1": 1, "2": 0, "3": 1, "4": None},
51 |         {"id": 1, "ts": "2016-04-11 12:03:21", "1": 1, "2": 0, "3": 1, "4": 0},
52 |         {"id": 1, "ts": "2016-04-11 13:46:24", "1": 1, "2": 0, "3": 1, "4": 0},
53 |     ]
54 |     df = spark_session.read.json(
55 |         spark_context.parallelize(data).map(lambda x: json.dumps(x))
56 |     )
57 |     return df.orderBy("ts")
58 | 
59 | 
60 | @pytest.fixture()
61 | def pivot_ffill_mock_df(spark_context, spark_session):
62 |     data = [
63 |         {"id": 1, "ts": "2016-04-11 11:31:11", "1": 1, "2": None, "3": None, "4": None},
64 |         {"id": 1, "ts": "2016-04-11 11:44:12", "1": 1, "2": 0, "3": None, "4": None},
65 |         {"id": 1, "ts": "2016-04-11 11:46:24", "1": 1, "2": 0, "3": 1, "4": None},
66 |         {"id": 1, "ts": "2016-04-11 12:03:21", "1": 1, "2": 0, "3": 1, "4": 0},
67 |         {"id": 1, "ts": "2016-04-11 13:46:24", "1": 1, "2": 0, "3": None, "4": 0},
68 |     ]
69 |     df = spark_session.read.json(
70 |         spark_context.parallelize(data).map(lambda x: json.dumps(x))
71 |     )
72 |     return df.orderBy("ts")
73 | 
74 | 
75 | def compare_dataframes(
76 |     actual_df: DataFrame, expected_df: DataFrame, columns_sort: List[str] = None
77 | ):
78 |     if not columns_sort:
79 |         columns_sort = actual_df.schema.fieldNames()
80 |     return sorted(actual_df.select(*columns_sort).collect()) == sorted(
81 |         expected_df.select(*columns_sort).collect()
82 |     )
83 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/extract/pre_processing/test_explode_json_column.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql.types import (
 2 |     ArrayType,
 3 |     IntegerType,
 4 |     StringType,
 5 |     StructField,
 6 |     StructType,
 7 | )
 8 | 
 9 | from butterfree.extract.pre_processing import explode_json_column
10 | from butterfree.testing.dataframe import (
11 |     assert_dataframe_equality,
12 |     create_df_from_collection,
13 | )
14 | 
15 | 
16 | def test_explode_json_column(spark_context, spark_session):
17 |     # arrange
18 |     input_data = [{"json_column": '{"a": 123, "b": "abc", "c": "123", "d": [1, 2, 3]}'}]
19 |     target_data = [
20 |         {
21 |             "json_column": '{"a": 123, "b": "abc", "c": "123", "d": [1, 2, 3]}',
22 |             "a": 123,
23 |             "b": "abc",
24 |             "c": 123,
25 |             "d": [1, 2, 3],
26 |         }
27 |     ]
28 | 
29 |     input_df = create_df_from_collection(input_data, spark_context, spark_session)
30 |     target_df = create_df_from_collection(target_data, spark_context, spark_session)
31 | 
32 |     json_column_schema = StructType(
33 |         [
34 |             StructField("a", IntegerType()),
35 |             StructField("b", StringType()),
36 |             StructField("c", IntegerType()),
37 |             StructField("d", ArrayType(IntegerType())),
38 |         ]
39 |     )
40 | 
41 |     # act
42 |     output_df = explode_json_column(input_df, "json_column", json_column_schema)
43 | 
44 |     # arrange
45 |     assert_dataframe_equality(target_df, output_df)
46 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/extract/pre_processing/test_filter_transform.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from butterfree.constants.columns import TIMESTAMP_COLUMN
 4 | from butterfree.extract.pre_processing import filter
 5 | from butterfree.extract.readers import FileReader
 6 | 
 7 | 
 8 | class TestFilterDataFrame:
 9 |     def test_filter(self, feature_set_dataframe, spark_context, spark_session):
10 |         # given
11 |         file_reader = FileReader("test", "path/to/file", "format")
12 | 
13 |         file_reader.with_(
14 |             transformer=filter,
15 |             condition="test not in ('fail') and feature in (110, 120)",
16 |         )
17 | 
18 |         # when
19 |         result_df = file_reader._apply_transformations(feature_set_dataframe)
20 | 
21 |         target_data = [
22 |             {"id": 1, TIMESTAMP_COLUMN: 1, "feature": 110, "test": "pass"},
23 |             {"id": 1, TIMESTAMP_COLUMN: 2, "feature": 120, "test": "pass"},
24 |         ]
25 |         target_df = spark_session.read.json(spark_context.parallelize(target_data, 1))
26 | 
27 |         # then
28 |         assert result_df.collect() == target_df.collect()
29 | 
30 |     @pytest.mark.parametrize(
31 |         "condition",
32 |         [None, 100],
33 |     )
34 |     def test_filter_with_invalidations(
35 |         self, feature_set_dataframe, condition, spark_context, spark_session
36 |     ):
37 |         # given
38 |         file_reader = FileReader("test", "path/to/file", "format")
39 | 
40 |         file_reader.with_(transformer=filter, condition=condition)
41 | 
42 |         # then
43 |         with pytest.raises(TypeError):
44 |             file_reader._apply_transformations(feature_set_dataframe)
45 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/extract/pre_processing/test_forward_fill.py:
--------------------------------------------------------------------------------
 1 | from butterfree.extract.pre_processing import forward_fill
 2 | 
 3 | 
 4 | class TestForwardFillTransform:
 5 |     def test_forward_fill_transform(self, input_df):
 6 |         # given
 7 |         result_df = forward_fill(
 8 |             dataframe=input_df,
 9 |             partition_by=["id", "pivot_column"],
10 |             order_by="ts",
11 |             fill_column="has_feature",
12 |         )
13 | 
14 |         # assert
15 |         assert all(
16 |             [r.has_feature == 1 for r in result_df.filter("pivot_column = 3").collect()]
17 |         )
18 | 
19 |     def test_forward_fill_transform_id_partition(self, input_df):
20 |         # given
21 |         result_df = forward_fill(
22 |             dataframe=input_df,
23 |             partition_by=["id"],
24 |             order_by="ts",
25 |             fill_column="has_feature",
26 |         )
27 | 
28 |         # assert
29 |         assert (
30 |             result_df.filter("pivot_column = 3").orderBy("ts").collect()[-1].has_feature
31 |             == 0
32 |         )
33 | 
34 |     def test_forward_fill_transform_new_column(self, input_df):
35 |         # given
36 |         result_df = forward_fill(
37 |             dataframe=input_df,
38 |             partition_by=["id"],
39 |             order_by="ts",
40 |             fill_column="has_feature",
41 |             filled_column="has_feature_filled",
42 |         )
43 | 
44 |         # assert
45 |         assert "has_feature_filled" in result_df.columns
46 |         assert result_df.filter("has_feature_filled is null").count() == 0
47 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/extract/pre_processing/test_pivot_transform.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from pyspark.sql.functions import first
  3 | 
  4 | from butterfree.extract.pre_processing import pivot
  5 | from butterfree.extract.readers import FileReader
  6 | 
  7 | from .conftest import compare_dataframes
  8 | 
  9 | 
 10 | class TestPivotTransform:
 11 |     def test_pivot_transformation(
 12 |         self,
 13 |         input_df,
 14 |         pivot_df,
 15 |     ):
 16 |         result_df = pivot(
 17 |             dataframe=input_df,
 18 |             group_by_columns=["id", "ts"],
 19 |             pivot_column="pivot_column",
 20 |             agg_column="has_feature",
 21 |             aggregation=first,
 22 |         )
 23 | 
 24 |         # assert
 25 |         assert compare_dataframes(
 26 |             actual_df=result_df,
 27 |             expected_df=pivot_df,
 28 |         )
 29 | 
 30 |     def test_pivot_transformation_with_forward_fill(
 31 |         self,
 32 |         input_df,
 33 |         pivot_ffill_df,
 34 |     ):
 35 |         result_df = pivot(
 36 |             dataframe=input_df,
 37 |             group_by_columns=["id", "ts"],
 38 |             pivot_column="pivot_column",
 39 |             agg_column="has_feature",
 40 |             aggregation=first,
 41 |             with_forward_fill=True,
 42 |         )
 43 | 
 44 |         # assert
 45 |         assert compare_dataframes(
 46 |             actual_df=result_df,
 47 |             expected_df=pivot_ffill_df,
 48 |         )
 49 | 
 50 |     def test_pivot_transformation_with_forward_fill_and_mock(
 51 |         self,
 52 |         input_df,
 53 |         pivot_ffill_mock_df,
 54 |     ):
 55 |         result_df = pivot(
 56 |             dataframe=input_df,
 57 |             group_by_columns=["id", "ts"],
 58 |             pivot_column="pivot_column",
 59 |             agg_column="has_feature",
 60 |             aggregation=first,
 61 |             mock_value=-1,
 62 |             mock_type="int",
 63 |             with_forward_fill=True,
 64 |         )
 65 | 
 66 |         # assert
 67 |         assert compare_dataframes(
 68 |             actual_df=result_df,
 69 |             expected_df=pivot_ffill_mock_df,
 70 |         )
 71 | 
 72 |     def test_pivot_transformation_mock_without_type(
 73 |         self,
 74 |         input_df,
 75 |         pivot_ffill_mock_df,
 76 |     ):
 77 |         with pytest.raises(AttributeError):
 78 |             _ = pivot(
 79 |                 dataframe=input_df,
 80 |                 group_by_columns=["id", "ts"],
 81 |                 pivot_column="pivot_column",
 82 |                 agg_column="has_feature",
 83 |                 aggregation=first,
 84 |                 mock_value=-1,
 85 |                 with_forward_fill=True,
 86 |             )
 87 | 
 88 |     def test_apply_pivot_transformation(self, input_df, pivot_df):
 89 |         # arrange
 90 |         file_reader = FileReader("test", "path/to/file", "format")
 91 |         file_reader.with_(
 92 |             transformer=pivot,
 93 |             group_by_columns=["id", "ts"],
 94 |             pivot_column="pivot_column",
 95 |             agg_column="has_feature",
 96 |             aggregation=first,
 97 |         )
 98 | 
 99 |         # act
100 |         result_df = file_reader._apply_transformations(input_df)
101 | 
102 |         # assert
103 |         assert compare_dataframes(
104 |             actual_df=result_df,
105 |             expected_df=pivot_df,
106 |         )
107 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/extract/pre_processing/test_replace_transform.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from butterfree.extract.pre_processing import replace
 4 | from butterfree.testing.dataframe import (
 5 |     assert_dataframe_equality,
 6 |     create_df_from_collection,
 7 | )
 8 | 
 9 | 
10 | class TestReplaceTransform:
11 |     def test_replace(self, spark_context, spark_session):
12 |         # arrange
13 |         input_data = [
14 |             {"id": 1, "type": "a"},
15 |             {"id": 2, "type": "b"},
16 |             {"id": 3, "type": "c"},
17 |         ]
18 |         target_data = [
19 |             {"id": 1, "type": "type_a"},
20 |             {"id": 2, "type": "type_b"},
21 |             {"id": 3, "type": "c"},
22 |         ]
23 |         input_df = create_df_from_collection(input_data, spark_context, spark_session)
24 |         target_df = create_df_from_collection(target_data, spark_context, spark_session)
25 |         replace_dict = {"a": "type_a", "b": "type_b"}
26 | 
27 |         # act
28 |         result_df = replace(input_df, "type", replace_dict)
29 | 
30 |         # assert
31 |         assert_dataframe_equality(target_df, result_df)
32 | 
33 |     @pytest.mark.parametrize(
34 |         "input_data, column, replace_dict",
35 |         [
36 |             ([{"column": "a"}], "not_column", {"a": "type_a"}),
37 |             ([{"column": 123}], "column", {"a": "type_a"}),
38 |             ([{"column": "a"}], "column", "not dict"),
39 |             ([{"column": "a"}], "column", {"a": 1}),
40 |         ],
41 |     )
42 |     def test_replace_with_invalid_args(
43 |         self, input_data, column, replace_dict, spark_context, spark_session
44 |     ):
45 |         # arrange
46 |         input_df = create_df_from_collection(input_data, spark_context, spark_session)
47 | 
48 |         # act and assert
49 |         with pytest.raises(ValueError):
50 |             replace(input_df, column, replace_dict)
51 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/extract/readers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/extract/readers/__init__.py


--------------------------------------------------------------------------------
/tests/unit/butterfree/extract/readers/file-reader-test.csv:
--------------------------------------------------------------------------------
1 | "A","B","C"
2 | 10,10.2,"Test1"
3 | 100,100.3,"Test2"
4 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/extract/readers/file-reader-test.json:
--------------------------------------------------------------------------------
1 | {"A":10,"B":10.2,"C":"Test1"}
2 | {"A":100,"B":100.2,"C":"Test2"}


--------------------------------------------------------------------------------
/tests/unit/butterfree/extract/readers/test_table_reader.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from butterfree.extract.readers import TableReader
 4 | 
 5 | 
 6 | class TestTableReader:
 7 |     @pytest.mark.parametrize(
 8 |         "database, table",
 9 |         [
10 |             ("database", 123),
11 |             (
12 |                 123,
13 |                 None,
14 |             ),
15 |         ],
16 |     )
17 |     def test_init_invalid_params(self, database, table):
18 |         # act and assert
19 |         with pytest.raises(ValueError):
20 |             TableReader("id", table, database)
21 | 
22 |     def test_consume(self, spark_client, target_df):
23 |         # arrange
24 |         database = "test_database"
25 |         table = "test_table"
26 |         spark_client.read_table.return_value = target_df
27 |         table_reader = TableReader("test", table, database)
28 | 
29 |         # act
30 |         output_df = table_reader.consume(spark_client)
31 | 
32 |         # assert
33 |         spark_client.read_table.assert_called_once_with(table, database)
34 |         assert target_df.collect() == output_df.collect()
35 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/extract/test_source.py:
--------------------------------------------------------------------------------
 1 | from butterfree.clients import SparkClient
 2 | from butterfree.extract import Source
 3 | 
 4 | 
 5 | class TestSource:
 6 |     def test_construct(self, mocker, target_df):
 7 |         # given
 8 |         spark_client = SparkClient()
 9 | 
10 |         reader_id = "a_source"
11 |         reader = mocker.stub(reader_id)
12 |         reader.build = mocker.stub("build")
13 |         reader.build.side_effect = target_df.createOrReplaceTempView(reader_id)
14 | 
15 |         # when
16 |         source_selector = Source(
17 |             readers=[reader],
18 |             query=f"select * from {reader_id}",  # noqa
19 |         )
20 | 
21 |         result_df = source_selector.construct(spark_client)
22 | 
23 |         assert result_df.collect() == target_df.collect()
24 | 
25 |     def test_is_cached(self, mocker, target_df):
26 |         # given
27 |         spark_client = SparkClient()
28 | 
29 |         reader_id = "a_source"
30 |         reader = mocker.stub(reader_id)
31 |         reader.build = mocker.stub("build")
32 |         reader.build.side_effect = target_df.createOrReplaceTempView(reader_id)
33 | 
34 |         # when
35 |         source_selector = Source(
36 |             readers=[reader],
37 |             query=f"select * from {reader_id}",  # noqa
38 |         )
39 | 
40 |         result_df = source_selector.construct(spark_client)
41 | 
42 |         assert result_df.is_cached
43 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/hooks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/hooks/__init__.py


--------------------------------------------------------------------------------
/tests/unit/butterfree/hooks/schema_compatibility/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/hooks/schema_compatibility/__init__.py


--------------------------------------------------------------------------------
/tests/unit/butterfree/hooks/schema_compatibility/test_cassandra_table_schema_compatibility_hook.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import MagicMock
 2 | 
 3 | import pytest
 4 | 
 5 | from butterfree.clients import CassandraClient
 6 | from butterfree.hooks.schema_compatibility import CassandraTableSchemaCompatibilityHook
 7 | 
 8 | 
 9 | class TestCassandraTableSchemaCompatibilityHook:
10 |     def test_run_compatible_schema(self, spark_session):
11 |         cassandra_client = CassandraClient(host=["mock"], keyspace="dummy_keyspace")
12 | 
13 |         cassandra_client.sql = MagicMock(  # type: ignore
14 |             return_value=[
15 |                 {"column_name": "feature1", "type": "text"},
16 |                 {"column_name": "feature2", "type": "int"},
17 |             ]
18 |         )
19 | 
20 |         table = "table"
21 | 
22 |         input_dataframe = spark_session.sql("select 'abc' as feature1, 1 as feature2")
23 | 
24 |         hook = CassandraTableSchemaCompatibilityHook(cassandra_client, table)
25 | 
26 |         # act and assert
27 |         assert hook.run(input_dataframe) == input_dataframe
28 | 
29 |     def test_run_incompatible_schema(self, spark_session):
30 |         cassandra_client = CassandraClient(host=["mock"], keyspace="dummy_keyspace")
31 | 
32 |         cassandra_client.sql = MagicMock(  # type: ignore
33 |             return_value=[
34 |                 {"column_name": "feature1", "type": "text"},
35 |                 {"column_name": "feature2", "type": "bigint"},
36 |             ]
37 |         )
38 | 
39 |         table = "table"
40 | 
41 |         input_dataframe = spark_session.sql("select 'abc' as feature1, 1 as feature2")
42 | 
43 |         hook = CassandraTableSchemaCompatibilityHook(cassandra_client, table)
44 | 
45 |         # act and assert
46 |         with pytest.raises(
47 |             ValueError, match="There's a schema incompatibility between"
48 |         ):
49 |             hook.run(input_dataframe)
50 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/hooks/schema_compatibility/test_spark_table_schema_compatibility_hook.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from butterfree.clients import SparkClient
 4 | from butterfree.hooks.schema_compatibility import SparkTableSchemaCompatibilityHook
 5 | 
 6 | 
 7 | class TestSparkTableSchemaCompatibilityHook:
 8 |     @pytest.mark.parametrize(
 9 |         "table, database, target_table_expression",
10 |         [("table", "database", "`database`.`table`"), ("table", None, "`table`")],
11 |     )
12 |     def test_build_table_expression(self, table, database, target_table_expression):
13 |         # arrange
14 |         spark_client = SparkClient()
15 | 
16 |         # act
17 |         result_table_expression = SparkTableSchemaCompatibilityHook(
18 |             spark_client, table, database
19 |         ).table_expression
20 | 
21 |         # assert
22 |         assert target_table_expression == result_table_expression
23 | 
24 |     def test_run_compatible_schema(self, spark_session):
25 |         # arrange
26 |         spark_client = SparkClient()
27 |         target_table = spark_session.sql(
28 |             "select 1 as feature_a, 'abc' as feature_b, true as other_feature"
29 |         )
30 |         input_dataframe = spark_session.sql("select 1 as feature_a, 'abc' as feature_b")
31 |         target_table.registerTempTable("test")
32 | 
33 |         hook = SparkTableSchemaCompatibilityHook(spark_client, "test")
34 | 
35 |         # act and assert
36 |         assert hook.run(input_dataframe) == input_dataframe
37 | 
38 |     def test_run_incompatible_schema(self, spark_session):
39 |         # arrange
40 |         spark_client = SparkClient()
41 |         target_table = spark_session.sql(
42 |             "select 1 as feature_a, 'abc' as feature_b, true as other_feature"
43 |         )
44 |         input_dataframe = spark_session.sql(
45 |             "select 1 as feature_a, 'abc' as feature_b, true as unregisted_column"
46 |         )
47 |         target_table.registerTempTable("test")
48 | 
49 |         hook = SparkTableSchemaCompatibilityHook(spark_client, "test")
50 | 
51 |         # act and assert
52 |         with pytest.raises(ValueError, match="The dataframe has a schema incompatible"):
53 |             hook.run(input_dataframe)
54 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/load/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/load/__init__.py


--------------------------------------------------------------------------------
/tests/unit/butterfree/load/processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/load/processing/__init__.py


--------------------------------------------------------------------------------
/tests/unit/butterfree/load/processing/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.fixture()
 5 | def input_df(spark_context, spark_session):
 6 |     data = [
 7 |         {"id": 1, "ts": "2016-04-11 11:31:11"},
 8 |         {"id": 1, "ts": "2016-04-11 11:44:12"},
 9 |         {"id": 1, "ts": "2016-04-11 11:46:24"},
10 |         {"id": 1, "ts": "2016-04-11 12:03:21"},
11 |         {"id": 1, "ts": "2016-04-11 13:46:24"},
12 |     ]
13 |     return spark_session.read.json(spark_context.parallelize(data, 1))
14 | 
15 | 
16 | @pytest.fixture()
17 | def json_df(spark_context, spark_session):
18 |     data = [
19 |         '{"value":"{\\"id\\":1,\\"ts\\":\\"2016-04-11 11:31:11\\"}"}',
20 |         '{"value":"{\\"id\\":1,\\"ts\\":\\"2016-04-11 11:44:12\\"}"}',
21 |         '{"value":"{\\"id\\":1,\\"ts\\":\\"2016-04-11 11:46:24\\"}"}',
22 |         '{"value":"{\\"id\\":1,\\"ts\\":\\"2016-04-11 12:03:21\\"}"}',
23 |         '{"value":"{\\"id\\":1,\\"ts\\":\\"2016-04-11 13:46:24\\"}"}',
24 |     ]
25 |     return spark_session.read.json(spark_context.parallelize(data, 1))
26 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/load/processing/test_json_transform.py:
--------------------------------------------------------------------------------
 1 | from butterfree.load.processing import json_transform
 2 | 
 3 | 
 4 | class TestJsonTransform:
 5 |     def test_json_transformation(
 6 |         self,
 7 |         input_df,
 8 |         json_df,
 9 |     ):
10 |         result_df = json_transform(dataframe=input_df)
11 | 
12 |         # assert
13 |         assert sorted(result_df.collect()) == sorted(json_df.collect())
14 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/load/writers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/load/writers/__init__.py


--------------------------------------------------------------------------------
/tests/unit/butterfree/migrations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/migrations/__init__.py


--------------------------------------------------------------------------------
/tests/unit/butterfree/migrations/database_migration/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/migrations/database_migration/__init__.py


--------------------------------------------------------------------------------
/tests/unit/butterfree/migrations/database_migration/conftest.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql.types import (
 2 |     ArrayType,
 3 |     DoubleType,
 4 |     FloatType,
 5 |     LongType,
 6 |     StringType,
 7 |     TimestampType,
 8 | )
 9 | from pytest import fixture
10 | 
11 | from butterfree.constants import DataType
12 | from butterfree.transform import FeatureSet
13 | from butterfree.transform.features import Feature, KeyFeature, TimestampFeature
14 | 
15 | 
16 | @fixture
17 | def db_schema():
18 |     return [
19 |         {"column_name": "id", "type": LongType(), "primary_key": True},
20 |         {"column_name": "timestamp", "type": TimestampType(), "primary_key": False},
21 |         {
22 |             "column_name": "feature1__avg_over_1_week_rolling_windows",
23 |             "type": DoubleType(),
24 |             "primary_key": False,
25 |         },
26 |         {
27 |             "column_name": "feature1__avg_over_2_days_rolling_windows",
28 |             "type": DoubleType(),
29 |             "primary_key": False,
30 |         },
31 |     ]
32 | 
33 | 
34 | @fixture
35 | def fs_schema():
36 |     return [
37 |         {"column_name": "id", "type": LongType(), "primary_key": True},
38 |         {"column_name": "timestamp", "type": TimestampType(), "primary_key": True},
39 |         {"column_name": "new_feature", "type": FloatType(), "primary_key": False},
40 |         {
41 |             "column_name": "array_feature",
42 |             "type": ArrayType(StringType(), True),
43 |             "primary_key": False,
44 |         },
45 |         {
46 |             "column_name": "feature1__avg_over_1_week_rolling_windows",
47 |             "type": FloatType(),
48 |             "primary_key": False,
49 |         },
50 |     ]
51 | 
52 | 
53 | @fixture
54 | def feature_set():
55 |     feature_set = FeatureSet(
56 |         name="feature_set",
57 |         entity="entity",
58 |         description="description",
59 |         features=[
60 |             Feature(
61 |                 name="feature_float",
62 |                 description="test",
63 |                 dtype=DataType.FLOAT,
64 |             ),
65 |         ],
66 |         keys=[
67 |             KeyFeature(
68 |                 name="id",
69 |                 description="The device ID",
70 |                 dtype=DataType.BIGINT,
71 |             )
72 |         ],
73 |         timestamp=TimestampFeature(),
74 |     )
75 | 
76 |     return feature_set
77 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py:
--------------------------------------------------------------------------------
 1 | from butterfree.migrations.database_migration import CassandraMigration
 2 | 
 3 | 
 4 | class TestCassandraMigration:
 5 |     def test_queries(self, fs_schema, db_schema):
 6 |         cassandra_migration = CassandraMigration()
 7 |         expected_query = [
 8 |             "ALTER TABLE table_name ADD (new_feature FloatType);",
 9 |             "ALTER TABLE table_name DROP (feature1__avg_over_2_days_rolling_windows);",
10 |             "ALTER TABLE table_name ALTER "
11 |             "feature1__avg_over_1_week_rolling_windows TYPE FloatType;",
12 |         ]
13 |         query = cassandra_migration.create_query(fs_schema, "table_name", db_schema)
14 | 
15 |         assert query, expected_query
16 | 
17 |     def test_queries_on_entity(self, fs_schema, db_schema):
18 |         cassandra_migration = CassandraMigration()
19 |         expected_query = [
20 |             "ALTER TABLE table_name ADD (new_feature FloatType);",
21 |             "ALTER TABLE table_name ALTER "
22 |             "feature1__avg_over_1_week_rolling_windows TYPE FloatType;",
23 |         ]
24 |         query = cassandra_migration.create_query(
25 |             fs_schema, "table_name", db_schema, True
26 |         )
27 | 
28 |         assert query, expected_query
29 | 
30 |     def test_create_table_query(self, fs_schema):
31 | 
32 |         cassandra_migration = CassandraMigration()
33 |         expected_query = [
34 |             "CREATE TABLE test.table_name "
35 |             "(id LongType, timestamp TimestampType, new_feature FloatType, "
36 |             "array_feature ArrayType(StringType(), True), "
37 |             "feature1__avg_over_1_week_rolling_windows FloatType, "
38 |             "PRIMARY KEY (id, timestamp));"
39 |         ]
40 | 
41 |         query = cassandra_migration.create_query(fs_schema, "table_name")
42 | 
43 |         assert query, expected_query
44 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/migrations/database_migration/test_database_migration.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql.types import DoubleType, FloatType, LongType, TimestampType
 2 | 
 3 | from butterfree.load.writers import HistoricalFeatureStoreWriter
 4 | from butterfree.migrations.database_migration import CassandraMigration, Diff
 5 | 
 6 | 
 7 | class TestDatabaseMigration:
 8 |     def test__get_diff_empty(self, mocker, db_schema):
 9 |         fs_schema = [
10 |             {"column_name": "id", "type": LongType(), "primary_key": True},
11 |             {"column_name": "timestamp", "type": TimestampType(), "primary_key": False},
12 |             {
13 |                 "column_name": "feature1__avg_over_1_week_rolling_windows",
14 |                 "type": DoubleType(),
15 |                 "primary_key": False,
16 |             },
17 |             {
18 |                 "column_name": "feature1__avg_over_2_days_rolling_windows",
19 |                 "type": DoubleType(),
20 |                 "primary_key": False,
21 |             },
22 |         ]
23 |         m = CassandraMigration()
24 |         m._client = mocker.stub("client")
25 |         diff = m._get_diff(fs_schema, db_schema)
26 |         assert not diff
27 | 
28 |     def test__get_diff(self, mocker, db_schema):
29 |         fs_schema = [
30 |             {"column_name": "id", "type": LongType(), "primary_key": True},
31 |             {"column_name": "timestamp", "type": TimestampType(), "primary_key": True},
32 |             {"column_name": "new_feature", "type": FloatType(), "primary_key": False},
33 |             {
34 |                 "column_name": "feature1__avg_over_1_week_rolling_windows",
35 |                 "type": FloatType(),
36 |                 "primary_key": False,
37 |             },
38 |         ]
39 |         expected_diff = {
40 |             Diff("timestamp", kind=Diff.Kind.ALTER_KEY, value=None),
41 |             Diff("new_feature", kind=Diff.Kind.ADD, value=FloatType()),
42 |             Diff(
43 |                 "feature1__avg_over_2_days_rolling_windows",
44 |                 kind=Diff.Kind.DROP,
45 |                 value=None,
46 |             ),
47 |             Diff(
48 |                 "feature1__avg_over_1_week_rolling_windows",
49 |                 kind=Diff.Kind.ALTER_TYPE,
50 |                 value=FloatType(),
51 |             ),
52 |         }
53 | 
54 |         m = CassandraMigration()
55 |         m._client = mocker.stub("client")
56 |         diff = m._get_diff(fs_schema, db_schema)
57 |         assert diff == expected_diff
58 | 
59 |     def test_apply_migration(self, feature_set, mocker):
60 |         # given
61 |         m = CassandraMigration()
62 |         m.apply_migration = mocker.stub("apply_migration")
63 | 
64 |         # when
65 |         m.apply_migration(feature_set, HistoricalFeatureStoreWriter())
66 | 
67 |         # then
68 |         m.apply_migration.assert_called_once()
69 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/migrations/database_migration/test_metastore_migration.py:
--------------------------------------------------------------------------------
 1 | from butterfree.migrations.database_migration import MetastoreMigration
 2 | 
 3 | 
 4 | class TestMetastoreMigration:
 5 |     def test_queries(self, fs_schema, db_schema):
 6 |         metastore_migration = MetastoreMigration()
 7 | 
 8 |         expected_query = [
 9 |             "ALTER TABLE test.table_name ADD IF NOT EXISTS "
10 |             "columns (new_feature FloatType);",
11 |             "ALTER TABLE table_name DROP IF EXISTS "
12 |             "(feature1__avg_over_2_days_rolling_windows None);",
13 |             "ALTER TABLE table_name ALTER COLUMN "
14 |             "feature1__avg_over_1_week_rolling_windows FloatType;",
15 |         ]
16 | 
17 |         query = metastore_migration.create_query(fs_schema, "table_name", db_schema)
18 | 
19 |         assert query, expected_query
20 | 
21 |     def test_queries_on_entity(self, fs_schema, db_schema):
22 |         metastore_migration = MetastoreMigration()
23 | 
24 |         expected_query = [
25 |             "ALTER TABLE test.table_name ADD IF NOT EXISTS "
26 |             "columns (new_feature FloatType);",
27 |             "ALTER TABLE table_name ALTER COLUMN "
28 |             "feature1__avg_over_1_week_rolling_windows FloatType;",
29 |         ]
30 | 
31 |         query = metastore_migration.create_query(
32 |             fs_schema, "table_name", db_schema, True
33 |         )
34 | 
35 |         assert query, expected_query
36 | 
37 |     def test_create_table_query(self, fs_schema):
38 | 
39 |         metastore_migration = MetastoreMigration()
40 | 
41 |         expected_query = [
42 |             "CREATE TABLE IF NOT EXISTS  test.table_name "
43 |             "(id LongType, timestamp TimestampType, new_feature FloatType) "
44 |             "PARTITIONED BY (year INT, month INT, day INT);"
45 |         ]
46 | 
47 |         query = metastore_migration.create_query(fs_schema, "table_name")
48 | 
49 |         assert query, expected_query
50 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/pipelines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/pipelines/__init__.py


--------------------------------------------------------------------------------
/tests/unit/butterfree/pipelines/conftest.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import Mock
 2 | 
 3 | from pyspark.sql import functions
 4 | from pytest import fixture
 5 | 
 6 | from butterfree.clients import SparkClient
 7 | from butterfree.constants import DataType
 8 | from butterfree.constants.columns import TIMESTAMP_COLUMN
 9 | from butterfree.extract import Source
10 | from butterfree.extract.readers import TableReader
11 | from butterfree.load import Sink
12 | from butterfree.load.writers import HistoricalFeatureStoreWriter
13 | from butterfree.pipelines import FeatureSetPipeline
14 | from butterfree.transform import FeatureSet
15 | from butterfree.transform.features import Feature, KeyFeature, TimestampFeature
16 | from butterfree.transform.transformations import SparkFunctionTransform
17 | from butterfree.transform.utils import Function
18 | 
19 | 
20 | @fixture()
21 | def feature_set_pipeline():
22 |     test_pipeline = FeatureSetPipeline(
23 |         spark_client=SparkClient(),
24 |         source=Mock(
25 |             spec=Source,
26 |             readers=[
27 |                 TableReader(
28 |                     id="source_a",
29 |                     database="db",
30 |                     table="table",
31 |                 )
32 |             ],
33 |             query="select * from source_a",
34 |         ),
35 |         feature_set=Mock(
36 |             spec=FeatureSet,
37 |             name="feature_set",
38 |             entity="entity",
39 |             description="description",
40 |             keys=[
41 |                 KeyFeature(
42 |                     name="user_id",
43 |                     description="The user's Main ID or device ID",
44 |                     dtype=DataType.INTEGER,
45 |                 )
46 |             ],
47 |             timestamp=TimestampFeature(from_column="ts"),
48 |             features=[
49 |                 Feature(
50 |                     name="listing_page_viewed__rent_per_month",
51 |                     description="Average of something.",
52 |                     transformation=SparkFunctionTransform(
53 |                         functions=[
54 |                             Function(functions.avg, DataType.FLOAT),
55 |                             Function(functions.stddev_pop, DataType.FLOAT),
56 |                         ],
57 |                     ).with_window(
58 |                         partition_by="user_id",
59 |                         order_by=TIMESTAMP_COLUMN,
60 |                         window_definition=["7 days", "2 weeks"],
61 |                         mode="fixed_windows",
62 |                     ),
63 |                 ),
64 |             ],
65 |         ),
66 |         sink=Mock(
67 |             spec=Sink,
68 |             writers=[HistoricalFeatureStoreWriter(db_config=None)],
69 |         ),
70 |     )
71 | 
72 |     return test_pipeline
73 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/reports/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/reports/__init__.py


--------------------------------------------------------------------------------
/tests/unit/butterfree/testing/dataframe/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/testing/dataframe/__init__.py


--------------------------------------------------------------------------------
/tests/unit/butterfree/testing/dataframe/test_dataframe.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from pyspark.sql.functions import col, from_unixtime
 3 | 
 4 | from butterfree.testing.dataframe import (
 5 |     assert_dataframe_equality,
 6 |     create_df_from_collection,
 7 | )
 8 | 
 9 | 
10 | def test_assert_dataframe_equality(spark_context, spark_session):
11 |     # arrange
12 |     data1 = [
13 |         {"ts": 1582911000000, "flag": 1, "value": 1234.0},
14 |         {"ts": 1577923200000, "flag": 0, "value": 123.0},
15 |     ]
16 |     data2 = [
17 |         {"ts": "2020-01-02T00:00:00+00:00", "flag": "false", "value": 123},
18 |         {"ts": "2020-02-28T17:30:00+00:00", "flag": "true", "value": 1234},
19 |     ]  # same data declared in different formats and in different order
20 | 
21 |     df1 = spark_session.read.json(spark_context.parallelize(data1, 1))
22 |     df1 = (
23 |         df1.withColumn("ts", from_unixtime(col("ts") / 1000.0).cast("timestamp"))
24 |         .withColumn("flag", col("flag").cast("boolean"))
25 |         .withColumn("value", col("flag").cast("integer"))
26 |     )
27 | 
28 |     df2 = spark_session.read.json(spark_context.parallelize(data2, 1))
29 |     df2 = (
30 |         df2.withColumn("ts", col("ts").cast("timestamp"))
31 |         .withColumn("flag", col("flag").cast("boolean"))
32 |         .withColumn("value", col("flag").cast("integer"))
33 |     )
34 | 
35 |     # act and assert
36 |     assert_dataframe_equality(df1, df2)
37 | 
38 | 
39 | def test_assert_dataframe_equality_different_values(spark_context, spark_session):
40 |     # arrange
41 |     data1 = [
42 |         {"value": "abc"},
43 |         {"value": "cba"},
44 |     ]
45 |     data2 = [
46 |         {"value": "abc"},
47 |         {"value": "different value"},
48 |     ]
49 | 
50 |     df1 = spark_session.read.json(spark_context.parallelize(data1, 1))
51 |     df2 = spark_session.read.json(spark_context.parallelize(data2, 1))
52 | 
53 |     # act and assert
54 |     with pytest.raises(AssertionError, match="DataFrames have different values:"):
55 |         assert_dataframe_equality(df1, df2)
56 | 
57 | 
58 | def test_assert_dataframe_equality_different_shapes(spark_context, spark_session):
59 |     # arrange
60 |     data1 = [
61 |         {"value": "abc"},
62 |         {"value": "cba"},
63 |         {"value": "cba"},
64 |     ]
65 |     data2 = [
66 |         {"value": "abc"},
67 |         {"value": "cba"},
68 |     ]
69 | 
70 |     df1 = spark_session.read.json(spark_context.parallelize(data1, 1))
71 |     df2 = spark_session.read.json(spark_context.parallelize(data2, 1))
72 | 
73 |     # act and assert
74 |     with pytest.raises(AssertionError, match="DataFrame shape mismatch:"):
75 |         assert_dataframe_equality(df1, df2)
76 | 
77 | 
78 | def test_create_df_from_collection(spark_context, spark_session):
79 |     # arrange
80 |     input_data = [{"json_column": '{"abc": 123}', "a": 123, "b": "abc"}]
81 | 
82 |     # act
83 |     output_df = create_df_from_collection(input_data, spark_context, spark_session)
84 |     target_df = spark_session.sql(
85 |         "select 123 as a, 'abc' as b, replace("
86 |         "to_json(named_struct('abc', 123)), ':', ': ') as json_column"
87 |     )  # generate the same data but with SparkSQL directly to df
88 | 
89 |     # arrange
90 |     assert_dataframe_equality(target_df, output_df)
91 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/transform/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/transform/__init__.py


--------------------------------------------------------------------------------
/tests/unit/butterfree/transform/features/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/transform/features/__init__.py


--------------------------------------------------------------------------------
/tests/unit/butterfree/transform/features/conftest.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import Mock
 2 | 
 3 | from pytest import fixture
 4 | 
 5 | from butterfree.constants.columns import TIMESTAMP_COLUMN
 6 | from butterfree.transform.features import Feature
 7 | 
 8 | 
 9 | @fixture
10 | def feature_set_dataframe(spark_context, spark_session):
11 |     data = [
12 |         {"id": 1, TIMESTAMP_COLUMN: 0, "feature": 100},
13 |         {"id": 2, TIMESTAMP_COLUMN: 1, "feature": 200},
14 |     ]
15 |     return spark_session.read.json(spark_context.parallelize(data, 1))
16 | 
17 | 
18 | @fixture
19 | def feature_set_dataframe_ms_from_column(spark_context, spark_session):
20 |     data = [
21 |         {"id": 1, "ts": 1581542311112, "feature": 100},
22 |         {"id": 2, "ts": 1581542322223, "feature": 200},
23 |     ]
24 |     return spark_session.read.json(spark_context.parallelize(data, 1))
25 | 
26 | 
27 | @fixture
28 | def feature_set_dataframe_ms(spark_context, spark_session):
29 |     data = [
30 |         {"id": 1, TIMESTAMP_COLUMN: 1581542311112, "feature": 100},
31 |         {"id": 2, TIMESTAMP_COLUMN: 1581542322223, "feature": 200},
32 |     ]
33 |     return spark_session.read.json(spark_context.parallelize(data, 1))
34 | 
35 | 
36 | @fixture
37 | def feature_set_dataframe_small_time_diff(spark_context, spark_session):
38 |     data = [
39 |         {"id": 1, TIMESTAMP_COLUMN: 1581542311001, "feature": 100},
40 |         {"id": 2, TIMESTAMP_COLUMN: 1581542311002, "feature": 200},
41 |     ]
42 |     return spark_session.read.json(spark_context.parallelize(data, 1))
43 | 
44 | 
45 | @fixture
46 | def feature_set_dataframe_date(spark_context, spark_session):
47 |     data = [
48 |         {"id": 1, TIMESTAMP_COLUMN: "2020-02-07T00:00:00", "feature": 100},
49 |         {"id": 2, TIMESTAMP_COLUMN: "2020-02-08T00:00:00", "feature": 200},
50 |     ]
51 |     return spark_session.read.json(spark_context.parallelize(data, 1))
52 | 
53 | 
54 | @fixture
55 | def mocked_feature():
56 |     return Mock(spec=Feature)
57 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/transform/features/test_key_feature.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import Mock
 2 | 
 3 | from butterfree.constants import DataType
 4 | from butterfree.transform.features import KeyFeature
 5 | 
 6 | 
 7 | class TestKeyFeature:
 8 |     def test_args_without_transformation(self):
 9 | 
10 |         test_key = KeyFeature(
11 |             name="id",
12 |             from_column="origin",
13 |             description="unit test",
14 |             dtype=DataType.INTEGER,
15 |         )
16 | 
17 |         assert test_key.name == "id"
18 |         assert test_key.from_column == "origin"
19 |         assert test_key.description == "unit test"
20 | 
21 |     def test_args_with_transformation(self):
22 | 
23 |         test_key = KeyFeature(
24 |             name="id",
25 |             from_column="origin",
26 |             description="unit test",
27 |             dtype=DataType.INTEGER,
28 |             transformation=Mock(),
29 |         )
30 |         assert test_key.name == "id"
31 |         assert test_key.from_column == "origin"
32 |         assert test_key.description == "unit test"
33 |         assert test_key.transformation
34 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/transform/transformations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/transform/transformations/__init__.py


--------------------------------------------------------------------------------
/tests/unit/butterfree/transform/transformations/test_custom_transform.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from pyspark.sql import functions as F
 3 | 
 4 | from butterfree.constants import DataType
 5 | from butterfree.constants.columns import TIMESTAMP_COLUMN
 6 | from butterfree.transform.features import Feature
 7 | from butterfree.transform.transformations import CustomTransform
 8 | 
 9 | 
10 | def divide(df, parent_feature, column1, column2):
11 |     name = parent_feature.get_output_columns()[0]
12 |     df = df.withColumn(name, F.col(column1) / F.col(column2))
13 |     return df
14 | 
15 | 
16 | class TestCustomTransform:
17 |     def test_feature_transform(self, feature_set_dataframe):
18 | 
19 |         test_feature = Feature(
20 |             name="feature",
21 |             description="unit test",
22 |             dtype=DataType.BIGINT,
23 |             transformation=CustomTransform(
24 |                 transformer=divide,
25 |                 column1="feature1",
26 |                 column2="feature2",
27 |             ),
28 |         )
29 | 
30 |         df = test_feature.transform(feature_set_dataframe)
31 | 
32 |         assert all(
33 |             [
34 |                 a == b
35 |                 for a, b in zip(
36 |                     df.columns,
37 |                     ["feature1", "feature2", "id", TIMESTAMP_COLUMN, "feature"],
38 |                 )
39 |             ]
40 |         )
41 | 
42 |     def test_output_columns(self, feature_set_dataframe):
43 | 
44 |         test_feature = Feature(
45 |             name="feature",
46 |             description="unit test",
47 |             dtype=DataType.BIGINT,
48 |             transformation=CustomTransform(
49 |                 transformer=divide,
50 |                 column1="feature1",
51 |                 column2="feature2",
52 |             ),
53 |         )
54 | 
55 |         df_columns = test_feature.get_output_columns()
56 | 
57 |         assert isinstance(df_columns, list)
58 |         assert df_columns == ["feature"]
59 | 
60 |     def test_custom_transform_output(self, feature_set_dataframe):
61 |         test_feature = Feature(
62 |             name="feature",
63 |             description="unit test",
64 |             dtype=DataType.BIGINT,
65 |             transformation=CustomTransform(
66 |                 transformer=divide,
67 |                 column1="feature1",
68 |                 column2="feature2",
69 |             ),
70 |         )
71 | 
72 |         df = test_feature.transform(feature_set_dataframe).collect()
73 | 
74 |         assert df[0]["feature"] == 1
75 |         assert df[1]["feature"] == 1
76 |         assert df[2]["feature"] == 1
77 |         assert df[3]["feature"] == 1
78 | 
79 |     def test_blank_transformer(self, feature_set_dataframe):
80 |         with pytest.raises(ValueError):
81 |             Feature(
82 |                 name="feature",
83 |                 description="unit test",
84 |                 dtype=DataType.BIGINT,
85 |                 transformation=CustomTransform(transformer=None),
86 |             )
87 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/transform/transformations/test_h3_transform.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import patch
 2 | 
 3 | import pytest
 4 | 
 5 | from butterfree.constants.data_type import DataType
 6 | from butterfree.testing.dataframe import assert_dataframe_equality
 7 | from butterfree.transform.features import Feature, KeyFeature
 8 | from butterfree.transform.transformations.h3_transform import H3HashTransform
 9 | 
10 | 
11 | class TestH3Transform:
12 |     def test_feature_transform(self, h3_input_df, h3_target_df):
13 |         # arrange
14 |         test_feature = Feature(
15 |             name="new_feature",
16 |             description="unit test",
17 |             dtype=DataType.STRING,
18 |             transformation=H3HashTransform(
19 |                 h3_resolutions=[6, 7, 8, 9, 10, 11, 12],
20 |                 lat_column="lat",
21 |                 lng_column="lng",
22 |             ),
23 |         )
24 | 
25 |         # act
26 |         output_df = test_feature.transform(h3_input_df)
27 | 
28 |         # assert
29 |         assert_dataframe_equality(output_df, h3_target_df)
30 | 
31 |     def test_output_columns(self):
32 |         # arrange
33 |         h3_feature = Feature(
34 |             name="new_feature",
35 |             description="unit test",
36 |             dtype=DataType.STRING,
37 |             transformation=H3HashTransform(
38 |                 h3_resolutions=[6, 7, 8, 9, 10, 11, 12],
39 |                 lat_column="lat",
40 |                 lng_column="lng",
41 |             ),
42 |         )
43 |         target_columns = [
44 |             "lat_lng__h3_hash__6",
45 |             "lat_lng__h3_hash__7",
46 |             "lat_lng__h3_hash__8",
47 |             "lat_lng__h3_hash__9",
48 |             "lat_lng__h3_hash__10",
49 |             "lat_lng__h3_hash__11",
50 |             "lat_lng__h3_hash__12",
51 |         ]
52 | 
53 |         # act
54 |         output_columns = h3_feature.get_output_columns()
55 | 
56 |         # assert
57 |         assert sorted(output_columns) == sorted(target_columns)
58 | 
59 |     def test_import_error(self):
60 |         import sys
61 | 
62 |         with patch.dict(sys.modules, h3=None):
63 |             modules = [m for m in sys.modules if m.startswith("butterfree")]
64 |             for m in modules:
65 |                 del sys.modules[m]
66 |             with pytest.raises(ModuleNotFoundError, match="you must install"):
67 |                 from butterfree.transform.transformations.h3_transform import (  # noqa; noqa
68 |                     H3HashTransform,
69 |                 )
70 | 
71 |     def test_with_stack(self, h3_input_df, h3_with_stack_target_df):
72 |         # arrange
73 |         test_feature = KeyFeature(
74 |             name="id",
75 |             description="unit test",
76 |             dtype=DataType.STRING,
77 |             transformation=H3HashTransform(
78 |                 h3_resolutions=[6, 7, 8, 9, 10, 11, 12],
79 |                 lat_column="lat",
80 |                 lng_column="lng",
81 |             ).with_stack(),
82 |         )
83 | 
84 |         # act
85 |         output_df = test_feature.transform(h3_input_df)
86 | 
87 |         # assert
88 |         assert_dataframe_equality(h3_with_stack_target_df, output_df)
89 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/transform/transformations/test_stack_transform.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from butterfree.constants import DataType
 4 | from butterfree.testing.dataframe import (
 5 |     assert_dataframe_equality,
 6 |     create_df_from_collection,
 7 | )
 8 | from butterfree.transform.features import Feature, KeyFeature
 9 | from butterfree.transform.transformations import StackTransform
10 | 
11 | 
12 | class TestSQLExpressionTransform:
13 | 
14 |     input_data = [
15 |         {"feature": 100, "id_a": 1, "id_b": 2},
16 |         {"feature": 120, "id_a": 3, "id_b": 4},
17 |     ]
18 | 
19 |     def test_feature_transform(self, spark_context, spark_session):
20 |         # arrange
21 |         target_data = [
22 |             {"id": 1, "feature": 100, "id_a": 1, "id_b": 2},
23 |             {"id": 2, "feature": 100, "id_a": 1, "id_b": 2},
24 |             {"id": 3, "feature": 120, "id_a": 3, "id_b": 4},
25 |             {"id": 4, "feature": 120, "id_a": 3, "id_b": 4},
26 |         ]
27 |         input_df = create_df_from_collection(
28 |             self.input_data, spark_context, spark_session
29 |         )
30 |         target_df = create_df_from_collection(target_data, spark_context, spark_session)
31 | 
32 |         feature_using_names = KeyFeature(
33 |             name="id",
34 |             description="id_a and id_b stacked in a single column.",
35 |             dtype=DataType.INTEGER,
36 |             transformation=StackTransform("id_*"),
37 |         )
38 | 
39 |         # act
40 |         result_df_1 = feature_using_names.transform(input_df)
41 | 
42 |         # assert
43 |         assert_dataframe_equality(target_df, result_df_1)
44 | 
45 |     def test_columns_not_in_dataframe(self, spark_context, spark_session):
46 |         # arrange
47 |         input_df = create_df_from_collection(
48 |             self.input_data, spark_context, spark_session
49 |         )
50 | 
51 |         feature = Feature(
52 |             name="id",
53 |             description="stack transformation",
54 |             dtype=DataType.STRING,
55 |             transformation=StackTransform("id_c", "id_d"),
56 |         )
57 | 
58 |         # act and assert
59 |         with pytest.raises(ValueError, match="Columns not found, columns in df: "):
60 |             feature.transform(input_df)
61 | 
62 |     @pytest.mark.parametrize(
63 |         "is_regex, pattern, column",
64 |         [
65 |             (False, "id_a", "id_a"),
66 |             (False, "id_*", "id_a"),
67 |             (False, "*_a", "id_a"),
68 |             (False, "id*a", "id_a"),
69 |             (False, "!id_b", "id_a"),
70 |             (True, "id.*", "id_a"),
71 |             (True, "id_[a-z]*", "id_column"),
72 |         ],
73 |     )
74 |     def test__matches_pattern(self, is_regex, pattern, column):
75 |         # arrange
76 |         transform = StackTransform(is_regex=is_regex)
77 | 
78 |         # act
79 |         result = transform._matches_pattern(pattern, column)
80 | 
81 |         # assert
82 |         assert result
83 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/transform/transformations/test_transform_component.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | from unittest.mock import patch
 3 | 
 4 | import pytest
 5 | 
 6 | from butterfree.transform.transformations import TransformComponent
 7 | 
 8 | 
 9 | class TestTransformComponent(TestCase):
10 |     def test_cannot_instantiate(self):
11 |         with pytest.raises(TypeError):
12 |             TransformComponent()
13 | 
14 |     @patch.multiple(TransformComponent, __abstractmethods__=set())
15 |     def test_parent(self):
16 |         with pytest.raises(TypeError):
17 |             feature_component = TransformComponent()
18 |             feature_component.parent()
19 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/transform/transformations/user_defined_functions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/transform/transformations/user_defined_functions/__init__.py


--------------------------------------------------------------------------------
/tests/unit/butterfree/transform/transformations/user_defined_functions/conftest.py:
--------------------------------------------------------------------------------
 1 | from pytest import fixture
 2 | 
 3 | 
 4 | @fixture
 5 | def feature_set_dataframe(spark_context, spark_session):
 6 |     data = [
 7 |         {"id": 1, "feature1": 100},
 8 |         {"id": 1, "feature1": 100},
 9 |         {"id": 1, "feature1": 200},
10 |         {"id": 1, "feature1": 200},
11 |         {"id": 1, "feature1": 200},
12 |         {"id": 1, "feature1": 300},
13 |         {"id": 1, "feature1": 300},
14 |         {"id": 1, "feature1": 300},
15 |         {"id": 1, "feature1": 300},
16 |         {"id": 1, "feature1": 300},
17 |         {"id": 2, "feature1": 100},
18 |         {"id": 2, "feature1": 100},
19 |         {"id": 2, "feature1": 200},
20 |         {"id": 2, "feature1": 200},
21 |         {"id": 2, "feature1": 200},
22 |         {"id": 2, "feature1": 300},
23 |         {"id": 2, "feature1": 300},
24 |         {"id": 2, "feature1": 300},
25 |         {"id": 2, "feature1": 300},
26 |         {"id": 2, "feature1": 300},
27 |     ]
28 |     return spark_session.read.json(spark_context.parallelize(data, 1))
29 | 
30 | 
31 | @fixture
32 | def feature_set_custom_dataframe(spark_context, spark_session):
33 |     data = [
34 |         {"id": 1, "feature1": "abc"},
35 |         {"id": 1, "feature1": "abc"},
36 |         {"id": 1, "feature1": "abc"},
37 |         {"id": 1, "feature1": "def"},
38 |         {"id": 1, "feature1": "def"},
39 |         {"id": 2, "feature1": "def"},
40 |         {"id": 2, "feature1": "def"},
41 |         {"id": 2, "feature1": "def"},
42 |         {"id": 2, "feature1": "abc"},
43 |         {"id": 2, "feature1": "abc"},
44 |     ]
45 |     return spark_session.read.json(spark_context.parallelize(data, 1))
46 | 
47 | 
48 | @fixture
49 | def mode_target_df(spark_context, spark_session):
50 |     data = [
51 |         {"id": 1, "mode(feature1)": "300"},
52 |         {"id": 2, "mode(feature1)": "300"},
53 |     ]
54 |     return spark_session.read.json(spark_context.parallelize(data, 1))
55 | 
56 | 
57 | @fixture
58 | def most_frequent_set_target_df(spark_context, spark_session):
59 |     data = [
60 |         {"id": 1, "most_frequent_set(feature1)": ["300", "200", "100"]},
61 |         {"id": 2, "most_frequent_set(feature1)": ["300", "200", "100"]},
62 |     ]
63 |     return spark_session.read.json(spark_context.parallelize(data, 1))
64 | 
65 | 
66 | @fixture
67 | def most_frequent_set_str_target_df(spark_context, spark_session):
68 |     data = [
69 |         {"id": 1, "most_frequent_set(feature1)": ["abc", "def"]},
70 |         {"id": 2, "most_frequent_set(feature1)": ["def", "abc"]},
71 |     ]
72 |     return spark_session.read.json(spark_context.parallelize(data, 1))
73 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/transform/transformations/user_defined_functions/test_mode.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql.types import StringType
 2 | 
 3 | from butterfree.testing.dataframe import assert_dataframe_equality
 4 | from butterfree.transform.transformations.user_defined_functions import mode
 5 | 
 6 | 
 7 | def test_mode_output(feature_set_dataframe, mode_target_df):
 8 |     output_df = feature_set_dataframe.groupby("id").agg(mode("feature1"))
 9 | 
10 |     assert_dataframe_equality(output_df, mode_target_df)
11 | 
12 | 
13 | def test_mode_output_type(feature_set_dataframe, mode_target_df):
14 |     output_df = feature_set_dataframe.groupby("id").agg(mode("feature1"))
15 | 
16 |     assert isinstance(output_df.schema["mode(feature1)"].dataType, StringType)
17 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/transform/transformations/user_defined_functions/test_most_frequent.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql.types import ArrayType
 2 | 
 3 | from butterfree.testing.dataframe import assert_dataframe_equality
 4 | from butterfree.transform.transformations.user_defined_functions import (
 5 |     most_frequent_set,
 6 | )
 7 | 
 8 | 
 9 | def test_most_frequent_set_output(feature_set_dataframe, most_frequent_set_target_df):
10 |     output_df = feature_set_dataframe.groupby("id").agg(most_frequent_set("feature1"))
11 | 
12 |     assert_dataframe_equality(output_df, most_frequent_set_target_df)
13 | 
14 | 
15 | def test_most_frequent_set_str_input(
16 |     feature_set_custom_dataframe, most_frequent_set_str_target_df
17 | ):
18 |     output_df = feature_set_custom_dataframe.groupby("id").agg(
19 |         most_frequent_set("feature1")
20 |     )
21 | 
22 |     assert_dataframe_equality(output_df, most_frequent_set_str_target_df)
23 | 
24 | 
25 | def test_most_frequent_set_output_type(
26 |     feature_set_dataframe, most_frequent_set_target_df
27 | ):
28 |     output_df = feature_set_dataframe.groupby("id").agg(most_frequent_set("feature1"))
29 | 
30 |     assert isinstance(
31 |         output_df.schema["most_frequent_set(feature1)"].dataType, ArrayType
32 |     )
33 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/validations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/validations/__init__.py


--------------------------------------------------------------------------------
/tests/unit/butterfree/validations/conftest.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql.types import StringType, StructField, StructType
 2 | from pytest import fixture
 3 | 
 4 | from butterfree.constants.columns import TIMESTAMP_COLUMN
 5 | 
 6 | 
 7 | @fixture
 8 | def feature_set_dataframe(spark_context, spark_session):
 9 |     data = [
10 |         {"id": 1, TIMESTAMP_COLUMN: 0, "feature": 100},
11 |         {"id": 2, TIMESTAMP_COLUMN: 0, "feature": 200},
12 |         {"id": 1, TIMESTAMP_COLUMN: 1, "feature": 110},
13 |         {"id": 1, TIMESTAMP_COLUMN: 2, "feature": 120},
14 |     ]
15 |     return spark_session.read.json(spark_context.parallelize(data, 1))
16 | 
17 | 
18 | @fixture
19 | def feature_set_without_ts(spark_context, spark_session):
20 |     data = [
21 |         {"id": 1, "feature": 100},
22 |         {"id": 2, "feature": 200},
23 |         {"id": 1, "feature": 110},
24 |         {"id": 1, "feature": 120},
25 |     ]
26 |     return spark_session.read.json(spark_context.parallelize(data, 1))
27 | 
28 | 
29 | @fixture
30 | def feature_set_empty(spark_context, spark_session):
31 | 
32 |     field = [StructField("field1", StringType(), True)]
33 |     schema = StructType(field)
34 | 
35 |     return spark_session.createDataFrame(spark_context.emptyRDD(), schema)
36 | 


--------------------------------------------------------------------------------
/tests/unit/butterfree/validations/test_basic_validation.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import MagicMock
 2 | 
 3 | import pytest
 4 | from pyspark.sql.connect.dataframe import DataFrame as ConnectDataFrame
 5 | from pyspark.sql.dataframe import DataFrame
 6 | 
 7 | from butterfree.validations import BasicValidation
 8 | 
 9 | 
10 | def test_validate_without_column_ts(feature_set_without_ts):
11 |     check = BasicValidation(feature_set_without_ts)
12 | 
13 |     with pytest.raises(ValueError):
14 |         check.validate_column_ts()
15 | 
16 | 
17 | def test_validate_df_is_empty_with_none_dataframe():
18 |     validation = BasicValidation(None)
19 | 
20 |     with pytest.raises(ValueError, match="DataFrame can't be None."):
21 |         validation.validate_df_is_empty()
22 | 
23 | 
24 | def test_validate_df_is_empty_with_empty_dataframe(spark_session):
25 |     df = spark_session.createDataFrame([], "id INT")
26 |     validation = BasicValidation(df)
27 | 
28 |     with pytest.raises(ValueError, match="DataFrame can't be empty."):
29 |         validation.validate_df_is_empty()
30 | 
31 | 
32 | def test_validate_df_is_empty_with_non_empty_dataframe(spark_session):
33 |     df = spark_session.createDataFrame([(1,)], "id INT")
34 |     validation = BasicValidation(df)
35 |     validation.validate_df_is_empty()
36 | 
37 | 
38 | # If it's DBR < 13.3 (spark < 3.4.1) it will break. Every ConnectDataFrame has isEmpty
39 | @pytest.mark.parametrize(
40 |     "is_empty, has_is_empty, dataframe_type",
41 |     [
42 |         (True, True, DataFrame),
43 |         (False, True, DataFrame),
44 |         (True, False, DataFrame),
45 |         (False, False, DataFrame),
46 |         # This module `pyspark.sql.connect.dataframe.DataFrame` always has isEmpty
47 |         # However, it does not have `rdd`
48 |         (True, True, ConnectDataFrame),
49 |         (False, True, ConnectDataFrame),
50 |     ],
51 | )
52 | def test_is_empty_permutations(is_empty, has_is_empty, dataframe_type):
53 |     df = MagicMock(spec=dataframe_type)
54 | 
55 |     if has_is_empty:
56 |         df.isEmpty.return_value = is_empty
57 |     else:
58 |         delattr(df, "isEmpty")
59 |         df.rdd.isEmpty.return_value = is_empty
60 | 
61 |     validation = BasicValidation(df)
62 |     assert validation._is_empty() == is_empty
63 | 


--------------------------------------------------------------------------------