├── .github ├── CODEOWNERS ├── pull-request-template.md └── workflows │ ├── release-please.yml │ └── test.yml ├── .gitignore ├── .release-please-manifest.json ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE ├── Makefile ├── README.md ├── WORKFLOW.md ├── butterfree ├── __init__.py ├── _cli │ ├── __init__.py │ ├── main.py │ └── migrate.py ├── automated │ ├── __init__.py │ └── feature_set_creation.py ├── clients │ ├── __init__.py │ ├── abstract_client.py │ ├── cassandra_client.py │ └── spark_client.py ├── configs │ ├── __init__.py │ ├── db │ │ ├── __init__.py │ │ ├── abstract_config.py │ │ ├── cassandra_config.py │ │ ├── delta.py │ │ ├── kafka_config.py │ │ └── metastore_config.py │ └── environment.py ├── constants │ ├── __init__.py │ ├── columns.py │ ├── data_type.py │ ├── migrations.py │ ├── spark_constants.py │ └── window_definitions.py ├── dataframe_service │ ├── __init__.py │ ├── incremental_strategy.py │ ├── partitioning.py │ └── repartition.py ├── extract │ ├── __init__.py │ ├── pre_processing │ │ ├── __init__.py │ │ ├── explode_json_column_transform.py │ │ ├── filter_transform.py │ │ ├── forward_fill_transform.py │ │ ├── pivot_transform.py │ │ └── replace_transform.py │ ├── readers │ │ ├── __init__.py │ │ ├── file_reader.py │ │ ├── kafka_reader.py │ │ ├── reader.py │ │ └── table_reader.py │ └── source.py ├── hooks │ ├── __init__.py │ ├── hook.py │ ├── hookable_component.py │ └── schema_compatibility │ │ ├── __init__.py │ │ ├── cassandra_table_schema_compatibility_hook.py │ │ └── spark_table_schema_compatibility_hook.py ├── load │ ├── __init__.py │ ├── processing │ │ ├── __init__.py │ │ └── json_transform.py │ ├── sink.py │ └── writers │ │ ├── __init__.py │ │ ├── delta_feature_store_writer.py │ │ ├── delta_writer.py │ │ ├── historical_feature_store_writer.py │ │ ├── online_feature_store_writer.py │ │ └── writer.py ├── migrations │ ├── __init__.py │ └── database_migration │ │ ├── __init__.py │ │ ├── cassandra_migration.py │ │ ├── database_migration.py │ │ └── metastore_migration.py ├── pipelines │ ├── __init__.py │ └── feature_set_pipeline.py ├── reports │ ├── __init__.py │ └── metadata.py ├── testing │ ├── __init__.py │ └── dataframe │ │ └── __init__.py ├── transform │ ├── __init__.py │ ├── aggregated_feature_set.py │ ├── feature_set.py │ ├── features │ │ ├── __init__.py │ │ ├── feature.py │ │ ├── key_feature.py │ │ └── timestamp_feature.py │ ├── transformations │ │ ├── __init__.py │ │ ├── aggregated_transform.py │ │ ├── custom_transform.py │ │ ├── h3_transform.py │ │ ├── spark_function_transform.py │ │ ├── sql_expression_transform.py │ │ ├── stack_transform.py │ │ ├── transform_component.py │ │ └── user_defined_functions │ │ │ ├── __init__.py │ │ │ ├── mode.py │ │ │ └── most_frequent_set.py │ └── utils │ │ ├── __init__.py │ │ ├── date_range.py │ │ ├── function.py │ │ └── window_spec.py └── validations │ ├── __init__.py │ ├── basic_validaton.py │ └── validation.py ├── docs ├── Makefile ├── index.html ├── make.bat ├── requirements.txt └── source │ ├── butterfree.automated.rst │ ├── butterfree.clients.rst │ ├── butterfree.configs.db.rst │ ├── butterfree.configs.rst │ ├── butterfree.constants.rst │ ├── butterfree.dataframe_service.rst │ ├── butterfree.extract.pre_processing.rst │ ├── butterfree.extract.readers.rst │ ├── butterfree.extract.rst │ ├── butterfree.hooks.rst │ ├── butterfree.hooks.schema_compatibility.rst │ ├── butterfree.load.processing.rst │ ├── butterfree.load.rst │ ├── butterfree.load.writers.rst │ ├── butterfree.migrations.database_migration.rst │ ├── butterfree.migrations.rst │ ├── butterfree.pipelines.rst │ ├── butterfree.reports.rst │ ├── butterfree.rst │ ├── butterfree.testing.dataframe.rst │ ├── butterfree.testing.rst │ ├── butterfree.transform.features.rst │ ├── butterfree.transform.rst │ ├── butterfree.transform.transformations.rst │ ├── butterfree.transform.transformations.user_defined_functions.rst │ ├── butterfree.transform.utils.rst │ ├── butterfree.validations.rst │ ├── cli.md │ ├── conf.py │ ├── configuration.md │ ├── extract.md │ ├── getstart.md │ ├── home.md │ ├── index.rst │ ├── load.md │ ├── modules.rst │ ├── stream.md │ └── transform.md ├── examples ├── README.md ├── aggregated_feature_set │ └── aggregated_feature_set.ipynb ├── data │ ├── listing_events.json │ └── region.json ├── interval_runs │ └── interval_runs.ipynb ├── simple_feature_set │ └── simple_feature_set.ipynb ├── spark_function_and_window │ └── spark_function_and_window.ipynb ├── streaming_feature_set │ ├── events │ │ └── 20582255.json │ ├── pokedex.json │ └── streaming_feature_set.ipynb └── test_examples.py ├── mypy.ini ├── release-please-config.json ├── requirements.dev.txt ├── requirements.lint.txt ├── requirements.test.txt ├── requirements.txt ├── setup.cfg ├── setup.py └── tests ├── __init__.py ├── integration ├── __init__.py ├── butterfree │ ├── __init__.py │ ├── extract │ │ ├── __init__.py │ │ ├── conftest.py │ │ └── test_source.py │ ├── load │ │ ├── __init__.py │ │ ├── conftest.py │ │ └── test_sink.py │ ├── pipelines │ │ ├── __init__.py │ │ ├── conftest.py │ │ └── test_feature_set_pipeline.py │ └── transform │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── test_aggregated_feature_set.py │ │ └── test_feature_set.py └── input │ └── data.json ├── mocks ├── __init__.py └── entities │ ├── __init__.py │ ├── first │ ├── __init__.py │ └── first_pipeline.py │ └── second │ ├── __init__.py │ └── deeper │ ├── __init__.py │ └── second_pipeline.py └── unit ├── __init__.py └── butterfree ├── __init__.py ├── _cli ├── __init__.py └── test_migrate.py ├── automated ├── __init__.py └── test_feature_set_creation.py ├── clients ├── __init__.py ├── conftest.py ├── test_cassandra_client.py └── test_spark_client.py ├── configs ├── __init__.py ├── db │ ├── __init__.py │ ├── conftest.py │ ├── test_cassandra_config.py │ ├── test_kafka_config.py │ └── test_metastore_config.py └── test_environment.py ├── dataframe_service ├── __init__.py ├── conftest.py ├── test_incremental_srategy.py ├── test_partitioning.py └── test_repartition.py ├── extract ├── __init__.py ├── conftest.py ├── pre_processing │ ├── __init__.py │ ├── conftest.py │ ├── test_explode_json_column.py │ ├── test_filter_transform.py │ ├── test_forward_fill.py │ ├── test_pivot_transform.py │ └── test_replace_transform.py ├── readers │ ├── __init__.py │ ├── file-reader-test.csv │ ├── file-reader-test.json │ ├── test_file_reader.py │ ├── test_kafka_reader.py │ ├── test_reader.py │ └── test_table_reader.py └── test_source.py ├── hooks ├── __init__.py ├── schema_compatibility │ ├── __init__.py │ ├── test_cassandra_table_schema_compatibility_hook.py │ └── test_spark_table_schema_compatibility_hook.py └── test_hookable_component.py ├── load ├── __init__.py ├── conftest.py ├── processing │ ├── __init__.py │ ├── conftest.py │ └── test_json_transform.py ├── test_sink.py └── writers │ ├── __init__.py │ ├── test_delta_writer.py │ ├── test_historical_feature_store_writer.py │ └── test_online_feature_store_writer.py ├── migrations ├── __init__.py └── database_migration │ ├── __init__.py │ ├── conftest.py │ ├── test_cassandra_migration.py │ ├── test_database_migration.py │ └── test_metastore_migration.py ├── pipelines ├── __init__.py ├── conftest.py └── test_feature_set_pipeline.py ├── reports ├── __init__.py └── test_metadata.py ├── testing └── dataframe │ ├── __init__.py │ └── test_dataframe.py ├── transform ├── __init__.py ├── conftest.py ├── features │ ├── __init__.py │ ├── conftest.py │ ├── test_feature.py │ ├── test_key_feature.py │ └── test_timestamp_feature.py ├── test_aggregated_feature_set.py ├── test_feature_set.py └── transformations │ ├── __init__.py │ ├── conftest.py │ ├── test_aggregated_transform.py │ ├── test_custom_transform.py │ ├── test_h3_transform.py │ ├── test_spark_function_transform.py │ ├── test_sql_expression_transform.py │ ├── test_stack_transform.py │ ├── test_transform_component.py │ └── user_defined_functions │ ├── __init__.py │ ├── conftest.py │ ├── test_mode.py │ └── test_most_frequent.py └── validations ├── __init__.py ├── conftest.py └── test_basic_validation.py /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @quintoandar/mlcops -------------------------------------------------------------------------------- /.github/pull-request-template.md: -------------------------------------------------------------------------------- 1 | ## Why? :open_book: 2 | _Replace me for a cool overview of why this PR is being created. You can 3 | refer to the Jira task or Github issue here too. Never forget to put the 4 | tag of a related Jira task in the title._ 5 | 6 | ## What? :wrench: 7 | _Replace me for a detailed explanation of what is being modified._ 8 | _Want to add some awesome bullet points?_ 9 | - _First changes;_ 10 | - _Second changes;_ 11 | - _..._ 12 | 13 | _How about some cool checkboxes?_ 14 | - [X] _First changes;_ 15 | - [X] _Second changes;_ 16 | - [ ] _..._ 17 | 18 | ## Type of change 19 | Please delete options that are not relevant. 20 | 21 | - [ ] Bug fix (non-breaking change which fixes an issue) 22 | - [ ] New feature (non-breaking change which adds functionality) 23 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) 24 | - [ ] This change requires a documentation update 25 | - [ ] Release 26 | 27 | ## How everything was tested? :straight_ruler: 28 | _Have you achieved all the acceptance criteria? How?_ 29 | _Is there any alternative flow in the testing process that you want to describe?_ 30 | 31 | ## Checklist 32 | - [ ] My code follows the style guidelines of this project (docstrings, type hinting and linter compliance); 33 | - [ ] I have performed a self-review of my own code; 34 | - [ ] I have made corresponding changes to the documentation; 35 | - [ ] I have added tests that prove my fix is effective or that my feature works; 36 | - [ ] New and existing unit tests pass locally with my changes; 37 | - [ ] Add labels to distinguish the type of pull request. Available labels are `bug`, `enhancement`, `feature`, and `review`. 38 | 39 | ## Attention Points :warning: 40 | _Replace me for what the reviewer will need to pay attention to in the PR or just to cover any concerns after the merge._ 41 | -------------------------------------------------------------------------------- /.github/workflows/release-please.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - staging 5 | 6 | permissions: 7 | contents: write 8 | pull-requests: write 9 | 10 | name: release-please 11 | 12 | jobs: 13 | release-please: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: googleapis/release-please-action@v4 17 | id: release 18 | with: 19 | token: ${{ secrets.GITHUB_TOKEN }} 20 | target-branch: staging 21 | config-file: release-please-config.json 22 | manifest-file: .release-please-manifest.json 23 | 24 | - uses: actions/checkout@v4 25 | if: ${{ steps.release.outputs.release_created }} 26 | 27 | - uses: actions/setup-python@v5 28 | if: ${{ steps.release.outputs.release_created }} 29 | with: 30 | python-version: '3.9' 31 | 32 | - name: Install dependencies 33 | if: ${{ steps.release.outputs.release_created }} 34 | run: make ci-install 35 | 36 | - name: Build package 37 | if: ${{ steps.release.outputs.release_created }} 38 | run: make package 39 | 40 | - name: Publish release to pypi.org 41 | if: ${{ steps.release.outputs.release_created }} 42 | env: 43 | PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }} 44 | PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 45 | run: PYTHONPATH=./pip/deps python -m twine upload -u $PYPI_USERNAME -p $PYPI_PASSWORD --verbose dist/* 46 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: "Test" 2 | on: 3 | push: 4 | branches: 5 | - master 6 | - staging 7 | - hotfix/** 8 | pull_request: 9 | 10 | jobs: 11 | Pipeline: 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - uses: actions/checkout@v2 16 | - uses: actions/setup-python@v5 17 | with: 18 | python-version: '3.9' 19 | 20 | - uses: actions/setup-java@v4 21 | with: 22 | java-version: '17' 23 | distribution: microsoft 24 | 25 | - uses: vemonet/setup-spark@v1 26 | with: 27 | spark-version: '3.5.1' 28 | hadoop-version: '3' 29 | 30 | - name: Install dependencies 31 | run: make ci-install 32 | 33 | - name: Style check 34 | run: PYTHONPATH=./pip/deps make style-check 35 | 36 | - name: Quality check 37 | run: PYTHONPATH=./pip/deps make quality-check 38 | 39 | - name: Static Type check 40 | run: PYTHONPATH=./pip/deps make type-check 41 | 42 | - name: Tests 43 | run: PYTHONPATH=./pip/deps make tests 44 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | *cov.xml 50 | test_folder/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # PyBuilder 68 | target/ 69 | pip/ 70 | 71 | # Jupyter Notebook 72 | .ipynb_checkpoints 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule 79 | 80 | # SageMath parsed files 81 | *.sage.py 82 | 83 | # dotenv 84 | .env 85 | 86 | # virtualenv 87 | .venv 88 | venv/ 89 | ENV/ 90 | 91 | # Spyder project settings 92 | .spyderproject 93 | .spyproject 94 | 95 | # Rope project settings 96 | .ropeproject 97 | 98 | # mkdocs documentation 99 | /site 100 | 101 | # mypy 102 | .mypy_cache/ 103 | 104 | # pytest 105 | .pytest_cache/ 106 | 107 | # PyCharm's Workspace 108 | .idea/ 109 | 110 | # Auto Generated: SHOULD NOT BE VERSIONED 111 | .version 112 | .package_name 113 | .repository_url 114 | .commit_hash 115 | *cov/ 116 | 117 | # VSCode Workspace 118 | spark-warehouse/ 119 | .vscode/ 120 | init/ 121 | 122 | # integration tests artifacts 123 | metastore_db/ 124 | -------------------------------------------------------------------------------- /.release-please-manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | ".": "1.7.2" 3 | } 4 | -------------------------------------------------------------------------------- /butterfree/__init__.py: -------------------------------------------------------------------------------- 1 | """Module docstring example, following Google's docstring style.""" 2 | 3 | __version__ = "1.7.2" # x-release-please-version 4 | -------------------------------------------------------------------------------- /butterfree/_cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/butterfree/_cli/__init__.py -------------------------------------------------------------------------------- /butterfree/_cli/main.py: -------------------------------------------------------------------------------- 1 | import typer 2 | 3 | from butterfree._cli import migrate 4 | 5 | app = typer.Typer(no_args_is_help=True) 6 | app.add_typer(migrate.app, name="migrate") 7 | 8 | if __name__ == "__main__": 9 | app() 10 | -------------------------------------------------------------------------------- /butterfree/automated/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/butterfree/automated/__init__.py -------------------------------------------------------------------------------- /butterfree/clients/__init__.py: -------------------------------------------------------------------------------- 1 | """Holds connection clients.""" 2 | 3 | from butterfree.clients.abstract_client import AbstractClient 4 | from butterfree.clients.cassandra_client import CassandraClient 5 | from butterfree.clients.spark_client import SparkClient 6 | 7 | __all__ = ["SparkClient", "CassandraClient", "AbstractClient"] 8 | -------------------------------------------------------------------------------- /butterfree/clients/abstract_client.py: -------------------------------------------------------------------------------- 1 | """Abstract class for database clients.""" 2 | 3 | from abc import ABC, abstractmethod 4 | from typing import Any, Optional 5 | 6 | 7 | class AbstractClient(ABC): 8 | """Abstract base class for database clients.""" 9 | 10 | @property 11 | @abstractmethod 12 | def conn(self) -> Any: 13 | """Returns a connection object.""" 14 | pass 15 | 16 | @abstractmethod 17 | def sql(self, query: str) -> Any: 18 | """Runs a query. 19 | 20 | Args: 21 | query: client query. 22 | 23 | Returns: 24 | Set of records. 25 | """ 26 | pass 27 | 28 | @abstractmethod 29 | def get_schema(self, table: str, database: Optional[str] = None) -> Any: 30 | """Returns desired table schema. 31 | 32 | Attributes: 33 | table: desired table. 34 | 35 | Returns: 36 | A list of dictionaries in the format 37 | [{"column_name": "example1", type: "Spark_type"}, ...] 38 | 39 | """ 40 | pass 41 | -------------------------------------------------------------------------------- /butterfree/configs/__init__.py: -------------------------------------------------------------------------------- 1 | """Holds configuration/setup for Butterfree components.""" 2 | -------------------------------------------------------------------------------- /butterfree/configs/db/__init__.py: -------------------------------------------------------------------------------- 1 | """This module holds database configurations to be used by clients.""" 2 | 3 | from butterfree.configs.db.abstract_config import AbstractWriteConfig 4 | from butterfree.configs.db.cassandra_config import CassandraConfig 5 | from butterfree.configs.db.delta import DeltaConfig 6 | from butterfree.configs.db.kafka_config import KafkaConfig 7 | from butterfree.configs.db.metastore_config import MetastoreConfig 8 | 9 | __all__ = [ 10 | "AbstractWriteConfig", 11 | "CassandraConfig", 12 | "KafkaConfig", 13 | "MetastoreConfig", 14 | "DeltaConfig", 15 | ] 16 | -------------------------------------------------------------------------------- /butterfree/configs/db/abstract_config.py: -------------------------------------------------------------------------------- 1 | """Abstract classes for database configurations with spark.""" 2 | 3 | from abc import ABC, abstractmethod 4 | from typing import Any, Dict, List 5 | 6 | 7 | class AbstractWriteConfig(ABC): 8 | """Abstract class for database write configurations with spark.""" 9 | 10 | @property 11 | @abstractmethod 12 | def database(self) -> str: 13 | """Database name.""" 14 | 15 | @property 16 | @abstractmethod 17 | def mode(self) -> Any: 18 | """Config option "mode" for spark write. 19 | 20 | Args: 21 | 22 | Returns: 23 | str: mode. 24 | 25 | """ 26 | 27 | @property 28 | @abstractmethod 29 | def format_(self) -> Any: 30 | """Config option "format" for spark write. 31 | 32 | Args: 33 | 34 | Returns: 35 | str: format. 36 | 37 | """ 38 | 39 | @abstractmethod 40 | def translate(self, schema: Any) -> List[Dict[Any, Any]]: 41 | """Translate feature set spark schema to the corresponding database. 42 | 43 | Args: 44 | schema: feature set schema 45 | 46 | Returns: 47 | Corresponding database schema. 48 | 49 | """ 50 | -------------------------------------------------------------------------------- /butterfree/configs/environment.py: -------------------------------------------------------------------------------- 1 | """Holds functions for managing the running environment.""" 2 | 3 | import os 4 | from typing import Optional 5 | 6 | specification = { 7 | "ENVIRONMENT": "dev", 8 | "CASSANDRA_HOST": "test", 9 | "CASSANDRA_KEYSPACE": "test", 10 | "CASSANDRA_USERNAME": "test", 11 | "CASSANDRA_PASSWORD": "test", 12 | "FEATURE_STORE_S3_BUCKET": "test", 13 | "FEATURE_STORE_HISTORICAL_DATABASE": "test", 14 | "KAFKA_CONSUMER_CONNECTION_STRING": "test_host:1234,test_host2:1234", 15 | "STREAM_CHECKPOINT_PATH": None, 16 | "CASSANDRA_READ_CONSISTENCY_LEVEL": None, 17 | "CASSANDRA_WRITE_CONSISTENCY_LEVEL": None, 18 | "CASSANDRA_LOCAL_DC": None, 19 | } 20 | 21 | 22 | class UnspecifiedVariableError(RuntimeError): 23 | """Environment variables not set error. 24 | 25 | Attributes: 26 | variable_name: environment variable name. 27 | 28 | """ 29 | 30 | def __init__(self, variable_name: str): 31 | super().__init__( 32 | f'Variable "{variable_name}" is not listed in the environment' 33 | " specification\nUpdate the environment module" 34 | f' to include "{variable_name}"' 35 | ) 36 | 37 | 38 | def get_variable( 39 | variable_name: str, default_value: Optional[str] = None 40 | ) -> Optional[str]: 41 | """Gets an environment variable. 42 | 43 | The variable comes from it's explicitly declared value in the running 44 | environment or from the default value declared in specification or from the 45 | default_value. 46 | 47 | Args: 48 | variable_name: environment variable name. 49 | default_value: default value to use in case no value is set in the 50 | environment nor in the environment.yaml specification file. 51 | 52 | Returns: 53 | The variable's string value 54 | 55 | """ 56 | try: 57 | spec_default = specification[variable_name] 58 | except KeyError: 59 | raise UnspecifiedVariableError(variable_name) 60 | return os.getenv(variable_name) or spec_default or default_value 61 | -------------------------------------------------------------------------------- /butterfree/constants/__init__.py: -------------------------------------------------------------------------------- 1 | """Holds constant attributes that are common for Butterfree.""" 2 | 3 | from butterfree.constants.data_type import DataType 4 | 5 | __all__ = ["DataType"] 6 | -------------------------------------------------------------------------------- /butterfree/constants/columns.py: -------------------------------------------------------------------------------- 1 | """Holds common column names, constant through all Butterfree.""" 2 | 3 | from typing_extensions import Final 4 | 5 | TIMESTAMP_COLUMN: Final = "timestamp" 6 | PARTITION_YEAR: Final = "year" 7 | PARTITION_MONTH: Final = "month" 8 | PARTITION_DAY: Final = "day" 9 | -------------------------------------------------------------------------------- /butterfree/constants/data_type.py: -------------------------------------------------------------------------------- 1 | """DataType Enum Entity.""" 2 | 3 | from enum import Enum 4 | 5 | from pyspark.sql.types import ArrayType, BinaryType, BooleanType 6 | from pyspark.sql.types import DataType as PySparkDataType 7 | from pyspark.sql.types import ( 8 | DateType, 9 | DecimalType, 10 | DoubleType, 11 | FloatType, 12 | IntegerType, 13 | LongType, 14 | StringType, 15 | TimestampNTZType, 16 | TimestampType, 17 | ) 18 | from typing_extensions import final 19 | 20 | 21 | @final 22 | class DataType(Enum): 23 | """Holds constants for data types within Butterfree.""" 24 | 25 | TIMESTAMP_NTZ = (TimestampNTZType(), "timestamp", "TIMESTAMP_NTZ") 26 | TIMESTAMP = (TimestampType(), "timestamp", "TIMESTAMP") 27 | BINARY = (BinaryType(), "boolean", "BINARY") 28 | BOOLEAN = (BooleanType(), "boolean", "BOOLEAN") 29 | DATE = (DateType(), "timestamp", "DATE") 30 | DECIMAL = (DecimalType(), "decimal", "DECIMAL") 31 | DOUBLE = (DoubleType(), "double", "DOUBLE") 32 | FLOAT = (FloatType(), "float", "FLOAT") 33 | INTEGER = (IntegerType(), "int", "INT") 34 | BIGINT = (LongType(), "bigint", "BIGINT") 35 | STRING = (StringType(), "text", "STRING") 36 | ARRAY_BIGINT = (ArrayType(LongType()), "frozen>", "ARRAY") 37 | ARRAY_STRING = (ArrayType(StringType()), "frozen>", "ARRAY") 38 | ARRAY_FLOAT = (ArrayType(FloatType()), "frozen>", "ARRAY") 39 | 40 | def __init__(self, spark: PySparkDataType, cassandra: str, spark_sql: str) -> None: 41 | self.spark = spark 42 | self.cassandra = cassandra 43 | self.spark_sql = spark_sql 44 | -------------------------------------------------------------------------------- /butterfree/constants/migrations.py: -------------------------------------------------------------------------------- 1 | """Migrations' Constants.""" 2 | 3 | from butterfree.constants import columns 4 | 5 | PARTITION_BY = [ 6 | {"column_name": columns.PARTITION_YEAR, "type": "INT"}, 7 | {"column_name": columns.PARTITION_MONTH, "type": "INT"}, 8 | {"column_name": columns.PARTITION_DAY, "type": "INT"}, 9 | ] 10 | -------------------------------------------------------------------------------- /butterfree/constants/spark_constants.py: -------------------------------------------------------------------------------- 1 | """Holds common spark constants, present through all Butterfree.""" 2 | 3 | from typing_extensions import Final 4 | 5 | # from spark.sql.shuffle.partitions default value 6 | DEFAULT_NUM_PARTITIONS: Final = 200 7 | 8 | # ratio between number of partitions per processor recommended (lower bound: 2) 9 | # refs: 10 | # https://github.com/vaquarkhan/Apache-Kafka-poc-and-notes/wiki/Apache-Spark-Join-guidelines-and-Performance-tuning 11 | PARTITION_PROCESSOR_RATIO: Final = 4 12 | -------------------------------------------------------------------------------- /butterfree/constants/window_definitions.py: -------------------------------------------------------------------------------- 1 | """Allowed windows units and lengths in seconds.""" 2 | 3 | ALLOWED_WINDOWS = { 4 | "second": 1, 5 | "seconds": 1, 6 | "minute": 60, 7 | "minutes": 60, 8 | "hour": 3600, 9 | "hours": 3600, 10 | "day": 86400, 11 | "days": 86400, 12 | "week": 604800, 13 | "weeks": 604800, 14 | "year": 29030400, 15 | "years": 29030400, 16 | } 17 | -------------------------------------------------------------------------------- /butterfree/dataframe_service/__init__.py: -------------------------------------------------------------------------------- 1 | """Dataframe optimization components regarding Butterfree.""" 2 | 3 | from butterfree.dataframe_service.incremental_strategy import IncrementalStrategy 4 | from butterfree.dataframe_service.partitioning import extract_partition_values 5 | from butterfree.dataframe_service.repartition import repartition_df, repartition_sort_df 6 | 7 | __all__ = [ 8 | "extract_partition_values", 9 | "IncrementalStrategy", 10 | "repartition_df", 11 | "repartition_sort_df", 12 | ] 13 | -------------------------------------------------------------------------------- /butterfree/dataframe_service/partitioning.py: -------------------------------------------------------------------------------- 1 | """Module defining partitioning methods.""" 2 | 3 | from typing import Any, Dict, List 4 | 5 | from pyspark.sql import DataFrame 6 | 7 | 8 | def extract_partition_values( 9 | dataframe: DataFrame, partition_columns: List[str] 10 | ) -> List[Dict[str, Any]]: 11 | """Extract distinct partition values from a given dataframe. 12 | 13 | Args: 14 | dataframe: dataframe from where to extract partition values. 15 | partition_columns: name of partition columns presented on the dataframe. 16 | 17 | Returns: 18 | distinct partition values. 19 | """ 20 | return [ 21 | row.asDict() 22 | for row in dataframe.select(*partition_columns).distinct().collect() 23 | ] 24 | -------------------------------------------------------------------------------- /butterfree/dataframe_service/repartition.py: -------------------------------------------------------------------------------- 1 | """Module where there are repartition methods.""" 2 | 3 | from typing import List, Optional 4 | 5 | from pyspark.sql.dataframe import DataFrame 6 | 7 | from butterfree.constants.spark_constants import ( 8 | DEFAULT_NUM_PARTITIONS, 9 | PARTITION_PROCESSOR_RATIO, 10 | ) 11 | 12 | 13 | def _num_partitions_definition( 14 | num_processors: Optional[int] = None, num_partitions: Optional[int] = None 15 | ) -> int: 16 | num_partitions = ( 17 | num_processors * PARTITION_PROCESSOR_RATIO 18 | if num_processors 19 | else num_partitions or DEFAULT_NUM_PARTITIONS 20 | ) 21 | 22 | return num_partitions 23 | 24 | 25 | def repartition_df( 26 | dataframe: DataFrame, 27 | partition_by: List[str], 28 | num_partitions: Optional[int] = None, 29 | num_processors: Optional[int] = None, 30 | ) -> DataFrame: 31 | """Partition the DataFrame. 32 | 33 | Args: 34 | dataframe: Spark DataFrame. 35 | partition_by: list of partitions. 36 | num_processors: number of processors. 37 | num_partitions: number of partitions. 38 | 39 | Returns: 40 | Partitioned dataframe. 41 | 42 | """ 43 | num_partitions = _num_partitions_definition(num_processors, num_partitions) 44 | return dataframe.repartition(num_partitions, *partition_by) 45 | 46 | 47 | def repartition_sort_df( 48 | dataframe: DataFrame, 49 | partition_by: List[str], 50 | order_by: List[str], 51 | num_processors: Optional[int] = None, 52 | num_partitions: Optional[int] = None, 53 | ) -> DataFrame: 54 | """Partition and Sort the DataFrame. 55 | 56 | Args: 57 | dataframe: Spark DataFrame. 58 | partition_by: list of columns to partition by. 59 | order_by: list of columns to order by. 60 | num_processors: number of processors. 61 | num_partitions: number of partitions. 62 | 63 | Returns: 64 | Partitioned and sorted dataframe. 65 | 66 | """ 67 | num_partitions = _num_partitions_definition(num_processors, num_partitions) 68 | dataframe = repartition_df(dataframe, partition_by, num_partitions) 69 | return dataframe.sortWithinPartitions(*order_by) 70 | -------------------------------------------------------------------------------- /butterfree/extract/__init__.py: -------------------------------------------------------------------------------- 1 | """The Source Component of a Feature Set.""" 2 | 3 | from butterfree.extract.source import Source 4 | 5 | __all__ = ["Source"] 6 | -------------------------------------------------------------------------------- /butterfree/extract/pre_processing/__init__.py: -------------------------------------------------------------------------------- 1 | """Pre Processing Components regarding Readers.""" 2 | 3 | from butterfree.extract.pre_processing.explode_json_column_transform import ( 4 | explode_json_column, 5 | ) 6 | from butterfree.extract.pre_processing.filter_transform import filter 7 | from butterfree.extract.pre_processing.forward_fill_transform import forward_fill 8 | from butterfree.extract.pre_processing.pivot_transform import pivot 9 | from butterfree.extract.pre_processing.replace_transform import replace 10 | 11 | __all__ = ["explode_json_column", "filter", "forward_fill", "pivot", "replace"] 12 | -------------------------------------------------------------------------------- /butterfree/extract/pre_processing/explode_json_column_transform.py: -------------------------------------------------------------------------------- 1 | """Explode json column for dataframes.""" 2 | 3 | from pyspark.sql.dataframe import DataFrame, StructType 4 | from pyspark.sql.functions import from_json, get_json_object 5 | 6 | JSON_TYPE_NAMES = ["array", "struct"] 7 | 8 | 9 | def explode_json_column( 10 | df: DataFrame, column: str, json_schema: StructType 11 | ) -> DataFrame: 12 | """Create new columns extracting properties from a JSON column. 13 | 14 | Example: 15 | 16 | >>> from pyspark import SparkContext 17 | >>> from pyspark.sql import session 18 | >>> from butterfree.testing.dataframe import create_df_from_collection 19 | >>> from butterfree.extract.pre_processing import explode_json_column 20 | >>> from pyspark.sql.types import ( 21 | ... ArrayType, 22 | ... IntegerType, 23 | ... StringType, 24 | ... StructField, 25 | ... StructType, 26 | ... ) 27 | >>> spark_context = SparkContext.getOrCreate() 28 | >>> spark_session = session.SparkSession(spark_context) 29 | >>> data = [{"json_column": '{"a": 123, "b": "abc", "c": "123", "d": [1, 2, 3]}'}] 30 | >>> df = create_df_from_collection(data, spark_context, spark_session) 31 | >>> df.collect() 32 | 33 | [Row(json_column='{"a": 123, "b": "abc", "c": "123", "d": [1, 2, 3]}')] 34 | 35 | >>> json_column_schema = StructType( 36 | ... [ 37 | ... StructField("a", IntegerType()), 38 | ... StructField("b", StringType()), 39 | ... StructField("c", IntegerType()), 40 | ... StructField("d", ArrayType(IntegerType())), 41 | ... ] 42 | >>> explode_json_column( 43 | ... df, column='json_column', json_schema=json_column_schema 44 | ... ).collect() 45 | 46 | [ 47 | Row( 48 | json_column='{"a": 123, "b": "abc", "c": "123", "d": [1, 2, 3]}', 49 | a=123, 50 | b='abc', 51 | c=123, 52 | d=[1, 2, 3] 53 | ) 54 | ] 55 | 56 | Args: 57 | df: input dataframe with the target JSON column. 58 | column: column name that is going to be exploded. 59 | json_schema: expected schema from that JSON column. 60 | Not all "first layer" fields need to be mapped in the json_schema, 61 | just the desired columns. If there is any JSON field that is needed 62 | to be cast to a struct, the declared expected schema (a StructType) 63 | need to have the exact same schema as the presented record, if don't, 64 | the value in the resulting column will be null. 65 | 66 | Returns: 67 | dataframe with the new extracted columns from the JSON column. 68 | 69 | """ 70 | for field in json_schema: 71 | if field.dataType.typeName() in JSON_TYPE_NAMES: 72 | df = df.withColumn( 73 | field.name, 74 | from_json( 75 | get_json_object(df[column], "$.{}".format(field.name)), 76 | schema=field.dataType, # type: ignore 77 | ), 78 | ) 79 | else: # non-collection data types 80 | df = df.withColumn( 81 | field.name, 82 | get_json_object(df[column], "$.{}".format(field.name)).cast( 83 | field.dataType 84 | ), 85 | ) 86 | return df 87 | -------------------------------------------------------------------------------- /butterfree/extract/pre_processing/filter_transform.py: -------------------------------------------------------------------------------- 1 | """Module where filter DataFrames coming from readers.""" 2 | 3 | from pyspark.sql.dataframe import DataFrame 4 | 5 | 6 | def filter(dataframe: DataFrame, condition: str) -> DataFrame: 7 | """Filters DataFrame's rows using the given condition and value. 8 | 9 | Args: 10 | dataframe: Spark DataFrame. 11 | condition: SQL expression with column, operation and value 12 | to filter the dataframe. 13 | 14 | Returns: 15 | Filtered dataframe 16 | """ 17 | if not isinstance(condition, str): 18 | raise TypeError("condition should be string.") 19 | 20 | return dataframe.filter(condition) 21 | -------------------------------------------------------------------------------- /butterfree/extract/pre_processing/replace_transform.py: -------------------------------------------------------------------------------- 1 | """Replace transformer for dataframes.""" 2 | 3 | from itertools import chain 4 | from typing import Dict 5 | 6 | from pyspark.sql.dataframe import DataFrame 7 | from pyspark.sql.functions import coalesce, col, create_map, lit 8 | 9 | 10 | def replace( 11 | dataframe: DataFrame, column: str, replace_dict: Dict[str, str] 12 | ) -> DataFrame: 13 | """Replace values of a string column in the dataframe using a dict. 14 | 15 | Example: 16 | 17 | >>> from butterfree.extract.pre_processing import replace 18 | ... from butterfree.testing.dataframe import ( 19 | ... assert_dataframe_equality, 20 | ... create_df_from_collection, 21 | ... ) 22 | >>> from pyspark import SparkContext 23 | >>> from pyspark.sql import session 24 | >>> spark_context = SparkContext.getOrCreate() 25 | >>> spark_session = session.SparkSession(spark_context) 26 | >>> input_data = [ 27 | ... {"id":1, "type": "a"}, {"id":2, "type": "b"}, {"id":3, "type": "c"} 28 | ... ] 29 | >>> input_df = create_df_from_collection(input_data, spark_context, spark_session) 30 | >>> input_df.collect() 31 | 32 | [Row(id=1, type='a'), Row(id=2, type='b'), Row(id=3, type='c')] 33 | 34 | >>> replace_dict = {"a": "type_a", "b": "type_b"} 35 | >>> replace(input_df, "type", replace_dict).collect() 36 | 37 | [Row(id=1, type='type_a'), Row(id=2, type='type_b'), Row(id=3, type='c')] 38 | 39 | Args: 40 | dataframe: data to be transformed. 41 | column: string column on the dataframe where to apply the replace. 42 | replace_dict: dict with values to be replaced. 43 | All mapped values must be string. 44 | 45 | Returns: 46 | Dataframe with column values replaced. 47 | 48 | """ 49 | if (column not in dict(dataframe.dtypes)) or ( 50 | dict(dataframe.dtypes)[column] != "string" 51 | ): 52 | raise ValueError("column needs to be the name of an string column in dataframe") 53 | if (not isinstance(replace_dict, dict)) or ( 54 | not all(isinstance(value, str) for value in chain(*replace_dict.items())) 55 | ): 56 | raise ValueError( 57 | "replace_dict needs to be a Python dict with " 58 | "all keys and values as string values" 59 | ) 60 | 61 | mapping = create_map( 62 | [lit(value) for value in chain(*replace_dict.items())] # type: ignore 63 | ) 64 | return dataframe.withColumn(column, coalesce(mapping[col(column)], col(column))) 65 | -------------------------------------------------------------------------------- /butterfree/extract/readers/__init__.py: -------------------------------------------------------------------------------- 1 | """The Reader Component of a Source.""" 2 | 3 | from butterfree.extract.readers.file_reader import FileReader 4 | from butterfree.extract.readers.kafka_reader import KafkaReader 5 | from butterfree.extract.readers.table_reader import TableReader 6 | 7 | __all__ = ["FileReader", "KafkaReader", "TableReader"] 8 | -------------------------------------------------------------------------------- /butterfree/extract/readers/table_reader.py: -------------------------------------------------------------------------------- 1 | """TableSource entity.""" 2 | 3 | from typing import Optional 4 | 5 | from pyspark.sql import DataFrame 6 | 7 | from butterfree.clients import SparkClient 8 | from butterfree.extract.readers.reader import Reader 9 | 10 | 11 | class TableReader(Reader): 12 | """Responsible for get data from tables registered in the metastore. 13 | 14 | Attributes: 15 | id: unique string id for register the reader as a view on the metastore. 16 | database: name of the metastore database/schema. 17 | table: name of the table. 18 | 19 | Example: 20 | Simple example regarding TableReader class instantiation. 21 | 22 | >>> from butterfree.extract.readers import TableReader 23 | >>> from butterfree.clients import SparkClient 24 | >>> from butterfree.extract.pre_processing import filter 25 | >>> spark_client = SparkClient() 26 | >>> table_reader = TableReader( 27 | ... id="table_reader_id", 28 | ... database="table_reader_db", 29 | ... table="table_reader_table" 30 | ... ) 31 | >>> df = table_reader.consume(spark_client) 32 | 33 | This last method will use the Spark Client, as default, to read 34 | the desired table, loading data into a dataframe, according to 35 | TableReader class arguments. 36 | 37 | It's also possible to define simple transformations within the 38 | reader's scope: 39 | 40 | >>> table_reader.with_(filter, condition="year = 2019").build(spark_client) 41 | 42 | In this case, however, a temp view will be created, cointaining 43 | the transformed data. 44 | 45 | """ 46 | 47 | __name__ = "Table Reader" 48 | 49 | def __init__(self, id: str, table: str, database: Optional[str] = None): 50 | super().__init__(id) 51 | if not isinstance(table, str): 52 | raise ValueError( 53 | "table needs to be a string with the name of the registered table" 54 | ) 55 | self.database = database 56 | self.table = table 57 | 58 | def consume(self, client: SparkClient) -> DataFrame: 59 | """Extract data from a table in Spark metastore. 60 | 61 | Args: 62 | client: client responsible for connecting to Spark session. 63 | 64 | Returns: 65 | Dataframe with all the data from the table. 66 | 67 | """ 68 | return client.read_table(self.table, self.database) 69 | -------------------------------------------------------------------------------- /butterfree/hooks/__init__.py: -------------------------------------------------------------------------------- 1 | """Holds Hooks definitions.""" 2 | 3 | from butterfree.hooks.hook import Hook 4 | from butterfree.hooks.hookable_component import HookableComponent 5 | 6 | __all__ = ["Hook", "HookableComponent"] 7 | -------------------------------------------------------------------------------- /butterfree/hooks/hook.py: -------------------------------------------------------------------------------- 1 | """Hook abstract class entity.""" 2 | 3 | from abc import ABC, abstractmethod 4 | 5 | from pyspark.sql import DataFrame 6 | 7 | 8 | class Hook(ABC): 9 | """Definition of a hook function to call on a Dataframe.""" 10 | 11 | @abstractmethod 12 | def run(self, dataframe: DataFrame) -> DataFrame: 13 | """Run interface for Hook. 14 | 15 | Args: 16 | dataframe: dataframe to use in the Hook. 17 | 18 | Returns: 19 | dataframe result from the Hook. 20 | """ 21 | -------------------------------------------------------------------------------- /butterfree/hooks/schema_compatibility/__init__.py: -------------------------------------------------------------------------------- 1 | """Holds Schema Compatibility Hooks definitions.""" 2 | 3 | from butterfree.hooks.schema_compatibility.cassandra_table_schema_compatibility_hook import ( # noqa 4 | CassandraTableSchemaCompatibilityHook, 5 | ) 6 | from butterfree.hooks.schema_compatibility.spark_table_schema_compatibility_hook import ( # noqa 7 | SparkTableSchemaCompatibilityHook, 8 | ) 9 | 10 | __all__ = ["SparkTableSchemaCompatibilityHook", "CassandraTableSchemaCompatibilityHook"] 11 | -------------------------------------------------------------------------------- /butterfree/hooks/schema_compatibility/cassandra_table_schema_compatibility_hook.py: -------------------------------------------------------------------------------- 1 | """Cassandra table schema compatibility Hook definition.""" 2 | 3 | from pyspark.sql import DataFrame 4 | 5 | from butterfree.clients import CassandraClient 6 | from butterfree.constants import DataType 7 | from butterfree.hooks.hook import Hook 8 | 9 | 10 | class CassandraTableSchemaCompatibilityHook(Hook): 11 | """Hook to verify the schema compatibility with a Cassandra's table. 12 | 13 | Verifies if all columns presented on the dataframe exists and are the same 14 | type on the target Cassandra's table. 15 | 16 | Attributes: 17 | cassandra_client: client to connect to Cassandra DB. 18 | table: table name. 19 | """ 20 | 21 | def __init__(self, cassandra_client: CassandraClient, table: str): 22 | self.cassandra_client = cassandra_client 23 | self.table = table 24 | 25 | def run(self, dataframe: DataFrame) -> DataFrame: 26 | """Check the schema compatibility from a given Dataframe. 27 | 28 | This method does not change anything on the Dataframe. 29 | 30 | Args: 31 | dataframe: dataframe to verify schema compatibility. 32 | 33 | Returns: 34 | unchanged dataframe. 35 | 36 | Raises: 37 | ValueError if the schemas are incompatible. 38 | """ 39 | table_schema = self.cassandra_client.get_schema(self.table) 40 | type_cassandra = [ 41 | type.cassandra 42 | for field_id in range(len(dataframe.schema.fieldNames())) 43 | for type in DataType 44 | if dataframe.schema.fields.__getitem__(field_id).dataType == type.spark 45 | ] 46 | schema = [ 47 | {"column_name": f"{column}", "type": f"{type}"} 48 | for column, type in zip(dataframe.columns, type_cassandra) 49 | ] 50 | 51 | if not all([column in table_schema for column in schema]): 52 | raise ValueError( 53 | "There's a schema incompatibility " 54 | "between the defined dataframe and the Cassandra table.\n" 55 | f"Dataframe schema = {schema}" 56 | f"Target table schema = {table_schema}" 57 | ) 58 | return dataframe 59 | -------------------------------------------------------------------------------- /butterfree/hooks/schema_compatibility/spark_table_schema_compatibility_hook.py: -------------------------------------------------------------------------------- 1 | """Spark table schema compatibility Hook definition.""" 2 | 3 | from typing import Optional 4 | 5 | from pyspark.sql import DataFrame 6 | 7 | from butterfree.clients import SparkClient 8 | from butterfree.hooks.hook import Hook 9 | 10 | 11 | class SparkTableSchemaCompatibilityHook(Hook): 12 | """Hook to verify the schema compatibility with a Spark's table. 13 | 14 | Verifies if all columns presented on the dataframe exists and are the same 15 | type on the target Spark's table. 16 | 17 | Attributes: 18 | spark_client: client to connect to Spark's metastore. 19 | table: table name. 20 | database: database name. 21 | """ 22 | 23 | def __init__( 24 | self, spark_client: SparkClient, table: str, database: Optional[str] = None 25 | ): 26 | self.spark_client = spark_client 27 | self.table_expression = (f"`{database}`." if database else "") + f"`{table}`" 28 | 29 | def run(self, dataframe: DataFrame) -> DataFrame: 30 | """Check the schema compatibility from a given Dataframe. 31 | 32 | This method does not change anything on the Dataframe. 33 | 34 | Args: 35 | dataframe: dataframe to verify schema compatibility. 36 | 37 | Returns: 38 | unchanged dataframe. 39 | 40 | Raises: 41 | ValueError if the schemas are incompatible. 42 | """ 43 | table_schema = self.spark_client.conn.table(self.table_expression).schema 44 | if not all([column in table_schema for column in dataframe.schema]): 45 | raise ValueError( 46 | "The dataframe has a schema incompatible with the defined table.\n" 47 | f"Dataframe schema = {dataframe.schema}" 48 | f"Target table schema = {table_schema}" 49 | ) 50 | return dataframe 51 | -------------------------------------------------------------------------------- /butterfree/load/__init__.py: -------------------------------------------------------------------------------- 1 | """Holds the Sink component of a feature set pipeline.""" 2 | 3 | from butterfree.load.sink import Sink 4 | 5 | __all__ = ["Sink"] 6 | -------------------------------------------------------------------------------- /butterfree/load/processing/__init__.py: -------------------------------------------------------------------------------- 1 | """Pre Processing Components regarding Readers.""" 2 | 3 | from butterfree.load.processing.json_transform import json_transform 4 | 5 | __all__ = ["json_transform"] 6 | -------------------------------------------------------------------------------- /butterfree/load/processing/json_transform.py: -------------------------------------------------------------------------------- 1 | """Json conversion for writers.""" 2 | 3 | from pyspark.sql.dataframe import DataFrame 4 | from pyspark.sql.functions import struct, to_json 5 | 6 | 7 | def json_transform(dataframe: DataFrame) -> DataFrame: 8 | """Filters DataFrame's rows using the given condition and value. 9 | 10 | Args: 11 | dataframe: Spark DataFrame. 12 | 13 | Returns: 14 | Converted dataframe. 15 | """ 16 | return dataframe.select( 17 | to_json( 18 | struct([dataframe[column] for column in dataframe.columns]) # type: ignore 19 | ).alias("value") 20 | ) 21 | -------------------------------------------------------------------------------- /butterfree/load/writers/__init__.py: -------------------------------------------------------------------------------- 1 | """Holds data loaders for historical and online feature store.""" 2 | 3 | from butterfree.load.writers.delta_feature_store_writer import DeltaFeatureStoreWriter 4 | from butterfree.load.writers.delta_writer import DeltaWriter 5 | from butterfree.load.writers.historical_feature_store_writer import ( 6 | HistoricalFeatureStoreWriter, 7 | ) 8 | from butterfree.load.writers.online_feature_store_writer import OnlineFeatureStoreWriter 9 | 10 | __all__ = [ 11 | "HistoricalFeatureStoreWriter", 12 | "OnlineFeatureStoreWriter", 13 | "DeltaWriter", 14 | "DeltaFeatureStoreWriter", 15 | ] 16 | -------------------------------------------------------------------------------- /butterfree/migrations/__init__.py: -------------------------------------------------------------------------------- 1 | """Holds available migrations.""" 2 | -------------------------------------------------------------------------------- /butterfree/migrations/database_migration/__init__.py: -------------------------------------------------------------------------------- 1 | """Holds available database migrations.""" 2 | 3 | from butterfree.migrations.database_migration.cassandra_migration import ( 4 | CassandraMigration, 5 | ) 6 | from butterfree.migrations.database_migration.database_migration import Diff 7 | from butterfree.migrations.database_migration.metastore_migration import ( 8 | MetastoreMigration, 9 | ) 10 | 11 | __all__ = ["CassandraMigration", "MetastoreMigration", "Diff"] 12 | 13 | 14 | ALLOWED_DATABASE = { 15 | "cassandra": CassandraMigration(), 16 | "metastore": MetastoreMigration(), 17 | } 18 | -------------------------------------------------------------------------------- /butterfree/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | """ETL Pipelines.""" 2 | 3 | from butterfree.pipelines.feature_set_pipeline import FeatureSetPipeline 4 | 5 | __all__ = ["FeatureSetPipeline"] 6 | -------------------------------------------------------------------------------- /butterfree/reports/__init__.py: -------------------------------------------------------------------------------- 1 | """Reports module.""" 2 | 3 | from butterfree.reports.metadata import Metadata 4 | 5 | __all__ = ["Metadata"] 6 | -------------------------------------------------------------------------------- /butterfree/testing/__init__.py: -------------------------------------------------------------------------------- 1 | """Utilities to make testing of Butterfree tools easier.""" 2 | -------------------------------------------------------------------------------- /butterfree/transform/__init__.py: -------------------------------------------------------------------------------- 1 | """The Transform Component of a Feature Set.""" 2 | 3 | from butterfree.transform.feature_set import FeatureSet 4 | 5 | __all__ = ["FeatureSet"] 6 | -------------------------------------------------------------------------------- /butterfree/transform/features/__init__.py: -------------------------------------------------------------------------------- 1 | """Holds all feature types to be part of a FeatureSet.""" 2 | 3 | from butterfree.transform.features.feature import Feature 4 | from butterfree.transform.features.key_feature import KeyFeature 5 | from butterfree.transform.features.timestamp_feature import TimestampFeature 6 | 7 | __all__ = ["Feature", "KeyFeature", "TimestampFeature"] 8 | -------------------------------------------------------------------------------- /butterfree/transform/features/key_feature.py: -------------------------------------------------------------------------------- 1 | """KeyFeature entity.""" 2 | 3 | from typing import Optional 4 | 5 | from butterfree.constants.data_type import DataType 6 | from butterfree.transform.features.feature import Feature 7 | from butterfree.transform.transformations import TransformComponent 8 | 9 | 10 | class KeyFeature(Feature): 11 | """Defines a KeyFeature. 12 | 13 | A FeatureSet must contain one or more KeyFeatures, which will be used as 14 | keys when storing the feature set dataframe as tables. The Feature Set may 15 | validate keys are unique for the latest state of a feature set. 16 | 17 | Attributes: 18 | name: key name. 19 | Can be use by the transformation to derive multiple key columns. 20 | description: brief explanation regarding the key. 21 | dtype: data type for the output column of this key. 22 | from_column: original column to build a key. 23 | Used when there is transformation or the transformation has no 24 | reference about the column to use for. 25 | transformation: transformation that will be applied to create this key. 26 | Keys can be derived by transformations over any data column. Like a 27 | location hash based on latitude and longitude. 28 | 29 | """ 30 | 31 | def __init__( 32 | self, 33 | name: str, 34 | description: str, 35 | dtype: DataType, 36 | from_column: Optional[str] = None, 37 | transformation: Optional[TransformComponent] = None, 38 | ) -> None: 39 | super(KeyFeature, self).__init__( 40 | name=name, 41 | description=description, 42 | dtype=dtype, 43 | from_column=from_column, 44 | transformation=transformation, 45 | ) 46 | -------------------------------------------------------------------------------- /butterfree/transform/features/timestamp_feature.py: -------------------------------------------------------------------------------- 1 | """TimestampFeature entity.""" 2 | 3 | from typing import Optional 4 | 5 | from pyspark.sql import DataFrame 6 | from pyspark.sql.functions import to_timestamp 7 | 8 | from butterfree.constants import DataType 9 | from butterfree.constants.columns import TIMESTAMP_COLUMN 10 | from butterfree.transform.features import Feature 11 | from butterfree.transform.transformations import TransformComponent 12 | 13 | 14 | class TimestampFeature(Feature): 15 | """Defines a TimestampFeature. 16 | 17 | A FeatureSet must contain one TimestampFeature, which will be used as a time 18 | tag for the state of all features. By containing a timestamp feature, users 19 | may time travel over their features. The Feature Set may validate that the 20 | set of keys and timestamp are unique for a feature set. 21 | 22 | By defining a TimestampColumn, the feature set will always contain a data 23 | column called "timestamp" of TimestampType (spark dtype). 24 | 25 | Attributes: 26 | from_column: original column to build a "timestamp" feature column. 27 | Used when there is transformation or the transformation has no 28 | reference about the column to use for. 29 | If from_column is None, the FeatureSet will assume the input 30 | dataframe already has a data column called "timestamp". 31 | transformation: transformation that will be applied to create the 32 | "timestamp". Type casting will already happen when no transformation 33 | is given. But a timestamp can be derived from multiple columns, like 34 | year, month and day, for example. The transformation must always 35 | handle naming and typing. 36 | from_ms: true if timestamp column presents milliseconds time unit. A 37 | conversion is then performed. 38 | mask: specified timestamp format by the user. 39 | 40 | """ 41 | 42 | def __init__( 43 | self, 44 | dtype: Optional[DataType] = DataType.TIMESTAMP, 45 | from_column: Optional[str] = None, 46 | transformation: Optional[TransformComponent] = None, 47 | from_ms: bool = False, 48 | mask: Optional[str] = None, 49 | ) -> None: 50 | description = "Time tag for the state of all features." 51 | super(TimestampFeature, self).__init__( 52 | name=TIMESTAMP_COLUMN, 53 | description=description, 54 | from_column=from_column, 55 | dtype=dtype, 56 | transformation=transformation, 57 | ) 58 | self.from_ms = from_ms 59 | self.mask = mask 60 | 61 | def transform(self, dataframe: DataFrame) -> DataFrame: 62 | """Performs a transformation to the feature pipeline. 63 | 64 | Args: 65 | dataframe: input dataframe for the transformation. 66 | 67 | Returns: 68 | Transformed dataframe. 69 | """ 70 | column_name = self.from_column if self.from_column else self.name 71 | 72 | ts_column = dataframe[column_name] 73 | if self.from_ms: 74 | ts_column = ts_column / 1000 75 | 76 | dataframe = dataframe.withColumn( 77 | column_name, to_timestamp(ts_column, self.mask) # type: ignore 78 | ) 79 | 80 | return super().transform(dataframe) 81 | -------------------------------------------------------------------------------- /butterfree/transform/transformations/__init__.py: -------------------------------------------------------------------------------- 1 | """Holds all transformations to be used by Features. 2 | 3 | A transformation must inherit from a TransformComponent and handle data modification, 4 | renaming and cast types using parent's (a Feature) information. 5 | """ 6 | 7 | from butterfree.transform.transformations.aggregated_transform import ( 8 | AggregatedTransform, 9 | ) 10 | from butterfree.transform.transformations.custom_transform import CustomTransform 11 | from butterfree.transform.transformations.spark_function_transform import ( 12 | SparkFunctionTransform, 13 | ) 14 | from butterfree.transform.transformations.sql_expression_transform import ( 15 | SQLExpressionTransform, 16 | ) 17 | from butterfree.transform.transformations.stack_transform import StackTransform 18 | from butterfree.transform.transformations.transform_component import TransformComponent 19 | 20 | __all__ = [ 21 | "AggregatedTransform", 22 | "CustomTransform", 23 | "SparkFunctionTransform", 24 | "SQLExpressionTransform", 25 | "StackTransform", 26 | "TransformComponent", 27 | ] 28 | -------------------------------------------------------------------------------- /butterfree/transform/transformations/transform_component.py: -------------------------------------------------------------------------------- 1 | """Transform Abstract Class.""" 2 | 3 | from abc import ABC, abstractmethod 4 | from typing import Any, List 5 | 6 | from pyspark.sql import DataFrame 7 | 8 | 9 | class TransformComponent(ABC): 10 | """Defines an abstract class for Transform entities. 11 | 12 | Attributes: 13 | parent: parent transform component. 14 | """ 15 | 16 | def __init__(self) -> None: 17 | self._parent = None 18 | 19 | @property 20 | def parent(self) -> Any: 21 | """Parent transform component.""" 22 | return self._parent 23 | 24 | @parent.setter 25 | def parent(self, parent: None) -> None: 26 | self._parent = parent 27 | 28 | @property 29 | @abstractmethod 30 | def output_columns(self) -> List[str]: 31 | """Columns generated by the transformation.""" 32 | 33 | @abstractmethod 34 | def transform(self, dataframe: DataFrame) -> DataFrame: 35 | """Performs a transformation to the feature pipeline. 36 | 37 | Args: 38 | dataframe: input dataframe. 39 | 40 | Returns: 41 | Transformed dataframe. 42 | """ 43 | -------------------------------------------------------------------------------- /butterfree/transform/transformations/user_defined_functions/__init__.py: -------------------------------------------------------------------------------- 1 | """Holds all transformations to be used by Features. 2 | 3 | A transformation must inherit from a TransformComponent and handle data modification, 4 | renaming and cast types using parent's (a Feature) information. 5 | """ 6 | 7 | from butterfree.transform.transformations.user_defined_functions.mode import mode 8 | from butterfree.transform.transformations.user_defined_functions.most_frequent_set import ( # noqa 9 | most_frequent_set, 10 | ) 11 | 12 | __all__ = [ 13 | "mode", 14 | "most_frequent_set", 15 | ] 16 | -------------------------------------------------------------------------------- /butterfree/transform/transformations/user_defined_functions/mode.py: -------------------------------------------------------------------------------- 1 | """Method to compute mode aggregation.""" 2 | 3 | import pandas as pd 4 | from pyspark.sql.functions import pandas_udf 5 | from pyspark.sql.types import StringType 6 | 7 | 8 | @pandas_udf(StringType()) # type: ignore 9 | def mode(column: pd.Series) -> str: 10 | """Computes a mode aggregation. 11 | 12 | Attributes: 13 | column: desired data to be aggregated with mode. 14 | 15 | Example: 16 | It's necessary to declare the desired aggregation method, (average, 17 | standard deviation and count are currently supported, as it can be 18 | seen in __ALLOWED_AGGREGATIONS) and, finally, define the mode. 19 | 20 | >>> from pyspark import SparkContext 21 | >>> from pyspark.sql import session, Window 22 | >>> from pyspark.sql.functions import pandas_udf 23 | >>> from butterfree.transform\ 24 | ... .transformations.user_defined_functions import (mode) 25 | >>> sc = SparkContext.getOrCreate() 26 | >>> spark = session.SparkSession(sc) 27 | >>> df = spark.createDataFrame( 28 | >>> [(1, 1), (1, 1), (2, 2), (2, 1), (2, 2)], 29 | >>> ("id", "column")) 30 | >>> df.groupby("id").agg(mode("column")).show() 31 | +---+------------+ 32 | | id|mode(column)| 33 | +---+------------+ 34 | | 1| 1| 35 | | 2| 2| 36 | +---+------------+ 37 | >>> w = Window.partitionBy('id').rowsBetween( 38 | ... Window.unboundedPreceding, Window.unboundedFollowing) 39 | >>> df.withColumn('most_viewed', mode("column").over(w)).show() 40 | +---+------+-----------+ 41 | | id|column|most_viewed| 42 | +---+------+-----------+ 43 | | 1| 1| 1| 44 | | 1| 1| 1| 45 | | 2| 2| 2| 46 | | 2| 1| 2| 47 | | 2| 2| 2| 48 | +---+------+-----------+ 49 | 50 | This example shows the mode aggregation. It's important to notice, 51 | however, that if we want to used in fixed_windows or row_windows mode, 52 | we'd need unbounded windows. For that reason, mode is meant to be used 53 | just in rolling_windows mode, initially. We intend to make it available 54 | to others modes soon. 55 | 56 | """ 57 | return str(column.mode()[0]) 58 | -------------------------------------------------------------------------------- /butterfree/transform/transformations/user_defined_functions/most_frequent_set.py: -------------------------------------------------------------------------------- 1 | """Method to compute most frequent set aggregation.""" 2 | 3 | from typing import Any 4 | 5 | import pandas as pd 6 | from pyspark.sql.functions import pandas_udf 7 | from pyspark.sql.types import ArrayType, StringType 8 | 9 | 10 | @pandas_udf(ArrayType(StringType())) # type: ignore 11 | def most_frequent_set(column: pd.Series) -> Any: 12 | """Computes the most frequent set aggregation. 13 | 14 | Attributes: 15 | column: desired data to be aggregated with most frequent set aggregation. 16 | 17 | Example: 18 | It's necessary to declare the desired aggregation method, (average, 19 | standard deviation and count are currently supported, as it can be 20 | seen in __ALLOWED_AGGREGATIONS) and define the most frequent set aggregation. 21 | 22 | >>> from pyspark import SparkContext 23 | >>> from pyspark.sql import session, Window 24 | >>> from butterfree.transform\ 25 | ... .transformations.user_defined_functions import (most_frequent_set) 26 | >>> sc = SparkContext.getOrCreate() 27 | >>> spark = session.SparkSession(sc) 28 | >>> df = spark.createDataFrame( 29 | >>> [(1, 1), (1, 1), (2, 2), (2, 1), (2, 2)], 30 | >>> ("id", "column")) 31 | >>> df.groupby("id").agg(most_frequent_set("column")).show() 32 | +---+-------------------------+ 33 | | id|most_frequent_set(column)| 34 | +---+-------------------------+ 35 | | 1| [1]| 36 | | 2| [2, 1]| 37 | +---+-------------------------+ 38 | >>> w = Window.partitionBy('id').rowsBetween( 39 | ... Window.unboundedPreceding, Window.unboundedFollowing) 40 | >>> df.withColumn( 41 | ... 'most_viewed', most_frequent_set("column").over(w) 42 | ... ).show() 43 | +---+------+-----------+ 44 | | id|column|most_viewed| 45 | +---+------+-----------+ 46 | | 1| 1| [1]| 47 | | 1| 1| [1]| 48 | | 2| 2| [2, 1]| 49 | | 2| 1| [2, 1]| 50 | | 2| 2| [2, 1]| 51 | +---+------+-----------+ 52 | 53 | This example shows the mode aggregation. It returns a list with the most 54 | frequent values. It's important to notice, however, that if we want to 55 | use it in fixed_windows or row_windows mode, we'd need unbounded windows. 56 | For that reason, mode is meant to be used just in rolling_windows mode, 57 | initially. We intend to make it available to others modes soon. 58 | 59 | """ 60 | return column.astype(str).value_counts().index.tolist() 61 | -------------------------------------------------------------------------------- /butterfree/transform/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """This module holds utils to be used by transformations.""" 2 | 3 | from butterfree.transform.utils.function import Function 4 | from butterfree.transform.utils.window_spec import Window 5 | 6 | __all__ = ["Window", "Function"] 7 | -------------------------------------------------------------------------------- /butterfree/transform/utils/date_range.py: -------------------------------------------------------------------------------- 1 | """Utils for date range generation.""" 2 | 3 | from datetime import datetime 4 | from typing import Optional, Union 5 | 6 | from pyspark.sql import DataFrame, functions 7 | 8 | from butterfree.clients import SparkClient 9 | from butterfree.constants import DataType 10 | from butterfree.constants.columns import TIMESTAMP_COLUMN 11 | 12 | 13 | def get_date_range( 14 | client: SparkClient, 15 | start_date: Union[str, datetime], 16 | end_date: Union[str, datetime], 17 | step: Optional[int] = None, 18 | ) -> DataFrame: 19 | """Create a date range dataframe. 20 | 21 | The dataframe returning from this method will containing a single column 22 | TIMESTAMP_COLUMN, of timestamp type, with dates between start and end. 23 | 24 | Args: 25 | client: a spark client. 26 | start_date: range beginning value (inclusive). 27 | end_date: range last value (exclusive) 28 | step: optional step, in seconds. 29 | 30 | Returns: 31 | A single column date range spark dataframe. 32 | """ 33 | day_in_seconds = 60 * 60 * 24 34 | step = step or day_in_seconds 35 | start_date = ( 36 | start_date if isinstance(start_date, str) else start_date.strftime("%Y-%m-%d") 37 | ) 38 | end_date = end_date if isinstance(end_date, str) else end_date.strftime("%Y-%m-%d") 39 | date_df = client.conn.createDataFrame( 40 | [(start_date, end_date)], ("start_date", "end_date") 41 | ).select( 42 | [ 43 | functions.col(c).cast(DataType.TIMESTAMP.spark).cast(DataType.BIGINT.spark) 44 | for c in ("start_date", "end_date") 45 | ] 46 | ) 47 | start_date, end_date = date_df.first() # type: ignore 48 | return client.conn.range( 49 | start_date, end_date + day_in_seconds, step # type: ignore 50 | ).select(functions.col("id").cast(DataType.TIMESTAMP.spark).alias(TIMESTAMP_COLUMN)) 51 | -------------------------------------------------------------------------------- /butterfree/transform/utils/function.py: -------------------------------------------------------------------------------- 1 | """Utils for custom or spark function to generation namedtuple.""" 2 | 3 | from typing import Callable 4 | 5 | from butterfree.constants import DataType 6 | 7 | 8 | class Function: 9 | """Define a class Function. 10 | 11 | Like a namedtuple: 12 | Function = namedtuple("Function", ["function", "data_type"]). 13 | 14 | Attributes: 15 | func: custom or spark functions, such as avg, std, count. 16 | For more information check spark functions: 17 | 'https://spark.apache.org/docs/2.3.1/api/python/_modules/pyspark/sql/functions.html' 18 | For custom functions, look the path: 19 | 'butterfree/transform/transformations/user_defined_functions'. 20 | data_type: data type for the output columns. 21 | """ 22 | 23 | def __init__(self, func: Callable, data_type: DataType): 24 | self.func = func 25 | self.data_type = data_type 26 | 27 | @property 28 | def func(self) -> Callable: 29 | """Function to be used in the transformation.""" 30 | return self._func 31 | 32 | @func.setter 33 | def func(self, value: Callable) -> None: 34 | """Definitions to be used in the transformation.""" 35 | if value is None: 36 | raise ValueError("Function must not be empty.") 37 | if callable(value) is False: 38 | raise TypeError("Function must be callable.") 39 | 40 | self._func = value 41 | 42 | @property 43 | def data_type(self) -> DataType: 44 | """Function to be used in the transformation.""" 45 | return self._data_type 46 | 47 | @data_type.setter 48 | def data_type(self, value: DataType) -> None: 49 | """Definitions to be used in the transformation.""" 50 | if not value: 51 | raise ValueError("DataType must not be empty.") 52 | if not isinstance(value, DataType): 53 | raise TypeError("Data type must be DataType.") 54 | 55 | self._data_type = value 56 | -------------------------------------------------------------------------------- /butterfree/validations/__init__.py: -------------------------------------------------------------------------------- 1 | """Holds dataframe validate for multiple destinations.""" 2 | 3 | from butterfree.validations.basic_validaton import BasicValidation 4 | 5 | __all__ = ["BasicValidation"] 6 | -------------------------------------------------------------------------------- /butterfree/validations/basic_validaton.py: -------------------------------------------------------------------------------- 1 | """Validation implementing basic checks over the dataframe.""" 2 | 3 | from typing import TYPE_CHECKING, Optional, Union 4 | 5 | if TYPE_CHECKING: 6 | from pyspark.sql.connect.dataframe import DataFrame as ConnectDataFrame 7 | 8 | from pyspark.sql.dataframe import DataFrame 9 | 10 | from butterfree.constants.columns import TIMESTAMP_COLUMN 11 | from butterfree.validations.validation import Validation 12 | 13 | 14 | class BasicValidation(Validation): 15 | """Basic validation suite for Feature Set's dataframe. 16 | 17 | Attributes: 18 | dataframe: object to be verified 19 | 20 | """ 21 | 22 | def __init__( 23 | self, dataframe: Optional[Union["ConnectDataFrame", DataFrame]] = None 24 | ): 25 | super().__init__(dataframe) 26 | 27 | def check(self) -> None: 28 | """Check basic validation properties about the dataframe. 29 | 30 | Raises: 31 | ValueError: if any of the verifications fail 32 | 33 | """ 34 | self.validate_column_ts() 35 | self.validate_df_is_empty() 36 | 37 | def validate_column_ts(self) -> None: 38 | """Check dataframe's ts column. 39 | 40 | Raises: 41 | ValueError: if dataframe don't have a column named ts. 42 | 43 | """ 44 | if not self.dataframe: 45 | raise ValueError("DataFrame can't be None.") 46 | if TIMESTAMP_COLUMN not in self.dataframe.columns: 47 | raise ValueError(f"DataFrame must have a '{TIMESTAMP_COLUMN}' column.") 48 | 49 | def _is_empty(self) -> bool: 50 | if hasattr(self.dataframe, "isEmpty"): 51 | # pyspark >= 3.4 52 | return self.dataframe.isEmpty() 53 | # pyspark < 3.4 54 | return self.dataframe.rdd.isEmpty() 55 | 56 | def validate_df_is_empty(self) -> None: 57 | """Check dataframe emptiness. 58 | 59 | Raises: 60 | ValueError: if dataframe is empty and is not streaming. 61 | 62 | """ 63 | 64 | if not self.dataframe: 65 | raise ValueError("DataFrame can't be None.") 66 | if (not self.dataframe.isStreaming) and self._is_empty(): 67 | raise ValueError("DataFrame can't be empty.") 68 | -------------------------------------------------------------------------------- /butterfree/validations/validation.py: -------------------------------------------------------------------------------- 1 | """Abstract Validation class.""" 2 | 3 | from abc import ABC, abstractmethod 4 | from typing import Optional 5 | 6 | from pyspark.sql.dataframe import DataFrame 7 | 8 | 9 | class Validation(ABC): 10 | """Validate dataframe properties. 11 | 12 | Attributes: 13 | dataframe: data to be verified. 14 | 15 | """ 16 | 17 | def __init__(self, dataframe: Optional[DataFrame] = None): 18 | self.dataframe = dataframe 19 | 20 | def input(self, dataframe: DataFrame) -> "Validation": 21 | """Input a dataframe to check. 22 | 23 | Args: 24 | dataframe: data to check. 25 | 26 | """ 27 | self.dataframe = dataframe 28 | return self 29 | 30 | @abstractmethod 31 | def check(self) -> None: 32 | """Check validation properties about the dataframe. 33 | 34 | Raises: 35 | ValueError: if any of the verifications fail. 36 | 37 | """ 38 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = source 8 | BUILDDIR = build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | recommonmark==0.6.0 2 | sphinx-rtd-theme==0.4.3 3 | sphinxemoji==0.1.6 4 | typing-extensions==3.7.4.2 5 | cmake==3.18.4 6 | h3==3.7.0 7 | pyarrow==16.1.0 8 | -------------------------------------------------------------------------------- /docs/source/butterfree.automated.rst: -------------------------------------------------------------------------------- 1 | butterfree.automated package 2 | ============================ 3 | 4 | Submodules 5 | ---------- 6 | 7 | 8 | .. automodule:: butterfree.automated.feature_set_creation 9 | :members: 10 | :undoc-members: 11 | :show-inheritance: 12 | 13 | Module contents 14 | --------------- 15 | 16 | .. automodule:: butterfree.automated 17 | :members: 18 | :undoc-members: 19 | :show-inheritance: 20 | -------------------------------------------------------------------------------- /docs/source/butterfree.clients.rst: -------------------------------------------------------------------------------- 1 | butterfree.clients package 2 | ========================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | butterfree.clients.abstract\_client module 8 | ------------------------------------------ 9 | 10 | .. automodule:: butterfree.clients.abstract_client 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | butterfree.clients.cassandra\_client module 16 | ------------------------------------------- 17 | 18 | .. automodule:: butterfree.clients.cassandra_client 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | butterfree.clients.spark\_client module 24 | --------------------------------------- 25 | 26 | .. automodule:: butterfree.clients.spark_client 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | Module contents 32 | --------------- 33 | 34 | .. automodule:: butterfree.clients 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | -------------------------------------------------------------------------------- /docs/source/butterfree.configs.db.rst: -------------------------------------------------------------------------------- 1 | butterfree.configs.db package 2 | ============================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | butterfree.configs.db.abstract\_config module 8 | --------------------------------------------- 9 | 10 | .. automodule:: butterfree.configs.db.abstract_config 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | butterfree.configs.db.cassandra\_config module 16 | ---------------------------------------------- 17 | 18 | .. automodule:: butterfree.configs.db.cassandra_config 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | butterfree.configs.db.kafka\_config module 24 | ------------------------------------------ 25 | 26 | .. automodule:: butterfree.configs.db.kafka_config 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | butterfree.configs.db.metastore\_config module 32 | ---------------------------------------------- 33 | 34 | .. automodule:: butterfree.configs.db.metastore_config 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | Module contents 40 | --------------- 41 | 42 | .. automodule:: butterfree.configs.db 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | -------------------------------------------------------------------------------- /docs/source/butterfree.configs.rst: -------------------------------------------------------------------------------- 1 | butterfree.configs package 2 | ========================== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | butterfree.configs.db 11 | 12 | Submodules 13 | ---------- 14 | 15 | butterfree.configs.environment module 16 | ------------------------------------- 17 | 18 | .. automodule:: butterfree.configs.environment 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | butterfree.configs.logger module 24 | -------------------------------- 25 | 26 | .. automodule:: butterfree.configs.logger 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | .. automodule:: butterfree.configs.logger 32 | :members: 33 | :undoc-members: 34 | :show-inheritance: 35 | 36 | .. automodule:: butterfree.configs.logger 37 | :members: 38 | :undoc-members: 39 | :show-inheritance: 40 | 41 | .. automodule:: butterfree.configs.logger 42 | :members: 43 | :undoc-members: 44 | :show-inheritance: 45 | 46 | .. automodule:: butterfree.configs.logger 47 | :members: 48 | :undoc-members: 49 | :show-inheritance: 50 | 51 | .. automodule:: butterfree.configs.logger 52 | :members: 53 | :undoc-members: 54 | :show-inheritance: 55 | 56 | Module contents 57 | --------------- 58 | 59 | .. automodule:: butterfree.configs 60 | :members: 61 | :undoc-members: 62 | :show-inheritance: 63 | -------------------------------------------------------------------------------- /docs/source/butterfree.constants.rst: -------------------------------------------------------------------------------- 1 | butterfree.constants package 2 | ============================ 3 | 4 | Submodules 5 | ---------- 6 | 7 | butterfree.constants.columns module 8 | ----------------------------------- 9 | 10 | .. automodule:: butterfree.constants.columns 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | butterfree.constants.data\_type module 16 | -------------------------------------- 17 | 18 | .. automodule:: butterfree.constants.data_type 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | butterfree.constants.migrations module 24 | -------------------------------------- 25 | 26 | .. automodule:: butterfree.constants.migrations 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | butterfree.constants.spark\_constants module 32 | -------------------------------------------- 33 | 34 | .. automodule:: butterfree.constants.migrations 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | 40 | .. automodule:: butterfree.constants.migrations 41 | :members: 42 | :undoc-members: 43 | :show-inheritance: 44 | 45 | 46 | .. automodule:: butterfree.constants.migrations 47 | :members: 48 | :undoc-members: 49 | :show-inheritance: 50 | 51 | 52 | .. automodule:: butterfree.constants.migrations 53 | :members: 54 | :undoc-members: 55 | :show-inheritance: 56 | 57 | 58 | .. automodule:: butterfree.constants.migrations 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | 64 | .. automodule:: butterfree.constants.spark_constants 65 | :members: 66 | :undoc-members: 67 | :show-inheritance: 68 | 69 | butterfree.constants.window\_definitions module 70 | ----------------------------------------------- 71 | 72 | .. automodule:: butterfree.constants.window_definitions 73 | :members: 74 | :undoc-members: 75 | :show-inheritance: 76 | 77 | .. automodule:: butterfree.constants.window_definitions 78 | :members: 79 | :undoc-members: 80 | :show-inheritance: 81 | 82 | .. automodule:: butterfree.constants.window_definitions 83 | :members: 84 | :undoc-members: 85 | :show-inheritance: 86 | 87 | .. automodule:: butterfree.constants.window_definitions 88 | :members: 89 | :undoc-members: 90 | :show-inheritance: 91 | 92 | .. automodule:: butterfree.constants.window_definitions 93 | :members: 94 | :undoc-members: 95 | :show-inheritance: 96 | 97 | .. automodule:: butterfree.constants.window_definitions 98 | :members: 99 | :undoc-members: 100 | :show-inheritance: 101 | 102 | Module contents 103 | --------------- 104 | 105 | .. automodule:: butterfree.constants 106 | :members: 107 | :undoc-members: 108 | :show-inheritance: 109 | -------------------------------------------------------------------------------- /docs/source/butterfree.dataframe_service.rst: -------------------------------------------------------------------------------- 1 | butterfree.dataframe\_service package 2 | ===================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | 8 | .. automodule:: butterfree.dataframe_service.incremental_strategy 9 | :members: 10 | :undoc-members: 11 | :show-inheritance: 12 | 13 | 14 | .. automodule:: butterfree.dataframe_service.partitioning 15 | :members: 16 | :undoc-members: 17 | :show-inheritance: 18 | 19 | 20 | .. automodule:: butterfree.dataframe_service.repartition 21 | :members: 22 | :undoc-members: 23 | :show-inheritance: 24 | 25 | Module contents 26 | --------------- 27 | 28 | .. automodule:: butterfree.dataframe_service 29 | :members: 30 | :undoc-members: 31 | :show-inheritance: 32 | -------------------------------------------------------------------------------- /docs/source/butterfree.extract.pre_processing.rst: -------------------------------------------------------------------------------- 1 | butterfree.extract.pre\_processing package 2 | ========================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | butterfree.extract.pre\_processing.explode\_json\_column\_transform module 8 | -------------------------------------------------------------------------- 9 | 10 | .. automodule:: butterfree.extract.pre_processing.explode_json_column_transform 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | butterfree.extract.pre\_processing.filter\_transform module 16 | ----------------------------------------------------------- 17 | 18 | .. automodule:: butterfree.extract.pre_processing.filter_transform 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | butterfree.extract.pre\_processing.forward\_fill\_transform module 24 | ------------------------------------------------------------------ 25 | 26 | .. automodule:: butterfree.extract.pre_processing.forward_fill_transform 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | butterfree.extract.pre\_processing.pivot\_transform module 32 | ---------------------------------------------------------- 33 | 34 | .. automodule:: butterfree.extract.pre_processing.pivot_transform 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | butterfree.extract.pre\_processing.replace\_transform module 40 | ------------------------------------------------------------ 41 | 42 | .. automodule:: butterfree.extract.pre_processing.replace_transform 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | Module contents 48 | --------------- 49 | 50 | .. automodule:: butterfree.extract.pre_processing 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | -------------------------------------------------------------------------------- /docs/source/butterfree.extract.readers.rst: -------------------------------------------------------------------------------- 1 | butterfree.extract.readers package 2 | ================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | butterfree.extract.readers.file\_reader module 8 | ---------------------------------------------- 9 | 10 | .. automodule:: butterfree.extract.readers.file_reader 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | butterfree.extract.readers.kafka\_reader module 16 | ----------------------------------------------- 17 | 18 | .. automodule:: butterfree.extract.readers.kafka_reader 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | butterfree.extract.readers.reader module 24 | ---------------------------------------- 25 | 26 | .. automodule:: butterfree.extract.readers.reader 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | butterfree.extract.readers.table\_reader module 32 | ----------------------------------------------- 33 | 34 | .. automodule:: butterfree.extract.readers.table_reader 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | Module contents 40 | --------------- 41 | 42 | .. automodule:: butterfree.extract.readers 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | -------------------------------------------------------------------------------- /docs/source/butterfree.extract.rst: -------------------------------------------------------------------------------- 1 | butterfree.extract package 2 | ========================== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | butterfree.extract.pre_processing 11 | butterfree.extract.readers 12 | 13 | Submodules 14 | ---------- 15 | 16 | butterfree.extract.source module 17 | -------------------------------- 18 | 19 | .. automodule:: butterfree.extract.source 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | 24 | Module contents 25 | --------------- 26 | 27 | .. automodule:: butterfree.extract 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | -------------------------------------------------------------------------------- /docs/source/butterfree.hooks.rst: -------------------------------------------------------------------------------- 1 | butterfree.hooks package 2 | ======================== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | butterfree.hooks.schema_compatibility 11 | 12 | Submodules 13 | ---------- 14 | 15 | 16 | .. automodule:: butterfree.hooks.hook 17 | :members: 18 | :undoc-members: 19 | :show-inheritance: 20 | 21 | 22 | .. automodule:: butterfree.hooks.hookable_component 23 | :members: 24 | :undoc-members: 25 | :show-inheritance: 26 | 27 | Module contents 28 | --------------- 29 | 30 | .. automodule:: butterfree.hooks 31 | :members: 32 | :undoc-members: 33 | :show-inheritance: 34 | -------------------------------------------------------------------------------- /docs/source/butterfree.hooks.schema_compatibility.rst: -------------------------------------------------------------------------------- 1 | butterfree.hooks.schema\_compatibility package 2 | ============================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | 8 | .. automodule:: butterfree.hooks.schema_compatibility.cassandra_table_schema_compatibility_hook 9 | :members: 10 | :undoc-members: 11 | :show-inheritance: 12 | 13 | 14 | .. automodule:: butterfree.hooks.schema_compatibility.spark_table_schema_compatibility_hook 15 | :members: 16 | :undoc-members: 17 | :show-inheritance: 18 | 19 | Module contents 20 | --------------- 21 | 22 | .. automodule:: butterfree.hooks.schema_compatibility 23 | :members: 24 | :undoc-members: 25 | :show-inheritance: 26 | -------------------------------------------------------------------------------- /docs/source/butterfree.load.processing.rst: -------------------------------------------------------------------------------- 1 | butterfree.load.processing package 2 | ================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | butterfree.load.processing.json\_transform module 8 | ------------------------------------------------- 9 | 10 | .. automodule:: butterfree.load.processing.json_transform 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | Module contents 16 | --------------- 17 | 18 | .. automodule:: butterfree.load.processing 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | -------------------------------------------------------------------------------- /docs/source/butterfree.load.rst: -------------------------------------------------------------------------------- 1 | butterfree.load package 2 | ======================= 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | butterfree.load.processing 11 | butterfree.load.writers 12 | 13 | Submodules 14 | ---------- 15 | 16 | butterfree.load.sink module 17 | --------------------------- 18 | 19 | .. automodule:: butterfree.load.sink 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | 24 | Module contents 25 | --------------- 26 | 27 | .. automodule:: butterfree.load 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | -------------------------------------------------------------------------------- /docs/source/butterfree.load.writers.rst: -------------------------------------------------------------------------------- 1 | butterfree.load.writers package 2 | =============================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | butterfree.load.writers.historical\_feature\_store\_writer module 8 | ----------------------------------------------------------------- 9 | 10 | .. automodule:: butterfree.load.writers.historical_feature_store_writer 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | butterfree.load.writers.online\_feature\_store\_writer module 16 | ------------------------------------------------------------- 17 | 18 | .. automodule:: butterfree.load.writers.online_feature_store_writer 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | butterfree.load.writers.writer module 24 | ------------------------------------- 25 | 26 | .. automodule:: butterfree.load.writers.writer 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | Module contents 32 | --------------- 33 | 34 | .. automodule:: butterfree.load.writers 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | -------------------------------------------------------------------------------- /docs/source/butterfree.migrations.database_migration.rst: -------------------------------------------------------------------------------- 1 | butterfree.migrations.database\_migration package 2 | ================================================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | 8 | .. automodule:: butterfree.migrations.database_migration.cassandra_migration 9 | :members: 10 | :undoc-members: 11 | :show-inheritance: 12 | 13 | 14 | .. automodule:: butterfree.migrations.database_migration.database_migration 15 | :members: 16 | :undoc-members: 17 | :show-inheritance: 18 | 19 | 20 | .. automodule:: butterfree.migrations.database_migration.metastore_migration 21 | :members: 22 | :undoc-members: 23 | :show-inheritance: 24 | 25 | Module contents 26 | --------------- 27 | 28 | .. automodule:: butterfree.migrations.database_migration 29 | :members: 30 | :undoc-members: 31 | :show-inheritance: 32 | -------------------------------------------------------------------------------- /docs/source/butterfree.migrations.rst: -------------------------------------------------------------------------------- 1 | butterfree.migrations package 2 | ============================= 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | butterfree.migrations.database_migration 11 | 12 | Module contents 13 | --------------- 14 | 15 | .. automodule:: butterfree.migrations 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | -------------------------------------------------------------------------------- /docs/source/butterfree.pipelines.rst: -------------------------------------------------------------------------------- 1 | butterfree.pipelines package 2 | ============================ 3 | 4 | Submodules 5 | ---------- 6 | 7 | butterfree.pipelines.feature\_set\_pipeline module 8 | -------------------------------------------------- 9 | 10 | .. automodule:: butterfree.pipelines.feature_set_pipeline 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | Module contents 16 | --------------- 17 | 18 | .. automodule:: butterfree.pipelines 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | -------------------------------------------------------------------------------- /docs/source/butterfree.reports.rst: -------------------------------------------------------------------------------- 1 | butterfree.reports package 2 | ========================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | butterfree.reports.metadata module 8 | ---------------------------------- 9 | 10 | .. automodule:: butterfree.reports.metadata 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | Module contents 16 | --------------- 17 | 18 | .. automodule:: butterfree.reports 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | -------------------------------------------------------------------------------- /docs/source/butterfree.rst: -------------------------------------------------------------------------------- 1 | butterfree package 2 | ================== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | butterfree.automated 11 | butterfree.clients 12 | butterfree.configs 13 | butterfree.constants 14 | butterfree.dataframe_service 15 | butterfree.extract 16 | butterfree.hooks 17 | butterfree.load 18 | butterfree.migrations 19 | butterfree.pipelines 20 | butterfree.reports 21 | butterfree.testing 22 | butterfree.transform 23 | butterfree.validations 24 | 25 | Module contents 26 | --------------- 27 | 28 | .. automodule:: butterfree 29 | :members: 30 | :undoc-members: 31 | :show-inheritance: 32 | -------------------------------------------------------------------------------- /docs/source/butterfree.testing.dataframe.rst: -------------------------------------------------------------------------------- 1 | butterfree.testing.dataframe package 2 | ==================================== 3 | 4 | Module contents 5 | --------------- 6 | 7 | .. automodule:: butterfree.testing.dataframe 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: 11 | -------------------------------------------------------------------------------- /docs/source/butterfree.testing.rst: -------------------------------------------------------------------------------- 1 | butterfree.testing package 2 | ========================== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | butterfree.testing.dataframe 11 | 12 | Module contents 13 | --------------- 14 | 15 | .. automodule:: butterfree.testing 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | -------------------------------------------------------------------------------- /docs/source/butterfree.transform.features.rst: -------------------------------------------------------------------------------- 1 | butterfree.transform.features package 2 | ===================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | butterfree.transform.features.feature module 8 | -------------------------------------------- 9 | 10 | .. automodule:: butterfree.transform.features.feature 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | butterfree.transform.features.key\_feature module 16 | ------------------------------------------------- 17 | 18 | .. automodule:: butterfree.transform.features.key_feature 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | butterfree.transform.features.timestamp\_feature module 24 | ------------------------------------------------------- 25 | 26 | .. automodule:: butterfree.transform.features.timestamp_feature 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | Module contents 32 | --------------- 33 | 34 | .. automodule:: butterfree.transform.features 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | -------------------------------------------------------------------------------- /docs/source/butterfree.transform.rst: -------------------------------------------------------------------------------- 1 | butterfree.transform package 2 | ============================ 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | butterfree.transform.features 11 | butterfree.transform.transformations 12 | butterfree.transform.utils 13 | 14 | Submodules 15 | ---------- 16 | 17 | butterfree.transform.aggregated\_feature\_set module 18 | ---------------------------------------------------- 19 | 20 | .. automodule:: butterfree.transform.aggregated_feature_set 21 | :members: 22 | :undoc-members: 23 | :show-inheritance: 24 | 25 | butterfree.transform.feature\_set module 26 | ---------------------------------------- 27 | 28 | .. automodule:: butterfree.transform.feature_set 29 | :members: 30 | :undoc-members: 31 | :show-inheritance: 32 | 33 | Module contents 34 | --------------- 35 | 36 | .. automodule:: butterfree.transform 37 | :members: 38 | :undoc-members: 39 | :show-inheritance: 40 | -------------------------------------------------------------------------------- /docs/source/butterfree.transform.transformations.rst: -------------------------------------------------------------------------------- 1 | butterfree.transform.transformations package 2 | ============================================ 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | butterfree.transform.transformations.user_defined_functions 11 | 12 | Submodules 13 | ---------- 14 | 15 | butterfree.transform.transformations.aggregated\_transform module 16 | ----------------------------------------------------------------- 17 | 18 | .. automodule:: butterfree.transform.transformations.aggregated_transform 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | butterfree.transform.transformations.custom\_transform module 24 | ------------------------------------------------------------- 25 | 26 | .. automodule:: butterfree.transform.transformations.custom_transform 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | butterfree.transform.transformations.h3\_transform module 32 | --------------------------------------------------------- 33 | 34 | .. automodule:: butterfree.transform.transformations.h3_transform 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | butterfree.transform.transformations.spark\_function\_transform module 40 | ---------------------------------------------------------------------- 41 | 42 | .. automodule:: butterfree.transform.transformations.spark_function_transform 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | butterfree.transform.transformations.sql\_expression\_transform module 48 | ---------------------------------------------------------------------- 49 | 50 | .. automodule:: butterfree.transform.transformations.sql_expression_transform 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | butterfree.transform.transformations.stack\_transform module 56 | ------------------------------------------------------------ 57 | 58 | .. automodule:: butterfree.transform.transformations.stack_transform 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | butterfree.transform.transformations.transform\_component module 64 | ---------------------------------------------------------------- 65 | 66 | .. automodule:: butterfree.transform.transformations.transform_component 67 | :members: 68 | :undoc-members: 69 | :show-inheritance: 70 | 71 | Module contents 72 | --------------- 73 | 74 | .. automodule:: butterfree.transform.transformations 75 | :members: 76 | :undoc-members: 77 | :show-inheritance: 78 | -------------------------------------------------------------------------------- /docs/source/butterfree.transform.transformations.user_defined_functions.rst: -------------------------------------------------------------------------------- 1 | butterfree.transform.transformations.user\_defined\_functions package 2 | ===================================================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | butterfree.transform.transformations.user\_defined\_functions.mode module 8 | ------------------------------------------------------------------------- 9 | 10 | .. automodule:: butterfree.transform.transformations.user_defined_functions.mode 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | butterfree.transform.transformations.user\_defined\_functions.most\_frequent\_set module 16 | ---------------------------------------------------------------------------------------- 17 | 18 | .. automodule:: butterfree.transform.transformations.user_defined_functions.most_frequent_set 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | Module contents 24 | --------------- 25 | 26 | .. automodule:: butterfree.transform.transformations.user_defined_functions 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | -------------------------------------------------------------------------------- /docs/source/butterfree.transform.utils.rst: -------------------------------------------------------------------------------- 1 | butterfree.transform.utils package 2 | ================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | butterfree.transform.utils.date\_range module 8 | --------------------------------------------- 9 | 10 | .. automodule:: butterfree.transform.utils.date_range 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | butterfree.transform.utils.function module 16 | ------------------------------------------ 17 | 18 | .. automodule:: butterfree.transform.utils.function 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | butterfree.transform.utils.window\_spec module 24 | ---------------------------------------------- 25 | 26 | .. automodule:: butterfree.transform.utils.window_spec 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | Module contents 32 | --------------- 33 | 34 | .. automodule:: butterfree.transform.utils 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | -------------------------------------------------------------------------------- /docs/source/butterfree.validations.rst: -------------------------------------------------------------------------------- 1 | butterfree.validations package 2 | ============================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | butterfree.validations.basic\_validaton module 8 | ---------------------------------------------- 9 | 10 | .. automodule:: butterfree.validations.basic_validaton 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | butterfree.validations.validation module 16 | ---------------------------------------- 17 | 18 | .. automodule:: butterfree.validations.validation 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | Module contents 24 | --------------- 25 | 26 | .. automodule:: butterfree.validations 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | -------------------------------------------------------------------------------- /docs/source/cli.md: -------------------------------------------------------------------------------- 1 | # Command-line Interface (CLI) 2 | 3 | Butterfree has now a command-line interface, introduced with the new automatic migration ability. 4 | 5 | As soon as you install butterfree, you can check what's available through butterfree's cli with: 6 | 7 | ```shell 8 | $~ butterfree --help 9 | ``` 10 | 11 | ### Automated Database Schema Migration 12 | 13 | When developing your feature sets, you need also to prepare your database for the changes 14 | to come into your Feature Store. Normally, when creating a new feature set, you needed 15 | to manually create a new table in cassandra. Or, when creating a new feature in an existing 16 | feature set, you needed to create new column in cassandra too. 17 | 18 | Now, you can just use `butterfree migrate apply ...`, butterfree will scan your python 19 | files, looking for classes that inherit from `butterfree.pipelines.FeatureSetPipeline`, 20 | then compare its schema with the database schema where the feature set would be written. 21 | Then it will prepare migration queries and run against the databases. 22 | 23 | For more information, please, check `butterfree migrate apply --help` :) 24 | 25 | ### Supported databases 26 | 27 | This functionality currently supports only the **Cassandra** database, which is the default 28 | storage for an Online Feature Store built with Butterfree. Nonetheless, it was made with 29 | the intent to be easily extended for other databases. 30 | 31 | Also, each database has its own rules for schema migration commands. Some changes may 32 | still require manual interference. -------------------------------------------------------------------------------- /docs/source/extract.md: -------------------------------------------------------------------------------- 1 | # Source 2 | 3 | Regarding the extract step, we can define a ```Source``` as a set of data sources in order to join your raw data for the transform step. 4 | 5 | Currently, we support three different data sources or, as it's called within ```Butterfree```, ```readers```: 6 | 7 | * ```FileReader```: this reader loads data from a file, as the name suggests, and returns a dataframe. It can be instantiated as: 8 | 9 | ```python 10 | file_reader = FileReader( 11 | id="file_reader_id", 12 | path="data_path", 13 | format="json" 14 | ) 15 | ``` 16 | 17 | * ```TableReader```: this reader loads data from a table registered in spark metastore and returns a dataframe. It can be instantiated as: 18 | 19 | ```python 20 | table_reader = TableReader( 21 | id="table_reader_id", 22 | database="table_reader_db", 23 | table="table_reader_table" 24 | ) 25 | ``` 26 | 27 | * ```KafkaReader```: this reader loads data from a kafka topic and returns a dataframe. It can be instantiated as: 28 | 29 | ```python 30 | kafka_reader = KafkaReader( 31 | id="kafka_reader_id", 32 | topic="topic", 33 | value_schema=value_schema 34 | connection_string="host1:port,host2:port", 35 | ) 36 | ``` 37 | 38 | After defining all your data sources, it's important to write a query in order to define the relation between them, something like this: 39 | 40 | ```python 41 | source = Source( 42 | readers=[ 43 | TableReader( 44 | id="table_reader_id", 45 | database="table_reader_db", 46 | table="table_reader_table", 47 | ), 48 | FileReader(id="file_reader_id", path="data_sample_path", format="json"), 49 | ], 50 | query=f"select a.*, b.feature2 " 51 | f"from table_reader_id a " 52 | f"inner join file_reader_id b on a.id = b.id ", 53 | ) 54 | ``` 55 | 56 | It's important to state that we have some pre-processing methods as well, such as filter and pivot. Feel free to check them [here](https://github.com/quintoandar/butterfree/tree/master/butterfree/extract/pre_processing). -------------------------------------------------------------------------------- /docs/source/getstart.md: -------------------------------------------------------------------------------- 1 | # Getting Started 2 | 3 | Butterfree depends on **Python 3.7+** and it is **Spark 3.0 ready**. 4 | 5 | [Python Package Index](https://quintoandar.github.io/python-package-server/) hosts reference to a pip-installable module of this library, using it is as straightforward as including it on your project's requirements. 6 | 7 | ```bash 8 | pip install butterfree 9 | ``` 10 | 11 | Or after listing `butterfree` in your `requirements.txt` file: 12 | 13 | ```bash 14 | pip install -r requirements.txt 15 | ``` 16 | 17 | ## Discovering Butterfree 18 | 19 | Welcome to **Discovering Butterfree** tutorial series! Click on the following links to open the tutorials: 20 | 21 | **[#1 Feature Set Basics](https://github.com/quintoandar/butterfree/blob/master/examples/simple_feature_set/simple_feature_set.ipynb)** 22 | 23 | **[#2 Spark Functions and Window](https://github.com/quintoandar/butterfree/blob/master/examples/spark_function_and_window/spark_function_and_window.ipynb)** 24 | 25 | **[#3 Aggregated Feature Set](https://github.com/quintoandar/butterfree/blob/master/examples/aggregated_feature_set/aggregated_feature_set%20.ipynb)** 26 | 27 | **[#4 Streaming Feature Set](https://github.com/quintoandar/butterfree/blob/master/examples/streaming_feature_set/streaming_feature_set.ipynb)** -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | Butterfree Docs 2 | =============== 3 | Made with |:heart:| by the **MLOps** team from `QuintoAndar `_. 4 | 5 | The library is centered on the following concetps: 6 | 7 | - **ETL**: central framework to create data pipelines. Spark-based Extract, Transform and Load modules ready to use. 8 | - **Declarative Feature Engineering**: care about what you want to compute and not how to code it. 9 | - **Feature Store Modeling**: the library easily provides everything you need to process and load data to your Feature Store. 10 | 11 | Navigation 12 | ^^^^^^^^^^ 13 | 14 | .. toctree:: 15 | :maxdepth: 2 16 | 17 | home 18 | getstart 19 | extract 20 | transform 21 | load 22 | stream 23 | configuration 24 | modules 25 | cli 26 | -------------------------------------------------------------------------------- /docs/source/load.md: -------------------------------------------------------------------------------- 1 | # Sink 2 | 3 | The Load step is the `Sink` method, where we define the destinations for the feature set pipeline, that is, it is the process of recording the transformed data after the transformation step. 4 | 5 | Declaring the sink: 6 | ```python 7 | sink = Sink( 8 | writers = [HistoricalFeatureStoreWriter(), OnlineFeatureStoreWriter()] 9 | ), 10 | ``` 11 | 12 | Currently, you can write your data into two types of `writers`: 13 | 14 | * `HistoricalFeatureStoreWriter`: The Historical Feature Store will write the data to an AWS S3 bucket. 15 | 16 | * `OnlineFeatureStoreWriter`: The Online Feature Store will write the data to a Cassandra database. 17 | 18 | If you declare your writers without a database configuration, they will use their default settings. But we can also define this configuration, such as: 19 | 20 | * `HistoricalFeatureStoreWriter`: 21 | ```python 22 | config = S3Config(bucket="my_bucket", mode="append", format_="parquet") 23 | writers = [HistoricalFeatureStoreWriter(db_config=config)] 24 | ``` 25 | 26 | * `OnlineFeatureStoreWriter`: 27 | ```python 28 | config = CassandraConfig( 29 | mode="overwrite", 30 | format_="org.apache.spark.sql.cassandra", 31 | keyspace="keyspace_name" 32 | ) 33 | writers = [OnlineFeatureStoreWriter(db_config=config)] 34 | ````` 35 | 36 | You can see the writers [here](https://github.com/quintoandar/butterfree/tree/staging/butterfree/core/load/writers) and database configuration [here](https://github.com/quintoandar/butterfree/tree/staging/butterfree/core/configs/db). 37 | 38 | It's also important to highlight that our writers support a ```debug_mode``` option: 39 | ```python 40 | writers = [HistoricalFeatureStoreWriter(debug_mode=True), OnlineFeatureStoreWriter(debug_mode=True)] 41 | sink = Sink(writers=writers) 42 | ``` 43 | When ```debug_mode``` assumes ```True```, then a temporary view will be created, therefore no data will be actually saved to both historical and online feature store. Feel free to check our [examples section](https://github.com/quintoandar/butterfree/tree/staging/examples), in order to learn more about how to use this mode. -------------------------------------------------------------------------------- /docs/source/modules.rst: -------------------------------------------------------------------------------- 1 | API Specification 2 | ================= 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | butterfree 8 | -------------------------------------------------------------------------------- /docs/source/stream.md: -------------------------------------------------------------------------------- 1 | # Streaming Feature Sets in Butterfree 2 | 3 | ## Introduction 4 | 5 | Spark enables us to deal with streaming processing in a very powerful way. For an introduction of all Spark's capabilities in streaming you can read more in this [link](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html). As the core Spark, Butterfree also let you declare pipelines to deal with streaming data. The best is that the pipeline declaration is almost the same as dealing with batch use-cases, so there isn't too complex to deal with this type of challenge using Butterfree tools. 6 | 7 | Streaming feature sets are the ones that have at least one streaming source of data declared in the `Readers` of a `FeatureSetPipeline`. The pipeline is considered a streaming job if it has at least one reader in streaming mode (`stream=True`). 8 | 9 | ## Readers 10 | 11 | Using readers in streaming mode will make use of Spark's `readStream` API instead of the normal `read`. That means it will produce a stream dataframe (`df.isStreaming == True`) instead of a normal Spark's dataframe. 12 | 13 | The currently supported readers in stream mode are `FileReader` and `KafkaReader`. For more information about the specifications read their docstrings, [here](https://github.com/quintoandar/butterfree/blob/master/butterfree/extract/readers/file_reader.py#L10) and [here](https://github.com/quintoandar/butterfree/blob/master/butterfree/extract/readers/kafka_reader.py#L12) respectively. 14 | 15 | ## Online Feature Store Writer 16 | `OnlineFeatureStoreWriter` is currently the only writer that supports streaming dataframes. It will write, in real time, and upserts to Cassandra. It uses `df.writeStream` and `foreachBatch` Spark functionality to do that. You can read more about `forachBatch` [here](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#using-foreach-and-foreachbatch). 17 | 18 | ![](https://i.imgur.com/KoI1HuC.png) 19 | 20 | 21 | ### Debug Mode 22 | You can use the `OnlineFeatureStoreWriter` in debug mode (`debug_mode=True`) with streaming dataframes. So instead of trying to write to Cassandra, the data will be written to an in-memory table. So you can query this table to show the output as it is being calculated. Normally this functionality is used for the purpose of testing if the defined features have the expected results in real time. 23 | 24 | ## Pipeline Run 25 | Differently from a batch run, a pipeline running with a streaming dataframe will not "finish to run". The pipeline will continue to get data from the streaming, process the data and save it to the defined sink sources. So when managing a job using this feature, an operation needs to be designed to support a continuously-up streaming job. -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Discovering Butterfree 2 | 3 | Welcome to **Discovering Butterfree** tutorial series! Click on the following links to open the tutorials: 4 | 5 | **[#1 Feature Set Basics](https://github.com/quintoandar/butterfree/blob/master/examples/simple_feature_set/simple_feature_set.ipynb)** 6 | 7 | **[#2 Spark Functions and Window](https://github.com/quintoandar/butterfree/blob/master/examples/spark_function_and_window/spark_function_and_window.ipynb)** 8 | 9 | **[#3 Aggregated Feature Set](https://github.com/quintoandar/butterfree/blob/master/examples/aggregated_feature_set/aggregated_feature_set.ipynb)** 10 | 11 | **[#4 Streaming Feature Set](https://github.com/quintoandar/butterfree/blob/master/examples/streaming_feature_set/streaming_feature_set.ipynb)** 12 | -------------------------------------------------------------------------------- /examples/data/listing_events.json: -------------------------------------------------------------------------------- 1 | {"id": 1, "rent": 1300, "region_id": 1, "area": 50, "bedrooms": 1, "bathrooms": 1, "timestamp": 1588302000000} 2 | {"id": 1, "rent": 2000, "region_id": 1, "area": 50, "bedrooms": 1, "bathrooms": 1, "timestamp": 1588647600000} 3 | {"id": 2, "rent": 1500, "region_id": 2, "area": 100, "bedrooms": 2, "bathrooms": 1, "timestamp": 1588734000000} 4 | {"id": 2, "rent": 2500, "region_id": 2, "area": 100, "bedrooms": 2, "bathrooms": 1, "timestamp": 1589252400000} 5 | {"id": 3, "rent": 3000, "region_id": 3, "area": 150, "bedrooms": 2, "bathrooms": 2, "timestamp": 1589943600000} 6 | {"id": 4, "rent": 3200, "region_id": 4, "area": 175, "bedrooms": 2, "bathrooms": 2, "timestamp": 1589943600000} 7 | {"id": 5, "rent": 3200, "region_id": 5, "area": 250, "bedrooms": 3, "bathrooms": 3, "timestamp": 1590030000000} 8 | {"id": 6, "rent": 3200, "region_id": 6, "area": 225, "bedrooms": 2, "bathrooms": 3, "timestamp": 1590116400000} 9 | -------------------------------------------------------------------------------- /examples/data/region.json: -------------------------------------------------------------------------------- 1 | {"id": 1, "city": "Cerulean", "lat": 73.44489, "lng": 31.75030, "region": "Kanto"} 2 | {"id": 2, "city": "Veridian", "lat": -9.43510, "lng": -167.11772, "region": "Kanto"} 3 | {"id": 3, "city": "Cinnabar", "lat": 29.73043, "lng": 117.66164, "region": "Kanto"} 4 | {"id": 4, "city": "Pallet", "lat": -52.95717, "lng": -81.15251, "region": "Kanto"} 5 | {"id": 5, "city": "Violet", "lat": -47.35798, "lng": -178.77255, "region": "Johto"} 6 | {"id": 6, "city": "Olivine", "lat": 51.72820, "lng": 46.21958, "region": "Johto"} -------------------------------------------------------------------------------- /examples/streaming_feature_set/events/20582255.json: -------------------------------------------------------------------------------- 1 | {"id": 0, "timestamp": 20582255, "payload": "{\"id_pokemon\": 1, \"pokeball\": \"Ultra\"}"} -------------------------------------------------------------------------------- /examples/streaming_feature_set/pokedex.json: -------------------------------------------------------------------------------- 1 | {"id": 1, "name": "Geodude", "type": "Rock"} 2 | {"id": 2, "name": "Bulbasaur", "type": "Grass"} 3 | {"id": 3, "name": "Pikachu", "type": "Electric"} 4 | {"id": 4, "name": "Eevee", "type": "Normal"} 5 | {"id": 5, "name": "Oddish", "type": "Grass"} 6 | {"id": 6, "name": "Magikarp", "type": "Water"} 7 | -------------------------------------------------------------------------------- /examples/test_examples.py: -------------------------------------------------------------------------------- 1 | """Script to test all notebooks under examples/ folder.""" 2 | 3 | import os 4 | from pathlib import Path 5 | from subprocess import PIPE, Popen # noqa S404 6 | 7 | if __name__ == "__main__": 8 | dir_path = os.path.dirname(os.path.realpath(__file__)) 9 | example_notebook_paths = [ 10 | str(path) 11 | for path in list(Path(dir_path).rglob("*.ipynb")) 12 | if ".ipynb_checkpoints" not in str(path) 13 | ] 14 | 15 | print("\n>>> Notebook Examples Tests") 16 | errors = [] 17 | for path in example_notebook_paths: 18 | print(f" >>> Running {path}") 19 | 20 | p = Popen( # noqa S607, S603 21 | [ 22 | "jupyter", 23 | "nbconvert", 24 | "--to", 25 | "notebook", 26 | "--inplace", 27 | "--no-prompt", 28 | "--execute", 29 | "--log-level='ERROR'", 30 | path, 31 | ], 32 | stdout=PIPE, 33 | stderr=PIPE, 34 | ) 35 | 36 | _, error = p.communicate() 37 | if p.returncode != 0: 38 | errors.append({"notebook": path, "error": error}) 39 | print(" >>> Error in execution!\n") 40 | else: 41 | print(" >>> Successful execution\n") 42 | 43 | if errors: 44 | print(">>> Errors in the following notebooks:") 45 | for run in errors: 46 | print("\n >>>", run["notebook"]) 47 | print(run["error"].decode("utf-8")) 48 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | python_version = 3.9 3 | ignore_missing_imports = True 4 | disallow_untyped_calls = False 5 | disallow_untyped_defs = True 6 | disallow_incomplete_defs = True 7 | warn_redundant_casts = True 8 | show_error_codes = True 9 | show_error_context = True 10 | disable_error_code = attr-defined, list-item, operator 11 | pretty = True 12 | 13 | [mypy-butterfree.pipelines.*] 14 | ignore_errors = True 15 | 16 | [mypy-butterfree.load.*] 17 | ignore_errors = True 18 | 19 | [mypy-butterfree.transform.*] 20 | ignore_errors = True 21 | 22 | [mypy-butterfree.extract.*] 23 | ignore_errors = True 24 | 25 | [mypy-butterfree.config.*] 26 | ignore_errors = True 27 | 28 | [mypy-butterfree.clients.*] 29 | ignore_errors = True 30 | 31 | [mypy-butterfree.configs.*] 32 | ignore_errors = True 33 | 34 | [mypy-butterfree.dataframe_service.*] 35 | ignore_errors = True 36 | 37 | [mypy-butterfree.validations.*] 38 | ignore_errors = True 39 | 40 | [mypy-butterfree.migrations.*] 41 | ignore_errors = True 42 | 43 | [mypy-butterfree.testing.*] 44 | ignore_errors = True 45 | 46 | [mypy-butterfree.hooks.*] 47 | ignore_errors = True 48 | 49 | [mypy-butterfree._cli.*] 50 | ignore_errors = True 51 | -------------------------------------------------------------------------------- /release-please-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "packages": { 3 | ".": { 4 | "changelog-path": "CHANGELOG.md", 5 | "release-type": "python", 6 | "bump-minor-pre-major": false, 7 | "bump-patch-for-minor-pre-major": false, 8 | "include-component-in-tag": false, 9 | "include-v-in-tag": false, 10 | "extra-files": [ 11 | "setup.py", 12 | "butterfree/__init__.py" 13 | ] 14 | } 15 | }, 16 | "$schema": "https://raw.githubusercontent.com/googleapis/release-please/main/schemas/config.json" 17 | } 18 | -------------------------------------------------------------------------------- /requirements.dev.txt: -------------------------------------------------------------------------------- 1 | h3==3.7.7 2 | jupyter==1.0.0 3 | twine==3.1.1 4 | mypy==1.10.0 5 | sphinx==6.2.1 6 | sphinxemoji==0.3.1 7 | sphinx-rtd-theme==1.3.0 8 | recommonmark==0.7.1 9 | pyarrow==16.1.0 10 | setuptools==70.0.0 11 | wheel==0.43.0 12 | -------------------------------------------------------------------------------- /requirements.lint.txt: -------------------------------------------------------------------------------- 1 | black==24.3.0 2 | flake8==4.0.1 3 | flake8-isort==4.1.1 4 | flake8-docstrings==1.5.0 5 | flake8-bugbear==20.1.0 6 | flake8-bandit==2.1.2 7 | bandit==1.7.2 8 | -------------------------------------------------------------------------------- /requirements.test.txt: -------------------------------------------------------------------------------- 1 | pytest==5.3.2 2 | pytest-cov==2.8.1 3 | pytest-xdist==1.31.0 4 | pytest-mock==2.0.0 5 | pytest-spark==0.6.0 6 | pyspark[connect]==3.5.1 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cassandra-driver==3.24.0 2 | mdutils>=1.2.2,<2.0 3 | pandas>=0.24,<2.0 4 | parameters-validation>=1.1.5,<2.0 5 | pyspark==3.5.1 6 | typer==0.4.2 7 | typing-extensions>3.7.4,<5 8 | boto3==1.35.* 9 | numpy==1.26.4 10 | delta-spark==3.2.0 11 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | docstring-convention = google 3 | max-line-length = 88 4 | max-complexity = 12 5 | ignore = W503, E203, D203, D401, D107, S101, D105, D100, W605, D202, D212, D104, E261 6 | exclude = dist/*,build/*,.pytest_cache/*,.git/*,pip/* 7 | per-file-ignores = 8 | # We will not check for docstrings or the use of asserts in tests 9 | tests/*:D,S101 10 | setup.py:D,S101 11 | 12 | [isort] 13 | profile = black 14 | line_length = 88 15 | known_first_party = butterfree 16 | default_section = THIRDPARTY 17 | multi_line_output = 3 18 | indent = ' ' 19 | skip_glob = pip 20 | include_trailing_comma = True 21 | 22 | [tool:pytest] 23 | spark_options = 24 | spark.sql.session.timeZone: UTC 25 | spark.driver.bindAddress: 127.0.0.1 26 | spark.sql.legacy.timeParserPolicy: LEGACY 27 | spark.sql.legacy.createHiveTableByDefault: false 28 | 29 | [mypy] 30 | # suppress errors about unsatisfied imports 31 | ignore_missing_imports=True 32 | 33 | # be strict 34 | warn_return_any = True 35 | strict_optional = True 36 | warn_no_return = True 37 | warn_redundant_casts = True 38 | warn_unused_ignores = True 39 | disallow_any_generics = True 40 | 41 | disallow_untyped_defs = True 42 | check_untyped_defs = True 43 | disallow_untyped_calls = True 44 | 45 | [build_sphinx] 46 | all-files = 1 47 | source-dir = docs/source 48 | build-dir = docs/build 49 | warning-is-error = 0 50 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | __package_name__ = "butterfree" 4 | __version__ = "1.7.2" # x-release-please-version 5 | __repository_url__ = "https://github.com/quintoandar/butterfree" 6 | 7 | with open("requirements.txt") as f: 8 | requirements = f.read().splitlines() 9 | 10 | with open("README.md") as f: 11 | long_description = f.read() 12 | 13 | setup( 14 | name=__package_name__, 15 | description="A tool for building feature stores - Transform your raw data " 16 | "into beautiful features.", 17 | long_description=long_description, 18 | long_description_content_type="text/markdown", 19 | keywords="feature store sets ETL", 20 | version=__version__, 21 | url=__repository_url__, 22 | packages=find_packages( 23 | exclude=( 24 | "docs", 25 | "tests", 26 | "tests.*", 27 | "pipenv", 28 | "env", 29 | "examples", 30 | "htmlcov", 31 | ".pytest_cache", 32 | ) 33 | ), 34 | license="Copyright", 35 | author="QuintoAndar", 36 | install_requires=requirements, 37 | extras_require={"h3": ["h3>=3.7.4,<4"]}, 38 | python_requires=">=3.9, <4", 39 | entry_points={"console_scripts": ["butterfree=butterfree._cli.main:app"]}, 40 | include_package_data=True, 41 | ) 42 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/__init__.py -------------------------------------------------------------------------------- /tests/integration/__init__.py: -------------------------------------------------------------------------------- 1 | from os import path 2 | 3 | INPUT_PATH = path.join(path.dirname(path.abspath(__file__)), "input") 4 | OUTPUT_PATH = path.join(path.dirname(path.abspath(__file__)), "output") 5 | -------------------------------------------------------------------------------- /tests/integration/butterfree/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/integration/butterfree/__init__.py -------------------------------------------------------------------------------- /tests/integration/butterfree/extract/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/integration/butterfree/extract/__init__.py -------------------------------------------------------------------------------- /tests/integration/butterfree/extract/conftest.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import Mock 2 | 3 | import pytest 4 | 5 | 6 | @pytest.fixture() 7 | def target_df_table_reader(spark_context, spark_session): 8 | data = [ 9 | {"id": 1, "feature1": 100}, 10 | {"id": 2, "feature1": 200}, 11 | {"id": 3, "feature1": 300}, 12 | {"id": 4, "feature1": 400}, 13 | {"id": 5, "feature1": 500}, 14 | {"id": 6, "feature1": 600}, 15 | ] 16 | return spark_session.read.json(spark_context.parallelize(data, 1)) 17 | 18 | 19 | @pytest.fixture() 20 | def target_df_source(spark_context, spark_session): 21 | data = [ 22 | {"id": 1, "feature1": 100, "feature2": 200}, 23 | {"id": 2, "feature1": 200, "feature2": 400}, 24 | {"id": 3, "feature1": 300, "feature2": 600}, 25 | {"id": 4, "feature1": 400, "feature2": 800}, 26 | {"id": 5, "feature1": 500, "feature2": 1000}, 27 | {"id": 6, "feature1": 600, "feature2": 1200}, 28 | ] 29 | return spark_session.read.json(spark_context.parallelize(data, 1)) 30 | 31 | 32 | @pytest.fixture() 33 | def spark_client_mock(): 34 | return Mock() 35 | -------------------------------------------------------------------------------- /tests/integration/butterfree/extract/test_source.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from pyspark.sql import DataFrame 4 | 5 | from butterfree.clients import SparkClient 6 | from butterfree.extract import Source 7 | from butterfree.extract.readers import FileReader, TableReader 8 | from tests.integration import INPUT_PATH 9 | 10 | 11 | def create_temp_view(dataframe: DataFrame, name): 12 | dataframe.createOrReplaceTempView(name) 13 | 14 | 15 | def create_db_and_table(spark, table_reader_id, table_reader_db, table_reader_table): 16 | spark.sql(f"drop schema if exists {table_reader_db} cascade") 17 | spark.sql(f"create database {table_reader_db}") 18 | spark.sql(f"use {table_reader_db}") 19 | spark.sql( 20 | f"create table {table_reader_db}.{table_reader_table} " # noqa 21 | f"as select * from {table_reader_id}" # noqa 22 | ) 23 | 24 | 25 | def compare_dataframes( 26 | actual_df: DataFrame, expected_df: DataFrame, columns_sort: List[str] = None 27 | ): 28 | if not columns_sort: 29 | columns_sort = actual_df.schema.fieldNames() 30 | return sorted(actual_df.select(*columns_sort).collect()) == sorted( 31 | expected_df.select(*columns_sort).collect() 32 | ) 33 | 34 | 35 | class TestSource: 36 | def test_source( 37 | self, 38 | target_df_source, 39 | target_df_table_reader, 40 | spark_session, 41 | ): 42 | # given 43 | spark_client = SparkClient() 44 | 45 | table_reader_id = "a_test_source" 46 | table_reader_db = "db" 47 | table_reader_table = "table_test_source" 48 | 49 | create_temp_view(dataframe=target_df_table_reader, name=table_reader_id) 50 | create_db_and_table( 51 | spark=spark_session, 52 | table_reader_id=table_reader_id, 53 | table_reader_db=table_reader_db, 54 | table_reader_table=table_reader_table, 55 | ) 56 | 57 | file_reader_id = "b_test_source" 58 | data_sample_path = INPUT_PATH + "/data.json" 59 | 60 | # when 61 | source = Source( 62 | readers=[ 63 | TableReader( 64 | id=table_reader_id, 65 | database=table_reader_db, 66 | table=table_reader_table, 67 | ), 68 | FileReader(id=file_reader_id, path=data_sample_path, format="json"), 69 | ], 70 | query=f"select a.*, b.feature2 " # noqa 71 | f"from {table_reader_id} a " # noqa 72 | f"inner join {file_reader_id} b on a.id = b.id ", # noqa 73 | eager_evaluation=False, 74 | ) 75 | 76 | result_df = source.construct(client=spark_client) 77 | target_df = target_df_source 78 | 79 | # then 80 | assert ( 81 | compare_dataframes( 82 | actual_df=result_df, 83 | expected_df=target_df, 84 | columns_sort=result_df.columns, 85 | ) 86 | is True 87 | ) 88 | -------------------------------------------------------------------------------- /tests/integration/butterfree/load/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/integration/butterfree/load/__init__.py -------------------------------------------------------------------------------- /tests/integration/butterfree/load/conftest.py: -------------------------------------------------------------------------------- 1 | from pytest import fixture 2 | 3 | from butterfree.constants import DataType, columns 4 | from butterfree.transform import FeatureSet 5 | from butterfree.transform.features import Feature, KeyFeature, TimestampFeature 6 | 7 | 8 | @fixture 9 | def input_dataframe(spark_context, spark_session): 10 | data = [ 11 | { 12 | "id": 1, 13 | "timestamp": "2019-12-01", 14 | "feature": 100, 15 | columns.PARTITION_YEAR: 2019, 16 | columns.PARTITION_MONTH: 12, 17 | columns.PARTITION_DAY: 1, 18 | }, 19 | { 20 | "id": 2, 21 | "timestamp": "2020-01-01", 22 | "feature": 200, 23 | columns.PARTITION_YEAR: 2020, 24 | columns.PARTITION_MONTH: 1, 25 | columns.PARTITION_DAY: 1, 26 | }, 27 | { 28 | "id": 1, 29 | "timestamp": "2020-02-01", 30 | "feature": 110, 31 | columns.PARTITION_YEAR: 2020, 32 | columns.PARTITION_MONTH: 2, 33 | columns.PARTITION_DAY: 1, 34 | }, 35 | { 36 | "id": 1, 37 | "timestamp": "2020-02-02", 38 | "feature": 120, 39 | columns.PARTITION_YEAR: 2020, 40 | columns.PARTITION_MONTH: 2, 41 | columns.PARTITION_DAY: 2, 42 | }, 43 | ] 44 | return spark_session.read.json(spark_context.parallelize(data, 1)) 45 | 46 | 47 | @fixture 48 | def feature_set(): 49 | key_features = [ 50 | KeyFeature(name="id", description="Description", dtype=DataType.INTEGER) 51 | ] 52 | ts_feature = TimestampFeature(from_column="timestamp") 53 | features = [ 54 | Feature(name="feature", description="Description", dtype=DataType.INTEGER), 55 | ] 56 | return FeatureSet( 57 | "test_sink_feature_set", 58 | "test_sink_entity", 59 | "description", 60 | keys=key_features, 61 | timestamp=ts_feature, 62 | features=features, 63 | ) 64 | -------------------------------------------------------------------------------- /tests/integration/butterfree/load/test_sink.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | from unittest.mock import Mock 3 | 4 | from butterfree.clients import SparkClient 5 | from butterfree.load import Sink 6 | from butterfree.load.writers import ( 7 | HistoricalFeatureStoreWriter, 8 | OnlineFeatureStoreWriter, 9 | ) 10 | 11 | 12 | def test_sink(input_dataframe, feature_set): 13 | # arrange 14 | client = SparkClient() 15 | client.conn.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic") 16 | feature_set_df = feature_set.construct(input_dataframe, client) 17 | target_latest_df = OnlineFeatureStoreWriter.filter_latest( 18 | feature_set_df, id_columns=[key.name for key in feature_set.keys] 19 | ) 20 | columns_sort = feature_set_df.schema.fieldNames() 21 | 22 | # setup historical writer 23 | s3config = Mock() 24 | s3config.mode = "overwrite" 25 | s3config.format_ = "parquet" 26 | s3config.get_options = Mock( 27 | return_value={ 28 | "path": "test_folder/historical/entity/feature_set", 29 | "mode": "overwrite", 30 | } 31 | ) 32 | s3config.get_path_with_partitions = Mock( 33 | return_value="spark-warehouse/test.db/test_folder/historical/entity/feature_set" 34 | ) 35 | 36 | historical_writer = HistoricalFeatureStoreWriter( 37 | db_config=s3config, interval_mode=True 38 | ) 39 | 40 | # setup online writer 41 | # TODO: Change for CassandraConfig when Cassandra for test is ready 42 | online_config = Mock() 43 | online_config.mode = "overwrite" 44 | online_config.format_ = "parquet" 45 | online_config.get_options = Mock( 46 | return_value={"path": "test_folder/online/entity/feature_set"} 47 | ) 48 | online_writer = OnlineFeatureStoreWriter(db_config=online_config) 49 | 50 | writers = [historical_writer, online_writer] 51 | sink = Sink(writers) 52 | 53 | # act 54 | client.sql("CREATE DATABASE IF NOT EXISTS {}".format(historical_writer.database)) 55 | sink.flush(feature_set, feature_set_df, client) 56 | 57 | # get historical results 58 | historical_result_df = client.read( 59 | s3config.format_, 60 | path=s3config.get_path_with_partitions(feature_set.name, feature_set_df), 61 | ) 62 | 63 | # get online results 64 | online_result_df = client.read( 65 | online_config.format_, **online_config.get_options(feature_set.name) 66 | ) 67 | 68 | # assert 69 | # assert historical results 70 | assert sorted(feature_set_df.select(*columns_sort).collect()) == sorted( 71 | historical_result_df.select(*columns_sort).collect() 72 | ) 73 | 74 | # assert online results 75 | assert sorted(target_latest_df.select(*columns_sort).collect()) == sorted( 76 | online_result_df.select(*columns_sort).collect() 77 | ) 78 | 79 | # tear down 80 | shutil.rmtree("test_folder") 81 | -------------------------------------------------------------------------------- /tests/integration/butterfree/pipelines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/integration/butterfree/pipelines/__init__.py -------------------------------------------------------------------------------- /tests/integration/butterfree/transform/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/integration/butterfree/transform/__init__.py -------------------------------------------------------------------------------- /tests/integration/input/data.json: -------------------------------------------------------------------------------- 1 | {"feature2":200,"id":1} 2 | {"feature2":400,"id":2} 3 | {"feature2":600,"id":3} 4 | {"feature2":800,"id":4} 5 | {"feature2":1000,"id":5} 6 | {"feature2":1200,"id":6} 7 | -------------------------------------------------------------------------------- /tests/mocks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/mocks/__init__.py -------------------------------------------------------------------------------- /tests/mocks/entities/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/mocks/entities/__init__.py -------------------------------------------------------------------------------- /tests/mocks/entities/first/__init__.py: -------------------------------------------------------------------------------- 1 | from .first_pipeline import FirstPipeline 2 | 3 | __all__ = ["FirstPipeline"] 4 | -------------------------------------------------------------------------------- /tests/mocks/entities/first/first_pipeline.py: -------------------------------------------------------------------------------- 1 | from butterfree.constants.data_type import DataType 2 | from butterfree.extract import Source 3 | from butterfree.extract.readers import TableReader 4 | from butterfree.load import Sink 5 | from butterfree.load.writers import ( 6 | HistoricalFeatureStoreWriter, 7 | OnlineFeatureStoreWriter, 8 | ) 9 | from butterfree.pipelines import FeatureSetPipeline 10 | from butterfree.transform import FeatureSet 11 | from butterfree.transform.features import Feature, KeyFeature, TimestampFeature 12 | 13 | 14 | class FirstPipeline(FeatureSetPipeline): 15 | def __init__(self): 16 | super(FirstPipeline, self).__init__( 17 | source=Source( 18 | readers=[ 19 | TableReader( 20 | id="t", 21 | database="db", 22 | table="table", 23 | ) 24 | ], 25 | query=f"select * from t", # noqa 26 | ), 27 | feature_set=FeatureSet( 28 | name="first", 29 | entity="entity", 30 | description="description", 31 | features=[ 32 | Feature( 33 | name="feature1", 34 | description="test", 35 | dtype=DataType.FLOAT, 36 | ), 37 | Feature( 38 | name="feature2", 39 | description="another test", 40 | dtype=DataType.STRING, 41 | ), 42 | ], 43 | keys=[ 44 | KeyFeature( 45 | name="id", 46 | description="identifier", 47 | dtype=DataType.BIGINT, 48 | ) 49 | ], 50 | timestamp=TimestampFeature(), 51 | ), 52 | sink=Sink( 53 | writers=[HistoricalFeatureStoreWriter(), OnlineFeatureStoreWriter()] 54 | ), 55 | ) 56 | -------------------------------------------------------------------------------- /tests/mocks/entities/second/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/mocks/entities/second/__init__.py -------------------------------------------------------------------------------- /tests/mocks/entities/second/deeper/__init__.py: -------------------------------------------------------------------------------- 1 | from .second_pipeline import SecondPipeline 2 | 3 | __all__ = ["SecondPipeline"] 4 | -------------------------------------------------------------------------------- /tests/mocks/entities/second/deeper/second_pipeline.py: -------------------------------------------------------------------------------- 1 | from butterfree.constants.data_type import DataType 2 | from butterfree.extract import Source 3 | from butterfree.extract.readers import TableReader 4 | from butterfree.load import Sink 5 | from butterfree.load.writers import ( 6 | HistoricalFeatureStoreWriter, 7 | OnlineFeatureStoreWriter, 8 | ) 9 | from butterfree.pipelines import FeatureSetPipeline 10 | from butterfree.transform import FeatureSet 11 | from butterfree.transform.features import Feature, KeyFeature, TimestampFeature 12 | 13 | 14 | class SecondPipeline(FeatureSetPipeline): 15 | def __init__(self): 16 | super(SecondPipeline, self).__init__( 17 | source=Source( 18 | readers=[ 19 | TableReader( 20 | id="t", 21 | database="db", 22 | table="table", 23 | ) 24 | ], 25 | query=f"select * from t", # noqa 26 | ), 27 | feature_set=FeatureSet( 28 | name="second", 29 | entity="entity", 30 | description="description", 31 | features=[ 32 | Feature( 33 | name="feature1", 34 | description="test", 35 | dtype=DataType.STRING, 36 | ), 37 | Feature( 38 | name="feature2", 39 | description="another test", 40 | dtype=DataType.FLOAT, 41 | ), 42 | ], 43 | keys=[ 44 | KeyFeature( 45 | name="id", 46 | description="identifier", 47 | dtype=DataType.BIGINT, 48 | ) 49 | ], 50 | timestamp=TimestampFeature(), 51 | ), 52 | sink=Sink( 53 | writers=[HistoricalFeatureStoreWriter(), OnlineFeatureStoreWriter()] 54 | ), 55 | ) 56 | -------------------------------------------------------------------------------- /tests/unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/__init__.py -------------------------------------------------------------------------------- /tests/unit/butterfree/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/__init__.py -------------------------------------------------------------------------------- /tests/unit/butterfree/_cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/_cli/__init__.py -------------------------------------------------------------------------------- /tests/unit/butterfree/_cli/test_migrate.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import call 2 | 3 | from typer.testing import CliRunner 4 | 5 | from butterfree._cli import migrate 6 | from butterfree._cli.main import app 7 | from butterfree.migrations.database_migration import CassandraMigration 8 | from butterfree.pipelines import FeatureSetPipeline 9 | 10 | runner = CliRunner() 11 | 12 | 13 | class TestMigrate: 14 | def test_migrate_success(self, mocker): 15 | mocker.patch.object(migrate.Migrate, "run") 16 | all_fs = migrate.migrate("tests/mocks/entities/") 17 | assert all(isinstance(fs, FeatureSetPipeline) for fs in all_fs) 18 | assert sorted([fs.feature_set.name for fs in all_fs]) == ["first", "second"] 19 | 20 | def test_migrate_run_methods(self, mocker): 21 | mocker.patch.object(CassandraMigration, "apply_migration") 22 | mocker.patch.object(migrate.Migrate, "_send_logs_to_s3") 23 | 24 | all_fs = migrate.migrate("tests/mocks/entities/", False, False) 25 | 26 | assert CassandraMigration.apply_migration.call_count == 2 27 | 28 | cassandra_pairs = [ 29 | call(pipe.feature_set, pipe.sink.writers[1], False) for pipe in all_fs 30 | ] 31 | CassandraMigration.apply_migration.assert_has_calls( 32 | cassandra_pairs, any_order=True 33 | ) 34 | migrate.Migrate._send_logs_to_s3.assert_called_once() 35 | 36 | def test_app_cli(self): 37 | result = runner.invoke(app, "migrate") 38 | assert result.exit_code == 0 39 | 40 | def test_app_migrate(self, mocker): 41 | mocker.patch.object(migrate.Migrate, "run") 42 | result = runner.invoke(app, ["migrate", "apply", "tests/mocks/entities/"]) 43 | assert result.exit_code == 0 44 | -------------------------------------------------------------------------------- /tests/unit/butterfree/automated/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/automated/__init__.py -------------------------------------------------------------------------------- /tests/unit/butterfree/automated/test_feature_set_creation.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import MagicMock 3 | 4 | from butterfree.automated.feature_set_creation import FeatureSetCreation 5 | 6 | 7 | class TestFeatureSetCreation(unittest.TestCase): 8 | def setUp(self): 9 | self.feature_set_creation = FeatureSetCreation() 10 | 11 | def test_get_features_with_regex(self): 12 | sql_query = "SELECT column1, column2 FROM table1" 13 | expected_features = ["column1", "column2"] 14 | 15 | features = self.feature_set_creation._get_features_with_regex(sql_query) 16 | 17 | self.assertEqual(features, expected_features) 18 | 19 | def test_get_data_type(self): 20 | field_name = "column1" 21 | df_mock = MagicMock() 22 | df_mock.schema.jsonValue.return_value = { 23 | "fields": [{"name": "column1", "type": "string"}] 24 | } 25 | 26 | data_type = self.feature_set_creation._get_data_type(field_name, df_mock) 27 | 28 | self.assertEqual(data_type, ".STRING") 29 | -------------------------------------------------------------------------------- /tests/unit/butterfree/clients/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/clients/__init__.py -------------------------------------------------------------------------------- /tests/unit/butterfree/clients/conftest.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, List 2 | from unittest.mock import Mock 3 | 4 | import pytest 5 | from pyspark import SparkContext 6 | from pyspark.sql import DataFrame, SparkSession 7 | from pyspark.sql.streaming import StreamingQuery 8 | 9 | from butterfree.clients import CassandraClient 10 | 11 | 12 | @pytest.fixture() 13 | def target_df(spark_context: SparkContext, spark_session: SparkSession) -> DataFrame: 14 | data = [{"col1": "value", "col2": 123}] 15 | return spark_session.read.json(spark_context.parallelize(data, 1)) # type: ignore 16 | 17 | 18 | @pytest.fixture() 19 | def mocked_spark_read() -> Mock: 20 | mock = Mock() 21 | mock.readStream = mock 22 | mock.read = mock 23 | mock.format.return_value = mock 24 | mock.options.return_value = mock 25 | return mock 26 | 27 | 28 | @pytest.fixture() 29 | def mocked_spark_write() -> Mock: 30 | mock = Mock() 31 | mock.dataframe = mock 32 | mock.write = mock 33 | return mock 34 | 35 | 36 | @pytest.fixture() 37 | def mocked_stream_df() -> Mock: 38 | mock = Mock() 39 | mock.isStreaming = True 40 | mock.writeStream = mock 41 | mock.trigger.return_value = mock 42 | mock.outputMode.return_value = mock 43 | mock.option.return_value = mock 44 | mock.foreachBatch.return_value = mock 45 | mock.start.return_value = Mock(spec=StreamingQuery) 46 | return mock 47 | 48 | 49 | @pytest.fixture() 50 | def mock_spark_sql() -> Mock: 51 | mock = Mock() 52 | mock.sql = mock 53 | return mock 54 | 55 | 56 | @pytest.fixture 57 | def cassandra_client() -> CassandraClient: 58 | return CassandraClient(host=["mock"], keyspace="dummy_keyspace") 59 | 60 | 61 | @pytest.fixture 62 | def cassandra_feature_set() -> List[Dict[str, Any]]: 63 | return [ 64 | {"feature1": "value1", "feature2": 10.5}, 65 | {"feature1": "value1", "feature2": 10}, 66 | ] 67 | -------------------------------------------------------------------------------- /tests/unit/butterfree/configs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/configs/__init__.py -------------------------------------------------------------------------------- /tests/unit/butterfree/configs/db/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/configs/db/__init__.py -------------------------------------------------------------------------------- /tests/unit/butterfree/configs/db/conftest.py: -------------------------------------------------------------------------------- 1 | from pytest import fixture 2 | 3 | from butterfree.configs.db import CassandraConfig, KafkaConfig, MetastoreConfig 4 | 5 | 6 | @fixture 7 | def cassandra_config(monkeypatch): 8 | monkeypatch.setenv("CASSANDRA_KEYSPACE", "test") 9 | monkeypatch.setenv("CASSANDRA_HOST", "test") 10 | monkeypatch.setenv("CASSANDRA_PASSWORD", "test") 11 | monkeypatch.setenv("CASSANDRA_USERNAME", "test") 12 | 13 | return CassandraConfig() 14 | 15 | 16 | @fixture 17 | def kafka_config(monkeypatch): 18 | monkeypatch.setenv("KAFKA_CONNECTION_STRING", "test") 19 | 20 | return KafkaConfig() 21 | 22 | 23 | @fixture 24 | def metastore_config(monkeypatch): 25 | monkeypatch.setenv("FEATURE_STORE_S3_BUCKET", "test") 26 | 27 | return MetastoreConfig() 28 | -------------------------------------------------------------------------------- /tests/unit/butterfree/configs/db/test_metastore_config.py: -------------------------------------------------------------------------------- 1 | from butterfree.configs import environment 2 | 3 | 4 | class TestMetastoreConfig: 5 | def test_mode(self, metastore_config): 6 | # expecting 7 | default = "overwrite" 8 | assert metastore_config.mode == default 9 | 10 | # given 11 | metastore_config.mode = None 12 | # then 13 | assert metastore_config.mode == default 14 | 15 | def test_mode_custom(self, metastore_config): 16 | # given 17 | mode = "append" 18 | metastore_config.mode = mode 19 | 20 | # then 21 | assert metastore_config.mode == mode 22 | 23 | def test_format(self, metastore_config): 24 | # expecting 25 | default = "parquet" 26 | assert metastore_config.format_ == default 27 | 28 | # given 29 | metastore_config.format_ = None 30 | # then 31 | assert metastore_config.format_ == default 32 | 33 | def test_format_custom(self, metastore_config): 34 | # given 35 | format_ = "json" 36 | metastore_config.format_ = format_ 37 | 38 | # then 39 | assert metastore_config.format_ == format_ 40 | 41 | def test_path(self, metastore_config): 42 | # expecting 43 | default = environment.get_variable("FEATURE_STORE_S3_BUCKET") 44 | assert metastore_config.path == default 45 | 46 | def test_path_custom(self, metastore_config): 47 | # given 48 | bucket = "test" 49 | metastore_config.path = bucket 50 | 51 | # then 52 | assert metastore_config.path == bucket 53 | 54 | def test_file_system(self, metastore_config): 55 | # expecting 56 | default = "s3a" 57 | assert metastore_config.file_system == default 58 | 59 | def test_file_system_custom(self, metastore_config): 60 | # given 61 | file_system = "dbfs" 62 | metastore_config.file_system = file_system 63 | 64 | # then 65 | assert metastore_config.file_system == file_system 66 | -------------------------------------------------------------------------------- /tests/unit/butterfree/configs/test_environment.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from butterfree.configs import environment 4 | 5 | 6 | def test_get_variable_success(monkeypatch): 7 | # given 8 | specified_variable = "specified_variable" 9 | effective_value = "effective_value" 10 | monkeypatch.setenv(specified_variable, effective_value) 11 | environment.specification[specified_variable] = "spec_default_value" 12 | 13 | # when 14 | return_value = environment.get_variable(specified_variable, "anything") 15 | 16 | # then 17 | assert return_value == effective_value 18 | 19 | 20 | def test_get_variable_from_spec_default(monkeypatch): 21 | # given 22 | specified_variable = "specified_variable" 23 | spec_default_value = "default_value" 24 | monkeypatch.setenv(specified_variable, "overwrite") 25 | monkeypatch.delenv(specified_variable) 26 | environment.specification[specified_variable] = spec_default_value 27 | 28 | # when 29 | return_value = environment.get_variable(specified_variable, "anything") 30 | 31 | # then 32 | assert return_value == spec_default_value 33 | 34 | 35 | def test_get_variable_default(monkeypatch): 36 | # given 37 | default = "default_value" 38 | variable = "environment_variable" 39 | environment.specification[variable] = None 40 | monkeypatch.setenv(variable, "overwrite") 41 | monkeypatch.delenv(variable) 42 | 43 | # when 44 | return_value = environment.get_variable(variable, default) 45 | 46 | # then 47 | assert return_value == default 48 | 49 | 50 | def test_get_variable_out_of_spec_fails(monkeypatch): 51 | # given 52 | not_specified_variable = "not_specified_variable" 53 | monkeypatch.setenv(not_specified_variable, "anything") 54 | if not_specified_variable in environment.specification: 55 | del environment.specification[not_specified_variable] 56 | 57 | # then 58 | with pytest.raises( 59 | environment.UnspecifiedVariableError, match="not listed in the environment" 60 | ): 61 | environment.get_variable(not_specified_variable, "anything") 62 | -------------------------------------------------------------------------------- /tests/unit/butterfree/dataframe_service/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/dataframe_service/__init__.py -------------------------------------------------------------------------------- /tests/unit/butterfree/dataframe_service/conftest.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import random 3 | 4 | import pytest 5 | 6 | 7 | @pytest.fixture() 8 | def input_df(spark_context, spark_session): 9 | start = datetime.datetime(year=1970, month=1, day=1) 10 | end = datetime.datetime(year=2020, month=12, day=31) 11 | random_dates = [ 12 | ( 13 | lambda: start 14 | + datetime.timedelta( 15 | seconds=random.randint( # noqa: S311 16 | 0, int((end - start).total_seconds()) 17 | ) 18 | ) 19 | )() 20 | .date() 21 | .isoformat() 22 | for _ in range(10000) 23 | ] 24 | data = [{"timestamp": date} for date in random_dates] 25 | return spark_session.read.json( 26 | spark_context.parallelize(data, 1), schema="timestamp timestamp" 27 | ) 28 | 29 | 30 | @pytest.fixture() 31 | def test_partitioning_input_df(spark_context, spark_session): 32 | data = [ 33 | {"feature": 1, "year": 2009, "month": 8, "day": 20}, 34 | {"feature": 2, "year": 2009, "month": 8, "day": 20}, 35 | {"feature": 3, "year": 2020, "month": 8, "day": 20}, 36 | {"feature": 4, "year": 2020, "month": 9, "day": 20}, 37 | {"feature": 5, "year": 2020, "month": 9, "day": 20}, 38 | {"feature": 6, "year": 2020, "month": 8, "day": 20}, 39 | {"feature": 7, "year": 2020, "month": 8, "day": 21}, 40 | ] 41 | return spark_session.read.json(spark_context.parallelize(data, 1)) 42 | -------------------------------------------------------------------------------- /tests/unit/butterfree/dataframe_service/test_incremental_srategy.py: -------------------------------------------------------------------------------- 1 | from butterfree.dataframe_service import IncrementalStrategy 2 | 3 | 4 | class TestIncrementalStrategy: 5 | def test_from_milliseconds(self): 6 | # arrange 7 | incremental_strategy = IncrementalStrategy().from_milliseconds("ts") 8 | target_expression = "date(from_unixtime(ts/ 1000.0)) >= date('2020-01-01')" 9 | 10 | # act 11 | result_expression = incremental_strategy.get_expression(start_date="2020-01-01") 12 | 13 | # assert 14 | assert target_expression.split() == result_expression.split() 15 | 16 | def test_from_string(self): 17 | # arrange 18 | incremental_strategy = IncrementalStrategy().from_string( 19 | "dt", mask="dd/MM/yyyy" 20 | ) 21 | target_expression = "date(to_date(dt, 'dd/MM/yyyy')) >= date('2020-01-01')" 22 | 23 | # act 24 | result_expression = incremental_strategy.get_expression(start_date="2020-01-01") 25 | 26 | # assert 27 | assert target_expression.split() == result_expression.split() 28 | 29 | def test_from_year_month_day_partitions(self): 30 | # arrange 31 | incremental_strategy = IncrementalStrategy().from_year_month_day_partitions( 32 | year_column="y", month_column="m", day_column="d" 33 | ) 34 | target_expression = ( 35 | "date(concat(string(y), " 36 | "'-', string(m), " 37 | "'-', string(d))) >= date('2020-01-01')" 38 | ) 39 | 40 | # act 41 | result_expression = incremental_strategy.get_expression(start_date="2020-01-01") 42 | 43 | # assert 44 | assert target_expression.split() == result_expression.split() 45 | 46 | def test_get_expression_with_just_end_date(self): 47 | # arrange 48 | incremental_strategy = IncrementalStrategy(column="dt") 49 | target_expression = "date(dt) <= date('2020-01-01')" 50 | 51 | # act 52 | result_expression = incremental_strategy.get_expression(end_date="2020-01-01") 53 | 54 | # assert 55 | assert target_expression.split() == result_expression.split() 56 | 57 | def test_get_expression_with_start_and_end_date(self): 58 | # arrange 59 | incremental_strategy = IncrementalStrategy(column="dt") 60 | target_expression = ( 61 | "date(dt) >= date('2019-12-30') and date(dt) <= date('2020-01-01')" 62 | ) 63 | 64 | # act 65 | result_expression = incremental_strategy.get_expression( 66 | start_date="2019-12-30", end_date="2020-01-01" 67 | ) 68 | 69 | # assert 70 | assert target_expression.split() == result_expression.split() 71 | -------------------------------------------------------------------------------- /tests/unit/butterfree/dataframe_service/test_partitioning.py: -------------------------------------------------------------------------------- 1 | from butterfree.dataframe_service import extract_partition_values 2 | 3 | 4 | class TestPartitioning: 5 | def test_extract_partition_values(self, test_partitioning_input_df): 6 | # arrange 7 | target_values = [ 8 | {"year": 2009, "month": 8, "day": 20}, 9 | {"year": 2020, "month": 8, "day": 20}, 10 | {"year": 2020, "month": 9, "day": 20}, 11 | {"year": 2020, "month": 8, "day": 21}, 12 | ] 13 | 14 | # act 15 | result_values = extract_partition_values( 16 | test_partitioning_input_df, partition_columns=["year", "month", "day"] 17 | ) 18 | 19 | # assert 20 | assert result_values == target_values 21 | -------------------------------------------------------------------------------- /tests/unit/butterfree/dataframe_service/test_repartition.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql.functions import spark_partition_id 2 | 3 | from butterfree.dataframe_service import repartition_df, repartition_sort_df 4 | 5 | 6 | class TestRepartition: 7 | def test_repartition_df(self, input_df): 8 | result_df = repartition_df(dataframe=input_df, partition_by=["timestamp"]) 9 | 10 | # Only one partition id, meaning data is not partitioned 11 | assert input_df.select(spark_partition_id()).distinct().count() == 1 12 | # Desired number of partitions 13 | assert result_df.select(spark_partition_id()).distinct().count() == 200 14 | 15 | def test_repartition_df_partitions(self, input_df): 16 | result_df = repartition_df( 17 | dataframe=input_df, partition_by=["timestamp"], num_partitions=50 18 | ) 19 | 20 | # Only one partition id, meaning data is not partitioned 21 | assert input_df.select(spark_partition_id()).distinct().count() == 1 22 | # Desired number of partitions 23 | assert result_df.select(spark_partition_id()).distinct().count() == 50 24 | 25 | def test_repartition_sort_df(self, input_df): 26 | result_df = repartition_sort_df( 27 | dataframe=input_df, partition_by=["timestamp"], order_by=["timestamp"] 28 | ) 29 | 30 | # Only one partition id, meaning data is not partitioned 31 | assert input_df.select(spark_partition_id()).distinct().count() == 1 32 | # Desired number of partitions 33 | assert result_df.select(spark_partition_id()).distinct().count() == 200 34 | 35 | def test_repartition_sort_df_processors(self, input_df): 36 | result_df = repartition_sort_df( 37 | dataframe=input_df, 38 | partition_by=["timestamp"], 39 | order_by=["timestamp"], 40 | num_processors=3, 41 | ) 42 | 43 | # Only one partition id, meaning data is not partitioned 44 | assert input_df.select(spark_partition_id()).distinct().count() == 1 45 | # Desired number of partitions 46 | assert result_df.select(spark_partition_id()).distinct().count() == 12 47 | 48 | def test_repartition_sort_df_processors_partitions(self, input_df): 49 | result_df = repartition_sort_df( 50 | dataframe=input_df, 51 | partition_by=["timestamp"], 52 | order_by=["timestamp"], 53 | num_partitions=50, 54 | ) 55 | 56 | # Only one partition id, meaning data is not partitioned 57 | assert input_df.select(spark_partition_id()).distinct().count() == 1 58 | # Desired number of partitions 59 | assert result_df.select(spark_partition_id()).distinct().count() == 50 60 | -------------------------------------------------------------------------------- /tests/unit/butterfree/extract/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/extract/__init__.py -------------------------------------------------------------------------------- /tests/unit/butterfree/extract/conftest.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import Mock 2 | 3 | import pytest 4 | from pyspark.sql.functions import col, to_date 5 | 6 | from butterfree.constants.columns import TIMESTAMP_COLUMN 7 | 8 | 9 | @pytest.fixture() 10 | def column_target_df(spark_context, spark_session): 11 | data = [{"new_col1": "value", "new_col2": 123}] 12 | return spark_session.read.json(spark_context.parallelize(data, 1)) 13 | 14 | 15 | @pytest.fixture() 16 | def target_df(spark_context, spark_session): 17 | data = [{"col1": "value", "col2": 123}] 18 | return spark_session.read.json(spark_context.parallelize(data, 1)) 19 | 20 | 21 | @pytest.fixture() 22 | def incremental_source_df(spark_context, spark_session): 23 | data = [ 24 | { 25 | "id": 1, 26 | "feature": 100, 27 | "date_str": "28/07/2020", 28 | "milliseconds": 1595894400000, 29 | "year": 2020, 30 | "month": 7, 31 | "day": 28, 32 | }, 33 | { 34 | "id": 1, 35 | "feature": 110, 36 | "date_str": "29/07/2020", 37 | "milliseconds": 1595980800000, 38 | "year": 2020, 39 | "month": 7, 40 | "day": 29, 41 | }, 42 | { 43 | "id": 1, 44 | "feature": 120, 45 | "date_str": "30/07/2020", 46 | "milliseconds": 1596067200000, 47 | "year": 2020, 48 | "month": 7, 49 | "day": 30, 50 | }, 51 | { 52 | "id": 2, 53 | "feature": 150, 54 | "date_str": "31/07/2020", 55 | "milliseconds": 1596153600000, 56 | "year": 2020, 57 | "month": 7, 58 | "day": 31, 59 | }, 60 | { 61 | "id": 2, 62 | "feature": 200, 63 | "date_str": "01/08/2020", 64 | "milliseconds": 1596240000000, 65 | "year": 2020, 66 | "month": 8, 67 | "day": 1, 68 | }, 69 | ] 70 | return spark_session.read.json(spark_context.parallelize(data, 1)).withColumn( 71 | "date", to_date(col("date_str"), "dd/MM/yyyy") 72 | ) 73 | 74 | 75 | @pytest.fixture() 76 | def spark_client(): 77 | return Mock() 78 | 79 | 80 | @pytest.fixture 81 | def feature_set_dataframe(spark_context, spark_session): 82 | data = [ 83 | {"id": 1, TIMESTAMP_COLUMN: 0, "feature": 100, "test": "fail"}, 84 | {"id": 2, TIMESTAMP_COLUMN: 0, "feature": 200, "test": "running"}, 85 | {"id": 1, TIMESTAMP_COLUMN: 1, "feature": 110, "test": "pass"}, 86 | {"id": 1, TIMESTAMP_COLUMN: 2, "feature": 120, "test": "pass"}, 87 | ] 88 | return spark_session.read.json(spark_context.parallelize(data, 1)) 89 | -------------------------------------------------------------------------------- /tests/unit/butterfree/extract/pre_processing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/extract/pre_processing/__init__.py -------------------------------------------------------------------------------- /tests/unit/butterfree/extract/pre_processing/conftest.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import List 3 | 4 | import pytest 5 | from pyspark.sql import DataFrame 6 | 7 | 8 | @pytest.fixture() 9 | def input_df(spark_context, spark_session): 10 | data = [ 11 | {"id": 1, "ts": "2016-04-11 11:31:11", "pivot_column": 1, "has_feature": 1}, 12 | {"id": 1, "ts": "2016-04-11 11:44:12", "pivot_column": 2, "has_feature": 0}, 13 | {"id": 1, "ts": "2016-04-11 11:46:24", "pivot_column": 3, "has_feature": 1}, 14 | {"id": 1, "ts": "2016-04-11 12:03:21", "pivot_column": 4, "has_feature": 0}, 15 | {"id": 1, "ts": "2016-04-11 13:46:24", "pivot_column": 3, "has_feature": None}, 16 | ] 17 | df = spark_session.read.json( 18 | spark_context.parallelize(data).map(lambda x: json.dumps(x)) 19 | ) 20 | return df 21 | 22 | 23 | @pytest.fixture() 24 | def pivot_df(spark_context, spark_session): 25 | data = [ 26 | {"id": 1, "ts": "2016-04-11 11:31:11", "1": 1, "2": None, "3": None, "4": None}, 27 | {"id": 1, "ts": "2016-04-11 11:44:12", "1": None, "2": 0, "3": None, "4": None}, 28 | {"id": 1, "ts": "2016-04-11 11:46:24", "1": None, "2": None, "3": 1, "4": None}, 29 | {"id": 1, "ts": "2016-04-11 12:03:21", "1": None, "2": None, "3": None, "4": 0}, 30 | { 31 | "id": 1, 32 | "ts": "2016-04-11 13:46:24", 33 | "1": None, 34 | "2": None, 35 | "3": None, 36 | "4": None, 37 | }, 38 | ] 39 | df = spark_session.read.json( 40 | spark_context.parallelize(data).map(lambda x: json.dumps(x)) 41 | ) 42 | return df.orderBy("ts") 43 | 44 | 45 | @pytest.fixture() 46 | def pivot_ffill_df(spark_context, spark_session): 47 | data = [ 48 | {"id": 1, "ts": "2016-04-11 11:31:11", "1": 1, "2": None, "3": None, "4": None}, 49 | {"id": 1, "ts": "2016-04-11 11:44:12", "1": 1, "2": 0, "3": None, "4": None}, 50 | {"id": 1, "ts": "2016-04-11 11:46:24", "1": 1, "2": 0, "3": 1, "4": None}, 51 | {"id": 1, "ts": "2016-04-11 12:03:21", "1": 1, "2": 0, "3": 1, "4": 0}, 52 | {"id": 1, "ts": "2016-04-11 13:46:24", "1": 1, "2": 0, "3": 1, "4": 0}, 53 | ] 54 | df = spark_session.read.json( 55 | spark_context.parallelize(data).map(lambda x: json.dumps(x)) 56 | ) 57 | return df.orderBy("ts") 58 | 59 | 60 | @pytest.fixture() 61 | def pivot_ffill_mock_df(spark_context, spark_session): 62 | data = [ 63 | {"id": 1, "ts": "2016-04-11 11:31:11", "1": 1, "2": None, "3": None, "4": None}, 64 | {"id": 1, "ts": "2016-04-11 11:44:12", "1": 1, "2": 0, "3": None, "4": None}, 65 | {"id": 1, "ts": "2016-04-11 11:46:24", "1": 1, "2": 0, "3": 1, "4": None}, 66 | {"id": 1, "ts": "2016-04-11 12:03:21", "1": 1, "2": 0, "3": 1, "4": 0}, 67 | {"id": 1, "ts": "2016-04-11 13:46:24", "1": 1, "2": 0, "3": None, "4": 0}, 68 | ] 69 | df = spark_session.read.json( 70 | spark_context.parallelize(data).map(lambda x: json.dumps(x)) 71 | ) 72 | return df.orderBy("ts") 73 | 74 | 75 | def compare_dataframes( 76 | actual_df: DataFrame, expected_df: DataFrame, columns_sort: List[str] = None 77 | ): 78 | if not columns_sort: 79 | columns_sort = actual_df.schema.fieldNames() 80 | return sorted(actual_df.select(*columns_sort).collect()) == sorted( 81 | expected_df.select(*columns_sort).collect() 82 | ) 83 | -------------------------------------------------------------------------------- /tests/unit/butterfree/extract/pre_processing/test_explode_json_column.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql.types import ( 2 | ArrayType, 3 | IntegerType, 4 | StringType, 5 | StructField, 6 | StructType, 7 | ) 8 | 9 | from butterfree.extract.pre_processing import explode_json_column 10 | from butterfree.testing.dataframe import ( 11 | assert_dataframe_equality, 12 | create_df_from_collection, 13 | ) 14 | 15 | 16 | def test_explode_json_column(spark_context, spark_session): 17 | # arrange 18 | input_data = [{"json_column": '{"a": 123, "b": "abc", "c": "123", "d": [1, 2, 3]}'}] 19 | target_data = [ 20 | { 21 | "json_column": '{"a": 123, "b": "abc", "c": "123", "d": [1, 2, 3]}', 22 | "a": 123, 23 | "b": "abc", 24 | "c": 123, 25 | "d": [1, 2, 3], 26 | } 27 | ] 28 | 29 | input_df = create_df_from_collection(input_data, spark_context, spark_session) 30 | target_df = create_df_from_collection(target_data, spark_context, spark_session) 31 | 32 | json_column_schema = StructType( 33 | [ 34 | StructField("a", IntegerType()), 35 | StructField("b", StringType()), 36 | StructField("c", IntegerType()), 37 | StructField("d", ArrayType(IntegerType())), 38 | ] 39 | ) 40 | 41 | # act 42 | output_df = explode_json_column(input_df, "json_column", json_column_schema) 43 | 44 | # arrange 45 | assert_dataframe_equality(target_df, output_df) 46 | -------------------------------------------------------------------------------- /tests/unit/butterfree/extract/pre_processing/test_filter_transform.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from butterfree.constants.columns import TIMESTAMP_COLUMN 4 | from butterfree.extract.pre_processing import filter 5 | from butterfree.extract.readers import FileReader 6 | 7 | 8 | class TestFilterDataFrame: 9 | def test_filter(self, feature_set_dataframe, spark_context, spark_session): 10 | # given 11 | file_reader = FileReader("test", "path/to/file", "format") 12 | 13 | file_reader.with_( 14 | transformer=filter, 15 | condition="test not in ('fail') and feature in (110, 120)", 16 | ) 17 | 18 | # when 19 | result_df = file_reader._apply_transformations(feature_set_dataframe) 20 | 21 | target_data = [ 22 | {"id": 1, TIMESTAMP_COLUMN: 1, "feature": 110, "test": "pass"}, 23 | {"id": 1, TIMESTAMP_COLUMN: 2, "feature": 120, "test": "pass"}, 24 | ] 25 | target_df = spark_session.read.json(spark_context.parallelize(target_data, 1)) 26 | 27 | # then 28 | assert result_df.collect() == target_df.collect() 29 | 30 | @pytest.mark.parametrize( 31 | "condition", 32 | [None, 100], 33 | ) 34 | def test_filter_with_invalidations( 35 | self, feature_set_dataframe, condition, spark_context, spark_session 36 | ): 37 | # given 38 | file_reader = FileReader("test", "path/to/file", "format") 39 | 40 | file_reader.with_(transformer=filter, condition=condition) 41 | 42 | # then 43 | with pytest.raises(TypeError): 44 | file_reader._apply_transformations(feature_set_dataframe) 45 | -------------------------------------------------------------------------------- /tests/unit/butterfree/extract/pre_processing/test_forward_fill.py: -------------------------------------------------------------------------------- 1 | from butterfree.extract.pre_processing import forward_fill 2 | 3 | 4 | class TestForwardFillTransform: 5 | def test_forward_fill_transform(self, input_df): 6 | # given 7 | result_df = forward_fill( 8 | dataframe=input_df, 9 | partition_by=["id", "pivot_column"], 10 | order_by="ts", 11 | fill_column="has_feature", 12 | ) 13 | 14 | # assert 15 | assert all( 16 | [r.has_feature == 1 for r in result_df.filter("pivot_column = 3").collect()] 17 | ) 18 | 19 | def test_forward_fill_transform_id_partition(self, input_df): 20 | # given 21 | result_df = forward_fill( 22 | dataframe=input_df, 23 | partition_by=["id"], 24 | order_by="ts", 25 | fill_column="has_feature", 26 | ) 27 | 28 | # assert 29 | assert ( 30 | result_df.filter("pivot_column = 3").orderBy("ts").collect()[-1].has_feature 31 | == 0 32 | ) 33 | 34 | def test_forward_fill_transform_new_column(self, input_df): 35 | # given 36 | result_df = forward_fill( 37 | dataframe=input_df, 38 | partition_by=["id"], 39 | order_by="ts", 40 | fill_column="has_feature", 41 | filled_column="has_feature_filled", 42 | ) 43 | 44 | # assert 45 | assert "has_feature_filled" in result_df.columns 46 | assert result_df.filter("has_feature_filled is null").count() == 0 47 | -------------------------------------------------------------------------------- /tests/unit/butterfree/extract/pre_processing/test_pivot_transform.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pyspark.sql.functions import first 3 | 4 | from butterfree.extract.pre_processing import pivot 5 | from butterfree.extract.readers import FileReader 6 | 7 | from .conftest import compare_dataframes 8 | 9 | 10 | class TestPivotTransform: 11 | def test_pivot_transformation( 12 | self, 13 | input_df, 14 | pivot_df, 15 | ): 16 | result_df = pivot( 17 | dataframe=input_df, 18 | group_by_columns=["id", "ts"], 19 | pivot_column="pivot_column", 20 | agg_column="has_feature", 21 | aggregation=first, 22 | ) 23 | 24 | # assert 25 | assert compare_dataframes( 26 | actual_df=result_df, 27 | expected_df=pivot_df, 28 | ) 29 | 30 | def test_pivot_transformation_with_forward_fill( 31 | self, 32 | input_df, 33 | pivot_ffill_df, 34 | ): 35 | result_df = pivot( 36 | dataframe=input_df, 37 | group_by_columns=["id", "ts"], 38 | pivot_column="pivot_column", 39 | agg_column="has_feature", 40 | aggregation=first, 41 | with_forward_fill=True, 42 | ) 43 | 44 | # assert 45 | assert compare_dataframes( 46 | actual_df=result_df, 47 | expected_df=pivot_ffill_df, 48 | ) 49 | 50 | def test_pivot_transformation_with_forward_fill_and_mock( 51 | self, 52 | input_df, 53 | pivot_ffill_mock_df, 54 | ): 55 | result_df = pivot( 56 | dataframe=input_df, 57 | group_by_columns=["id", "ts"], 58 | pivot_column="pivot_column", 59 | agg_column="has_feature", 60 | aggregation=first, 61 | mock_value=-1, 62 | mock_type="int", 63 | with_forward_fill=True, 64 | ) 65 | 66 | # assert 67 | assert compare_dataframes( 68 | actual_df=result_df, 69 | expected_df=pivot_ffill_mock_df, 70 | ) 71 | 72 | def test_pivot_transformation_mock_without_type( 73 | self, 74 | input_df, 75 | pivot_ffill_mock_df, 76 | ): 77 | with pytest.raises(AttributeError): 78 | _ = pivot( 79 | dataframe=input_df, 80 | group_by_columns=["id", "ts"], 81 | pivot_column="pivot_column", 82 | agg_column="has_feature", 83 | aggregation=first, 84 | mock_value=-1, 85 | with_forward_fill=True, 86 | ) 87 | 88 | def test_apply_pivot_transformation(self, input_df, pivot_df): 89 | # arrange 90 | file_reader = FileReader("test", "path/to/file", "format") 91 | file_reader.with_( 92 | transformer=pivot, 93 | group_by_columns=["id", "ts"], 94 | pivot_column="pivot_column", 95 | agg_column="has_feature", 96 | aggregation=first, 97 | ) 98 | 99 | # act 100 | result_df = file_reader._apply_transformations(input_df) 101 | 102 | # assert 103 | assert compare_dataframes( 104 | actual_df=result_df, 105 | expected_df=pivot_df, 106 | ) 107 | -------------------------------------------------------------------------------- /tests/unit/butterfree/extract/pre_processing/test_replace_transform.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from butterfree.extract.pre_processing import replace 4 | from butterfree.testing.dataframe import ( 5 | assert_dataframe_equality, 6 | create_df_from_collection, 7 | ) 8 | 9 | 10 | class TestReplaceTransform: 11 | def test_replace(self, spark_context, spark_session): 12 | # arrange 13 | input_data = [ 14 | {"id": 1, "type": "a"}, 15 | {"id": 2, "type": "b"}, 16 | {"id": 3, "type": "c"}, 17 | ] 18 | target_data = [ 19 | {"id": 1, "type": "type_a"}, 20 | {"id": 2, "type": "type_b"}, 21 | {"id": 3, "type": "c"}, 22 | ] 23 | input_df = create_df_from_collection(input_data, spark_context, spark_session) 24 | target_df = create_df_from_collection(target_data, spark_context, spark_session) 25 | replace_dict = {"a": "type_a", "b": "type_b"} 26 | 27 | # act 28 | result_df = replace(input_df, "type", replace_dict) 29 | 30 | # assert 31 | assert_dataframe_equality(target_df, result_df) 32 | 33 | @pytest.mark.parametrize( 34 | "input_data, column, replace_dict", 35 | [ 36 | ([{"column": "a"}], "not_column", {"a": "type_a"}), 37 | ([{"column": 123}], "column", {"a": "type_a"}), 38 | ([{"column": "a"}], "column", "not dict"), 39 | ([{"column": "a"}], "column", {"a": 1}), 40 | ], 41 | ) 42 | def test_replace_with_invalid_args( 43 | self, input_data, column, replace_dict, spark_context, spark_session 44 | ): 45 | # arrange 46 | input_df = create_df_from_collection(input_data, spark_context, spark_session) 47 | 48 | # act and assert 49 | with pytest.raises(ValueError): 50 | replace(input_df, column, replace_dict) 51 | -------------------------------------------------------------------------------- /tests/unit/butterfree/extract/readers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/extract/readers/__init__.py -------------------------------------------------------------------------------- /tests/unit/butterfree/extract/readers/file-reader-test.csv: -------------------------------------------------------------------------------- 1 | "A","B","C" 2 | 10,10.2,"Test1" 3 | 100,100.3,"Test2" 4 | -------------------------------------------------------------------------------- /tests/unit/butterfree/extract/readers/file-reader-test.json: -------------------------------------------------------------------------------- 1 | {"A":10,"B":10.2,"C":"Test1"} 2 | {"A":100,"B":100.2,"C":"Test2"} -------------------------------------------------------------------------------- /tests/unit/butterfree/extract/readers/test_table_reader.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from butterfree.extract.readers import TableReader 4 | 5 | 6 | class TestTableReader: 7 | @pytest.mark.parametrize( 8 | "database, table", 9 | [ 10 | ("database", 123), 11 | ( 12 | 123, 13 | None, 14 | ), 15 | ], 16 | ) 17 | def test_init_invalid_params(self, database, table): 18 | # act and assert 19 | with pytest.raises(ValueError): 20 | TableReader("id", table, database) 21 | 22 | def test_consume(self, spark_client, target_df): 23 | # arrange 24 | database = "test_database" 25 | table = "test_table" 26 | spark_client.read_table.return_value = target_df 27 | table_reader = TableReader("test", table, database) 28 | 29 | # act 30 | output_df = table_reader.consume(spark_client) 31 | 32 | # assert 33 | spark_client.read_table.assert_called_once_with(table, database) 34 | assert target_df.collect() == output_df.collect() 35 | -------------------------------------------------------------------------------- /tests/unit/butterfree/extract/test_source.py: -------------------------------------------------------------------------------- 1 | from butterfree.clients import SparkClient 2 | from butterfree.extract import Source 3 | 4 | 5 | class TestSource: 6 | def test_construct(self, mocker, target_df): 7 | # given 8 | spark_client = SparkClient() 9 | 10 | reader_id = "a_source" 11 | reader = mocker.stub(reader_id) 12 | reader.build = mocker.stub("build") 13 | reader.build.side_effect = target_df.createOrReplaceTempView(reader_id) 14 | 15 | # when 16 | source_selector = Source( 17 | readers=[reader], 18 | query=f"select * from {reader_id}", # noqa 19 | ) 20 | 21 | result_df = source_selector.construct(spark_client) 22 | 23 | assert result_df.collect() == target_df.collect() 24 | 25 | def test_is_cached(self, mocker, target_df): 26 | # given 27 | spark_client = SparkClient() 28 | 29 | reader_id = "a_source" 30 | reader = mocker.stub(reader_id) 31 | reader.build = mocker.stub("build") 32 | reader.build.side_effect = target_df.createOrReplaceTempView(reader_id) 33 | 34 | # when 35 | source_selector = Source( 36 | readers=[reader], 37 | query=f"select * from {reader_id}", # noqa 38 | ) 39 | 40 | result_df = source_selector.construct(spark_client) 41 | 42 | assert result_df.is_cached 43 | -------------------------------------------------------------------------------- /tests/unit/butterfree/hooks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/hooks/__init__.py -------------------------------------------------------------------------------- /tests/unit/butterfree/hooks/schema_compatibility/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/hooks/schema_compatibility/__init__.py -------------------------------------------------------------------------------- /tests/unit/butterfree/hooks/schema_compatibility/test_cassandra_table_schema_compatibility_hook.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import MagicMock 2 | 3 | import pytest 4 | 5 | from butterfree.clients import CassandraClient 6 | from butterfree.hooks.schema_compatibility import CassandraTableSchemaCompatibilityHook 7 | 8 | 9 | class TestCassandraTableSchemaCompatibilityHook: 10 | def test_run_compatible_schema(self, spark_session): 11 | cassandra_client = CassandraClient(host=["mock"], keyspace="dummy_keyspace") 12 | 13 | cassandra_client.sql = MagicMock( # type: ignore 14 | return_value=[ 15 | {"column_name": "feature1", "type": "text"}, 16 | {"column_name": "feature2", "type": "int"}, 17 | ] 18 | ) 19 | 20 | table = "table" 21 | 22 | input_dataframe = spark_session.sql("select 'abc' as feature1, 1 as feature2") 23 | 24 | hook = CassandraTableSchemaCompatibilityHook(cassandra_client, table) 25 | 26 | # act and assert 27 | assert hook.run(input_dataframe) == input_dataframe 28 | 29 | def test_run_incompatible_schema(self, spark_session): 30 | cassandra_client = CassandraClient(host=["mock"], keyspace="dummy_keyspace") 31 | 32 | cassandra_client.sql = MagicMock( # type: ignore 33 | return_value=[ 34 | {"column_name": "feature1", "type": "text"}, 35 | {"column_name": "feature2", "type": "bigint"}, 36 | ] 37 | ) 38 | 39 | table = "table" 40 | 41 | input_dataframe = spark_session.sql("select 'abc' as feature1, 1 as feature2") 42 | 43 | hook = CassandraTableSchemaCompatibilityHook(cassandra_client, table) 44 | 45 | # act and assert 46 | with pytest.raises( 47 | ValueError, match="There's a schema incompatibility between" 48 | ): 49 | hook.run(input_dataframe) 50 | -------------------------------------------------------------------------------- /tests/unit/butterfree/hooks/schema_compatibility/test_spark_table_schema_compatibility_hook.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from butterfree.clients import SparkClient 4 | from butterfree.hooks.schema_compatibility import SparkTableSchemaCompatibilityHook 5 | 6 | 7 | class TestSparkTableSchemaCompatibilityHook: 8 | @pytest.mark.parametrize( 9 | "table, database, target_table_expression", 10 | [("table", "database", "`database`.`table`"), ("table", None, "`table`")], 11 | ) 12 | def test_build_table_expression(self, table, database, target_table_expression): 13 | # arrange 14 | spark_client = SparkClient() 15 | 16 | # act 17 | result_table_expression = SparkTableSchemaCompatibilityHook( 18 | spark_client, table, database 19 | ).table_expression 20 | 21 | # assert 22 | assert target_table_expression == result_table_expression 23 | 24 | def test_run_compatible_schema(self, spark_session): 25 | # arrange 26 | spark_client = SparkClient() 27 | target_table = spark_session.sql( 28 | "select 1 as feature_a, 'abc' as feature_b, true as other_feature" 29 | ) 30 | input_dataframe = spark_session.sql("select 1 as feature_a, 'abc' as feature_b") 31 | target_table.registerTempTable("test") 32 | 33 | hook = SparkTableSchemaCompatibilityHook(spark_client, "test") 34 | 35 | # act and assert 36 | assert hook.run(input_dataframe) == input_dataframe 37 | 38 | def test_run_incompatible_schema(self, spark_session): 39 | # arrange 40 | spark_client = SparkClient() 41 | target_table = spark_session.sql( 42 | "select 1 as feature_a, 'abc' as feature_b, true as other_feature" 43 | ) 44 | input_dataframe = spark_session.sql( 45 | "select 1 as feature_a, 'abc' as feature_b, true as unregisted_column" 46 | ) 47 | target_table.registerTempTable("test") 48 | 49 | hook = SparkTableSchemaCompatibilityHook(spark_client, "test") 50 | 51 | # act and assert 52 | with pytest.raises(ValueError, match="The dataframe has a schema incompatible"): 53 | hook.run(input_dataframe) 54 | -------------------------------------------------------------------------------- /tests/unit/butterfree/load/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/load/__init__.py -------------------------------------------------------------------------------- /tests/unit/butterfree/load/processing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/load/processing/__init__.py -------------------------------------------------------------------------------- /tests/unit/butterfree/load/processing/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.fixture() 5 | def input_df(spark_context, spark_session): 6 | data = [ 7 | {"id": 1, "ts": "2016-04-11 11:31:11"}, 8 | {"id": 1, "ts": "2016-04-11 11:44:12"}, 9 | {"id": 1, "ts": "2016-04-11 11:46:24"}, 10 | {"id": 1, "ts": "2016-04-11 12:03:21"}, 11 | {"id": 1, "ts": "2016-04-11 13:46:24"}, 12 | ] 13 | return spark_session.read.json(spark_context.parallelize(data, 1)) 14 | 15 | 16 | @pytest.fixture() 17 | def json_df(spark_context, spark_session): 18 | data = [ 19 | '{"value":"{\\"id\\":1,\\"ts\\":\\"2016-04-11 11:31:11\\"}"}', 20 | '{"value":"{\\"id\\":1,\\"ts\\":\\"2016-04-11 11:44:12\\"}"}', 21 | '{"value":"{\\"id\\":1,\\"ts\\":\\"2016-04-11 11:46:24\\"}"}', 22 | '{"value":"{\\"id\\":1,\\"ts\\":\\"2016-04-11 12:03:21\\"}"}', 23 | '{"value":"{\\"id\\":1,\\"ts\\":\\"2016-04-11 13:46:24\\"}"}', 24 | ] 25 | return spark_session.read.json(spark_context.parallelize(data, 1)) 26 | -------------------------------------------------------------------------------- /tests/unit/butterfree/load/processing/test_json_transform.py: -------------------------------------------------------------------------------- 1 | from butterfree.load.processing import json_transform 2 | 3 | 4 | class TestJsonTransform: 5 | def test_json_transformation( 6 | self, 7 | input_df, 8 | json_df, 9 | ): 10 | result_df = json_transform(dataframe=input_df) 11 | 12 | # assert 13 | assert sorted(result_df.collect()) == sorted(json_df.collect()) 14 | -------------------------------------------------------------------------------- /tests/unit/butterfree/load/writers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/load/writers/__init__.py -------------------------------------------------------------------------------- /tests/unit/butterfree/migrations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/migrations/__init__.py -------------------------------------------------------------------------------- /tests/unit/butterfree/migrations/database_migration/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/migrations/database_migration/__init__.py -------------------------------------------------------------------------------- /tests/unit/butterfree/migrations/database_migration/conftest.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql.types import ( 2 | ArrayType, 3 | DoubleType, 4 | FloatType, 5 | LongType, 6 | StringType, 7 | TimestampType, 8 | ) 9 | from pytest import fixture 10 | 11 | from butterfree.constants import DataType 12 | from butterfree.transform import FeatureSet 13 | from butterfree.transform.features import Feature, KeyFeature, TimestampFeature 14 | 15 | 16 | @fixture 17 | def db_schema(): 18 | return [ 19 | {"column_name": "id", "type": LongType(), "primary_key": True}, 20 | {"column_name": "timestamp", "type": TimestampType(), "primary_key": False}, 21 | { 22 | "column_name": "feature1__avg_over_1_week_rolling_windows", 23 | "type": DoubleType(), 24 | "primary_key": False, 25 | }, 26 | { 27 | "column_name": "feature1__avg_over_2_days_rolling_windows", 28 | "type": DoubleType(), 29 | "primary_key": False, 30 | }, 31 | ] 32 | 33 | 34 | @fixture 35 | def fs_schema(): 36 | return [ 37 | {"column_name": "id", "type": LongType(), "primary_key": True}, 38 | {"column_name": "timestamp", "type": TimestampType(), "primary_key": True}, 39 | {"column_name": "new_feature", "type": FloatType(), "primary_key": False}, 40 | { 41 | "column_name": "array_feature", 42 | "type": ArrayType(StringType(), True), 43 | "primary_key": False, 44 | }, 45 | { 46 | "column_name": "feature1__avg_over_1_week_rolling_windows", 47 | "type": FloatType(), 48 | "primary_key": False, 49 | }, 50 | ] 51 | 52 | 53 | @fixture 54 | def feature_set(): 55 | feature_set = FeatureSet( 56 | name="feature_set", 57 | entity="entity", 58 | description="description", 59 | features=[ 60 | Feature( 61 | name="feature_float", 62 | description="test", 63 | dtype=DataType.FLOAT, 64 | ), 65 | ], 66 | keys=[ 67 | KeyFeature( 68 | name="id", 69 | description="The device ID", 70 | dtype=DataType.BIGINT, 71 | ) 72 | ], 73 | timestamp=TimestampFeature(), 74 | ) 75 | 76 | return feature_set 77 | -------------------------------------------------------------------------------- /tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py: -------------------------------------------------------------------------------- 1 | from butterfree.migrations.database_migration import CassandraMigration 2 | 3 | 4 | class TestCassandraMigration: 5 | def test_queries(self, fs_schema, db_schema): 6 | cassandra_migration = CassandraMigration() 7 | expected_query = [ 8 | "ALTER TABLE table_name ADD (new_feature FloatType);", 9 | "ALTER TABLE table_name DROP (feature1__avg_over_2_days_rolling_windows);", 10 | "ALTER TABLE table_name ALTER " 11 | "feature1__avg_over_1_week_rolling_windows TYPE FloatType;", 12 | ] 13 | query = cassandra_migration.create_query(fs_schema, "table_name", db_schema) 14 | 15 | assert query, expected_query 16 | 17 | def test_queries_on_entity(self, fs_schema, db_schema): 18 | cassandra_migration = CassandraMigration() 19 | expected_query = [ 20 | "ALTER TABLE table_name ADD (new_feature FloatType);", 21 | "ALTER TABLE table_name ALTER " 22 | "feature1__avg_over_1_week_rolling_windows TYPE FloatType;", 23 | ] 24 | query = cassandra_migration.create_query( 25 | fs_schema, "table_name", db_schema, True 26 | ) 27 | 28 | assert query, expected_query 29 | 30 | def test_create_table_query(self, fs_schema): 31 | 32 | cassandra_migration = CassandraMigration() 33 | expected_query = [ 34 | "CREATE TABLE test.table_name " 35 | "(id LongType, timestamp TimestampType, new_feature FloatType, " 36 | "array_feature ArrayType(StringType(), True), " 37 | "feature1__avg_over_1_week_rolling_windows FloatType, " 38 | "PRIMARY KEY (id, timestamp));" 39 | ] 40 | 41 | query = cassandra_migration.create_query(fs_schema, "table_name") 42 | 43 | assert query, expected_query 44 | -------------------------------------------------------------------------------- /tests/unit/butterfree/migrations/database_migration/test_database_migration.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql.types import DoubleType, FloatType, LongType, TimestampType 2 | 3 | from butterfree.load.writers import HistoricalFeatureStoreWriter 4 | from butterfree.migrations.database_migration import CassandraMigration, Diff 5 | 6 | 7 | class TestDatabaseMigration: 8 | def test__get_diff_empty(self, mocker, db_schema): 9 | fs_schema = [ 10 | {"column_name": "id", "type": LongType(), "primary_key": True}, 11 | {"column_name": "timestamp", "type": TimestampType(), "primary_key": False}, 12 | { 13 | "column_name": "feature1__avg_over_1_week_rolling_windows", 14 | "type": DoubleType(), 15 | "primary_key": False, 16 | }, 17 | { 18 | "column_name": "feature1__avg_over_2_days_rolling_windows", 19 | "type": DoubleType(), 20 | "primary_key": False, 21 | }, 22 | ] 23 | m = CassandraMigration() 24 | m._client = mocker.stub("client") 25 | diff = m._get_diff(fs_schema, db_schema) 26 | assert not diff 27 | 28 | def test__get_diff(self, mocker, db_schema): 29 | fs_schema = [ 30 | {"column_name": "id", "type": LongType(), "primary_key": True}, 31 | {"column_name": "timestamp", "type": TimestampType(), "primary_key": True}, 32 | {"column_name": "new_feature", "type": FloatType(), "primary_key": False}, 33 | { 34 | "column_name": "feature1__avg_over_1_week_rolling_windows", 35 | "type": FloatType(), 36 | "primary_key": False, 37 | }, 38 | ] 39 | expected_diff = { 40 | Diff("timestamp", kind=Diff.Kind.ALTER_KEY, value=None), 41 | Diff("new_feature", kind=Diff.Kind.ADD, value=FloatType()), 42 | Diff( 43 | "feature1__avg_over_2_days_rolling_windows", 44 | kind=Diff.Kind.DROP, 45 | value=None, 46 | ), 47 | Diff( 48 | "feature1__avg_over_1_week_rolling_windows", 49 | kind=Diff.Kind.ALTER_TYPE, 50 | value=FloatType(), 51 | ), 52 | } 53 | 54 | m = CassandraMigration() 55 | m._client = mocker.stub("client") 56 | diff = m._get_diff(fs_schema, db_schema) 57 | assert diff == expected_diff 58 | 59 | def test_apply_migration(self, feature_set, mocker): 60 | # given 61 | m = CassandraMigration() 62 | m.apply_migration = mocker.stub("apply_migration") 63 | 64 | # when 65 | m.apply_migration(feature_set, HistoricalFeatureStoreWriter()) 66 | 67 | # then 68 | m.apply_migration.assert_called_once() 69 | -------------------------------------------------------------------------------- /tests/unit/butterfree/migrations/database_migration/test_metastore_migration.py: -------------------------------------------------------------------------------- 1 | from butterfree.migrations.database_migration import MetastoreMigration 2 | 3 | 4 | class TestMetastoreMigration: 5 | def test_queries(self, fs_schema, db_schema): 6 | metastore_migration = MetastoreMigration() 7 | 8 | expected_query = [ 9 | "ALTER TABLE test.table_name ADD IF NOT EXISTS " 10 | "columns (new_feature FloatType);", 11 | "ALTER TABLE table_name DROP IF EXISTS " 12 | "(feature1__avg_over_2_days_rolling_windows None);", 13 | "ALTER TABLE table_name ALTER COLUMN " 14 | "feature1__avg_over_1_week_rolling_windows FloatType;", 15 | ] 16 | 17 | query = metastore_migration.create_query(fs_schema, "table_name", db_schema) 18 | 19 | assert query, expected_query 20 | 21 | def test_queries_on_entity(self, fs_schema, db_schema): 22 | metastore_migration = MetastoreMigration() 23 | 24 | expected_query = [ 25 | "ALTER TABLE test.table_name ADD IF NOT EXISTS " 26 | "columns (new_feature FloatType);", 27 | "ALTER TABLE table_name ALTER COLUMN " 28 | "feature1__avg_over_1_week_rolling_windows FloatType;", 29 | ] 30 | 31 | query = metastore_migration.create_query( 32 | fs_schema, "table_name", db_schema, True 33 | ) 34 | 35 | assert query, expected_query 36 | 37 | def test_create_table_query(self, fs_schema): 38 | 39 | metastore_migration = MetastoreMigration() 40 | 41 | expected_query = [ 42 | "CREATE TABLE IF NOT EXISTS test.table_name " 43 | "(id LongType, timestamp TimestampType, new_feature FloatType) " 44 | "PARTITIONED BY (year INT, month INT, day INT);" 45 | ] 46 | 47 | query = metastore_migration.create_query(fs_schema, "table_name") 48 | 49 | assert query, expected_query 50 | -------------------------------------------------------------------------------- /tests/unit/butterfree/pipelines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/pipelines/__init__.py -------------------------------------------------------------------------------- /tests/unit/butterfree/pipelines/conftest.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import Mock 2 | 3 | from pyspark.sql import functions 4 | from pytest import fixture 5 | 6 | from butterfree.clients import SparkClient 7 | from butterfree.constants import DataType 8 | from butterfree.constants.columns import TIMESTAMP_COLUMN 9 | from butterfree.extract import Source 10 | from butterfree.extract.readers import TableReader 11 | from butterfree.load import Sink 12 | from butterfree.load.writers import HistoricalFeatureStoreWriter 13 | from butterfree.pipelines import FeatureSetPipeline 14 | from butterfree.transform import FeatureSet 15 | from butterfree.transform.features import Feature, KeyFeature, TimestampFeature 16 | from butterfree.transform.transformations import SparkFunctionTransform 17 | from butterfree.transform.utils import Function 18 | 19 | 20 | @fixture() 21 | def feature_set_pipeline(): 22 | test_pipeline = FeatureSetPipeline( 23 | spark_client=SparkClient(), 24 | source=Mock( 25 | spec=Source, 26 | readers=[ 27 | TableReader( 28 | id="source_a", 29 | database="db", 30 | table="table", 31 | ) 32 | ], 33 | query="select * from source_a", 34 | ), 35 | feature_set=Mock( 36 | spec=FeatureSet, 37 | name="feature_set", 38 | entity="entity", 39 | description="description", 40 | keys=[ 41 | KeyFeature( 42 | name="user_id", 43 | description="The user's Main ID or device ID", 44 | dtype=DataType.INTEGER, 45 | ) 46 | ], 47 | timestamp=TimestampFeature(from_column="ts"), 48 | features=[ 49 | Feature( 50 | name="listing_page_viewed__rent_per_month", 51 | description="Average of something.", 52 | transformation=SparkFunctionTransform( 53 | functions=[ 54 | Function(functions.avg, DataType.FLOAT), 55 | Function(functions.stddev_pop, DataType.FLOAT), 56 | ], 57 | ).with_window( 58 | partition_by="user_id", 59 | order_by=TIMESTAMP_COLUMN, 60 | window_definition=["7 days", "2 weeks"], 61 | mode="fixed_windows", 62 | ), 63 | ), 64 | ], 65 | ), 66 | sink=Mock( 67 | spec=Sink, 68 | writers=[HistoricalFeatureStoreWriter(db_config=None)], 69 | ), 70 | ) 71 | 72 | return test_pipeline 73 | -------------------------------------------------------------------------------- /tests/unit/butterfree/reports/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/reports/__init__.py -------------------------------------------------------------------------------- /tests/unit/butterfree/testing/dataframe/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/testing/dataframe/__init__.py -------------------------------------------------------------------------------- /tests/unit/butterfree/testing/dataframe/test_dataframe.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pyspark.sql.functions import col, from_unixtime 3 | 4 | from butterfree.testing.dataframe import ( 5 | assert_dataframe_equality, 6 | create_df_from_collection, 7 | ) 8 | 9 | 10 | def test_assert_dataframe_equality(spark_context, spark_session): 11 | # arrange 12 | data1 = [ 13 | {"ts": 1582911000000, "flag": 1, "value": 1234.0}, 14 | {"ts": 1577923200000, "flag": 0, "value": 123.0}, 15 | ] 16 | data2 = [ 17 | {"ts": "2020-01-02T00:00:00+00:00", "flag": "false", "value": 123}, 18 | {"ts": "2020-02-28T17:30:00+00:00", "flag": "true", "value": 1234}, 19 | ] # same data declared in different formats and in different order 20 | 21 | df1 = spark_session.read.json(spark_context.parallelize(data1, 1)) 22 | df1 = ( 23 | df1.withColumn("ts", from_unixtime(col("ts") / 1000.0).cast("timestamp")) 24 | .withColumn("flag", col("flag").cast("boolean")) 25 | .withColumn("value", col("flag").cast("integer")) 26 | ) 27 | 28 | df2 = spark_session.read.json(spark_context.parallelize(data2, 1)) 29 | df2 = ( 30 | df2.withColumn("ts", col("ts").cast("timestamp")) 31 | .withColumn("flag", col("flag").cast("boolean")) 32 | .withColumn("value", col("flag").cast("integer")) 33 | ) 34 | 35 | # act and assert 36 | assert_dataframe_equality(df1, df2) 37 | 38 | 39 | def test_assert_dataframe_equality_different_values(spark_context, spark_session): 40 | # arrange 41 | data1 = [ 42 | {"value": "abc"}, 43 | {"value": "cba"}, 44 | ] 45 | data2 = [ 46 | {"value": "abc"}, 47 | {"value": "different value"}, 48 | ] 49 | 50 | df1 = spark_session.read.json(spark_context.parallelize(data1, 1)) 51 | df2 = spark_session.read.json(spark_context.parallelize(data2, 1)) 52 | 53 | # act and assert 54 | with pytest.raises(AssertionError, match="DataFrames have different values:"): 55 | assert_dataframe_equality(df1, df2) 56 | 57 | 58 | def test_assert_dataframe_equality_different_shapes(spark_context, spark_session): 59 | # arrange 60 | data1 = [ 61 | {"value": "abc"}, 62 | {"value": "cba"}, 63 | {"value": "cba"}, 64 | ] 65 | data2 = [ 66 | {"value": "abc"}, 67 | {"value": "cba"}, 68 | ] 69 | 70 | df1 = spark_session.read.json(spark_context.parallelize(data1, 1)) 71 | df2 = spark_session.read.json(spark_context.parallelize(data2, 1)) 72 | 73 | # act and assert 74 | with pytest.raises(AssertionError, match="DataFrame shape mismatch:"): 75 | assert_dataframe_equality(df1, df2) 76 | 77 | 78 | def test_create_df_from_collection(spark_context, spark_session): 79 | # arrange 80 | input_data = [{"json_column": '{"abc": 123}', "a": 123, "b": "abc"}] 81 | 82 | # act 83 | output_df = create_df_from_collection(input_data, spark_context, spark_session) 84 | target_df = spark_session.sql( 85 | "select 123 as a, 'abc' as b, replace(" 86 | "to_json(named_struct('abc', 123)), ':', ': ') as json_column" 87 | ) # generate the same data but with SparkSQL directly to df 88 | 89 | # arrange 90 | assert_dataframe_equality(target_df, output_df) 91 | -------------------------------------------------------------------------------- /tests/unit/butterfree/transform/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/transform/__init__.py -------------------------------------------------------------------------------- /tests/unit/butterfree/transform/features/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/transform/features/__init__.py -------------------------------------------------------------------------------- /tests/unit/butterfree/transform/features/conftest.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import Mock 2 | 3 | from pytest import fixture 4 | 5 | from butterfree.constants.columns import TIMESTAMP_COLUMN 6 | from butterfree.transform.features import Feature 7 | 8 | 9 | @fixture 10 | def feature_set_dataframe(spark_context, spark_session): 11 | data = [ 12 | {"id": 1, TIMESTAMP_COLUMN: 0, "feature": 100}, 13 | {"id": 2, TIMESTAMP_COLUMN: 1, "feature": 200}, 14 | ] 15 | return spark_session.read.json(spark_context.parallelize(data, 1)) 16 | 17 | 18 | @fixture 19 | def feature_set_dataframe_ms_from_column(spark_context, spark_session): 20 | data = [ 21 | {"id": 1, "ts": 1581542311112, "feature": 100}, 22 | {"id": 2, "ts": 1581542322223, "feature": 200}, 23 | ] 24 | return spark_session.read.json(spark_context.parallelize(data, 1)) 25 | 26 | 27 | @fixture 28 | def feature_set_dataframe_ms(spark_context, spark_session): 29 | data = [ 30 | {"id": 1, TIMESTAMP_COLUMN: 1581542311112, "feature": 100}, 31 | {"id": 2, TIMESTAMP_COLUMN: 1581542322223, "feature": 200}, 32 | ] 33 | return spark_session.read.json(spark_context.parallelize(data, 1)) 34 | 35 | 36 | @fixture 37 | def feature_set_dataframe_small_time_diff(spark_context, spark_session): 38 | data = [ 39 | {"id": 1, TIMESTAMP_COLUMN: 1581542311001, "feature": 100}, 40 | {"id": 2, TIMESTAMP_COLUMN: 1581542311002, "feature": 200}, 41 | ] 42 | return spark_session.read.json(spark_context.parallelize(data, 1)) 43 | 44 | 45 | @fixture 46 | def feature_set_dataframe_date(spark_context, spark_session): 47 | data = [ 48 | {"id": 1, TIMESTAMP_COLUMN: "2020-02-07T00:00:00", "feature": 100}, 49 | {"id": 2, TIMESTAMP_COLUMN: "2020-02-08T00:00:00", "feature": 200}, 50 | ] 51 | return spark_session.read.json(spark_context.parallelize(data, 1)) 52 | 53 | 54 | @fixture 55 | def mocked_feature(): 56 | return Mock(spec=Feature) 57 | -------------------------------------------------------------------------------- /tests/unit/butterfree/transform/features/test_key_feature.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import Mock 2 | 3 | from butterfree.constants import DataType 4 | from butterfree.transform.features import KeyFeature 5 | 6 | 7 | class TestKeyFeature: 8 | def test_args_without_transformation(self): 9 | 10 | test_key = KeyFeature( 11 | name="id", 12 | from_column="origin", 13 | description="unit test", 14 | dtype=DataType.INTEGER, 15 | ) 16 | 17 | assert test_key.name == "id" 18 | assert test_key.from_column == "origin" 19 | assert test_key.description == "unit test" 20 | 21 | def test_args_with_transformation(self): 22 | 23 | test_key = KeyFeature( 24 | name="id", 25 | from_column="origin", 26 | description="unit test", 27 | dtype=DataType.INTEGER, 28 | transformation=Mock(), 29 | ) 30 | assert test_key.name == "id" 31 | assert test_key.from_column == "origin" 32 | assert test_key.description == "unit test" 33 | assert test_key.transformation 34 | -------------------------------------------------------------------------------- /tests/unit/butterfree/transform/transformations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/transform/transformations/__init__.py -------------------------------------------------------------------------------- /tests/unit/butterfree/transform/transformations/test_custom_transform.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pyspark.sql import functions as F 3 | 4 | from butterfree.constants import DataType 5 | from butterfree.constants.columns import TIMESTAMP_COLUMN 6 | from butterfree.transform.features import Feature 7 | from butterfree.transform.transformations import CustomTransform 8 | 9 | 10 | def divide(df, parent_feature, column1, column2): 11 | name = parent_feature.get_output_columns()[0] 12 | df = df.withColumn(name, F.col(column1) / F.col(column2)) 13 | return df 14 | 15 | 16 | class TestCustomTransform: 17 | def test_feature_transform(self, feature_set_dataframe): 18 | 19 | test_feature = Feature( 20 | name="feature", 21 | description="unit test", 22 | dtype=DataType.BIGINT, 23 | transformation=CustomTransform( 24 | transformer=divide, 25 | column1="feature1", 26 | column2="feature2", 27 | ), 28 | ) 29 | 30 | df = test_feature.transform(feature_set_dataframe) 31 | 32 | assert all( 33 | [ 34 | a == b 35 | for a, b in zip( 36 | df.columns, 37 | ["feature1", "feature2", "id", TIMESTAMP_COLUMN, "feature"], 38 | ) 39 | ] 40 | ) 41 | 42 | def test_output_columns(self, feature_set_dataframe): 43 | 44 | test_feature = Feature( 45 | name="feature", 46 | description="unit test", 47 | dtype=DataType.BIGINT, 48 | transformation=CustomTransform( 49 | transformer=divide, 50 | column1="feature1", 51 | column2="feature2", 52 | ), 53 | ) 54 | 55 | df_columns = test_feature.get_output_columns() 56 | 57 | assert isinstance(df_columns, list) 58 | assert df_columns == ["feature"] 59 | 60 | def test_custom_transform_output(self, feature_set_dataframe): 61 | test_feature = Feature( 62 | name="feature", 63 | description="unit test", 64 | dtype=DataType.BIGINT, 65 | transformation=CustomTransform( 66 | transformer=divide, 67 | column1="feature1", 68 | column2="feature2", 69 | ), 70 | ) 71 | 72 | df = test_feature.transform(feature_set_dataframe).collect() 73 | 74 | assert df[0]["feature"] == 1 75 | assert df[1]["feature"] == 1 76 | assert df[2]["feature"] == 1 77 | assert df[3]["feature"] == 1 78 | 79 | def test_blank_transformer(self, feature_set_dataframe): 80 | with pytest.raises(ValueError): 81 | Feature( 82 | name="feature", 83 | description="unit test", 84 | dtype=DataType.BIGINT, 85 | transformation=CustomTransform(transformer=None), 86 | ) 87 | -------------------------------------------------------------------------------- /tests/unit/butterfree/transform/transformations/test_h3_transform.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import patch 2 | 3 | import pytest 4 | 5 | from butterfree.constants.data_type import DataType 6 | from butterfree.testing.dataframe import assert_dataframe_equality 7 | from butterfree.transform.features import Feature, KeyFeature 8 | from butterfree.transform.transformations.h3_transform import H3HashTransform 9 | 10 | 11 | class TestH3Transform: 12 | def test_feature_transform(self, h3_input_df, h3_target_df): 13 | # arrange 14 | test_feature = Feature( 15 | name="new_feature", 16 | description="unit test", 17 | dtype=DataType.STRING, 18 | transformation=H3HashTransform( 19 | h3_resolutions=[6, 7, 8, 9, 10, 11, 12], 20 | lat_column="lat", 21 | lng_column="lng", 22 | ), 23 | ) 24 | 25 | # act 26 | output_df = test_feature.transform(h3_input_df) 27 | 28 | # assert 29 | assert_dataframe_equality(output_df, h3_target_df) 30 | 31 | def test_output_columns(self): 32 | # arrange 33 | h3_feature = Feature( 34 | name="new_feature", 35 | description="unit test", 36 | dtype=DataType.STRING, 37 | transformation=H3HashTransform( 38 | h3_resolutions=[6, 7, 8, 9, 10, 11, 12], 39 | lat_column="lat", 40 | lng_column="lng", 41 | ), 42 | ) 43 | target_columns = [ 44 | "lat_lng__h3_hash__6", 45 | "lat_lng__h3_hash__7", 46 | "lat_lng__h3_hash__8", 47 | "lat_lng__h3_hash__9", 48 | "lat_lng__h3_hash__10", 49 | "lat_lng__h3_hash__11", 50 | "lat_lng__h3_hash__12", 51 | ] 52 | 53 | # act 54 | output_columns = h3_feature.get_output_columns() 55 | 56 | # assert 57 | assert sorted(output_columns) == sorted(target_columns) 58 | 59 | def test_import_error(self): 60 | import sys 61 | 62 | with patch.dict(sys.modules, h3=None): 63 | modules = [m for m in sys.modules if m.startswith("butterfree")] 64 | for m in modules: 65 | del sys.modules[m] 66 | with pytest.raises(ModuleNotFoundError, match="you must install"): 67 | from butterfree.transform.transformations.h3_transform import ( # noqa; noqa 68 | H3HashTransform, 69 | ) 70 | 71 | def test_with_stack(self, h3_input_df, h3_with_stack_target_df): 72 | # arrange 73 | test_feature = KeyFeature( 74 | name="id", 75 | description="unit test", 76 | dtype=DataType.STRING, 77 | transformation=H3HashTransform( 78 | h3_resolutions=[6, 7, 8, 9, 10, 11, 12], 79 | lat_column="lat", 80 | lng_column="lng", 81 | ).with_stack(), 82 | ) 83 | 84 | # act 85 | output_df = test_feature.transform(h3_input_df) 86 | 87 | # assert 88 | assert_dataframe_equality(h3_with_stack_target_df, output_df) 89 | -------------------------------------------------------------------------------- /tests/unit/butterfree/transform/transformations/test_stack_transform.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from butterfree.constants import DataType 4 | from butterfree.testing.dataframe import ( 5 | assert_dataframe_equality, 6 | create_df_from_collection, 7 | ) 8 | from butterfree.transform.features import Feature, KeyFeature 9 | from butterfree.transform.transformations import StackTransform 10 | 11 | 12 | class TestSQLExpressionTransform: 13 | 14 | input_data = [ 15 | {"feature": 100, "id_a": 1, "id_b": 2}, 16 | {"feature": 120, "id_a": 3, "id_b": 4}, 17 | ] 18 | 19 | def test_feature_transform(self, spark_context, spark_session): 20 | # arrange 21 | target_data = [ 22 | {"id": 1, "feature": 100, "id_a": 1, "id_b": 2}, 23 | {"id": 2, "feature": 100, "id_a": 1, "id_b": 2}, 24 | {"id": 3, "feature": 120, "id_a": 3, "id_b": 4}, 25 | {"id": 4, "feature": 120, "id_a": 3, "id_b": 4}, 26 | ] 27 | input_df = create_df_from_collection( 28 | self.input_data, spark_context, spark_session 29 | ) 30 | target_df = create_df_from_collection(target_data, spark_context, spark_session) 31 | 32 | feature_using_names = KeyFeature( 33 | name="id", 34 | description="id_a and id_b stacked in a single column.", 35 | dtype=DataType.INTEGER, 36 | transformation=StackTransform("id_*"), 37 | ) 38 | 39 | # act 40 | result_df_1 = feature_using_names.transform(input_df) 41 | 42 | # assert 43 | assert_dataframe_equality(target_df, result_df_1) 44 | 45 | def test_columns_not_in_dataframe(self, spark_context, spark_session): 46 | # arrange 47 | input_df = create_df_from_collection( 48 | self.input_data, spark_context, spark_session 49 | ) 50 | 51 | feature = Feature( 52 | name="id", 53 | description="stack transformation", 54 | dtype=DataType.STRING, 55 | transformation=StackTransform("id_c", "id_d"), 56 | ) 57 | 58 | # act and assert 59 | with pytest.raises(ValueError, match="Columns not found, columns in df: "): 60 | feature.transform(input_df) 61 | 62 | @pytest.mark.parametrize( 63 | "is_regex, pattern, column", 64 | [ 65 | (False, "id_a", "id_a"), 66 | (False, "id_*", "id_a"), 67 | (False, "*_a", "id_a"), 68 | (False, "id*a", "id_a"), 69 | (False, "!id_b", "id_a"), 70 | (True, "id.*", "id_a"), 71 | (True, "id_[a-z]*", "id_column"), 72 | ], 73 | ) 74 | def test__matches_pattern(self, is_regex, pattern, column): 75 | # arrange 76 | transform = StackTransform(is_regex=is_regex) 77 | 78 | # act 79 | result = transform._matches_pattern(pattern, column) 80 | 81 | # assert 82 | assert result 83 | -------------------------------------------------------------------------------- /tests/unit/butterfree/transform/transformations/test_transform_component.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | from unittest.mock import patch 3 | 4 | import pytest 5 | 6 | from butterfree.transform.transformations import TransformComponent 7 | 8 | 9 | class TestTransformComponent(TestCase): 10 | def test_cannot_instantiate(self): 11 | with pytest.raises(TypeError): 12 | TransformComponent() 13 | 14 | @patch.multiple(TransformComponent, __abstractmethods__=set()) 15 | def test_parent(self): 16 | with pytest.raises(TypeError): 17 | feature_component = TransformComponent() 18 | feature_component.parent() 19 | -------------------------------------------------------------------------------- /tests/unit/butterfree/transform/transformations/user_defined_functions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/transform/transformations/user_defined_functions/__init__.py -------------------------------------------------------------------------------- /tests/unit/butterfree/transform/transformations/user_defined_functions/conftest.py: -------------------------------------------------------------------------------- 1 | from pytest import fixture 2 | 3 | 4 | @fixture 5 | def feature_set_dataframe(spark_context, spark_session): 6 | data = [ 7 | {"id": 1, "feature1": 100}, 8 | {"id": 1, "feature1": 100}, 9 | {"id": 1, "feature1": 200}, 10 | {"id": 1, "feature1": 200}, 11 | {"id": 1, "feature1": 200}, 12 | {"id": 1, "feature1": 300}, 13 | {"id": 1, "feature1": 300}, 14 | {"id": 1, "feature1": 300}, 15 | {"id": 1, "feature1": 300}, 16 | {"id": 1, "feature1": 300}, 17 | {"id": 2, "feature1": 100}, 18 | {"id": 2, "feature1": 100}, 19 | {"id": 2, "feature1": 200}, 20 | {"id": 2, "feature1": 200}, 21 | {"id": 2, "feature1": 200}, 22 | {"id": 2, "feature1": 300}, 23 | {"id": 2, "feature1": 300}, 24 | {"id": 2, "feature1": 300}, 25 | {"id": 2, "feature1": 300}, 26 | {"id": 2, "feature1": 300}, 27 | ] 28 | return spark_session.read.json(spark_context.parallelize(data, 1)) 29 | 30 | 31 | @fixture 32 | def feature_set_custom_dataframe(spark_context, spark_session): 33 | data = [ 34 | {"id": 1, "feature1": "abc"}, 35 | {"id": 1, "feature1": "abc"}, 36 | {"id": 1, "feature1": "abc"}, 37 | {"id": 1, "feature1": "def"}, 38 | {"id": 1, "feature1": "def"}, 39 | {"id": 2, "feature1": "def"}, 40 | {"id": 2, "feature1": "def"}, 41 | {"id": 2, "feature1": "def"}, 42 | {"id": 2, "feature1": "abc"}, 43 | {"id": 2, "feature1": "abc"}, 44 | ] 45 | return spark_session.read.json(spark_context.parallelize(data, 1)) 46 | 47 | 48 | @fixture 49 | def mode_target_df(spark_context, spark_session): 50 | data = [ 51 | {"id": 1, "mode(feature1)": "300"}, 52 | {"id": 2, "mode(feature1)": "300"}, 53 | ] 54 | return spark_session.read.json(spark_context.parallelize(data, 1)) 55 | 56 | 57 | @fixture 58 | def most_frequent_set_target_df(spark_context, spark_session): 59 | data = [ 60 | {"id": 1, "most_frequent_set(feature1)": ["300", "200", "100"]}, 61 | {"id": 2, "most_frequent_set(feature1)": ["300", "200", "100"]}, 62 | ] 63 | return spark_session.read.json(spark_context.parallelize(data, 1)) 64 | 65 | 66 | @fixture 67 | def most_frequent_set_str_target_df(spark_context, spark_session): 68 | data = [ 69 | {"id": 1, "most_frequent_set(feature1)": ["abc", "def"]}, 70 | {"id": 2, "most_frequent_set(feature1)": ["def", "abc"]}, 71 | ] 72 | return spark_session.read.json(spark_context.parallelize(data, 1)) 73 | -------------------------------------------------------------------------------- /tests/unit/butterfree/transform/transformations/user_defined_functions/test_mode.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql.types import StringType 2 | 3 | from butterfree.testing.dataframe import assert_dataframe_equality 4 | from butterfree.transform.transformations.user_defined_functions import mode 5 | 6 | 7 | def test_mode_output(feature_set_dataframe, mode_target_df): 8 | output_df = feature_set_dataframe.groupby("id").agg(mode("feature1")) 9 | 10 | assert_dataframe_equality(output_df, mode_target_df) 11 | 12 | 13 | def test_mode_output_type(feature_set_dataframe, mode_target_df): 14 | output_df = feature_set_dataframe.groupby("id").agg(mode("feature1")) 15 | 16 | assert isinstance(output_df.schema["mode(feature1)"].dataType, StringType) 17 | -------------------------------------------------------------------------------- /tests/unit/butterfree/transform/transformations/user_defined_functions/test_most_frequent.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql.types import ArrayType 2 | 3 | from butterfree.testing.dataframe import assert_dataframe_equality 4 | from butterfree.transform.transformations.user_defined_functions import ( 5 | most_frequent_set, 6 | ) 7 | 8 | 9 | def test_most_frequent_set_output(feature_set_dataframe, most_frequent_set_target_df): 10 | output_df = feature_set_dataframe.groupby("id").agg(most_frequent_set("feature1")) 11 | 12 | assert_dataframe_equality(output_df, most_frequent_set_target_df) 13 | 14 | 15 | def test_most_frequent_set_str_input( 16 | feature_set_custom_dataframe, most_frequent_set_str_target_df 17 | ): 18 | output_df = feature_set_custom_dataframe.groupby("id").agg( 19 | most_frequent_set("feature1") 20 | ) 21 | 22 | assert_dataframe_equality(output_df, most_frequent_set_str_target_df) 23 | 24 | 25 | def test_most_frequent_set_output_type( 26 | feature_set_dataframe, most_frequent_set_target_df 27 | ): 28 | output_df = feature_set_dataframe.groupby("id").agg(most_frequent_set("feature1")) 29 | 30 | assert isinstance( 31 | output_df.schema["most_frequent_set(feature1)"].dataType, ArrayType 32 | ) 33 | -------------------------------------------------------------------------------- /tests/unit/butterfree/validations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quintoandar/butterfree/a411f1032f40f2bb15e05652811a0a28bd0b5ed2/tests/unit/butterfree/validations/__init__.py -------------------------------------------------------------------------------- /tests/unit/butterfree/validations/conftest.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql.types import StringType, StructField, StructType 2 | from pytest import fixture 3 | 4 | from butterfree.constants.columns import TIMESTAMP_COLUMN 5 | 6 | 7 | @fixture 8 | def feature_set_dataframe(spark_context, spark_session): 9 | data = [ 10 | {"id": 1, TIMESTAMP_COLUMN: 0, "feature": 100}, 11 | {"id": 2, TIMESTAMP_COLUMN: 0, "feature": 200}, 12 | {"id": 1, TIMESTAMP_COLUMN: 1, "feature": 110}, 13 | {"id": 1, TIMESTAMP_COLUMN: 2, "feature": 120}, 14 | ] 15 | return spark_session.read.json(spark_context.parallelize(data, 1)) 16 | 17 | 18 | @fixture 19 | def feature_set_without_ts(spark_context, spark_session): 20 | data = [ 21 | {"id": 1, "feature": 100}, 22 | {"id": 2, "feature": 200}, 23 | {"id": 1, "feature": 110}, 24 | {"id": 1, "feature": 120}, 25 | ] 26 | return spark_session.read.json(spark_context.parallelize(data, 1)) 27 | 28 | 29 | @fixture 30 | def feature_set_empty(spark_context, spark_session): 31 | 32 | field = [StructField("field1", StringType(), True)] 33 | schema = StructType(field) 34 | 35 | return spark_session.createDataFrame(spark_context.emptyRDD(), schema) 36 | -------------------------------------------------------------------------------- /tests/unit/butterfree/validations/test_basic_validation.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import MagicMock 2 | 3 | import pytest 4 | from pyspark.sql.connect.dataframe import DataFrame as ConnectDataFrame 5 | from pyspark.sql.dataframe import DataFrame 6 | 7 | from butterfree.validations import BasicValidation 8 | 9 | 10 | def test_validate_without_column_ts(feature_set_without_ts): 11 | check = BasicValidation(feature_set_without_ts) 12 | 13 | with pytest.raises(ValueError): 14 | check.validate_column_ts() 15 | 16 | 17 | def test_validate_df_is_empty_with_none_dataframe(): 18 | validation = BasicValidation(None) 19 | 20 | with pytest.raises(ValueError, match="DataFrame can't be None."): 21 | validation.validate_df_is_empty() 22 | 23 | 24 | def test_validate_df_is_empty_with_empty_dataframe(spark_session): 25 | df = spark_session.createDataFrame([], "id INT") 26 | validation = BasicValidation(df) 27 | 28 | with pytest.raises(ValueError, match="DataFrame can't be empty."): 29 | validation.validate_df_is_empty() 30 | 31 | 32 | def test_validate_df_is_empty_with_non_empty_dataframe(spark_session): 33 | df = spark_session.createDataFrame([(1,)], "id INT") 34 | validation = BasicValidation(df) 35 | validation.validate_df_is_empty() 36 | 37 | 38 | # If it's DBR < 13.3 (spark < 3.4.1) it will break. Every ConnectDataFrame has isEmpty 39 | @pytest.mark.parametrize( 40 | "is_empty, has_is_empty, dataframe_type", 41 | [ 42 | (True, True, DataFrame), 43 | (False, True, DataFrame), 44 | (True, False, DataFrame), 45 | (False, False, DataFrame), 46 | # This module `pyspark.sql.connect.dataframe.DataFrame` always has isEmpty 47 | # However, it does not have `rdd` 48 | (True, True, ConnectDataFrame), 49 | (False, True, ConnectDataFrame), 50 | ], 51 | ) 52 | def test_is_empty_permutations(is_empty, has_is_empty, dataframe_type): 53 | df = MagicMock(spec=dataframe_type) 54 | 55 | if has_is_empty: 56 | df.isEmpty.return_value = is_empty 57 | else: 58 | delattr(df, "isEmpty") 59 | df.rdd.isEmpty.return_value = is_empty 60 | 61 | validation = BasicValidation(df) 62 | assert validation._is_empty() == is_empty 63 | --------------------------------------------------------------------------------