├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md └── workflows │ ├── python-publish.yml │ └── run-tests-in-docker.yml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md └── check-engine-lib ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── checkengine ├── _constraints │ ├── _Constraint.py │ ├── _NotNull.py │ ├── _Numbers.py │ ├── _OneOf.py │ ├── _StatColumn.py │ ├── _TextLength.py │ ├── _TextRegex.py │ └── _Unique.py └── validate_df.py ├── conftest.py ├── poetry.lock ├── pyproject.toml └── tests ├── __init__.py ├── spark ├── AssertResult.py ├── __init__.py ├── assert_df.py └── spark_session.py ├── test_between_integer.py ├── test_max_integer.py ├── test_mean_value.py ├── test_median_value.py ├── test_min_integer.py ├── test_multi_contraint.py ├── test_not_null.py ├── test_one_of.py ├── test_spark_env.py ├── test_string_length.py ├── test_string_matches.py ├── test_uniqueness.py └── test_validation_without_rules.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 16 | **Expected behavior** 17 | A clear and concise description of what you expected to happen. 18 | 19 | **Actual behavior** 20 | A clear and concise description of what you see instead of the expected behavior. 21 | 22 | **Screenshots or console log** 23 | If applicable, add screenshots to help explain your problem. 24 | 25 | **Environment (please complete the following information):** 26 | - OS: 27 | - Python version 28 | - Spark version 29 | - Python packages installed in the runtime environment 30 | - check-engine version 31 | 32 | **Additional context** 33 | Add any other context about the problem here. 34 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package 5 | 6 | on: 7 | release: 8 | types: [created] 9 | 10 | jobs: 11 | deploy: 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - uses: actions/checkout@v2 16 | - name: Build 17 | uses: abatilo/actions-poetry@v1.5.0 18 | with: 19 | python_version: 3.8.0 20 | poetry_version: 1.0 21 | working_directory: ./check-engine-lib 22 | args: build 23 | - name: Publish distribution 📦 to PyPI 24 | uses: pypa/gh-action-pypi-publish@master 25 | with: 26 | password: ${{ secrets.pypi_token }} 27 | packages_dir: ./check-engine-lib/dist 28 | -------------------------------------------------------------------------------- /.github/workflows/run-tests-in-docker.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | 7 | jobs: 8 | build: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v2 12 | - name: Run tests in Docker 13 | run: docker build -t check-engine-test check-engine-lib/. && docker run check-engine-test 14 | - name: Build 15 | uses: abatilo/actions-poetry@v1.5.0 16 | with: 17 | python_version: 3.8.0 18 | poetry_version: 1.0 19 | working_directory: ./check-engine-lib 20 | args: build 21 | - name: Publish distribution 📦 to Test PyPI 22 | uses: pypa/gh-action-pypi-publish@master 23 | with: 24 | password: ${{ secrets.test_pypi_token }} 25 | repository_url: https://test.pypi.org/legacy/ 26 | packages_dir: ./check-engine-lib/dist 27 | skip_existing: true 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of conduct 2 | 3 | ## Our Standards 4 | 5 | Examples of behavior that contributes to creating a positive environment include: 6 | 7 | * Using welcoming and inclusive language 8 | * Being respectful of differing viewpoints and experiences 9 | * Gracefully accepting constructive criticism 10 | * Focusing on what is best for the community 11 | * Showing empathy towards other community members 12 | 13 | Examples of unacceptable behavior by participants include: 14 | 15 | * The use of sexualized language or imagery and unwelcome sexual attention or advances 16 | * Trolling, insulting/derogatory comments, and personal or political attacks 17 | * Public or private harassment 18 | * Publishing others' private information, such as a physical or electronic address, without explicit permission 19 | * Other conduct which could reasonably be considered inappropriate in a professional setting 20 | 21 | ## Our Responsibilities 22 | 23 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 24 | 25 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 26 | 27 | ## Scope 28 | 29 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. 30 | 31 | ## Enforcement 32 | 33 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project admin (https://twitter.com/mikulskibartosz). All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. 34 | 35 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. 36 | 37 | ## Attribution 38 | 39 | This Code of Conduct is adapted from the Contributor Covenant, version 1.4, available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 40 | 41 | For answers to common questions about this code of conduct, see https://www.contributor-covenant.org/faq 42 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to contribute 2 | 3 | 1. Before contributing please create GitHub issue (if it is not created already) and discuss your case and the solution with the authors. 4 | 5 | 2. Propose your changes via pull request. 6 | 7 | 3. Please write descriptive messages in your commits and pull requests so that we can understand all easily. 8 | 9 | 4. All changes must be covered by **passing** tests. 10 | 11 | 5. Remember to update README.md files if necessary. 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Bartosz Mikulski 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Summary 2 | 3 | The goal of this project is to implement a data validation library for PySpark. The library should detect the incorrect structure of the data, unexpected values in columns, and anomalies in the data. 4 | 5 | ## How to install 6 | 7 | ``` 8 | pip install checkengine==0.2.0 9 | ``` 10 | 11 | ## How to use 12 | 13 | ``` 14 | from checkengine.validate_df import ValidateSparkDataFrame 15 | 16 | result = ValidateSparkDataFrame(spark_session, spark_data_frame) \ 17 | .is_not_null("column_name") \ 18 | .are_not_null(["column_name_2", "column_name_3"]) \ 19 | .is_min("numeric_column", 10) \ 20 | .is_max("numeric_column", 20) \ 21 | .is_unique("column_name") \ 22 | .are_unique(["column_name_2", "column_name_3"]) \ 23 | .is_between("numeric_column_2", 10, 15) \ 24 | .has_length_between("text_column", 0, 10) \ 25 | .mean_column_value("numeric_column", 10, 20) \ 26 | .median_column_value("numeric_column", 5, 15) \ 27 | .text_matches_regex("text_column", "^[a-z]{3,10}$") \ 28 | .one_of("text_column", ["value_a", "value_b"]) \ 29 | .one_of("numeric_column", [123, 456]) \ 30 | .execute() 31 | 32 | result.correct_data #rows that passed the validation 33 | result.erroneous_data #rows rejected during the validation 34 | results.errors a summary of validation errors (three fields: column_name, constraint_name, number_of_errors) 35 | ``` 36 | 37 | ## How to build 38 | 39 | 1. Install the Poetry build tool. 40 | 41 | 2. Run the following commands: 42 | 43 | ``` 44 | cd check-engine-lib 45 | poetry build 46 | ``` 47 | 48 | ## How to test locally 49 | 50 | ### Run all tests 51 | 52 | ``` 53 | cd check-engine-lib 54 | poetry run pytest tests/ 55 | ``` 56 | 57 | ### Run a single test file 58 | 59 | ``` 60 | cd check-engine-lib 61 | poetry run pytest tests/test_between_integer.py 62 | ``` 63 | 64 | ### Run a single test method 65 | 66 | ``` 67 | cd check-engine-lib 68 | poetry run pytest tests/test_between_integer.py -k 'test_should_return_df_without_changes_if_all_are_between' 69 | ``` 70 | 71 | ## How to test in Docker 72 | 73 | ``` 74 | docker build -t check-engine-test check-engine-lib/. && docker run check-engine-test 75 | ``` 76 | -------------------------------------------------------------------------------- /check-engine-lib/.gitignore: -------------------------------------------------------------------------------- 1 | /.idea/ 2 | /dist/ 3 | /requirements.txt 4 | /venv/ -------------------------------------------------------------------------------- /check-engine-lib/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8.1-slim 2 | 3 | 4 | ENV PYTHONUNBUFFERED=1 \ 5 | # prevents python creating .pyc files 6 | PYTHONDONTWRITEBYTECODE=1 \ 7 | \ 8 | # pip 9 | PIP_NO_CACHE_DIR=off \ 10 | PIP_DISABLE_PIP_VERSION_CHECK=on \ 11 | PIP_DEFAULT_TIMEOUT=100 \ 12 | \ 13 | # poetry 14 | # https://python-poetry.org/docs/configuration/#using-environment-variables 15 | POETRY_VERSION=1.0.3 \ 16 | # make poetry install to this location 17 | POETRY_HOME="/opt/poetry" \ 18 | # make poetry create the virtual environment in the project's root 19 | # it gets named `.venv` 20 | POETRY_VIRTUALENVS_IN_PROJECT=true \ 21 | # do not ask any interactive question 22 | POETRY_NO_INTERACTION=1 \ 23 | \ 24 | # paths 25 | # this is where our requirements + virtual environment will live 26 | PYSETUP_PATH="/opt/pysetup" \ 27 | VENV_PATH="/opt/pysetup/.venv" 28 | 29 | 30 | # prepend poetry and venv to path 31 | ENV PATH="$POETRY_HOME/bin:$VENV_PATH/bin:$PATH" 32 | 33 | RUN apt-get update \ 34 | && apt-get install --no-install-recommends -y \ 35 | # deps for installing poetry 36 | curl \ 37 | wget \ 38 | # deps for building python deps 39 | build-essential 40 | 41 | RUN curl -sSL https://raw.githubusercontent.com/sdispater/poetry/master/get-poetry.py | python 42 | 43 | RUN cd /opt && wget -c --header "Cookie:oraclelicense=accept-securebackup-cookie" http://download.oracle.com/otn-pub/java/jdk/8u131-b11/d54c1d3a095b4ff2b6607d096fa80163/jdk-8u131-linux-x64.tar.gz 44 | RUN tar -xzf /opt/jdk-8u131-linux-x64.tar.gz -C /opt && ln -s /opt/jdk1.8.0_131 /opt/jdk 45 | 46 | ENV JAVA_HOME /opt/jdk 47 | ENV PATH ${PATH}:${JAVA_HOME}/bin 48 | 49 | RUN mkdir -p /opt/spark \ 50 | && cd /opt/spark \ 51 | && curl https://mirrors.hostingromania.ro/apache.org/spark/spark-3.0.3/spark-3.0.3-bin-hadoop2.7.tgz -o spark.tgz \ 52 | && tar xf spark.tgz 53 | 54 | WORKDIR /app 55 | COPY checkengine ./checkengine 56 | COPY ./tests ./tests 57 | COPY ./pyproject.toml poetry.lock ./ 58 | COPY ./conftest.py ./ 59 | COPY ./README.md ./ 60 | 61 | RUN poetry install 62 | 63 | ENV SPARK_HOME=/opt/spark/spark-3.0.3-bin-hadoop2.7 64 | 65 | CMD poetry run pytest -s --cov=checkengine --cov-branch --cov-fail-under=80 tests/ -------------------------------------------------------------------------------- /check-engine-lib/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Bartosz Mikulski 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /check-engine-lib/README.md: -------------------------------------------------------------------------------- 1 | ## Summary 2 | 3 | The goal of this project is to implement a data validation library for PySpark. The library should detect the incorrect structure of the data, unexpected values in columns, and anomalies in the data. 4 | 5 | ## How to install 6 | 7 | THERE IS NO PACKAGE YET!!! 8 | 9 | ## How to use 10 | 11 | ``` 12 | from checkengine.validate_df import ValidateSparkDataFrame 13 | 14 | result = ValidateSparkDataFrame(spark_session, spark_data_frame) \ 15 | .is_not_null("column_name") \ 16 | .are_not_null(["column_name_2", "column_name_3"]) \ 17 | .is_min("numeric_column", 10) \ 18 | .is_max("numeric_column", 20) \ 19 | .is_unique("column_name") \ 20 | .are_unique(["column_name_2", "column_name_3"]) \ 21 | .is_between("numeric_column_2", 10, 15) \ 22 | .has_length_between("text_column", 0, 10) \ 23 | .mean_column_value("numeric_column", 10, 20) \ 24 | .median_column_value("numeric_column", 5, 15) \ 25 | .text_matches_regex("text_column", "^[a-z]{3,10}$") \ 26 | .one_of("text_column", ["value_a", "value_b"]) \ 27 | .one_of("numeric_column", [123, 456]) \ 28 | .execute() 29 | 30 | result.correct_data #rows that passed the validation 31 | result.erroneous_data #rows rejected during the validation 32 | results.errors a summary of validation errors (three fields: column_name, constraint_name, number_of_errors) 33 | ``` 34 | 35 | ## How to build 36 | 37 | 1. Install the Poetry build tool. 38 | 39 | 2. Run the following commands: 40 | 41 | ``` 42 | cd check-engine-lib 43 | poetry build 44 | ``` 45 | 46 | ## How to test locally 47 | 48 | ### Run all tests 49 | 50 | ``` 51 | cd check-engine-lib 52 | poetry run pytest tests/ 53 | ``` 54 | 55 | ### Run a single test file 56 | 57 | ``` 58 | cd check-engine-lib 59 | poetry run pytest tests/test_between_integer.py 60 | ``` 61 | 62 | ### Run a single test method 63 | 64 | ``` 65 | cd check-engine-lib 66 | poetry run pytest tests/test_between_integer.py -k 'test_should_return_df_without_changes_if_all_are_between' 67 | ``` 68 | 69 | ## How to test in Docker 70 | 71 | ``` 72 | docker build -t check-engine-test check-engine-lib/. && docker run check-engine-test 73 | ``` 74 | -------------------------------------------------------------------------------- /check-engine-lib/checkengine/_constraints/_Constraint.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | from abc import ABC, abstractmethod 3 | import random 4 | import string 5 | 6 | from pyspark.sql import DataFrame 7 | 8 | 9 | def _generate_constraint_column_name(constraint_type, column_name): 10 | random_suffix = ''.join(random.choice(string.ascii_lowercase) for i in range(12)) 11 | return f"__checkengine__{column_name}_{constraint_type}_{random_suffix}" 12 | 13 | 14 | class _Constraint(ABC): 15 | def __init__(self, column_name: str): 16 | self.column_name = column_name 17 | self.constraint_column_name = _generate_constraint_column_name(self.constraint_name(), column_name) 18 | 19 | @abstractmethod 20 | def constraint_name(self): 21 | pass 22 | 23 | @abstractmethod 24 | def prepare_df_for_check(self, data_frame: DataFrame) -> DataFrame: 25 | return data_frame 26 | 27 | @abstractmethod 28 | def filter_success(self, data_frame: DataFrame) -> DataFrame: 29 | return data_frame 30 | 31 | @abstractmethod 32 | def filter_failure(self, data_frame: DataFrame) -> DataFrame: 33 | return data_frame 34 | 35 | def validate_self(self, data_frame: DataFrame, df_columns: List[str]) -> Tuple[bool, str]: 36 | return self.column_name in df_columns, f"There is no '{self.column_name}' column" 37 | -------------------------------------------------------------------------------- /check-engine-lib/checkengine/_constraints/_NotNull.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import DataFrame 2 | 3 | from checkengine._constraints._Constraint import _Constraint 4 | 5 | 6 | class _NotNull(_Constraint): 7 | def __init__(self, column_name: str): 8 | super().__init__(column_name) 9 | 10 | def prepare_df_for_check(self, data_frame: DataFrame) -> DataFrame: 11 | return data_frame 12 | 13 | def filter_success(self, data_frame: DataFrame) -> DataFrame: 14 | return data_frame.filter(f"{self.column_name} IS NOT NULL") 15 | 16 | def filter_failure(self, data_frame: DataFrame) -> DataFrame: 17 | return data_frame.filter(f"{self.column_name} IS NULL") 18 | 19 | def constraint_name(self): 20 | return "not_null" 21 | -------------------------------------------------------------------------------- /check-engine-lib/checkengine/_constraints/_Numbers.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | from typing import List, Tuple 3 | 4 | from pyspark.sql import DataFrame 5 | 6 | from checkengine._constraints._Constraint import _Constraint 7 | 8 | 9 | class _Number(_Constraint, ABC): 10 | def __init__(self, column_name: str): 11 | super().__init__(column_name) 12 | 13 | def prepare_df_for_check(self, data_frame: DataFrame) -> DataFrame: 14 | return data_frame 15 | 16 | def validate_self(self, data_frame: DataFrame, df_columns: List[str]) -> Tuple[bool, str]: 17 | parent_validation_result = super().validate_self(data_frame, df_columns) 18 | if not parent_validation_result[0]: 19 | return parent_validation_result 20 | else: 21 | column_type = [dtype for name, dtype in data_frame.dtypes if name == self.column_name][0] 22 | return column_type in ["tinyint", "smallint", "int", "bigint"], f"Column {self.column_name} is not a number" 23 | 24 | 25 | class _Min(_Number): 26 | def __init__(self, column_name: str, value: int): 27 | super().__init__(column_name) 28 | self.value = value 29 | 30 | def filter_success(self, data_frame: DataFrame) -> DataFrame: 31 | return data_frame.filter(f"{self.column_name} >= {self.value}") 32 | 33 | def filter_failure(self, data_frame: DataFrame) -> DataFrame: 34 | return data_frame.filter(f"{self.column_name} < {self.value}") 35 | 36 | def constraint_name(self): 37 | return "min" 38 | 39 | 40 | class _Max(_Number): 41 | def __init__(self, column_name: str, value: int): 42 | super().__init__(column_name) 43 | self.value = value 44 | 45 | def filter_success(self, data_frame: DataFrame) -> DataFrame: 46 | return data_frame.filter(f"{self.column_name} <= {self.value}") 47 | 48 | def filter_failure(self, data_frame: DataFrame) -> DataFrame: 49 | return data_frame.filter(f"{self.column_name} > {self.value}") 50 | 51 | def constraint_name(self): 52 | return "max" 53 | 54 | 55 | class _Between(_Number): 56 | def __init__(self, column_name: str, lower_bound: int, upper_bound: int): 57 | super().__init__(column_name) 58 | self.lower_bound = lower_bound 59 | self.upper_bound = upper_bound 60 | 61 | def filter_success(self, data_frame: DataFrame) -> DataFrame: 62 | return data_frame.filter(f"{self.column_name} >= {self.lower_bound} AND {self.column_name} <= {self.upper_bound}") 63 | 64 | def filter_failure(self, data_frame: DataFrame) -> DataFrame: 65 | return data_frame.filter(f"{self.column_name} < {self.lower_bound} OR {self.column_name} > {self.upper_bound}") 66 | 67 | def constraint_name(self): 68 | return "between" 69 | 70 | def validate_self(self, data_frame: DataFrame, df_columns: List[str]) -> Tuple[bool, str]: 71 | parent_validation_result = super().validate_self(data_frame, df_columns) 72 | if not parent_validation_result[0]: 73 | return parent_validation_result 74 | else: 75 | return self.lower_bound <= self.upper_bound, f"Upper bound ({self.upper_bound}) cannot be lower than lower bound ({self.lower_bound})." 76 | -------------------------------------------------------------------------------- /check-engine-lib/checkengine/_constraints/_OneOf.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import DataFrame 2 | 3 | from checkengine._constraints._Constraint import _Constraint 4 | 5 | 6 | class _OneOf(_Constraint): 7 | def __init__(self, column_name: str, allowed_values: list): 8 | super().__init__(column_name) 9 | self.allowed_values = allowed_values 10 | 11 | def prepare_df_for_check(self, data_frame: DataFrame) -> DataFrame: 12 | return data_frame 13 | 14 | def filter_success(self, data_frame: DataFrame) -> DataFrame: 15 | return data_frame.filter(data_frame[self.column_name].isin(*self.allowed_values)) 16 | 17 | def filter_failure(self, data_frame: DataFrame) -> DataFrame: 18 | return data_frame.filter(~data_frame[self.column_name].isin(*self.allowed_values)) 19 | 20 | def constraint_name(self): 21 | return "one_of" 22 | -------------------------------------------------------------------------------- /check-engine-lib/checkengine/_constraints/_StatColumn.py: -------------------------------------------------------------------------------- 1 | """ 2 | Checks that require applying a statistical function to all values in a single column. 3 | """ 4 | from abc import ABC 5 | 6 | from pyspark.sql import DataFrame 7 | import pyspark.sql.functions as F 8 | 9 | from checkengine._constraints._Numbers import _Number 10 | 11 | 12 | class _StatColumn(_Number, ABC): 13 | def __init__(self, column_name: str, lower_bound: float, upper_bound: float): 14 | super().__init__(column_name) 15 | self.lower_bound = lower_bound 16 | self.upper_bound = upper_bound 17 | 18 | def filter_success(self, data_frame: DataFrame) -> DataFrame: 19 | return data_frame.filter(f"{self.constraint_column_name} >= {self.lower_bound} AND {self.constraint_column_name} <= {self.upper_bound}") 20 | 21 | def filter_failure(self, data_frame: DataFrame) -> DataFrame: 22 | return data_frame.filter(f"{self.constraint_column_name} < {self.lower_bound} OR {self.constraint_column_name} > {self.upper_bound}") 23 | 24 | 25 | class _MeanColumn(_StatColumn): 26 | def prepare_df_for_check(self, data_frame: DataFrame) -> DataFrame: 27 | average: DataFrame = data_frame \ 28 | .groupby() \ 29 | .avg(self.column_name) \ 30 | .withColumnRenamed(f"avg({self.column_name})", self.constraint_column_name) 31 | 32 | return data_frame.crossJoin(average) 33 | 34 | def constraint_name(self): 35 | return "mean_between" 36 | 37 | 38 | class _MedianColumn(_StatColumn): 39 | def prepare_df_for_check(self, data_frame: DataFrame) -> DataFrame: 40 | median = F.expr(f"percentile_approx({self.column_name}, 0.5)") 41 | 42 | average: DataFrame = data_frame \ 43 | .groupby() \ 44 | .agg(median.alias(self.constraint_column_name)) 45 | 46 | return data_frame.crossJoin(average) 47 | 48 | def constraint_name(self): 49 | return "median_between" 50 | -------------------------------------------------------------------------------- /check-engine-lib/checkengine/_constraints/_TextLength.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | from pyspark.sql import DataFrame 4 | 5 | from checkengine._constraints._Constraint import _Constraint 6 | 7 | 8 | class _TextLength(_Constraint): 9 | def __init__(self, column_name: str, lower_bound: int, upper_bound: int): 10 | super().__init__(column_name) 11 | self.lower_bound = lower_bound 12 | self.upper_bound = upper_bound 13 | 14 | def prepare_df_for_check(self, data_frame: DataFrame) -> DataFrame: 15 | return data_frame 16 | 17 | def validate_self(self, data_frame: DataFrame, df_columns: List[str]) -> Tuple[bool, str]: 18 | parent_validation_result = super().validate_self(data_frame, df_columns) 19 | if not parent_validation_result[0]: 20 | return parent_validation_result 21 | else: 22 | column_type = [dtype for name, dtype in data_frame.dtypes if name == self.column_name][0] 23 | if column_type != 'string': 24 | return False, f"Column {self.column_name} is not a string." 25 | else: 26 | return self.lower_bound <= self.upper_bound, f"Upper bound ({self.upper_bound}) cannot be lower than lower bound ({self.lower_bound})." 27 | 28 | def filter_success(self, data_frame: DataFrame) -> DataFrame: 29 | return data_frame.filter(f"LENGTH({self.column_name}) >= {self.lower_bound} AND LENGTH({self.column_name}) <= {self.upper_bound}") 30 | 31 | def filter_failure(self, data_frame: DataFrame) -> DataFrame: 32 | return data_frame.filter(f"LENGTH({self.column_name}) < {self.lower_bound} OR LENGTH({self.column_name}) > {self.upper_bound}") 33 | 34 | def constraint_name(self): 35 | return "text_length" 36 | -------------------------------------------------------------------------------- /check-engine-lib/checkengine/_constraints/_TextRegex.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | from pyspark.sql import DataFrame 4 | 5 | from checkengine._constraints._Constraint import _Constraint 6 | 7 | 8 | class _TextRegex(_Constraint): 9 | def __init__(self, column_name: str, regex: str): 10 | super().__init__(column_name) 11 | self.regex = regex 12 | 13 | def prepare_df_for_check(self, data_frame: DataFrame) -> DataFrame: 14 | return data_frame.withColumn(self.constraint_column_name, data_frame[self.column_name].rlike(self.regex)) 15 | 16 | def validate_self(self, data_frame: DataFrame, df_columns: List[str]) -> Tuple[bool, str]: 17 | parent_validation_result = super().validate_self(data_frame, df_columns) 18 | if not parent_validation_result[0]: 19 | return parent_validation_result 20 | else: 21 | column_type = [dtype for name, dtype in data_frame.dtypes if name == self.column_name][0] 22 | return column_type == 'string', f"Column {self.column_name} is not a string." 23 | 24 | def filter_success(self, data_frame: DataFrame) -> DataFrame: 25 | return data_frame.filter(f"{self.constraint_column_name} = TRUE") 26 | 27 | def filter_failure(self, data_frame: DataFrame) -> DataFrame: 28 | return data_frame.filter(f"{self.constraint_column_name} = FALSE") 29 | 30 | def constraint_name(self): 31 | return "regex_match" 32 | -------------------------------------------------------------------------------- /check-engine-lib/checkengine/_constraints/_Unique.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import DataFrame 2 | 3 | from checkengine._constraints._Constraint import _Constraint 4 | 5 | 6 | class _Unique(_Constraint): 7 | def __init__(self, column_name: str): 8 | super().__init__(column_name) 9 | 10 | def prepare_df_for_check(self, data_frame: DataFrame) -> DataFrame: 11 | count_repetitions: DataFrame = data_frame \ 12 | .groupby(self.column_name) \ 13 | .count() \ 14 | .withColumnRenamed("count", self.constraint_column_name) 15 | 16 | return data_frame.join(count_repetitions, self.column_name, "left") 17 | 18 | def filter_success(self, data_frame: DataFrame) -> DataFrame: 19 | return data_frame.filter(f"{self.constraint_column_name} == 1") 20 | 21 | def filter_failure(self, data_frame: DataFrame) -> DataFrame: 22 | return data_frame.filter(f"{self.constraint_column_name} > 1") 23 | 24 | def constraint_name(self): 25 | return "unique" 26 | -------------------------------------------------------------------------------- /check-engine-lib/checkengine/validate_df.py: -------------------------------------------------------------------------------- 1 | from typing import NamedTuple, List 2 | 3 | from pyspark.sql import DataFrame, SparkSession 4 | 5 | from checkengine._constraints._Constraint import _Constraint 6 | from checkengine._constraints._NotNull import _NotNull 7 | from ._constraints._Numbers import _Min, _Max, _Between 8 | from checkengine._constraints._OneOf import _OneOf 9 | from checkengine._constraints._TextLength import _TextLength 10 | from checkengine._constraints._TextRegex import _TextRegex 11 | from checkengine._constraints._Unique import _Unique 12 | from checkengine._constraints._StatColumn import _MeanColumn, _MedianColumn 13 | 14 | 15 | class ValidationError(NamedTuple): 16 | """Describes a single validation error. 17 | 18 | This object contains the name of the violated constraint, the name of the column that contains incorrect data and the number of rows that violate the rule. 19 | """ 20 | column_name: str 21 | constraint_name: str 22 | number_of_errors: int 23 | 24 | 25 | class ValidationResult(NamedTuple): 26 | """Contains both the correct data and the data that has not passed the validation. 27 | 28 | In addition to the erroneous data, it also contains a list of validation errors. 29 | """ 30 | correct_data: DataFrame 31 | erroneous_data: DataFrame 32 | errors: List[ValidationError] 33 | 34 | 35 | class ValidateSparkDataFrame: 36 | """Describes the validation rules of a Spark DataFrame and performs the validation. 37 | 38 | // TODO update the example when there is a new validation rule 39 | Usage example: 40 | ValidateSparkDataFrame(spark_session, data_frame) \ 41 | .is_not_null("column_name") \ 42 | .are_not_null(["column_name_2", "column_name_3"]) \ 43 | .is_min("numeric_column", 10) \ 44 | .is_max("numeric_column", 20) \ 45 | .is_unique("column_name") \ 46 | .are_unique(["column_name_2", "column_name_3"]) \ 47 | .execute() 48 | """ 49 | 50 | def __init__(self, spark: SparkSession, data_frame: DataFrame): 51 | """ValidateSparkDataFrame 52 | 53 | Args: 54 | spark: a Spark Session 55 | data_frame: the DataFrame to be validated 56 | """ 57 | self.spark: SparkSession = spark 58 | self.df: DataFrame = data_frame 59 | self.input_columns: List[str] = data_frame.columns 60 | self.constraints: List[_Constraint] = [] 61 | 62 | def is_unique(self, column_name: str): 63 | """Defines a constraint that checks whether the given column contains only unique values. 64 | 65 | Args: 66 | column_name: the name of the column 67 | 68 | Returns: 69 | self 70 | 71 | Raises: 72 | ValueError: if an unique constraint for a given column already exists. 73 | """ 74 | self._add_constraint(_Unique(column_name)) 75 | return self 76 | 77 | def are_unique(self, column_names: List[str]): 78 | """Defines constraints that check whether given columns contain only unique values. 79 | 80 | Args: 81 | column_names: a list of column names 82 | 83 | Returns: 84 | self 85 | """ 86 | for column_name in column_names: 87 | self.is_unique(column_name) 88 | return self 89 | 90 | def is_not_null(self, column_name: str): 91 | """Defines a constraint that does not allow null values in a given column. 92 | 93 | Args: 94 | column_name: the column name 95 | 96 | Returns: 97 | self 98 | """ 99 | self._add_constraint(_NotNull(column_name)) 100 | return self 101 | 102 | def are_not_null(self, column_names: List[str]): 103 | """Defines constraints that don't allow null values in all of the given columns 104 | 105 | Args: 106 | column_names: a list of column names 107 | 108 | Returns: 109 | self 110 | """ 111 | for column_name in column_names: 112 | self.is_not_null(column_name) 113 | return self 114 | 115 | def is_min(self, column_name: str, value: int): 116 | """Defines a constraint that check whether the given column contains values equal or larger than a given integer. 117 | 118 | Args: 119 | column_name: the column name 120 | value: the minimal value 121 | 122 | Returns: 123 | self 124 | """ 125 | self._add_constraint(_Min(column_name, value)) 126 | return self 127 | 128 | def is_max(self, column_name: str, value: int): 129 | """Defines a constraint that check whether the given column contains values equal or smaller than a given integer. 130 | 131 | Args: 132 | column_name: the column name 133 | value: the maximal value 134 | 135 | Returns: 136 | self 137 | """ 138 | self._add_constraint(_Max(column_name, value)) 139 | return self 140 | 141 | def is_between(self, column_name, lower_bound, upper_bound): 142 | """Defines a constraint that checks whether the given column contains a value equal to or between the lower and upper bound. 143 | 144 | Args: 145 | column_name: the column name 146 | lower_bound: the lower bound of the range 147 | upper_bound: the upper bound of the range 148 | 149 | Returns: 150 | self 151 | """ 152 | self._add_constraint(_Between(column_name, lower_bound, upper_bound)) 153 | return self 154 | 155 | def has_length_between(self, column_name, lower_bound, upper_bound): 156 | """Defines a constraint that checks whether the given column contains a text which length is equal to or between the lower and upper bound. 157 | 158 | Args: 159 | column_name: the column name 160 | lower_bound: the lower bound of the text length 161 | upper_bound: the upper bound of the text length 162 | 163 | Returns: 164 | self 165 | """ 166 | self._add_constraint(_TextLength(column_name, lower_bound, upper_bound)) 167 | return self 168 | 169 | def text_matches_regex(self, column_name, regex): 170 | """Defines a constraint that checks whether the content of a given column matches the given regex. 171 | 172 | Args: 173 | column_name: the column name 174 | regex: the regex 175 | 176 | Returns: 177 | self 178 | """ 179 | self._add_constraint(_TextRegex(column_name, regex)) 180 | return self 181 | 182 | def one_of(self, column_name, allowed_values: list): 183 | """Defines a constraint that checks whether the column value is equal to one of the given values. 184 | 185 | Args: 186 | column_name: the column name 187 | allowed_values: a list of allowed values, the type should match the column type 188 | 189 | Returns: 190 | self 191 | """ 192 | self._add_constraint(_OneOf(column_name, allowed_values)) 193 | return self 194 | 195 | def mean_column_value(self, column_name: str, min_mean: float, max_mean: float): 196 | """Defines a constraint that checks whether the average of all values in the column is between the given min and max value (inclusive). 197 | 198 | Args: 199 | column_name: the column name 200 | min_mean: the expected min value 201 | max_mean: the expected max value 202 | 203 | Returns: 204 | self 205 | """ 206 | self._add_constraint(_MeanColumn(column_name, min_mean, max_mean)) 207 | return self 208 | 209 | def median_column_value(self, column_name: str, min_median: float, max_median: float): 210 | """Defines a constraint that checks whether the median of all values in the column is between the given min and max value (inclusive). 211 | 212 | Args: 213 | column_name: the column name 214 | min_median: the expected min value 215 | max_median: the expected max value 216 | 217 | Returns: 218 | self 219 | """ 220 | self._add_constraint(_MedianColumn(column_name, min_median, max_median)) 221 | return self 222 | 223 | def execute(self) -> ValidationResult: 224 | """Returns a named tuple containing the data that passed the validation, the data that was rejected (only unique rows), and a list of violated constraints. 225 | 226 | Note that the order of rows and constraints is not preserved. 227 | 228 | Returns: 229 | an instance of ValidationResult 230 | 231 | Raises: 232 | ValueError: if a constraint has been defined using a non-existing column. 233 | """ 234 | self._validate_constraints() 235 | 236 | if self.constraints: 237 | for constraint in self.constraints: 238 | self.df = constraint.prepare_df_for_check(self.df) 239 | 240 | correct_output = self.df 241 | errors = [] 242 | 243 | for constraint in self.constraints: 244 | correct_output = constraint.filter_success(correct_output) 245 | number_of_failures = constraint.filter_failure(self.df).count() 246 | 247 | if number_of_failures > 0: 248 | errors.append(ValidationError(constraint.column_name, constraint.constraint_name(), number_of_failures)) 249 | 250 | correct_output = correct_output.select(self.input_columns) 251 | incorrect_output = self.df.select(self.input_columns).subtract(correct_output) 252 | 253 | return ValidationResult(correct_output, incorrect_output, errors) 254 | else: 255 | return ValidationResult(self.df, self.spark.createDataFrame([], self.df.schema), []) 256 | 257 | def _add_constraint(self, constraint: _Constraint) -> None: 258 | existing = filter(lambda c: c.constraint_name() == constraint.constraint_name() and c.column_name == constraint.column_name, self.constraints) 259 | if list(existing): 260 | raise ValueError(f"An not_null constraint for column {constraint.column_name} already exists.") 261 | 262 | self.constraints.append(constraint) 263 | 264 | def _validate_constraints(self) -> None: 265 | columns = self.df.columns 266 | 267 | errors = [] 268 | for constraint in self.constraints: 269 | is_correct, error_message = constraint.validate_self(self.df, columns) 270 | if not is_correct: 271 | errors.append(error_message) 272 | 273 | if errors: 274 | raise ValueError(", ".join(errors)) 275 | -------------------------------------------------------------------------------- /check-engine-lib/conftest.py: -------------------------------------------------------------------------------- 1 | 2 | pytest_plugins = [ 3 | "tests.spark.spark_session", 4 | ] 5 | -------------------------------------------------------------------------------- /check-engine-lib/poetry.lock: -------------------------------------------------------------------------------- 1 | [[package]] 2 | category = "dev" 3 | description = "Atomic file writes." 4 | marker = "sys_platform == \"win32\"" 5 | name = "atomicwrites" 6 | optional = false 7 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" 8 | version = "1.4.0" 9 | 10 | [[package]] 11 | category = "dev" 12 | description = "Classes Without Boilerplate" 13 | name = "attrs" 14 | optional = false 15 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" 16 | version = "21.2.0" 17 | 18 | [package.extras] 19 | dev = ["coverage (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "furo", "sphinx", "sphinx-notfound-page", "pre-commit"] 20 | docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"] 21 | tests = ["coverage (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface"] 22 | tests_no_zope = ["coverage (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins"] 23 | 24 | [[package]] 25 | category = "dev" 26 | description = "behave is behaviour-driven development, Python style" 27 | name = "behave" 28 | optional = false 29 | python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" 30 | version = "1.2.6" 31 | 32 | [package.dependencies] 33 | parse = ">=1.8.2" 34 | parse-type = ">=0.4.2" 35 | six = ">=1.11" 36 | 37 | [package.extras] 38 | develop = ["coverage", "pytest (>=3.0)", "pytest-cov", "tox", "invoke (>=0.21.0)", "path.py (>=8.1.2)", "pycmd", "pathlib", "modernize (>=0.5)", "pylint"] 39 | docs = ["sphinx (>=1.6)", "sphinx-bootstrap-theme (>=0.6)"] 40 | 41 | [[package]] 42 | category = "dev" 43 | description = "Cross-platform colored terminal text." 44 | marker = "sys_platform == \"win32\"" 45 | name = "colorama" 46 | optional = false 47 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" 48 | version = "0.4.4" 49 | 50 | [[package]] 51 | category = "dev" 52 | description = "Code coverage measurement for Python" 53 | name = "coverage" 54 | optional = false 55 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4" 56 | version = "5.5" 57 | 58 | [package.extras] 59 | toml = ["toml"] 60 | 61 | [[package]] 62 | category = "dev" 63 | description = "More routines for operating on iterables, beyond itertools" 64 | name = "more-itertools" 65 | optional = false 66 | python-versions = ">=3.5" 67 | version = "8.8.0" 68 | 69 | [[package]] 70 | category = "dev" 71 | description = "NumPy is the fundamental package for array computing with Python." 72 | name = "numpy" 73 | optional = false 74 | python-versions = ">=3.7" 75 | version = "1.21.0" 76 | 77 | [[package]] 78 | category = "dev" 79 | description = "Core utilities for Python packages" 80 | name = "packaging" 81 | optional = false 82 | python-versions = ">=3.6" 83 | version = "21.0" 84 | 85 | [package.dependencies] 86 | pyparsing = ">=2.0.2" 87 | 88 | [[package]] 89 | category = "dev" 90 | description = "Powerful data structures for data analysis, time series, and statistics" 91 | name = "pandas" 92 | optional = false 93 | python-versions = ">=3.7.1" 94 | version = "1.3.0" 95 | 96 | [package.dependencies] 97 | numpy = ">=1.17.3" 98 | python-dateutil = ">=2.7.3" 99 | pytz = ">=2017.3" 100 | 101 | [package.extras] 102 | test = ["hypothesis (>=3.58)", "pytest (>=6.0)", "pytest-xdist"] 103 | 104 | [[package]] 105 | category = "dev" 106 | description = "parse() is the opposite of format()" 107 | name = "parse" 108 | optional = false 109 | python-versions = "*" 110 | version = "1.19.0" 111 | 112 | [[package]] 113 | category = "dev" 114 | description = "Simplifies to build parse types based on the parse module" 115 | name = "parse-type" 116 | optional = false 117 | python-versions = ">=2.6, !=3.0.*, !=3.1.*" 118 | version = "0.5.2" 119 | 120 | [package.dependencies] 121 | parse = ">=1.8.4" 122 | six = ">=1.11" 123 | 124 | [package.extras] 125 | develop = ["coverage (>=4.4)", "pytest (>=3.2)", "pytest-cov", "tox (>=2.8)"] 126 | docs = ["sphinx (>=1.2)"] 127 | 128 | [[package]] 129 | category = "dev" 130 | description = "plugin and hook calling mechanisms for python" 131 | name = "pluggy" 132 | optional = false 133 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" 134 | version = "0.13.1" 135 | 136 | [package.extras] 137 | dev = ["pre-commit", "tox"] 138 | 139 | [[package]] 140 | category = "dev" 141 | description = "library with cross-python path, ini-parsing, io, code, log facilities" 142 | name = "py" 143 | optional = false 144 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" 145 | version = "1.10.0" 146 | 147 | [[package]] 148 | category = "main" 149 | description = "Enables Python programs to dynamically access arbitrary Java objects" 150 | name = "py4j" 151 | optional = false 152 | python-versions = "*" 153 | version = "0.10.9" 154 | 155 | [[package]] 156 | category = "dev" 157 | description = "Python parsing module" 158 | name = "pyparsing" 159 | optional = false 160 | python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" 161 | version = "2.4.7" 162 | 163 | [[package]] 164 | category = "main" 165 | description = "Apache Spark Python API" 166 | name = "pyspark" 167 | optional = false 168 | python-versions = "*" 169 | version = "3.0.3" 170 | 171 | [package.dependencies] 172 | py4j = "0.10.9" 173 | 174 | [package.extras] 175 | ml = ["numpy (>=1.7)"] 176 | mllib = ["numpy (>=1.7)"] 177 | sql = ["pandas (>=0.23.2)", "pyarrow (>=0.15.1)"] 178 | 179 | [[package]] 180 | category = "dev" 181 | description = "pytest: simple powerful testing with Python" 182 | name = "pytest" 183 | optional = false 184 | python-versions = ">=3.5" 185 | version = "5.4.3" 186 | 187 | [package.dependencies] 188 | atomicwrites = ">=1.0" 189 | attrs = ">=17.4.0" 190 | colorama = "*" 191 | more-itertools = ">=4.0.0" 192 | packaging = "*" 193 | pluggy = ">=0.12,<1.0" 194 | py = ">=1.5.0" 195 | wcwidth = "*" 196 | 197 | [package.extras] 198 | checkqa-mypy = ["mypy (v0.761)"] 199 | testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"] 200 | 201 | [[package]] 202 | category = "dev" 203 | description = "Pytest plugin for measuring coverage." 204 | name = "pytest-cov" 205 | optional = false 206 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" 207 | version = "2.12.1" 208 | 209 | [package.dependencies] 210 | coverage = ">=5.2.1" 211 | pytest = ">=4.6" 212 | toml = "*" 213 | 214 | [package.extras] 215 | testing = ["fields", "hunter", "process-tests", "six", "pytest-xdist", "virtualenv"] 216 | 217 | [[package]] 218 | category = "dev" 219 | description = "Extensions to the standard Python datetime module" 220 | name = "python-dateutil" 221 | optional = false 222 | python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" 223 | version = "2.8.1" 224 | 225 | [package.dependencies] 226 | six = ">=1.5" 227 | 228 | [[package]] 229 | category = "dev" 230 | description = "World timezone definitions, modern and historical" 231 | name = "pytz" 232 | optional = false 233 | python-versions = "*" 234 | version = "2021.1" 235 | 236 | [[package]] 237 | category = "dev" 238 | description = "Python 2 and 3 compatibility utilities" 239 | name = "six" 240 | optional = false 241 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" 242 | version = "1.16.0" 243 | 244 | [[package]] 245 | category = "dev" 246 | description = "Python Library for Tom's Obvious, Minimal Language" 247 | name = "toml" 248 | optional = false 249 | python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" 250 | version = "0.10.2" 251 | 252 | [[package]] 253 | category = "dev" 254 | description = "Measures the displayed width of unicode strings in a terminal" 255 | name = "wcwidth" 256 | optional = false 257 | python-versions = "*" 258 | version = "0.2.5" 259 | 260 | [metadata] 261 | content-hash = "0501f68fcc1c687a561a82f33a1e6fe871ce8dad067ccba85a1f963a69d46138" 262 | python-versions = "^3.8" 263 | 264 | [metadata.files] 265 | atomicwrites = [ 266 | {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"}, 267 | {file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"}, 268 | ] 269 | attrs = [ 270 | {file = "attrs-21.2.0-py2.py3-none-any.whl", hash = "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1"}, 271 | {file = "attrs-21.2.0.tar.gz", hash = "sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"}, 272 | ] 273 | behave = [ 274 | {file = "behave-1.2.6-py2.py3-none-any.whl", hash = "sha256:ebda1a6c9e5bfe95c5f9f0a2794e01c7098b3dde86c10a95d8621c5907ff6f1c"}, 275 | {file = "behave-1.2.6.tar.gz", hash = "sha256:b9662327aa53294c1351b0a9c369093ccec1d21026f050c3bd9b3e5cccf81a86"}, 276 | ] 277 | colorama = [ 278 | {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"}, 279 | {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"}, 280 | ] 281 | coverage = [ 282 | {file = "coverage-5.5-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:b6d534e4b2ab35c9f93f46229363e17f63c53ad01330df9f2d6bd1187e5eaacf"}, 283 | {file = "coverage-5.5-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:b7895207b4c843c76a25ab8c1e866261bcfe27bfaa20c192de5190121770672b"}, 284 | {file = "coverage-5.5-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:c2723d347ab06e7ddad1a58b2a821218239249a9e4365eaff6649d31180c1669"}, 285 | {file = "coverage-5.5-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:900fbf7759501bc7807fd6638c947d7a831fc9fdf742dc10f02956ff7220fa90"}, 286 | {file = "coverage-5.5-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:004d1880bed2d97151facef49f08e255a20ceb6f9432df75f4eef018fdd5a78c"}, 287 | {file = "coverage-5.5-cp27-cp27m-win32.whl", hash = "sha256:06191eb60f8d8a5bc046f3799f8a07a2d7aefb9504b0209aff0b47298333302a"}, 288 | {file = "coverage-5.5-cp27-cp27m-win_amd64.whl", hash = "sha256:7501140f755b725495941b43347ba8a2777407fc7f250d4f5a7d2a1050ba8e82"}, 289 | {file = "coverage-5.5-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:372da284cfd642d8e08ef606917846fa2ee350f64994bebfbd3afb0040436905"}, 290 | {file = "coverage-5.5-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:8963a499849a1fc54b35b1c9f162f4108017b2e6db2c46c1bed93a72262ed083"}, 291 | {file = "coverage-5.5-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:869a64f53488f40fa5b5b9dcb9e9b2962a66a87dab37790f3fcfb5144b996ef5"}, 292 | {file = "coverage-5.5-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:4a7697d8cb0f27399b0e393c0b90f0f1e40c82023ea4d45d22bce7032a5d7b81"}, 293 | {file = "coverage-5.5-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:8d0a0725ad7c1a0bcd8d1b437e191107d457e2ec1084b9f190630a4fb1af78e6"}, 294 | {file = "coverage-5.5-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:51cb9476a3987c8967ebab3f0fe144819781fca264f57f89760037a2ea191cb0"}, 295 | {file = "coverage-5.5-cp310-cp310-win_amd64.whl", hash = "sha256:c0891a6a97b09c1f3e073a890514d5012eb256845c451bd48f7968ef939bf4ae"}, 296 | {file = "coverage-5.5-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:3487286bc29a5aa4b93a072e9592f22254291ce96a9fbc5251f566b6b7343cdb"}, 297 | {file = "coverage-5.5-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:deee1077aae10d8fa88cb02c845cfba9b62c55e1183f52f6ae6a2df6a2187160"}, 298 | {file = "coverage-5.5-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:f11642dddbb0253cc8853254301b51390ba0081750a8ac03f20ea8103f0c56b6"}, 299 | {file = "coverage-5.5-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:6c90e11318f0d3c436a42409f2749ee1a115cd8b067d7f14c148f1ce5574d701"}, 300 | {file = "coverage-5.5-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:30c77c1dc9f253283e34c27935fded5015f7d1abe83bc7821680ac444eaf7793"}, 301 | {file = "coverage-5.5-cp35-cp35m-win32.whl", hash = "sha256:9a1ef3b66e38ef8618ce5fdc7bea3d9f45f3624e2a66295eea5e57966c85909e"}, 302 | {file = "coverage-5.5-cp35-cp35m-win_amd64.whl", hash = "sha256:972c85d205b51e30e59525694670de6a8a89691186012535f9d7dbaa230e42c3"}, 303 | {file = "coverage-5.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:af0e781009aaf59e25c5a678122391cb0f345ac0ec272c7961dc5455e1c40066"}, 304 | {file = "coverage-5.5-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:74d881fc777ebb11c63736622b60cb9e4aee5cace591ce274fb69e582a12a61a"}, 305 | {file = "coverage-5.5-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:92b017ce34b68a7d67bd6d117e6d443a9bf63a2ecf8567bb3d8c6c7bc5014465"}, 306 | {file = "coverage-5.5-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:d636598c8305e1f90b439dbf4f66437de4a5e3c31fdf47ad29542478c8508bbb"}, 307 | {file = "coverage-5.5-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:41179b8a845742d1eb60449bdb2992196e211341818565abded11cfa90efb821"}, 308 | {file = "coverage-5.5-cp36-cp36m-win32.whl", hash = "sha256:040af6c32813fa3eae5305d53f18875bedd079960822ef8ec067a66dd8afcd45"}, 309 | {file = "coverage-5.5-cp36-cp36m-win_amd64.whl", hash = "sha256:5fec2d43a2cc6965edc0bb9e83e1e4b557f76f843a77a2496cbe719583ce8184"}, 310 | {file = "coverage-5.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:18ba8bbede96a2c3dde7b868de9dcbd55670690af0988713f0603f037848418a"}, 311 | {file = "coverage-5.5-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:2910f4d36a6a9b4214bb7038d537f015346f413a975d57ca6b43bf23d6563b53"}, 312 | {file = "coverage-5.5-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:f0b278ce10936db1a37e6954e15a3730bea96a0997c26d7fee88e6c396c2086d"}, 313 | {file = "coverage-5.5-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:796c9c3c79747146ebd278dbe1e5c5c05dd6b10cc3bcb8389dfdf844f3ead638"}, 314 | {file = "coverage-5.5-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:53194af30d5bad77fcba80e23a1441c71abfb3e01192034f8246e0d8f99528f3"}, 315 | {file = "coverage-5.5-cp37-cp37m-win32.whl", hash = "sha256:184a47bbe0aa6400ed2d41d8e9ed868b8205046518c52464fde713ea06e3a74a"}, 316 | {file = "coverage-5.5-cp37-cp37m-win_amd64.whl", hash = "sha256:2949cad1c5208b8298d5686d5a85b66aae46d73eec2c3e08c817dd3513e5848a"}, 317 | {file = "coverage-5.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:217658ec7187497e3f3ebd901afdca1af062b42cfe3e0dafea4cced3983739f6"}, 318 | {file = "coverage-5.5-cp38-cp38-manylinux1_i686.whl", hash = "sha256:1aa846f56c3d49205c952d8318e76ccc2ae23303351d9270ab220004c580cfe2"}, 319 | {file = "coverage-5.5-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:24d4a7de75446be83244eabbff746d66b9240ae020ced65d060815fac3423759"}, 320 | {file = "coverage-5.5-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:d1f8bf7b90ba55699b3a5e44930e93ff0189aa27186e96071fac7dd0d06a1873"}, 321 | {file = "coverage-5.5-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:970284a88b99673ccb2e4e334cfb38a10aab7cd44f7457564d11898a74b62d0a"}, 322 | {file = "coverage-5.5-cp38-cp38-win32.whl", hash = "sha256:01d84219b5cdbfc8122223b39a954820929497a1cb1422824bb86b07b74594b6"}, 323 | {file = "coverage-5.5-cp38-cp38-win_amd64.whl", hash = "sha256:2e0d881ad471768bf6e6c2bf905d183543f10098e3b3640fc029509530091502"}, 324 | {file = "coverage-5.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d1f9ce122f83b2305592c11d64f181b87153fc2c2bbd3bb4a3dde8303cfb1a6b"}, 325 | {file = "coverage-5.5-cp39-cp39-manylinux1_i686.whl", hash = "sha256:13c4ee887eca0f4c5a247b75398d4114c37882658300e153113dafb1d76de529"}, 326 | {file = "coverage-5.5-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:52596d3d0e8bdf3af43db3e9ba8dcdaac724ba7b5ca3f6358529d56f7a166f8b"}, 327 | {file = "coverage-5.5-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:2cafbbb3af0733db200c9b5f798d18953b1a304d3f86a938367de1567f4b5bff"}, 328 | {file = "coverage-5.5-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:44d654437b8ddd9eee7d1eaee28b7219bec228520ff809af170488fd2fed3e2b"}, 329 | {file = "coverage-5.5-cp39-cp39-win32.whl", hash = "sha256:d314ed732c25d29775e84a960c3c60808b682c08d86602ec2c3008e1202e3bb6"}, 330 | {file = "coverage-5.5-cp39-cp39-win_amd64.whl", hash = "sha256:13034c4409db851670bc9acd836243aeee299949bd5673e11844befcb0149f03"}, 331 | {file = "coverage-5.5-pp36-none-any.whl", hash = "sha256:f030f8873312a16414c0d8e1a1ddff2d3235655a2174e3648b4fa66b3f2f1079"}, 332 | {file = "coverage-5.5-pp37-none-any.whl", hash = "sha256:2a3859cb82dcbda1cfd3e6f71c27081d18aa251d20a17d87d26d4cd216fb0af4"}, 333 | {file = "coverage-5.5.tar.gz", hash = "sha256:ebe78fe9a0e874362175b02371bdfbee64d8edc42a044253ddf4ee7d3c15212c"}, 334 | ] 335 | more-itertools = [ 336 | {file = "more-itertools-8.8.0.tar.gz", hash = "sha256:83f0308e05477c68f56ea3a888172c78ed5d5b3c282addb67508e7ba6c8f813a"}, 337 | {file = "more_itertools-8.8.0-py3-none-any.whl", hash = "sha256:2cf89ec599962f2ddc4d568a05defc40e0a587fbc10d5989713638864c36be4d"}, 338 | ] 339 | numpy = [ 340 | {file = "numpy-1.21.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:d5caa946a9f55511e76446e170bdad1d12d6b54e17a2afe7b189112ed4412bb8"}, 341 | {file = "numpy-1.21.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:ac4fd578322842dbda8d968e3962e9f22e862b6ec6e3378e7415625915e2da4d"}, 342 | {file = "numpy-1.21.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:598fe100b2948465cf3ed64b1a326424b5e4be2670552066e17dfaa67246011d"}, 343 | {file = "numpy-1.21.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c55407f739f0bfcec67d0df49103f9333edc870061358ac8a8c9e37ea02fcd2"}, 344 | {file = "numpy-1.21.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:75579acbadbf74e3afd1153da6177f846212ea2a0cc77de53523ae02c9256513"}, 345 | {file = "numpy-1.21.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:cc367c86eb87e5b7c9592935620f22d13b090c609f1b27e49600cd033b529f54"}, 346 | {file = "numpy-1.21.0-cp37-cp37m-win32.whl", hash = "sha256:d89b0dc7f005090e32bb4f9bf796e1dcca6b52243caf1803fdd2b748d8561f63"}, 347 | {file = "numpy-1.21.0-cp37-cp37m-win_amd64.whl", hash = "sha256:eda2829af498946c59d8585a9fd74da3f810866e05f8df03a86f70079c7531dd"}, 348 | {file = "numpy-1.21.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:1a784e8ff7ea2a32e393cc53eb0003eca1597c7ca628227e34ce34eb11645a0e"}, 349 | {file = "numpy-1.21.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bba474a87496d96e61461f7306fba2ebba127bed7836212c360f144d1e72ac54"}, 350 | {file = "numpy-1.21.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:fd0a359c1c17f00cb37de2969984a74320970e0ceef4808c32e00773b06649d9"}, 351 | {file = "numpy-1.21.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:e4d5a86a5257843a18fb1220c5f1c199532bc5d24e849ed4b0289fb59fbd4d8f"}, 352 | {file = "numpy-1.21.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:620732f42259eb2c4642761bd324462a01cdd13dd111740ce3d344992dd8492f"}, 353 | {file = "numpy-1.21.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9205711e5440954f861ceeea8f1b415d7dd15214add2e878b4d1cf2bcb1a914"}, 354 | {file = "numpy-1.21.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ad09f55cc95ed8d80d8ab2052f78cc21cb231764de73e229140d81ff49d8145e"}, 355 | {file = "numpy-1.21.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:a1f2fb2da242568af0271455b89aee0f71e4e032086ee2b4c5098945d0e11cf6"}, 356 | {file = "numpy-1.21.0-cp38-cp38-win32.whl", hash = "sha256:e58ddb53a7b4959932f5582ac455ff90dcb05fac3f8dcc8079498d43afbbde6c"}, 357 | {file = "numpy-1.21.0-cp38-cp38-win_amd64.whl", hash = "sha256:d2910d0a075caed95de1a605df00ee03b599de5419d0b95d55342e9a33ad1fb3"}, 358 | {file = "numpy-1.21.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:a290989cd671cd0605e9c91a70e6df660f73ae87484218e8285c6522d29f6e38"}, 359 | {file = "numpy-1.21.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3537b967b350ad17633b35c2f4b1a1bbd258c018910b518c30b48c8e41272717"}, 360 | {file = "numpy-1.21.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ccc6c650f8700ce1e3a77668bb7c43e45c20ac06ae00d22bdf6760b38958c883"}, 361 | {file = "numpy-1.21.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:709884863def34d72b183d074d8ba5cfe042bc3ff8898f1ffad0209161caaa99"}, 362 | {file = "numpy-1.21.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:bebab3eaf0641bba26039fb0b2c5bf9b99407924b53b1ea86e03c32c64ef5aef"}, 363 | {file = "numpy-1.21.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cf680682ad0a3bef56dae200dbcbac2d57294a73e5b0f9864955e7dd7c2c2491"}, 364 | {file = "numpy-1.21.0-cp39-cp39-win32.whl", hash = "sha256:d95d16204cd51ff1a1c8d5f9958ce90ae190be81d348b514f9be39f878b8044a"}, 365 | {file = "numpy-1.21.0-cp39-cp39-win_amd64.whl", hash = "sha256:2ba579dde0563f47021dcd652253103d6fd66165b18011dce1a0609215b2791e"}, 366 | {file = "numpy-1.21.0-pp37-pypy37_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:3c40e6b860220ed862e8097b8f81c9af6d7405b723f4a7af24a267b46f90e461"}, 367 | {file = "numpy-1.21.0.zip", hash = "sha256:e80fe25cba41c124d04c662f33f6364909b985f2eb5998aaa5ae4b9587242cce"}, 368 | ] 369 | packaging = [ 370 | {file = "packaging-21.0-py3-none-any.whl", hash = "sha256:c86254f9220d55e31cc94d69bade760f0847da8000def4dfe1c6b872fd14ff14"}, 371 | {file = "packaging-21.0.tar.gz", hash = "sha256:7dc96269f53a4ccec5c0670940a4281106dd0bb343f47b7471f779df49c2fbe7"}, 372 | ] 373 | pandas = [ 374 | {file = "pandas-1.3.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c81b8d91e9ae861eb4406b4e0f8d4dabbc105b9c479b3d1e921fba1d35b5b62a"}, 375 | {file = "pandas-1.3.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:08eeff3da6a188e24db7f292b39a8ca9e073bf841fbbeadb946b3ad5c19d843e"}, 376 | {file = "pandas-1.3.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:88864c1e28353b958b1f30e4193818519624ad9a1776921622a6a2a016d5d807"}, 377 | {file = "pandas-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:872aa91e0f9ca913046ab639d4181a899f5e592030d954d28c2529b88756a736"}, 378 | {file = "pandas-1.3.0-cp37-cp37m-win32.whl", hash = "sha256:92835113a67cbd34747c198d41f09f4b63f6fe11ca5643baebc7ab1e30e89e95"}, 379 | {file = "pandas-1.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:7d3cd2c99faa94d717ca00ea489264a291ad7209453dffbf059bfb7971fd3a61"}, 380 | ] 381 | parse = [ 382 | {file = "parse-1.19.0.tar.gz", hash = "sha256:9ff82852bcb65d139813e2a5197627a94966245c897796760a3a2a8eb66f020b"}, 383 | ] 384 | parse-type = [ 385 | {file = "parse_type-0.5.2-py2.py3-none-any.whl", hash = "sha256:089a471b06327103865dfec2dd844230c3c658a4a1b5b4c8b6c16c8f77577f9e"}, 386 | {file = "parse_type-0.5.2.tar.gz", hash = "sha256:7f690b18d35048c15438d6d0571f9045cffbec5907e0b1ccf006f889e3a38c0b"}, 387 | ] 388 | pluggy = [ 389 | {file = "pluggy-0.13.1-py2.py3-none-any.whl", hash = "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"}, 390 | {file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"}, 391 | ] 392 | py = [ 393 | {file = "py-1.10.0-py2.py3-none-any.whl", hash = "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"}, 394 | {file = "py-1.10.0.tar.gz", hash = "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3"}, 395 | ] 396 | py4j = [ 397 | {file = "py4j-0.10.9-py2.py3-none-any.whl", hash = "sha256:859ba728a7bb43e9c2bf058832759fb97a598bb28cc12f34f5fc4abdec08ede6"}, 398 | {file = "py4j-0.10.9.tar.gz", hash = "sha256:36ec57f43ff8ced260a18aa9a4e46c3500a730cac8860e259cbaa546c2b9db2f"}, 399 | ] 400 | pyparsing = [ 401 | {file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"}, 402 | {file = "pyparsing-2.4.7.tar.gz", hash = "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1"}, 403 | ] 404 | pyspark = [ 405 | {file = "pyspark-3.0.3.tar.gz", hash = "sha256:c4499903e3d7289cf2b4bf7755fb32cf17922598f3b85a1c230860dec020eec4"}, 406 | ] 407 | pytest = [ 408 | {file = "pytest-5.4.3-py3-none-any.whl", hash = "sha256:5c0db86b698e8f170ba4582a492248919255fcd4c79b1ee64ace34301fb589a1"}, 409 | {file = "pytest-5.4.3.tar.gz", hash = "sha256:7979331bfcba207414f5e1263b5a0f8f521d0f457318836a7355531ed1a4c7d8"}, 410 | ] 411 | pytest-cov = [ 412 | {file = "pytest-cov-2.12.1.tar.gz", hash = "sha256:261ceeb8c227b726249b376b8526b600f38667ee314f910353fa318caa01f4d7"}, 413 | {file = "pytest_cov-2.12.1-py2.py3-none-any.whl", hash = "sha256:261bb9e47e65bd099c89c3edf92972865210c36813f80ede5277dceb77a4a62a"}, 414 | ] 415 | python-dateutil = [ 416 | {file = "python-dateutil-2.8.1.tar.gz", hash = "sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c"}, 417 | {file = "python_dateutil-2.8.1-py2.py3-none-any.whl", hash = "sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a"}, 418 | ] 419 | pytz = [ 420 | {file = "pytz-2021.1-py2.py3-none-any.whl", hash = "sha256:eb10ce3e7736052ed3623d49975ce333bcd712c7bb19a58b9e2089d4057d0798"}, 421 | {file = "pytz-2021.1.tar.gz", hash = "sha256:83a4a90894bf38e243cf052c8b58f381bfe9a7a483f6a9cab140bc7f702ac4da"}, 422 | ] 423 | six = [ 424 | {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, 425 | {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, 426 | ] 427 | toml = [ 428 | {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"}, 429 | {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, 430 | ] 431 | wcwidth = [ 432 | {file = "wcwidth-0.2.5-py2.py3-none-any.whl", hash = "sha256:beb4802a9cebb9144e99086eff703a642a13d6a0052920003a230f3294bbe784"}, 433 | {file = "wcwidth-0.2.5.tar.gz", hash = "sha256:c4d647b99872929fdb7bdcaa4fbe7f01413ed3d98077df798530e5b04f116c83"}, 434 | ] 435 | -------------------------------------------------------------------------------- /check-engine-lib/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "checkengine" 3 | version = "0.2.0" 4 | description = "Data-quality checks for PySpark" 5 | authors = ["Bartosz Mikulski "] 6 | license = "MIT" 7 | readme = "README.md" 8 | repository = "https://github.com/mikulskibartosz/check-engine" 9 | 10 | [tool.poetry.dependencies] 11 | python = "^3.8" 12 | pyspark = "3.0.3" 13 | 14 | [tool.poetry.dev-dependencies] 15 | pytest = "^5.2" 16 | pytest-cov = "^2.10.0" 17 | pandas = "^1.0.5" 18 | numpy = "^1.19.0" 19 | behave = "^1.2.6" 20 | 21 | [build-system] 22 | requires = ["poetry_core>=1.0.0"] 23 | build-backend = "poetry.core.masonry.api" 24 | -------------------------------------------------------------------------------- /check-engine-lib/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mikulskibartosz/check-engine/383cf4106605cc6f94e800bdc707789c0cedbe95/check-engine-lib/tests/__init__.py -------------------------------------------------------------------------------- /check-engine-lib/tests/spark/AssertResult.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import DataFrame 2 | 3 | from checkengine.validate_df import ValidationResult, ValidationError 4 | from tests.spark.assert_df import AssertDf 5 | 6 | 7 | class AssertValidationResult: 8 | def __init__(self, *, column_name: str, constraint_name: str): 9 | self.column_name = column_name 10 | self.constraint_name = constraint_name 11 | 12 | def check(self, *, actual: ValidationResult, expected_correct: DataFrame, expected_erroneous: DataFrame): 13 | if expected_correct.count() == 0: 14 | AssertDf(actual.correct_data) \ 15 | .is_empty() \ 16 | .has_columns(expected_correct.columns) 17 | else: 18 | AssertDf(actual.correct_data, order_by_column=self.column_name) \ 19 | .contains_exactly(expected_correct.toPandas()) \ 20 | .has_columns(expected_correct.columns) 21 | 22 | if expected_erroneous.count() == 0: 23 | AssertDf(actual.erroneous_data) \ 24 | .is_empty() \ 25 | .has_columns(expected_erroneous.columns) 26 | else: 27 | AssertDf(actual.erroneous_data, order_by_column=self.column_name) \ 28 | .contains_exactly(expected_erroneous.toPandas()) \ 29 | .has_columns(expected_erroneous.columns) 30 | 31 | if expected_erroneous.count() == 0: 32 | assert actual.errors == [] 33 | else: 34 | assert actual.errors == [ValidationError(self.column_name, self.constraint_name, expected_erroneous.count())] 35 | -------------------------------------------------------------------------------- /check-engine-lib/tests/spark/__init__.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from pyspark.sql.types import StructType, StructField, StringType, IntegerType 3 | 4 | single_string_column_schema = StructType([StructField("col1", StringType())]) 5 | two_string_columns_schema = StructType([StructField("col1", StringType()), StructField("col2", StringType())]) 6 | 7 | single_integer_column_schema = StructType([StructField("col1", IntegerType())]) 8 | two_integer_columns_schema = StructType([StructField("col1", IntegerType()), StructField("col2", IntegerType())]) 9 | 10 | 11 | def empty_string_df(spark_session: SparkSession): 12 | return spark_session.createDataFrame([], schema=single_string_column_schema) 13 | 14 | 15 | def empty_integer_df(spark_session: SparkSession): 16 | return spark_session.createDataFrame([], schema=single_integer_column_schema) 17 | -------------------------------------------------------------------------------- /check-engine-lib/tests/spark/assert_df.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, List, Union 2 | 3 | from pandas import DataFrame as Pandas_df 4 | from pyspark.sql import DataFrame as Spark_df 5 | from numpy.testing import assert_array_equal 6 | 7 | 8 | class AssertDf: 9 | def __init__(self, df: Spark_df, order_by_column: Optional[Union[str, List[str]]] = None): 10 | self.df: Pandas_df = df.toPandas() 11 | self.order_by_column = order_by_column 12 | 13 | def is_empty(self): 14 | assert self.df.empty 15 | return self 16 | 17 | def contains_exactly(self, other: Pandas_df): 18 | if self.order_by_column: 19 | sorted_df = self.df.sort_values(self.order_by_column) 20 | other_sorted = other.sort_values(self.order_by_column) 21 | assert_array_equal(sorted_df.values, other_sorted.values, verbose=True) 22 | else: 23 | assert self.df.equals(other) 24 | return self 25 | 26 | def has_columns(self, columns: list): 27 | existing_columns = sorted(list(self.df.columns)) 28 | expected_columns = sorted(columns) 29 | assert existing_columns == expected_columns, f"{existing_columns} != {expected_columns}" 30 | return self 31 | 32 | def has_n_rows(self, n): 33 | assert self.df.shape[0] == n 34 | return self 35 | -------------------------------------------------------------------------------- /check-engine-lib/tests/spark/spark_session.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pyspark.sql import SparkSession 3 | 4 | 5 | @pytest.fixture(scope="session") 6 | def spark_session(request): 7 | """ 8 | Fixture for creating a Spark Session used in the tests. 9 | :param request: pytest.FixtureRequest 10 | """ 11 | spark_session = SparkSession.builder \ 12 | .master("local[*]") \ 13 | .appName("correct-horse-test") \ 14 | .getOrCreate() 15 | 16 | request.addfinalizer(lambda: spark_session.sparkContext.stop()) 17 | 18 | return spark_session 19 | -------------------------------------------------------------------------------- /check-engine-lib/tests/test_between_integer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Verifies the constraints that check whether the column contains value that is equal to or between two values. 3 | """ 4 | 5 | import pytest 6 | 7 | from tests.spark import empty_integer_df, empty_string_df, single_integer_column_schema, two_integer_columns_schema 8 | from tests.spark.AssertResult import AssertValidationResult 9 | from tests.spark.assert_df import AssertDf 10 | from checkengine.validate_df import ValidateSparkDataFrame, ValidationError 11 | 12 | pytestmark = pytest.mark.usefixtures("spark_session") 13 | 14 | 15 | def test_should_return_df_without_changes_if_empty_df_with_is_between_constraint(spark_session): 16 | df = empty_integer_df(spark_session) 17 | 18 | result = ValidateSparkDataFrame(spark_session, df) \ 19 | .is_between("col1", 5, 10) \ 20 | .execute() 21 | 22 | AssertValidationResult(column_name="col1", constraint_name="between") \ 23 | .check( 24 | actual=result, 25 | expected_correct=df, 26 | expected_erroneous=df 27 | ) 28 | 29 | 30 | def test_should_return_df_without_changes_if_all_are_between(spark_session): 31 | df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema) 32 | 33 | result = ValidateSparkDataFrame(spark_session, df) \ 34 | .is_between("col1", 5, 15) \ 35 | .execute() 36 | 37 | AssertValidationResult(column_name="col1", constraint_name="between") \ 38 | .check( 39 | actual=result, 40 | expected_correct=df, 41 | expected_erroneous=empty_integer_df(spark_session) 42 | ) 43 | 44 | 45 | def test_should_reject_all_rows_if_not_between(spark_session): 46 | df = spark_session.createDataFrame([[5], [10], [20]], schema=single_integer_column_schema) 47 | 48 | result = ValidateSparkDataFrame(spark_session, df) \ 49 | .is_between("col1", 11, 19) \ 50 | .execute() 51 | 52 | AssertValidationResult(column_name="col1", constraint_name="between") \ 53 | .check( 54 | actual=result, 55 | expected_correct=empty_integer_df(spark_session), 56 | expected_erroneous=df 57 | ) 58 | 59 | 60 | def test_should_return_both_correct_and_incorrect_rows(spark_session): 61 | df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema) 62 | expected_correct = spark_session.createDataFrame([[5], [10]], schema=single_integer_column_schema) 63 | expected_errors = spark_session.createDataFrame([[15]], schema=single_integer_column_schema) 64 | 65 | result = ValidateSparkDataFrame(spark_session, df) \ 66 | .is_between("col1", 0, 10) \ 67 | .execute() 68 | 69 | AssertValidationResult(column_name="col1", constraint_name="between") \ 70 | .check( 71 | actual=result, 72 | expected_correct=expected_correct, 73 | expected_erroneous=expected_errors 74 | ) 75 | 76 | 77 | def test_between_ignores_the_other_column(spark_session): 78 | df = spark_session.createDataFrame([[5, 8], [10, 20], [15, 8]], schema=two_integer_columns_schema) 79 | expected_correct = spark_session.createDataFrame([[5, 8], [10, 20]], schema=two_integer_columns_schema) 80 | expected_errors = spark_session.createDataFrame([[15, 8]], schema=two_integer_columns_schema) 81 | 82 | result = ValidateSparkDataFrame(spark_session, df) \ 83 | .is_between("col1", 5, 10) \ 84 | .execute() 85 | 86 | AssertDf(result.correct_data, order_by_column="col1") \ 87 | .contains_exactly(expected_correct.toPandas()) \ 88 | .has_columns(["col1", "col2"]) 89 | 90 | AssertDf(result.erroneous_data, order_by_column="col2") \ 91 | .contains_exactly(expected_errors.toPandas()) \ 92 | .has_columns(["col1", "col2"]) 93 | 94 | assert result.errors == [ValidationError("col1", "between", 1)] 95 | 96 | 97 | def test_between_should_check_all_given_columns_separately(spark_session): 98 | df = spark_session.createDataFrame([[25, 1], [30, 2], [35, 3]], schema=two_integer_columns_schema) 99 | expected_correct = spark_session.createDataFrame([], schema=two_integer_columns_schema) 100 | expected_errors = spark_session.createDataFrame([[25, 1], [30, 2], [35, 3]], schema=two_integer_columns_schema) 101 | 102 | result = ValidateSparkDataFrame(spark_session, df) \ 103 | .is_between("col1", 0, 5) \ 104 | .is_between("col2", 20, 40) \ 105 | .execute() 106 | 107 | AssertDf(result.correct_data, order_by_column="col1") \ 108 | .contains_exactly(expected_correct.toPandas()) \ 109 | .has_columns(["col1", "col2"]) 110 | 111 | AssertDf(result.erroneous_data, order_by_column="col2") \ 112 | .contains_exactly(expected_errors.toPandas()) \ 113 | .has_columns(["col1", "col2"]) 114 | 115 | assert result.errors == [ValidationError("col1", "between", 3), ValidationError("col2", "between", 3)] 116 | 117 | 118 | def test_should_throw_error_if_constraint_is_not_a_numeric_column(spark_session): 119 | with pytest.raises(ValueError): 120 | ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \ 121 | .is_between("col1", 5, 10) \ 122 | .execute() 123 | 124 | 125 | def test_should_throw_error_if_constraint_uses_non_existing_column(spark_session): 126 | with pytest.raises(ValueError): 127 | ValidateSparkDataFrame(spark_session, empty_integer_df(spark_session)) \ 128 | .is_between("column_that_does_not_exist", 5, 5) \ 129 | .execute() 130 | 131 | 132 | def test_should_throw_error_if_there_are_duplicate_constraints(spark_session): 133 | with pytest.raises(ValueError): 134 | ValidateSparkDataFrame(spark_session, empty_integer_df(spark_session)) \ 135 | .is_between("col1", 5, 10) \ 136 | .is_between("col1", 5, 15) \ 137 | .execute() 138 | 139 | 140 | def test_should_throw_error_if_lower_bound_is_greater_than_upper_bound(spark_session): 141 | with pytest.raises(ValueError): 142 | ValidateSparkDataFrame(spark_session, empty_integer_df(spark_session)) \ 143 | .is_between("col1", 10, 5) \ 144 | .execute() 145 | -------------------------------------------------------------------------------- /check-engine-lib/tests/test_max_integer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Verifies the constraints that check whether the column contains value equal or smaller than a given integer. 3 | """ 4 | 5 | import pytest 6 | 7 | from tests.spark import empty_integer_df, single_integer_column_schema, two_integer_columns_schema, empty_string_df 8 | from tests.spark.AssertResult import AssertValidationResult 9 | from tests.spark.assert_df import AssertDf 10 | from checkengine.validate_df import ValidateSparkDataFrame, ValidationError 11 | 12 | pytestmark = pytest.mark.usefixtures("spark_session") 13 | 14 | 15 | def test_should_return_df_without_changes_if_empty_df_with_is_max_constraint(spark_session): 16 | df = empty_integer_df(spark_session) 17 | 18 | result = ValidateSparkDataFrame(spark_session, df) \ 19 | .is_max("col1", 5) \ 20 | .execute() 21 | 22 | AssertValidationResult(column_name="col1", constraint_name="max") \ 23 | .check( 24 | actual=result, 25 | expected_correct=df, 26 | expected_erroneous=df 27 | ) 28 | 29 | 30 | def test_should_return_df_without_changes_if_all_rows_smaller_than_max(spark_session): 31 | df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema) 32 | 33 | result = ValidateSparkDataFrame(spark_session, df) \ 34 | .is_max("col1", 20) \ 35 | .execute() 36 | 37 | AssertValidationResult(column_name="col1", constraint_name="max") \ 38 | .check( 39 | actual=result, 40 | expected_correct=df, 41 | expected_erroneous=empty_integer_df(spark_session) 42 | ) 43 | 44 | 45 | def test_should_reject_all_rows_if_larger_than_max(spark_session): 46 | df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema) 47 | 48 | result = ValidateSparkDataFrame(spark_session, df) \ 49 | .is_max("col1", 1) \ 50 | .execute() 51 | 52 | AssertValidationResult(column_name="col1", constraint_name="max") \ 53 | .check( 54 | actual=result, 55 | expected_correct=empty_integer_df(spark_session), 56 | expected_erroneous=df 57 | ) 58 | 59 | 60 | def test_should_return_both_correct_and_incorrect_rows(spark_session): 61 | df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema) 62 | expected_correct = spark_session.createDataFrame([[5], [10]], schema=single_integer_column_schema) 63 | expected_errors = spark_session.createDataFrame([[15]], schema=single_integer_column_schema) 64 | 65 | result = ValidateSparkDataFrame(spark_session, df) \ 66 | .is_max("col1", 10) \ 67 | .execute() 68 | 69 | AssertValidationResult(column_name="col1", constraint_name="max") \ 70 | .check( 71 | actual=result, 72 | expected_correct=expected_correct, 73 | expected_erroneous=expected_errors 74 | ) 75 | 76 | 77 | def test_max_value_of_other_columns_is_ignored(spark_session): 78 | df = spark_session.createDataFrame([[5, 1], [10, 20], [15, 1]], schema=two_integer_columns_schema) 79 | expected_correct = spark_session.createDataFrame([[5, 1], [10, 20]], schema=two_integer_columns_schema) 80 | expected_errors = spark_session.createDataFrame([[15, 1]], schema=two_integer_columns_schema) 81 | 82 | result = ValidateSparkDataFrame(spark_session, df) \ 83 | .is_max("col1", 10) \ 84 | .execute() 85 | 86 | AssertValidationResult(column_name="col1", constraint_name="max") \ 87 | .check( 88 | actual=result, 89 | expected_correct=expected_correct, 90 | expected_erroneous=expected_errors 91 | ) 92 | 93 | 94 | def test_max_should_check_all_given_columns_separately(spark_session): 95 | df = spark_session.createDataFrame([[25, 1], [30, 2], [35, 3]], schema=two_integer_columns_schema) 96 | 97 | expected_correct = spark_session.createDataFrame([], schema=two_integer_columns_schema) 98 | expected_errors = spark_session.createDataFrame([[25, 1], [30, 2], [35, 3]], schema=two_integer_columns_schema) 99 | 100 | result = ValidateSparkDataFrame(spark_session, df) \ 101 | .is_max("col1", 20) \ 102 | .is_max("col2", 0) \ 103 | .execute() 104 | 105 | AssertDf(result.correct_data, order_by_column="col1") \ 106 | .contains_exactly(expected_correct.toPandas()) \ 107 | .has_columns(["col1", "col2"]) 108 | 109 | AssertDf(result.erroneous_data, order_by_column="col2") \ 110 | .contains_exactly(expected_errors.toPandas()) \ 111 | .has_columns(["col1", "col2"]) 112 | 113 | assert result.errors == [ValidationError("col1", "max", 3), ValidationError("col2", "max", 3)] 114 | 115 | 116 | def test_should_throw_error_if_constraint_is_not_a_numeric_column(spark_session): 117 | with pytest.raises(ValueError): 118 | ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \ 119 | .is_max("col1", 5) \ 120 | .execute() 121 | 122 | 123 | def test_should_throw_error_if_constraint_uses_non_existing_column(spark_session): 124 | with pytest.raises(ValueError): 125 | ValidateSparkDataFrame(spark_session, empty_integer_df(spark_session)) \ 126 | .is_max("column_that_does_not_exist", 5) \ 127 | .execute() 128 | 129 | 130 | def test_should_throw_error_if_there_are_duplicate_constraints(spark_session): 131 | with pytest.raises(ValueError): 132 | ValidateSparkDataFrame(spark_session, empty_integer_df(spark_session)) \ 133 | .is_max("col1", 5) \ 134 | .is_max("col1", 10) \ 135 | .execute() 136 | -------------------------------------------------------------------------------- /check-engine-lib/tests/test_mean_value.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests the meanColumnValue constraint. 3 | 4 | The implementation should reject all rows in a column if the column mean value is not between the expected values. 5 | """ 6 | import pytest 7 | 8 | from checkengine.validate_df import ValidateSparkDataFrame 9 | from tests.spark import empty_integer_df, single_integer_column_schema, two_integer_columns_schema, empty_string_df 10 | from tests.spark.AssertResult import AssertValidationResult 11 | 12 | pytestmark = pytest.mark.usefixtures("spark_session") 13 | 14 | 15 | def test_should_return_df_without_changes_if_empty_df_with_mean_constraint(spark_session): 16 | df = empty_integer_df(spark_session) 17 | 18 | result = ValidateSparkDataFrame(spark_session, df) \ 19 | .mean_column_value("col1", 0, 1) \ 20 | .execute() 21 | 22 | AssertValidationResult(column_name="col1", constraint_name="mean_between") \ 23 | .check( 24 | actual=result, 25 | expected_correct=df, 26 | expected_erroneous=df 27 | ) 28 | 29 | 30 | def test_should_return_df_without_changes_if_the_mean_is_between_given_values(spark_session): 31 | df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema) 32 | 33 | result = ValidateSparkDataFrame(spark_session, df) \ 34 | .mean_column_value("col1", 5, 15) \ 35 | .execute() 36 | 37 | AssertValidationResult(column_name="col1", constraint_name="mean_between") \ 38 | .check( 39 | actual=result, 40 | expected_correct=df, 41 | expected_erroneous=empty_integer_df(spark_session) 42 | ) 43 | 44 | 45 | def test_should_reject_all_rows_if_mean_is_smaller_than_given_values(spark_session): 46 | df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema) 47 | 48 | result = ValidateSparkDataFrame(spark_session, df) \ 49 | .mean_column_value("col1", 12, 15) \ 50 | .execute() 51 | 52 | AssertValidationResult(column_name="col1", constraint_name="mean_between") \ 53 | .check( 54 | actual=result, 55 | expected_correct=empty_integer_df(spark_session), 56 | expected_erroneous=df 57 | ) 58 | 59 | 60 | def test_should_reject_all_rows_if_mean_is_larger_than_given_values(spark_session): 61 | df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema) 62 | 63 | result = ValidateSparkDataFrame(spark_session, df) \ 64 | .mean_column_value("col1", 5, 8) \ 65 | .execute() 66 | 67 | AssertValidationResult(column_name="col1", constraint_name="mean_between") \ 68 | .check( 69 | actual=result, 70 | expected_correct=empty_integer_df(spark_session), 71 | expected_erroneous=df 72 | ) 73 | 74 | 75 | def test_mean_value_of_other_columns_is_ignored(spark_session): 76 | df = spark_session.createDataFrame([[5, 1], [10, 2], [15, 3]], schema=two_integer_columns_schema) 77 | expected_errors = spark_session.createDataFrame([], schema=two_integer_columns_schema) 78 | 79 | result = ValidateSparkDataFrame(spark_session, df) \ 80 | .mean_column_value("col1", 10, 10) \ 81 | .execute() 82 | 83 | AssertValidationResult(column_name="col1", constraint_name="mean_between") \ 84 | .check( 85 | actual=result, 86 | expected_correct=df, 87 | expected_erroneous=expected_errors 88 | ) 89 | 90 | 91 | def test_mean_should_check_all_given_columns_separately(spark_session): 92 | df = spark_session.createDataFrame([[5, 1], [10, 2], [15, 3]], schema=two_integer_columns_schema) 93 | expected_errors = spark_session.createDataFrame([], schema=two_integer_columns_schema) 94 | 95 | result = ValidateSparkDataFrame(spark_session, df) \ 96 | .mean_column_value("col1", 10, 10) \ 97 | .mean_column_value("col2", 2, 2) \ 98 | .execute() 99 | 100 | AssertValidationResult(column_name="col1", constraint_name="mean_between") \ 101 | .check( 102 | actual=result, 103 | expected_correct=df, 104 | expected_erroneous=expected_errors 105 | ) 106 | 107 | 108 | def test_should_throw_error_if_constraint_is_not_a_numeric_column(spark_session): 109 | with pytest.raises(ValueError): 110 | ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \ 111 | .mean_column_value("col1", 10, 10) \ 112 | .execute() 113 | 114 | 115 | def test_should_throw_error_if_constraint_uses_non_existing_column(spark_session): 116 | with pytest.raises(ValueError): 117 | ValidateSparkDataFrame(spark_session, empty_integer_df(spark_session)) \ 118 | .mean_column_value("column_that_does_not_exist", 5, 5) \ 119 | .execute() 120 | 121 | 122 | def test_should_throw_error_if_there_are_duplicate_constraints(spark_session): 123 | with pytest.raises(ValueError): 124 | ValidateSparkDataFrame(spark_session, empty_integer_df(spark_session)) \ 125 | .mean_column_value("col1", 10, 10) \ 126 | .mean_column_value("col1", 5, 5) \ 127 | .execute() 128 | -------------------------------------------------------------------------------- /check-engine-lib/tests/test_median_value.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests the medianColumnValue constraint. 3 | 4 | The implementation should reject all rows in a column if the column median value is not between the expected values. 5 | """ 6 | import pytest 7 | 8 | from checkengine.validate_df import ValidateSparkDataFrame 9 | from tests.spark import empty_integer_df, single_integer_column_schema, two_integer_columns_schema, empty_string_df 10 | from tests.spark.AssertResult import AssertValidationResult 11 | 12 | pytestmark = pytest.mark.usefixtures("spark_session") 13 | 14 | 15 | def test_should_return_df_without_changes_if_empty_df_with_median_constraint(spark_session): 16 | df = empty_integer_df(spark_session) 17 | 18 | result = ValidateSparkDataFrame(spark_session, df) \ 19 | .median_column_value("col1", 0, 1) \ 20 | .execute() 21 | 22 | AssertValidationResult(column_name="col1", constraint_name="median_between") \ 23 | .check( 24 | actual=result, 25 | expected_correct=df, 26 | expected_erroneous=df 27 | ) 28 | 29 | 30 | def test_should_return_df_without_changes_if_the_median_is_between_given_values(spark_session): 31 | df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema) 32 | 33 | result = ValidateSparkDataFrame(spark_session, df) \ 34 | .median_column_value("col1", 5, 15) \ 35 | .execute() 36 | 37 | AssertValidationResult(column_name="col1", constraint_name="median_between") \ 38 | .check( 39 | actual=result, 40 | expected_correct=df, 41 | expected_erroneous=empty_integer_df(spark_session) 42 | ) 43 | 44 | 45 | def test_should_reject_all_rows_if_median_is_smaller_than_given_values(spark_session): 46 | df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema) 47 | 48 | result = ValidateSparkDataFrame(spark_session, df) \ 49 | .median_column_value("col1", 12, 15) \ 50 | .execute() 51 | 52 | AssertValidationResult(column_name="col1", constraint_name="median_between") \ 53 | .check( 54 | actual=result, 55 | expected_correct=empty_integer_df(spark_session), 56 | expected_erroneous=df 57 | ) 58 | 59 | 60 | def test_should_reject_all_rows_if_median_is_larger_than_given_values(spark_session): 61 | df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema) 62 | 63 | result = ValidateSparkDataFrame(spark_session, df) \ 64 | .median_column_value("col1", 5, 8) \ 65 | .execute() 66 | 67 | AssertValidationResult(column_name="col1", constraint_name="median_between") \ 68 | .check( 69 | actual=result, 70 | expected_correct=empty_integer_df(spark_session), 71 | expected_erroneous=df 72 | ) 73 | 74 | 75 | def test_median_value_of_other_columns_is_ignored(spark_session): 76 | df = spark_session.createDataFrame([[5, 1], [10, 2], [15, 3]], schema=two_integer_columns_schema) 77 | expected_errors = spark_session.createDataFrame([], schema=two_integer_columns_schema) 78 | 79 | result = ValidateSparkDataFrame(spark_session, df) \ 80 | .median_column_value("col1", 10, 10) \ 81 | .execute() 82 | 83 | AssertValidationResult(column_name="col1", constraint_name="median_between") \ 84 | .check( 85 | actual=result, 86 | expected_correct=df, 87 | expected_erroneous=expected_errors 88 | ) 89 | 90 | 91 | def test_median_should_check_all_given_columns_separately(spark_session): 92 | df = spark_session.createDataFrame([[5, 1], [10, 2], [15, 3]], schema=two_integer_columns_schema) 93 | expected_errors = spark_session.createDataFrame([], schema=two_integer_columns_schema) 94 | 95 | result = ValidateSparkDataFrame(spark_session, df) \ 96 | .median_column_value("col1", 10, 10) \ 97 | .median_column_value("col2", 2, 2) \ 98 | .execute() 99 | 100 | AssertValidationResult(column_name="col1", constraint_name="median_between") \ 101 | .check( 102 | actual=result, 103 | expected_correct=df, 104 | expected_erroneous=expected_errors 105 | ) 106 | 107 | 108 | def test_should_throw_error_if_constraint_is_not_a_numeric_column(spark_session): 109 | with pytest.raises(ValueError): 110 | ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \ 111 | .median_column_value("col1", 10, 10) \ 112 | .execute() 113 | 114 | 115 | def test_should_throw_error_if_constraint_uses_non_existing_column(spark_session): 116 | with pytest.raises(ValueError): 117 | ValidateSparkDataFrame(spark_session, empty_integer_df(spark_session)) \ 118 | .median_column_value("column_that_does_not_exist", 5, 5) \ 119 | .execute() 120 | 121 | 122 | def test_should_throw_error_if_there_are_duplicate_constraints(spark_session): 123 | with pytest.raises(ValueError): 124 | ValidateSparkDataFrame(spark_session, empty_integer_df(spark_session)) \ 125 | .median_column_value("col1", 10, 10) \ 126 | .median_column_value("col1", 5, 5) \ 127 | .execute() 128 | -------------------------------------------------------------------------------- /check-engine-lib/tests/test_min_integer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Verifies the constraints that check whether the column contains value equal or larger than a given integer. 3 | """ 4 | 5 | import pytest 6 | 7 | from tests.spark import empty_integer_df, single_integer_column_schema, two_integer_columns_schema, empty_string_df 8 | from tests.spark.AssertResult import AssertValidationResult 9 | from tests.spark.assert_df import AssertDf 10 | from checkengine.validate_df import ValidateSparkDataFrame, ValidationError 11 | 12 | pytestmark = pytest.mark.usefixtures("spark_session") 13 | 14 | 15 | def test_should_return_df_without_changes_if_empty_df_with_is_min_constraint(spark_session): 16 | df = empty_integer_df(spark_session) 17 | 18 | result = ValidateSparkDataFrame(spark_session, df) \ 19 | .is_min("col1", 5) \ 20 | .execute() 21 | 22 | AssertValidationResult(column_name="col1", constraint_name="min") \ 23 | .check( 24 | actual=result, 25 | expected_correct=df, 26 | expected_erroneous=df 27 | ) 28 | 29 | 30 | def test_should_return_df_without_changes_if_all_rows_greater_than_min(spark_session): 31 | df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema) 32 | 33 | result = ValidateSparkDataFrame(spark_session, df) \ 34 | .is_min("col1", 5) \ 35 | .execute() 36 | 37 | AssertValidationResult(column_name="col1", constraint_name="min") \ 38 | .check( 39 | actual=result, 40 | expected_correct=df, 41 | expected_erroneous=empty_integer_df(spark_session) 42 | ) 43 | 44 | 45 | def test_should_reject_all_rows_if_smaller_than_min(spark_session): 46 | df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema) 47 | 48 | result = ValidateSparkDataFrame(spark_session, df) \ 49 | .is_min("col1", 20) \ 50 | .execute() 51 | 52 | AssertValidationResult(column_name="col1", constraint_name="min") \ 53 | .check( 54 | actual=result, 55 | expected_correct=empty_integer_df(spark_session), 56 | expected_erroneous=df 57 | ) 58 | 59 | 60 | def test_should_return_both_correct_and_incorrect_rows(spark_session): 61 | df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema) 62 | expected_correct = spark_session.createDataFrame([[10], [15]], schema=single_integer_column_schema) 63 | expected_errors = spark_session.createDataFrame([[5]], schema=single_integer_column_schema) 64 | 65 | result = ValidateSparkDataFrame(spark_session, df) \ 66 | .is_min("col1", 10) \ 67 | .execute() 68 | 69 | AssertValidationResult(column_name="col1", constraint_name="min") \ 70 | .check( 71 | actual=result, 72 | expected_correct=expected_correct, 73 | expected_erroneous=expected_errors 74 | ) 75 | 76 | 77 | def test_min_value_of_other_columns_is_ignored(spark_session): 78 | df = spark_session.createDataFrame([[5, 1], [10, 2], [15, 3]], schema=two_integer_columns_schema) 79 | expected_correct = spark_session.createDataFrame([[10, 2], [15, 3]], schema=two_integer_columns_schema) 80 | expected_errors = spark_session.createDataFrame([[5, 1]], schema=two_integer_columns_schema) 81 | 82 | result = ValidateSparkDataFrame(spark_session, df) \ 83 | .is_min("col1", 10) \ 84 | .execute() 85 | 86 | AssertValidationResult(column_name="col1", constraint_name="min") \ 87 | .check( 88 | actual=result, 89 | expected_correct=expected_correct, 90 | expected_erroneous=expected_errors 91 | ) 92 | 93 | 94 | def test_min_should_check_all_given_columns_separately(spark_session): 95 | df = spark_session.createDataFrame([[5, 1], [10, 2], [15, 3]], schema=two_integer_columns_schema) 96 | expected_correct = spark_session.createDataFrame([], schema=two_integer_columns_schema) 97 | expected_errors = spark_session.createDataFrame([[5, 1], [10, 2], [15, 3]], schema=two_integer_columns_schema) 98 | 99 | result = ValidateSparkDataFrame(spark_session, df) \ 100 | .is_min("col1", 20) \ 101 | .is_min("col2", 5) \ 102 | .execute() 103 | 104 | AssertDf(result.correct_data, order_by_column="col1") \ 105 | .contains_exactly(expected_correct.toPandas()) \ 106 | .has_columns(["col1", "col2"]) 107 | 108 | AssertDf(result.erroneous_data, order_by_column="col2") \ 109 | .contains_exactly(expected_errors.toPandas()) \ 110 | .has_columns(["col1", "col2"]) 111 | 112 | assert result.errors == [ValidationError("col1", "min", 3), ValidationError("col2", "min", 3)] 113 | 114 | 115 | def test_should_throw_error_if_constraint_is_not_a_numeric_column(spark_session): 116 | with pytest.raises(ValueError): 117 | ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \ 118 | .is_min("col1", 5) \ 119 | .execute() 120 | 121 | 122 | def test_should_throw_error_if_constraint_uses_non_existing_column(spark_session): 123 | with pytest.raises(ValueError): 124 | ValidateSparkDataFrame(spark_session, empty_integer_df(spark_session)) \ 125 | .is_min("column_that_does_not_exist", 5) \ 126 | .execute() 127 | 128 | 129 | def test_should_throw_error_if_there_are_duplicate_constraints(spark_session): 130 | with pytest.raises(ValueError): 131 | ValidateSparkDataFrame(spark_session, empty_integer_df(spark_session)) \ 132 | .is_min("col1", 5) \ 133 | .is_min("col1", 10) \ 134 | .execute() 135 | -------------------------------------------------------------------------------- /check-engine-lib/tests/test_multi_contraint.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains the tests that verify whether the library works correctly when multiple constraints are defined at the same time. 3 | """ 4 | 5 | import pytest 6 | 7 | from checkengine.validate_df import ValidateSparkDataFrame, ValidationError 8 | from tests.spark import two_integer_columns_schema 9 | from tests.spark.assert_df import AssertDf 10 | 11 | pytestmark = pytest.mark.usefixtures("spark_session") 12 | 13 | 14 | def test_should_return_rows_that_pass_all_checks_and_reject_rows_that_violate_any_test(spark_session): 15 | not_between = [25, 1] 16 | max_exceeded = [3, 30] 17 | correct = [3, 15] 18 | less_than_min = [1, 15] 19 | both_wrong = [7, 30] 20 | 21 | df = spark_session.createDataFrame([not_between, max_exceeded, correct, less_than_min, both_wrong], schema=two_integer_columns_schema) 22 | expected_correct = spark_session.createDataFrame([correct], schema=two_integer_columns_schema) 23 | expected_errors = spark_session.createDataFrame([not_between, max_exceeded, less_than_min, both_wrong], schema=two_integer_columns_schema) 24 | 25 | result = ValidateSparkDataFrame(spark_session, df) \ 26 | .is_between("col1", 0, 5) \ 27 | .is_min("col1", 3) \ 28 | .is_max("col2", 20) \ 29 | .execute() 30 | 31 | AssertDf(result.correct_data, order_by_column="col1") \ 32 | .contains_exactly(expected_correct.toPandas()) \ 33 | .has_columns(["col1", "col2"]) 34 | 35 | AssertDf(result.erroneous_data, order_by_column="col2") \ 36 | .contains_exactly(expected_errors.toPandas()) \ 37 | .has_columns(["col1", "col2"]) 38 | 39 | assert result.errors == [ValidationError("col1", "between", 2), ValidationError("col1", "min", 1), ValidationError("col2", "max", 2)] 40 | -------------------------------------------------------------------------------- /check-engine-lib/tests/test_not_null.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests the not null constraint. 3 | """ 4 | import pytest 5 | 6 | from tests.spark import empty_string_df, single_string_column_schema, two_string_columns_schema 7 | from tests.spark.AssertResult import AssertValidationResult 8 | from tests.spark.assert_df import AssertDf 9 | from checkengine.validate_df import ValidateSparkDataFrame, ValidationError 10 | 11 | pytestmark = pytest.mark.usefixtures("spark_session") 12 | 13 | 14 | def test_should_pass_empty_df_with_not_null_constraint(spark_session): 15 | df = empty_string_df(spark_session) 16 | 17 | result = ValidateSparkDataFrame(spark_session, df) \ 18 | .is_not_null("col1") \ 19 | .execute() 20 | 21 | AssertValidationResult(column_name="col1", constraint_name="not_null") \ 22 | .check( 23 | actual=result, 24 | expected_correct=df, 25 | expected_erroneous=df 26 | ) 27 | 28 | 29 | def test_should_return_df_without_changes_if_all_rows_are_not_null(spark_session): 30 | df = spark_session.createDataFrame([["abc"], ["def"], ["ghi"]], schema=single_string_column_schema) 31 | 32 | result = ValidateSparkDataFrame(spark_session, df) \ 33 | .is_not_null("col1") \ 34 | .execute() 35 | 36 | AssertValidationResult(column_name="col1", constraint_name="not_null") \ 37 | .check( 38 | actual=result, 39 | expected_correct=df, 40 | expected_erroneous=empty_string_df(spark_session) 41 | ) 42 | 43 | 44 | def test_should_reject_all_rows_if_all_are_null(spark_session): 45 | df = spark_session.createDataFrame([[None], [None], [None]], schema=single_string_column_schema) 46 | expected_errors = spark_session.createDataFrame([[None]], schema=single_string_column_schema) 47 | 48 | result = ValidateSparkDataFrame(spark_session, df) \ 49 | .is_not_null("col1") \ 50 | .execute() 51 | 52 | AssertDf(result.correct_data) \ 53 | .is_empty() \ 54 | .has_columns(["col1"]) 55 | 56 | AssertDf(result.erroneous_data) \ 57 | .contains_exactly(expected_errors.toPandas()) \ 58 | .has_columns(["col1"]) 59 | 60 | assert result.errors == [ValidationError("col1", "not_null", 3)] 61 | 62 | 63 | def test_should_return_both_correct_and_incorrect_rows(spark_session): 64 | df = spark_session.createDataFrame([["abc"], [None]], schema=single_string_column_schema) 65 | 66 | expected_correct = spark_session.createDataFrame([["abc"]], schema=single_string_column_schema) 67 | expected_errors = spark_session.createDataFrame([[None]], schema=single_string_column_schema) 68 | 69 | result = ValidateSparkDataFrame(spark_session, df) \ 70 | .is_not_null("col1") \ 71 | .execute() 72 | 73 | AssertValidationResult(column_name="col1", constraint_name="not_null") \ 74 | .check( 75 | actual=result, 76 | expected_correct=expected_correct, 77 | expected_erroneous=expected_errors 78 | ) 79 | 80 | 81 | def test_nulls_in_other_columns_are_ignored(spark_session): 82 | df = spark_session.createDataFrame([["abc", "123"], [None, "456"], ["def", None]], schema=two_string_columns_schema) 83 | 84 | expected_correct = spark_session.createDataFrame([["abc", "123"], ["def", None]], schema=two_string_columns_schema) 85 | expected_errors = spark_session.createDataFrame([[None, "456"]], schema=two_string_columns_schema) 86 | 87 | result = ValidateSparkDataFrame(spark_session, df) \ 88 | .is_not_null("col1") \ 89 | .execute() 90 | 91 | AssertValidationResult(column_name="col1", constraint_name="not_null") \ 92 | .check( 93 | actual=result, 94 | expected_correct=expected_correct, 95 | expected_erroneous=expected_errors 96 | ) 97 | 98 | 99 | def test_not_null_should_check_all_given_columns_separately(spark_session): 100 | df = spark_session.createDataFrame([["abc", None], [None, "456"], [None, None]], schema=two_string_columns_schema) 101 | expected_errors = spark_session.createDataFrame([["abc", None], [None, "456"], [None, None]], schema=two_string_columns_schema) 102 | 103 | result = ValidateSparkDataFrame(spark_session, df) \ 104 | .is_not_null("col1") \ 105 | .is_not_null("col2") \ 106 | .execute() 107 | 108 | AssertDf(result.correct_data) \ 109 | .is_empty() \ 110 | .has_columns(["col1", "col2"]) 111 | 112 | AssertDf(result.erroneous_data, order_by_column=["col1", "col2"]) \ 113 | .contains_exactly(expected_errors.toPandas()) \ 114 | .has_columns(["col1", "col2"]) 115 | 116 | assert result.errors == [ValidationError("col1", "not_null", 2), ValidationError("col2", "not_null", 2)] 117 | 118 | 119 | def test_not_null_should_check_all_given_columns_separately_even_if_all_of_them_are_defined_at_once(spark_session): 120 | df = spark_session.createDataFrame([["abc", None], [None, "456"], [None, None]], schema=two_string_columns_schema) 121 | expected_errors = spark_session.createDataFrame([["abc", None], [None, "456"], [None, None]], schema=two_string_columns_schema) 122 | 123 | result = ValidateSparkDataFrame(spark_session, df) \ 124 | .are_not_null(["col1", "col2"]) \ 125 | .execute() 126 | 127 | AssertDf(result.correct_data) \ 128 | .is_empty() \ 129 | .has_columns(["col1", "col2"]) 130 | 131 | AssertDf(result.erroneous_data, order_by_column=["col1", "col2"]) \ 132 | .contains_exactly(expected_errors.toPandas()) \ 133 | .has_columns(["col1", "col2"]) 134 | 135 | assert result.errors == [ValidationError("col1", "not_null", 2), ValidationError("col2", "not_null", 2)] 136 | 137 | 138 | def test_should_throw_error_if_constraint_uses_non_existing_column(spark_session): 139 | with pytest.raises(ValueError): 140 | ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \ 141 | .is_not_null("column_that_does_not_exist") \ 142 | .execute() 143 | 144 | 145 | def test_should_throw_error_if_there_are_duplicate_constraints(spark_session): 146 | with pytest.raises(ValueError): 147 | ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \ 148 | .is_not_null("col1") \ 149 | .is_not_null("col1") \ 150 | .execute() 151 | -------------------------------------------------------------------------------- /check-engine-lib/tests/test_one_of.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests the one_of constraint. 3 | """ 4 | import pytest 5 | 6 | from tests.spark import empty_string_df, single_string_column_schema, two_string_columns_schema 7 | from tests.spark.AssertResult import AssertValidationResult 8 | from tests.spark.assert_df import AssertDf 9 | from checkengine.validate_df import ValidateSparkDataFrame, ValidationError 10 | 11 | pytestmark = pytest.mark.usefixtures("spark_session") 12 | 13 | 14 | def test_should_return_df_without_changes_if_empty_df_with_one_of_constraint(spark_session): 15 | df = empty_string_df(spark_session) 16 | 17 | result = ValidateSparkDataFrame(spark_session, df) \ 18 | .one_of("col1", []) \ 19 | .execute() 20 | 21 | AssertValidationResult(column_name="col1", constraint_name="one_of") \ 22 | .check( 23 | actual=result, 24 | expected_correct=df, 25 | expected_erroneous=df 26 | ) 27 | 28 | 29 | def test_should_return_df_without_changes_if_all_are_in_list(spark_session): 30 | df = spark_session.createDataFrame([["abc"], ["def"], ["ghi"]], schema=single_string_column_schema) 31 | 32 | result = ValidateSparkDataFrame(spark_session, df) \ 33 | .one_of("col1", ["abc", "def", "ghi"]) \ 34 | .execute() 35 | 36 | AssertValidationResult(column_name="col1", constraint_name="one_of") \ 37 | .check( 38 | actual=result, 39 | expected_correct=df, 40 | expected_erroneous=empty_string_df(spark_session) 41 | ) 42 | 43 | 44 | def test_should_reject_all_rows_if_none_of_them_is_in_the_list(spark_session): 45 | df = spark_session.createDataFrame([["abc"], ["a"], ["abcdefghi"]], schema=single_string_column_schema) 46 | expected_errors = spark_session.createDataFrame([["abc"], ["a"], ["abcdefghi"]], schema=single_string_column_schema) 47 | 48 | result = ValidateSparkDataFrame(spark_session, df) \ 49 | .one_of("col1", ["ab", "b"]) \ 50 | .execute() 51 | 52 | AssertValidationResult(column_name="col1", constraint_name="one_of") \ 53 | .check( 54 | actual=result, 55 | expected_correct=empty_string_df(spark_session), 56 | expected_erroneous=expected_errors 57 | ) 58 | 59 | 60 | def test_should_return_both_correct_and_incorrect_rows(spark_session): 61 | df = spark_session.createDataFrame([["a"], ["abc"], ["defg"], ["hijkl"]], schema=single_string_column_schema) 62 | 63 | expected_correct = spark_session.createDataFrame([["abc"], ["defg"]], schema=single_string_column_schema) 64 | expected_errors = spark_session.createDataFrame([["a"], ["hijkl"]], schema=single_string_column_schema) 65 | 66 | result = ValidateSparkDataFrame(spark_session, df) \ 67 | .one_of("col1", ["abc", "defg"]) \ 68 | .execute() 69 | 70 | AssertValidationResult(column_name="col1", constraint_name="one_of") \ 71 | .check( 72 | actual=result, 73 | expected_correct=expected_correct, 74 | expected_erroneous=expected_errors 75 | ) 76 | 77 | 78 | def test_should_return_both_correct_and_incorrect_rows_numeric_values(spark_session): 79 | df = spark_session.createDataFrame([[1], [2], [3], [4]], schema=single_string_column_schema) 80 | 81 | expected_correct = spark_session.createDataFrame([[1], [3]], schema=single_string_column_schema) 82 | expected_errors = spark_session.createDataFrame([[2], [4]], schema=single_string_column_schema) 83 | 84 | result = ValidateSparkDataFrame(spark_session, df) \ 85 | .one_of("col1", [1, 3, 5]) \ 86 | .execute() 87 | 88 | AssertValidationResult(column_name="col1", constraint_name="one_of") \ 89 | .check( 90 | actual=result, 91 | expected_correct=expected_correct, 92 | expected_erroneous=expected_errors 93 | ) 94 | 95 | 96 | def test_one_of_of_other_columns_is_ignored(spark_session): 97 | df = spark_session.createDataFrame([["a", "123"], ["bcd", "45"], ["cd", "12345"]], schema=two_string_columns_schema) 98 | 99 | expected_correct = spark_session.createDataFrame([["cd", "12345"]], schema=two_string_columns_schema) 100 | expected_errors = spark_session.createDataFrame([["a", "123"], ["bcd", "45"]], schema=two_string_columns_schema) 101 | 102 | result = ValidateSparkDataFrame(spark_session, df) \ 103 | .one_of("col1", ["cd", "123", "45"]) \ 104 | .execute() 105 | 106 | AssertValidationResult(column_name="col1", constraint_name="one_of") \ 107 | .check( 108 | actual=result, 109 | expected_correct=expected_correct, 110 | expected_erroneous=expected_errors 111 | ) 112 | 113 | 114 | def test_should_check_all_given_columns_separately(spark_session): 115 | df = spark_session.createDataFrame([["a", "12"], ["abcde", "56"], ["def", "123"]], schema=two_string_columns_schema) 116 | 117 | expected_correct = spark_session.createDataFrame([], schema=two_string_columns_schema) 118 | expected_errors = spark_session.createDataFrame([["a", "12"], ["abcde", "56"], ["def", "123"]], schema=two_string_columns_schema) 119 | 120 | result = ValidateSparkDataFrame(spark_session, df) \ 121 | .one_of("col1", ["12", "56", "def"]) \ 122 | .one_of("col2", ["12", "56", "adcde"]) \ 123 | .execute() 124 | 125 | AssertDf(result.correct_data, order_by_column="col1") \ 126 | .contains_exactly(expected_correct.toPandas()) \ 127 | .has_columns(["col1", "col2"]) 128 | 129 | AssertDf(result.erroneous_data, order_by_column="col2") \ 130 | .contains_exactly(expected_errors.toPandas()) \ 131 | .has_columns(["col1", "col2"]) 132 | 133 | assert result.errors == [ValidationError("col1", "one_of", 2), ValidationError("col2", "one_of", 1)] 134 | 135 | 136 | def test_should_throw_error_if_constraint_uses_non_existing_column(spark_session): 137 | with pytest.raises(ValueError): 138 | ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \ 139 | .one_of("column_that_does_not_exist", []) \ 140 | .execute() 141 | 142 | 143 | def test_should_throw_error_if_there_are_duplicate_constraints(spark_session): 144 | with pytest.raises(ValueError): 145 | ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \ 146 | .one_of("col1", ["a"]) \ 147 | .one_of("col1", ["b"]) \ 148 | .execute() 149 | -------------------------------------------------------------------------------- /check-engine-lib/tests/test_spark_env.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests that verify whether the PySpark instance used for testing is configured properly. 3 | """ 4 | import pytest 5 | 6 | import pandas as pd 7 | from pyspark.sql import DataFrame 8 | 9 | from pyspark.sql.types import * 10 | 11 | from tests.spark.assert_df import AssertDf 12 | 13 | pytestmark = pytest.mark.usefixtures("spark_session") 14 | 15 | 16 | def test_empty_dataframe(spark_session): 17 | df_schema = StructType([StructField("col1", StringType())]) 18 | 19 | df = spark_session.createDataFrame([], schema=df_schema) 20 | AssertDf(df).is_empty() 21 | 22 | 23 | def test_spark_sql_operation(spark_session): 24 | df_schema = StructType([StructField("col1", StringType()), StructField("col2", IntegerType())]) 25 | 26 | test_list = [["v1", 1], ["v1", 2], ["v2", 3]] 27 | 28 | df: DataFrame = spark_session.createDataFrame(test_list, schema=df_schema) 29 | aggregated = df.groupby("col1").sum("col2").orderBy('col1') 30 | 31 | AssertDf(aggregated) \ 32 | .contains_exactly(pd.DataFrame([['v1', 3], ['v2', 3]], columns=['col1', 'sum(col2)']).sort_values('col1')) \ 33 | .has_columns(["col1", "sum(col2)"]) \ 34 | .has_n_rows(2) 35 | -------------------------------------------------------------------------------- /check-engine-lib/tests/test_string_length.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests the textLength constraint. 3 | """ 4 | import pytest 5 | 6 | from tests.spark import single_string_column_schema, two_string_columns_schema, empty_string_df, empty_integer_df 7 | from tests.spark.AssertResult import AssertValidationResult 8 | from tests.spark.assert_df import AssertDf 9 | from checkengine.validate_df import ValidateSparkDataFrame, ValidationError 10 | 11 | pytestmark = pytest.mark.usefixtures("spark_session") 12 | 13 | 14 | def test_should_return_df_without_changes_if_empty_df_with_is_text_length_constraint(spark_session): 15 | df = empty_string_df(spark_session) 16 | 17 | result = ValidateSparkDataFrame(spark_session, df) \ 18 | .has_length_between("col1", 0, 20) \ 19 | .execute() 20 | 21 | AssertValidationResult(column_name="col1", constraint_name="text_length") \ 22 | .check( 23 | actual=result, 24 | expected_correct=df, 25 | expected_erroneous=df 26 | ) 27 | 28 | 29 | def test_should_return_df_without_changes_if_all_are_shorter_than_upper_bound(spark_session): 30 | df = spark_session.createDataFrame([["abc"], ["def"], ["ghi"]], schema=single_string_column_schema) 31 | 32 | result = ValidateSparkDataFrame(spark_session, df) \ 33 | .has_length_between("col1", 0, 20) \ 34 | .execute() 35 | 36 | AssertValidationResult(column_name="col1", constraint_name="text_length") \ 37 | .check( 38 | actual=result, 39 | expected_correct=df, 40 | expected_erroneous=empty_string_df(spark_session) 41 | ) 42 | 43 | 44 | def test_should_return_df_without_changes_if_all_are_longer_than_lower_bound(spark_session): 45 | df = spark_session.createDataFrame([["abcdef"], ["ghijkl"]], schema=single_string_column_schema) 46 | 47 | result = ValidateSparkDataFrame(spark_session, df) \ 48 | .has_length_between("col1", 5, 20) \ 49 | .execute() 50 | 51 | AssertValidationResult(column_name="col1", constraint_name="text_length") \ 52 | .check( 53 | actual=result, 54 | expected_correct=df, 55 | expected_erroneous=empty_string_df(spark_session) 56 | ) 57 | 58 | 59 | def test_should_reject_all_rows_if_all_are_too_short_or_too_long(spark_session): 60 | df = spark_session.createDataFrame([["abc"], ["a"], ["abcdefghi"]], schema=single_string_column_schema) 61 | expected_errors = spark_session.createDataFrame([["abc"], ["a"], ["abcdefghi"]], schema=single_string_column_schema) 62 | 63 | result = ValidateSparkDataFrame(spark_session, df) \ 64 | .has_length_between("col1", 5, 8) \ 65 | .execute() 66 | 67 | AssertValidationResult(column_name="col1", constraint_name="text_length") \ 68 | .check( 69 | actual=result, 70 | expected_correct=empty_string_df(spark_session), 71 | expected_erroneous=expected_errors 72 | ) 73 | 74 | 75 | def test_should_return_both_correct_and_incorrect_rows(spark_session): 76 | df = spark_session.createDataFrame([["a"], ["abc"], ["defg"], ["hijkl"]], schema=single_string_column_schema) 77 | 78 | expected_correct = spark_session.createDataFrame([["abc"], ["defg"]], schema=single_string_column_schema) 79 | expected_errors = spark_session.createDataFrame([["a"], ["hijkl"]], schema=single_string_column_schema) 80 | 81 | result = ValidateSparkDataFrame(spark_session, df) \ 82 | .has_length_between("col1", 3, 4) \ 83 | .execute() 84 | 85 | AssertValidationResult(column_name="col1", constraint_name="text_length") \ 86 | .check( 87 | actual=result, 88 | expected_correct=expected_correct, 89 | expected_erroneous=expected_errors 90 | ) 91 | 92 | 93 | def test_text_length_of_other_columns_is_ignored(spark_session): 94 | df = spark_session.createDataFrame([["a", "123"], ["bcd", "45"], ["cd", "12345"]], schema=two_string_columns_schema) 95 | 96 | expected_correct = spark_session.createDataFrame([["cd", "12345"]], schema=two_string_columns_schema) 97 | expected_errors = spark_session.createDataFrame([["a", "123"], ["bcd", "45"]], schema=two_string_columns_schema) 98 | 99 | result = ValidateSparkDataFrame(spark_session, df) \ 100 | .has_length_between("col1", 2, 2) \ 101 | .execute() 102 | 103 | AssertValidationResult(column_name="col1", constraint_name="text_length") \ 104 | .check( 105 | actual=result, 106 | expected_correct=expected_correct, 107 | expected_erroneous=expected_errors 108 | ) 109 | 110 | 111 | def test_should_check_all_given_columns_separately(spark_session): 112 | df = spark_session.createDataFrame([["a", "12"], ["abcde", "56"], ["def", "123"]], schema=two_string_columns_schema) 113 | 114 | expected_correct = spark_session.createDataFrame([], schema=two_string_columns_schema) 115 | expected_errors = spark_session.createDataFrame([["a", "12"], ["abcde", "56"], ["def", "123"]], schema=two_string_columns_schema) 116 | 117 | result = ValidateSparkDataFrame(spark_session, df) \ 118 | .has_length_between("col1", 2, 4) \ 119 | .has_length_between("col2", 1, 2) \ 120 | .execute() 121 | 122 | AssertDf(result.correct_data, order_by_column="col1") \ 123 | .contains_exactly(expected_correct.toPandas()) \ 124 | .has_columns(["col1", "col2"]) 125 | 126 | AssertDf(result.erroneous_data, order_by_column="col2") \ 127 | .contains_exactly(expected_errors.toPandas()) \ 128 | .has_columns(["col1", "col2"]) 129 | 130 | assert result.errors == [ValidationError("col1", "text_length", 2), ValidationError("col2", "text_length", 1)] 131 | 132 | 133 | def test_should_throw_error_if_constraint_uses_non_existing_column(spark_session): 134 | with pytest.raises(ValueError): 135 | ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \ 136 | .has_length_between("column_that_does_not_exist", 0, 1) \ 137 | .execute() 138 | 139 | 140 | def test_should_throw_error_if_there_are_duplicate_constraints(spark_session): 141 | with pytest.raises(ValueError): 142 | ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \ 143 | .has_length_between("col1", 0, 10) \ 144 | .has_length_between("col1", 0, 5) \ 145 | .execute() 146 | 147 | 148 | def test_should_throw_error_if_lower_bound_is_greater_than_upper_bound(spark_session): 149 | with pytest.raises(ValueError): 150 | ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \ 151 | .has_length_between("col1", 10, 5) \ 152 | .execute() 153 | 154 | 155 | def test_should_throw_error_if_constraint_is_not_a_text_column(spark_session): 156 | with pytest.raises(ValueError): 157 | ValidateSparkDataFrame(spark_session, empty_integer_df(spark_session)) \ 158 | .has_length_between("col1", 5, 10) \ 159 | .execute() 160 | -------------------------------------------------------------------------------- /check-engine-lib/tests/test_string_matches.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests the Regex matching constraint. 3 | """ 4 | import pytest 5 | 6 | from tests.spark import empty_string_df, single_string_column_schema, two_string_columns_schema, empty_integer_df 7 | from tests.spark.AssertResult import AssertValidationResult 8 | from tests.spark.assert_df import AssertDf 9 | from checkengine.validate_df import ValidateSparkDataFrame, ValidationError 10 | 11 | pytestmark = pytest.mark.usefixtures("spark_session") 12 | 13 | 14 | def test_should_return_df_without_changes_if_empty_df_with_is_text_matches_regex_constraint(spark_session): 15 | df = empty_string_df(spark_session) 16 | 17 | result = ValidateSparkDataFrame(spark_session, df) \ 18 | .text_matches_regex("col1", ".*") \ 19 | .execute() 20 | 21 | AssertValidationResult(column_name="col1", constraint_name="regex_match") \ 22 | .check( 23 | actual=result, 24 | expected_correct=df, 25 | expected_erroneous=df 26 | ) 27 | 28 | 29 | def test_should_return_df_without_changes_if_regex_matches_the_text(spark_session): 30 | df = spark_session.createDataFrame([["abc"], ["def"], ["ghi"]], schema=single_string_column_schema) 31 | 32 | result = ValidateSparkDataFrame(spark_session, df) \ 33 | .text_matches_regex("col1", ".*") \ 34 | .execute() 35 | 36 | AssertValidationResult(column_name="col1", constraint_name="regex_match") \ 37 | .check( 38 | actual=result, 39 | expected_correct=df, 40 | expected_erroneous=empty_string_df(spark_session) 41 | ) 42 | 43 | 44 | def test_should_reject_all_rows_if_regex_match_fails(spark_session): 45 | df = spark_session.createDataFrame([["abc"], ["a"], ["abcdefghi"]], schema=single_string_column_schema) 46 | expected_errors = spark_session.createDataFrame([["abc"], ["a"], ["abcdefghi"]], schema=single_string_column_schema) 47 | 48 | result = ValidateSparkDataFrame(spark_session, df) \ 49 | .text_matches_regex("col1", "[0-9]+") \ 50 | .execute() 51 | 52 | AssertValidationResult(column_name="col1", constraint_name="regex_match") \ 53 | .check( 54 | actual=result, 55 | expected_correct=empty_string_df(spark_session), 56 | expected_erroneous=expected_errors 57 | ) 58 | 59 | 60 | def test_should_return_both_correct_and_incorrect_rows(spark_session): 61 | df = spark_session.createDataFrame([["a"], ["abc"], ["defg"], ["hijkl"]], schema=single_string_column_schema) 62 | expected_correct = spark_session.createDataFrame([["abc"], ["defg"]], schema=single_string_column_schema) 63 | expected_errors = spark_session.createDataFrame([["a"], ["hijkl"]], schema=single_string_column_schema) 64 | 65 | result = ValidateSparkDataFrame(spark_session, df) \ 66 | .text_matches_regex("col1", "^[a-z]{3,4}$") \ 67 | .execute() 68 | 69 | AssertValidationResult(column_name="col1", constraint_name="regex_match") \ 70 | .check( 71 | actual=result, 72 | expected_correct=expected_correct, 73 | expected_erroneous=expected_errors 74 | ) 75 | 76 | 77 | def test_matching_of_other_columns_is_ignored(spark_session): 78 | df = spark_session.createDataFrame([["a", "123"], ["bcd", "45"], ["cd", "12345"]], schema=two_string_columns_schema) 79 | 80 | expected_correct = spark_session.createDataFrame([["cd", "12345"]], schema=two_string_columns_schema) 81 | expected_errors = spark_session.createDataFrame([["a", "123"], ["bcd", "45"]], schema=two_string_columns_schema) 82 | 83 | result = ValidateSparkDataFrame(spark_session, df) \ 84 | .text_matches_regex("col1", "^[cd]+$") \ 85 | .execute() 86 | 87 | AssertValidationResult(column_name="col1", constraint_name="regex_match") \ 88 | .check( 89 | actual=result, 90 | expected_correct=expected_correct, 91 | expected_erroneous=expected_errors 92 | ) 93 | 94 | 95 | def test_should_check_all_given_columns_separately(spark_session): 96 | df = spark_session.createDataFrame([["a", "12"], ["abcde", "56"], ["def", "123"]], schema=two_string_columns_schema) 97 | 98 | expected_correct = spark_session.createDataFrame([], schema=two_string_columns_schema) 99 | expected_errors = spark_session.createDataFrame([["a", "12"], ["abcde", "56"], ["def", "123"]], schema=two_string_columns_schema) 100 | 101 | result = ValidateSparkDataFrame(spark_session, df) \ 102 | .text_matches_regex("col1", "[0-9]+") \ 103 | .text_matches_regex("col2", "[a-z]+") \ 104 | .execute() 105 | 106 | AssertDf(result.correct_data, order_by_column="col1") \ 107 | .contains_exactly(expected_correct.toPandas()) \ 108 | .has_columns(["col1", "col2"]) 109 | 110 | AssertDf(result.erroneous_data, order_by_column="col2") \ 111 | .contains_exactly(expected_errors.toPandas()) \ 112 | .has_columns(["col1", "col2"]) 113 | 114 | assert result.errors == [ValidationError("col1", "regex_match", 3), ValidationError("col2", "regex_match", 3)] 115 | 116 | 117 | def test_should_throw_error_if_constraint_uses_non_existing_column(spark_session): 118 | with pytest.raises(ValueError): 119 | ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \ 120 | .text_matches_regex("column_that_does_not_exist", '.*') \ 121 | .execute() 122 | 123 | 124 | def test_should_throw_error_if_there_are_duplicate_constraints(spark_session): 125 | with pytest.raises(ValueError): 126 | ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \ 127 | .text_matches_regex("column_that_does_not_exist", '.*') \ 128 | .text_matches_regex("column_that_does_not_exist", '[a-z]*') \ 129 | .execute() 130 | 131 | 132 | def test_should_throw_error_if_constraint_is_not_a_text_column(spark_session): 133 | with pytest.raises(ValueError): 134 | ValidateSparkDataFrame(spark_session, empty_integer_df(spark_session)) \ 135 | .text_matches_regex("col1", '[a-z]*') \ 136 | .execute() 137 | -------------------------------------------------------------------------------- /check-engine-lib/tests/test_uniqueness.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests the isUnique constraint. 3 | 4 | The implementation should reject a row if there is another row that contains the same value in the given column. 5 | In that case, the rows should be reported as an error (only once). 6 | """ 7 | import pytest 8 | 9 | from tests.spark import empty_string_df, single_string_column_schema, two_string_columns_schema 10 | from tests.spark.AssertResult import AssertValidationResult 11 | from tests.spark.assert_df import AssertDf 12 | from checkengine.validate_df import ValidateSparkDataFrame, ValidationError 13 | 14 | pytestmark = pytest.mark.usefixtures("spark_session") 15 | 16 | 17 | def test_should_return_df_without_changes_if_empty_df_with_is_unique_constraint(spark_session): 18 | df = empty_string_df(spark_session) 19 | 20 | result = ValidateSparkDataFrame(spark_session, df) \ 21 | .is_unique("col1") \ 22 | .execute() 23 | 24 | AssertValidationResult(column_name="col1", constraint_name="unique") \ 25 | .check( 26 | actual=result, 27 | expected_correct=df, 28 | expected_erroneous=df 29 | ) 30 | 31 | 32 | def test_should_return_df_without_changes_if_all_rows_are_unique(spark_session): 33 | df = spark_session.createDataFrame([["abc"], ["def"], ["ghi"]], schema=single_string_column_schema) 34 | 35 | result = ValidateSparkDataFrame(spark_session, df) \ 36 | .is_unique("col1") \ 37 | .execute() 38 | 39 | AssertValidationResult(column_name="col1", constraint_name="unique") \ 40 | .check( 41 | actual=result, 42 | expected_correct=df, 43 | expected_erroneous=empty_string_df(spark_session) 44 | ) 45 | 46 | 47 | def test_should_reject_all_rows_if_all_are_the_same(spark_session): 48 | df = spark_session.createDataFrame([["abc"], ["abc"], ["abc"]], schema=single_string_column_schema) 49 | expected_errors = spark_session.createDataFrame([["abc"]], schema=single_string_column_schema) 50 | 51 | result = ValidateSparkDataFrame(spark_session, df) \ 52 | .is_unique("col1") \ 53 | .execute() 54 | 55 | AssertDf(result.correct_data) \ 56 | .is_empty() \ 57 | .has_columns(["col1"]) 58 | 59 | AssertDf(result.erroneous_data, order_by_column="col1") \ 60 | .contains_exactly(expected_errors.toPandas()) \ 61 | .has_columns(["col1"]) 62 | 63 | assert result.errors == [ValidationError("col1", "unique", 3)] 64 | 65 | 66 | def test_should_return_both_correct_and_incorrect_rows(spark_session): 67 | df = spark_session.createDataFrame([["abc"], ["abc"], ["def"]], schema=single_string_column_schema) 68 | expected_correct = spark_session.createDataFrame([["def"]], schema=single_string_column_schema) 69 | expected_errors = spark_session.createDataFrame([["abc"]], schema=single_string_column_schema) 70 | 71 | result = ValidateSparkDataFrame(spark_session, df) \ 72 | .is_unique("col1") \ 73 | .execute() 74 | 75 | AssertDf(result.correct_data, order_by_column="col1") \ 76 | .contains_exactly(expected_correct.toPandas()) \ 77 | .has_columns(["col1"]) 78 | 79 | AssertDf(result.erroneous_data, order_by_column="col1") \ 80 | .contains_exactly(expected_errors.toPandas()) \ 81 | .has_columns(["col1"]) 82 | 83 | assert result.errors == [ValidationError("col1", "unique", 2)] 84 | 85 | 86 | def test_uniqueness_of_other_columns_is_ignored(spark_session): 87 | df = spark_session.createDataFrame([["abc", "123"], ["abc", "456"], ["def", "123"]], schema=two_string_columns_schema) 88 | expected_correct = spark_session.createDataFrame([["def", "123"]], schema=two_string_columns_schema) 89 | expected_errors = spark_session.createDataFrame([["abc", "123"], ["abc", "456"]], schema=two_string_columns_schema) 90 | 91 | result = ValidateSparkDataFrame(spark_session, df) \ 92 | .is_unique("col1") \ 93 | .execute() 94 | 95 | AssertDf(result.correct_data, order_by_column="col1") \ 96 | .contains_exactly(expected_correct.toPandas()) \ 97 | .has_columns(["col1", "col2"]) 98 | 99 | AssertDf(result.erroneous_data, order_by_column="col2") \ 100 | .contains_exactly(expected_errors.toPandas()) \ 101 | .has_columns(["col1", "col2"]) 102 | 103 | assert result.errors == [ValidationError("col1", "unique", 2)] 104 | 105 | 106 | def test_uniqueness_should_check_all_given_columns_separately(spark_session): 107 | df = spark_session.createDataFrame([["abc", "123"], ["abc", "456"], ["def", "123"]], schema=two_string_columns_schema) 108 | expected_correct = spark_session.createDataFrame([], schema=two_string_columns_schema) 109 | expected_errors = spark_session.createDataFrame([["abc", "123"], ["abc", "456"], ["def", "123"]], schema=two_string_columns_schema) 110 | 111 | result = ValidateSparkDataFrame(spark_session, df) \ 112 | .is_unique("col1") \ 113 | .is_unique("col2") \ 114 | .execute() 115 | 116 | AssertDf(result.correct_data, order_by_column="col1") \ 117 | .contains_exactly(expected_correct.toPandas()) \ 118 | .has_columns(["col1", "col2"]) 119 | 120 | AssertDf(result.erroneous_data, order_by_column="col2") \ 121 | .contains_exactly(expected_errors.toPandas()) \ 122 | .has_columns(["col1", "col2"]) 123 | 124 | assert result.errors == [ValidationError("col1", "unique", 2), ValidationError("col2", "unique", 2)] 125 | 126 | 127 | def test_uniqueness_should_check_all_given_columns_separately_when_defining_all_columns_at_once(spark_session): 128 | df = spark_session.createDataFrame([["abc", "123"], ["abc", "456"], ["def", "123"]], schema=two_string_columns_schema) 129 | expected_correct = spark_session.createDataFrame([], schema=two_string_columns_schema) 130 | expected_errors = spark_session.createDataFrame([["abc", "123"], ["abc", "456"], ["def", "123"]], schema=two_string_columns_schema) 131 | 132 | result = ValidateSparkDataFrame(spark_session, df) \ 133 | .are_unique(["col1", "col2"]) \ 134 | .execute() 135 | 136 | AssertDf(result.correct_data, order_by_column="col1") \ 137 | .contains_exactly(expected_correct.toPandas()) \ 138 | .has_columns(["col1", "col2"]) 139 | 140 | AssertDf(result.erroneous_data, order_by_column="col2") \ 141 | .contains_exactly(expected_errors.toPandas()) \ 142 | .has_columns(["col1", "col2"]) 143 | 144 | assert result.errors == [ValidationError("col1", "unique", 2), ValidationError("col2", "unique", 2)] 145 | 146 | 147 | def test_should_throw_error_if_constraint_uses_non_existing_column(spark_session): 148 | with pytest.raises(ValueError): 149 | ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \ 150 | .is_unique("column_that_does_not_exist") \ 151 | .execute() 152 | 153 | 154 | def test_should_throw_error_if_there_are_duplicate_constraints(spark_session): 155 | with pytest.raises(ValueError): 156 | ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \ 157 | .is_unique("col1") \ 158 | .is_unique("col1") \ 159 | .execute() 160 | -------------------------------------------------------------------------------- /check-engine-lib/tests/test_validation_without_rules.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests the behavior of ValidateSparkDataFrame when no constraint has been defined. 3 | In that case, the implementation should pass all of the given data as correct and don't return any errors. 4 | """ 5 | import pytest 6 | 7 | from tests.spark import empty_string_df, single_string_column_schema 8 | from tests.spark.AssertResult import AssertValidationResult 9 | from checkengine.validate_df import ValidateSparkDataFrame 10 | 11 | pytestmark = pytest.mark.usefixtures("spark_session") 12 | 13 | 14 | def test_should_pass_empty_df_if_there_are_no_rules(spark_session): 15 | df = empty_string_df(spark_session) 16 | 17 | result = ValidateSparkDataFrame(spark_session, df).execute() 18 | 19 | AssertValidationResult(column_name="col1", constraint_name="") \ 20 | .check( 21 | actual=result, 22 | expected_correct=df, 23 | expected_erroneous=df 24 | ) 25 | 26 | 27 | def test_should_pass_df_if_there_are_no_rules(spark_session): 28 | df = spark_session.createDataFrame([["abc"], ["def"]], schema=single_string_column_schema) 29 | 30 | result = ValidateSparkDataFrame(spark_session, df).execute() 31 | 32 | AssertValidationResult(column_name="col1", constraint_name="") \ 33 | .check( 34 | actual=result, 35 | expected_correct=df, 36 | expected_erroneous=empty_string_df(spark_session) 37 | ) 38 | --------------------------------------------------------------------------------