├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
    └── workflows
    │   ├── python-publish.yml
    │   └── run-tests-in-docker.yml
├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
└── check-engine-lib
    ├── .gitignore
    ├── Dockerfile
    ├── LICENSE
    ├── README.md
    ├── checkengine
        ├── _constraints
        │   ├── _Constraint.py
        │   ├── _NotNull.py
        │   ├── _Numbers.py
        │   ├── _OneOf.py
        │   ├── _StatColumn.py
        │   ├── _TextLength.py
        │   ├── _TextRegex.py
        │   └── _Unique.py
        └── validate_df.py
    ├── conftest.py
    ├── poetry.lock
    ├── pyproject.toml
    └── tests
        ├── __init__.py
        ├── spark
            ├── AssertResult.py
            ├── __init__.py
            ├── assert_df.py
            └── spark_session.py
        ├── test_between_integer.py
        ├── test_max_integer.py
        ├── test_mean_value.py
        ├── test_median_value.py
        ├── test_min_integer.py
        ├── test_multi_contraint.py
        ├── test_not_null.py
        ├── test_one_of.py
        ├── test_spark_env.py
        ├── test_string_length.py
        ├── test_string_matches.py
        ├── test_uniqueness.py
        └── test_validation_without_rules.py


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 
16 | **Expected behavior**
17 | A clear and concise description of what you expected to happen.
18 | 
19 | **Actual behavior**
20 | A clear and concise description of what you see instead of the expected behavior.
21 | 
22 | **Screenshots or console log**
23 | If applicable, add screenshots to help explain your problem.
24 | 
25 | **Environment (please complete the following information):**
26 |  - OS: 
27 |  - Python version 
28 |  - Spark version
29 | -  Python packages installed in the runtime environment
30 | - check-engine version
31 | 
32 | **Additional context**
33 | Add any other context about the problem here.
34 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: enhancement
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflows will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: Upload Python Package
 5 | 
 6 | on:
 7 |   release:
 8 |     types: [created]
 9 | 
10 | jobs:
11 |   deploy:
12 |     runs-on: ubuntu-latest
13 | 
14 |     steps:
15 |     - uses: actions/checkout@v2
16 |     - name: Build
17 |       uses: abatilo/actions-poetry@v1.5.0
18 |       with:
19 |         python_version: 3.8.0
20 |         poetry_version: 1.0
21 |         working_directory: ./check-engine-lib
22 |         args: build
23 |     - name: Publish distribution 📦 to PyPI
24 |       uses: pypa/gh-action-pypi-publish@master
25 |       with:
26 |         password: ${{ secrets.pypi_token }}
27 |         packages_dir: ./check-engine-lib/dist
28 | 


--------------------------------------------------------------------------------
/.github/workflows/run-tests-in-docker.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 | 
 7 | jobs:
 8 |   build:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |     - uses: actions/checkout@v2
12 |     - name: Run tests in Docker
13 |       run: docker build -t check-engine-test check-engine-lib/. && docker run check-engine-test
14 |     - name: Build
15 |       uses: abatilo/actions-poetry@v1.5.0
16 |       with:
17 |         python_version: 3.8.0
18 |         poetry_version: 1.0
19 |         working_directory: ./check-engine-lib
20 |         args: build
21 |     - name: Publish distribution 📦 to Test PyPI
22 |       uses: pypa/gh-action-pypi-publish@master
23 |       with:
24 |         password: ${{ secrets.test_pypi_token }}
25 |         repository_url: https://test.pypi.org/legacy/
26 |         packages_dir: ./check-engine-lib/dist
27 |         skip_existing: true
28 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Code of conduct
 2 | 
 3 | ## Our Standards
 4 | 
 5 | Examples of behavior that contributes to creating a positive environment include:
 6 | 
 7 | * Using welcoming and inclusive language
 8 | * Being respectful of differing viewpoints and experiences
 9 | * Gracefully accepting constructive criticism
10 | * Focusing on what is best for the community
11 | * Showing empathy towards other community members
12 | 
13 | Examples of unacceptable behavior by participants include:
14 | 
15 | * The use of sexualized language or imagery and unwelcome sexual attention or advances
16 | * Trolling, insulting/derogatory comments, and personal or political attacks
17 | * Public or private harassment
18 | * Publishing others' private information, such as a physical or electronic address, without explicit permission
19 | * Other conduct which could reasonably be considered inappropriate in a professional setting
20 | 
21 | ## Our Responsibilities
22 | 
23 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
24 | 
25 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
26 | 
27 | ## Scope
28 | 
29 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
30 | 
31 | ## Enforcement
32 | 
33 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project admin (https://twitter.com/mikulskibartosz). All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
34 | 
35 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
36 | 
37 | ## Attribution
38 | 
39 | This Code of Conduct is adapted from the Contributor Covenant, version 1.4, available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
40 | 
41 | For answers to common questions about this code of conduct, see https://www.contributor-covenant.org/faq
42 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to contribute
 2 | 
 3 | 1. Before contributing please create GitHub issue (if it is not created already) and discuss your case and the solution with the authors.
 4 | 
 5 | 2. Propose your changes via pull request.
 6 | 
 7 | 3. Please write descriptive messages in your commits and pull requests so that we can understand all easily.
 8 | 
 9 | 4. All changes must be covered by **passing** tests.
10 | 
11 | 5. Remember to update README.md files if necessary.
12 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Bartosz Mikulski
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Summary
 2 | 
 3 | The goal of this project is to implement a data validation library for PySpark. The library should detect the incorrect structure of the data, unexpected values in columns, and anomalies in the data.
 4 | 
 5 | ## How to install
 6 | 
 7 | ```
 8 | pip install checkengine==0.2.0
 9 | ```
10 | 
11 | ## How to use
12 | 
13 | ```
14 | from checkengine.validate_df import ValidateSparkDataFrame
15 | 
16 | result = ValidateSparkDataFrame(spark_session, spark_data_frame) \
17 |         .is_not_null("column_name") \
18 |         .are_not_null(["column_name_2", "column_name_3"]) \
19 |         .is_min("numeric_column", 10) \
20 |         .is_max("numeric_column", 20) \
21 |         .is_unique("column_name") \
22 |         .are_unique(["column_name_2", "column_name_3"]) \
23 |         .is_between("numeric_column_2", 10, 15) \
24 |         .has_length_between("text_column", 0, 10) \
25 |         .mean_column_value("numeric_column", 10, 20) \
26 |         .median_column_value("numeric_column", 5, 15) \
27 |         .text_matches_regex("text_column", "^[a-z]{3,10}$") \
28 |         .one_of("text_column", ["value_a", "value_b"]) \
29 |         .one_of("numeric_column", [123, 456]) \
30 |         .execute()
31 | 
32 | result.correct_data #rows that passed the validation
33 | result.erroneous_data #rows rejected during the validation
34 | results.errors a summary of validation errors (three fields: column_name, constraint_name, number_of_errors)
35 | ```
36 | 
37 | ## How to build
38 | 
39 | 1. Install the Poetry build tool.
40 | 
41 | 2. Run the following commands:
42 | 
43 | ```
44 | cd check-engine-lib
45 | poetry build
46 | ```
47 | 
48 | ## How to test locally
49 | 
50 | ### Run all tests
51 | 
52 | ```
53 | cd check-engine-lib
54 | poetry run pytest tests/
55 | ```
56 | 
57 | ### Run a single test file
58 | 
59 | ```
60 | cd check-engine-lib
61 | poetry run pytest tests/test_between_integer.py
62 | ```
63 | 
64 | ### Run a single test method
65 | 
66 | ```
67 | cd check-engine-lib
68 | poetry run pytest tests/test_between_integer.py -k 'test_should_return_df_without_changes_if_all_are_between'
69 | ```
70 | 
71 | ## How to test in Docker
72 | 
73 | ```
74 | docker build -t check-engine-test check-engine-lib/. && docker run check-engine-test
75 | ```
76 | 


--------------------------------------------------------------------------------
/check-engine-lib/.gitignore:
--------------------------------------------------------------------------------
1 | /.idea/
2 | /dist/
3 | /requirements.txt
4 | /venv/


--------------------------------------------------------------------------------
/check-engine-lib/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.8.1-slim
 2 | 
 3 | 
 4 | ENV PYTHONUNBUFFERED=1 \
 5 |     # prevents python creating .pyc files
 6 |     PYTHONDONTWRITEBYTECODE=1 \
 7 |     \
 8 |     # pip
 9 |     PIP_NO_CACHE_DIR=off \
10 |     PIP_DISABLE_PIP_VERSION_CHECK=on \
11 |     PIP_DEFAULT_TIMEOUT=100 \
12 |     \
13 |     # poetry
14 |     # https://python-poetry.org/docs/configuration/#using-environment-variables
15 |     POETRY_VERSION=1.0.3 \
16 |     # make poetry install to this location
17 |     POETRY_HOME="/opt/poetry" \
18 |     # make poetry create the virtual environment in the project's root
19 |     # it gets named `.venv`
20 |     POETRY_VIRTUALENVS_IN_PROJECT=true \
21 |     # do not ask any interactive question
22 |     POETRY_NO_INTERACTION=1 \
23 |     \
24 |     # paths
25 |     # this is where our requirements + virtual environment will live
26 |     PYSETUP_PATH="/opt/pysetup" \
27 |     VENV_PATH="/opt/pysetup/.venv"
28 | 
29 | 
30 | # prepend poetry and venv to path
31 | ENV PATH="$POETRY_HOME/bin:$VENV_PATH/bin:$PATH"
32 | 
33 | RUN apt-get update \
34 |     && apt-get install --no-install-recommends -y \
35 |         # deps for installing poetry
36 |         curl \
37 |         wget \
38 |         # deps for building python deps
39 |         build-essential
40 | 
41 | RUN curl -sSL https://raw.githubusercontent.com/sdispater/poetry/master/get-poetry.py | python
42 | 
43 | RUN cd /opt && wget -c --header "Cookie:oraclelicense=accept-securebackup-cookie" http://download.oracle.com/otn-pub/java/jdk/8u131-b11/d54c1d3a095b4ff2b6607d096fa80163/jdk-8u131-linux-x64.tar.gz
44 | RUN tar -xzf /opt/jdk-8u131-linux-x64.tar.gz -C /opt && ln -s /opt/jdk1.8.0_131 /opt/jdk
45 | 
46 | ENV JAVA_HOME /opt/jdk
47 | ENV PATH ${PATH}:${JAVA_HOME}/bin
48 | 
49 | RUN mkdir -p /opt/spark \
50 |     && cd /opt/spark \
51 |     && curl https://mirrors.hostingromania.ro/apache.org/spark/spark-3.0.3/spark-3.0.3-bin-hadoop2.7.tgz -o spark.tgz \
52 |     && tar xf spark.tgz
53 | 
54 | WORKDIR /app
55 | COPY checkengine ./checkengine
56 | COPY ./tests ./tests
57 | COPY ./pyproject.toml poetry.lock ./
58 | COPY ./conftest.py ./
59 | COPY ./README.md ./
60 | 
61 | RUN poetry install
62 | 
63 | ENV SPARK_HOME=/opt/spark/spark-3.0.3-bin-hadoop2.7
64 | 
65 | CMD poetry run pytest -s --cov=checkengine --cov-branch --cov-fail-under=80 tests/


--------------------------------------------------------------------------------
/check-engine-lib/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Bartosz Mikulski
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/check-engine-lib/README.md:
--------------------------------------------------------------------------------
 1 | ## Summary
 2 | 
 3 | The goal of this project is to implement a data validation library for PySpark. The library should detect the incorrect structure of the data, unexpected values in columns, and anomalies in the data.
 4 | 
 5 | ## How to install
 6 | 
 7 | THERE IS NO PACKAGE YET!!!
 8 | 
 9 | ## How to use
10 | 
11 | ```
12 | from checkengine.validate_df import ValidateSparkDataFrame
13 | 
14 | result = ValidateSparkDataFrame(spark_session, spark_data_frame) \
15 |         .is_not_null("column_name") \
16 |         .are_not_null(["column_name_2", "column_name_3"]) \
17 |         .is_min("numeric_column", 10) \
18 |         .is_max("numeric_column", 20) \
19 |         .is_unique("column_name") \
20 |         .are_unique(["column_name_2", "column_name_3"]) \
21 |         .is_between("numeric_column_2", 10, 15) \
22 |         .has_length_between("text_column", 0, 10) \
23 |         .mean_column_value("numeric_column", 10, 20) \
24 |         .median_column_value("numeric_column", 5, 15) \
25 |         .text_matches_regex("text_column", "^[a-z]{3,10}$") \
26 |         .one_of("text_column", ["value_a", "value_b"]) \
27 |         .one_of("numeric_column", [123, 456]) \
28 |         .execute()
29 | 
30 | result.correct_data #rows that passed the validation
31 | result.erroneous_data #rows rejected during the validation
32 | results.errors a summary of validation errors (three fields: column_name, constraint_name, number_of_errors)
33 | ```
34 | 
35 | ## How to build
36 | 
37 | 1. Install the Poetry build tool.
38 | 
39 | 2. Run the following commands:
40 | 
41 | ```
42 | cd check-engine-lib
43 | poetry build
44 | ```
45 | 
46 | ## How to test locally
47 | 
48 | ### Run all tests
49 | 
50 | ```
51 | cd check-engine-lib
52 | poetry run pytest tests/
53 | ```
54 | 
55 | ### Run a single test file
56 | 
57 | ```
58 | cd check-engine-lib
59 | poetry run pytest tests/test_between_integer.py
60 | ```
61 | 
62 | ### Run a single test method
63 | 
64 | ```
65 | cd check-engine-lib
66 | poetry run pytest tests/test_between_integer.py -k 'test_should_return_df_without_changes_if_all_are_between'
67 | ```
68 | 
69 | ## How to test in Docker
70 | 
71 | ```
72 | docker build -t check-engine-test check-engine-lib/. && docker run check-engine-test
73 | ```
74 | 


--------------------------------------------------------------------------------
/check-engine-lib/checkengine/_constraints/_Constraint.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple
 2 | from abc import ABC, abstractmethod
 3 | import random
 4 | import string
 5 | 
 6 | from pyspark.sql import DataFrame
 7 | 
 8 | 
 9 | def _generate_constraint_column_name(constraint_type, column_name):
10 |     random_suffix = ''.join(random.choice(string.ascii_lowercase) for i in range(12))
11 |     return f"__checkengine__{column_name}_{constraint_type}_{random_suffix}"
12 | 
13 | 
14 | class _Constraint(ABC):
15 |     def __init__(self, column_name: str):
16 |         self.column_name = column_name
17 |         self.constraint_column_name = _generate_constraint_column_name(self.constraint_name(), column_name)
18 | 
19 |     @abstractmethod
20 |     def constraint_name(self):
21 |         pass
22 | 
23 |     @abstractmethod
24 |     def prepare_df_for_check(self, data_frame: DataFrame) -> DataFrame:
25 |         return data_frame
26 | 
27 |     @abstractmethod
28 |     def filter_success(self, data_frame: DataFrame) -> DataFrame:
29 |         return data_frame
30 | 
31 |     @abstractmethod
32 |     def filter_failure(self, data_frame: DataFrame) -> DataFrame:
33 |         return data_frame
34 | 
35 |     def validate_self(self, data_frame: DataFrame, df_columns: List[str]) -> Tuple[bool, str]:
36 |         return self.column_name in df_columns, f"There is no '{self.column_name}' column"
37 | 


--------------------------------------------------------------------------------
/check-engine-lib/checkengine/_constraints/_NotNull.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import DataFrame
 2 | 
 3 | from checkengine._constraints._Constraint import _Constraint
 4 | 
 5 | 
 6 | class _NotNull(_Constraint):
 7 |     def __init__(self, column_name: str):
 8 |         super().__init__(column_name)
 9 | 
10 |     def prepare_df_for_check(self, data_frame: DataFrame) -> DataFrame:
11 |         return data_frame
12 | 
13 |     def filter_success(self, data_frame: DataFrame) -> DataFrame:
14 |         return data_frame.filter(f"{self.column_name} IS NOT NULL")
15 | 
16 |     def filter_failure(self, data_frame: DataFrame) -> DataFrame:
17 |         return data_frame.filter(f"{self.column_name} IS NULL")
18 | 
19 |     def constraint_name(self):
20 |         return "not_null"
21 | 


--------------------------------------------------------------------------------
/check-engine-lib/checkengine/_constraints/_Numbers.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC
 2 | from typing import List, Tuple
 3 | 
 4 | from pyspark.sql import DataFrame
 5 | 
 6 | from checkengine._constraints._Constraint import _Constraint
 7 | 
 8 | 
 9 | class _Number(_Constraint, ABC):
10 |     def __init__(self, column_name: str):
11 |         super().__init__(column_name)
12 | 
13 |     def prepare_df_for_check(self, data_frame: DataFrame) -> DataFrame:
14 |         return data_frame
15 | 
16 |     def validate_self(self, data_frame: DataFrame, df_columns: List[str]) -> Tuple[bool, str]:
17 |         parent_validation_result = super().validate_self(data_frame, df_columns)
18 |         if not parent_validation_result[0]:
19 |             return parent_validation_result
20 |         else:
21 |             column_type = [dtype for name, dtype in data_frame.dtypes if name == self.column_name][0]
22 |             return column_type in ["tinyint", "smallint", "int", "bigint"], f"Column {self.column_name} is not a number"
23 | 
24 | 
25 | class _Min(_Number):
26 |     def __init__(self, column_name: str, value: int):
27 |         super().__init__(column_name)
28 |         self.value = value
29 | 
30 |     def filter_success(self, data_frame: DataFrame) -> DataFrame:
31 |         return data_frame.filter(f"{self.column_name} >= {self.value}")
32 | 
33 |     def filter_failure(self, data_frame: DataFrame) -> DataFrame:
34 |         return data_frame.filter(f"{self.column_name} < {self.value}")
35 | 
36 |     def constraint_name(self):
37 |         return "min"
38 | 
39 | 
40 | class _Max(_Number):
41 |     def __init__(self, column_name: str, value: int):
42 |         super().__init__(column_name)
43 |         self.value = value
44 | 
45 |     def filter_success(self, data_frame: DataFrame) -> DataFrame:
46 |         return data_frame.filter(f"{self.column_name} <= {self.value}")
47 | 
48 |     def filter_failure(self, data_frame: DataFrame) -> DataFrame:
49 |         return data_frame.filter(f"{self.column_name} > {self.value}")
50 | 
51 |     def constraint_name(self):
52 |         return "max"
53 | 
54 | 
55 | class _Between(_Number):
56 |     def __init__(self, column_name: str, lower_bound: int, upper_bound: int):
57 |         super().__init__(column_name)
58 |         self.lower_bound = lower_bound
59 |         self.upper_bound = upper_bound
60 | 
61 |     def filter_success(self, data_frame: DataFrame) -> DataFrame:
62 |         return data_frame.filter(f"{self.column_name} >= {self.lower_bound} AND {self.column_name} <= {self.upper_bound}")
63 | 
64 |     def filter_failure(self, data_frame: DataFrame) -> DataFrame:
65 |         return data_frame.filter(f"{self.column_name} < {self.lower_bound} OR {self.column_name} > {self.upper_bound}")
66 | 
67 |     def constraint_name(self):
68 |         return "between"
69 | 
70 |     def validate_self(self, data_frame: DataFrame, df_columns: List[str]) -> Tuple[bool, str]:
71 |         parent_validation_result = super().validate_self(data_frame, df_columns)
72 |         if not parent_validation_result[0]:
73 |             return parent_validation_result
74 |         else:
75 |             return self.lower_bound <= self.upper_bound, f"Upper bound ({self.upper_bound}) cannot be lower than lower bound ({self.lower_bound})."
76 | 


--------------------------------------------------------------------------------
/check-engine-lib/checkengine/_constraints/_OneOf.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import DataFrame
 2 | 
 3 | from checkengine._constraints._Constraint import _Constraint
 4 | 
 5 | 
 6 | class _OneOf(_Constraint):
 7 |     def __init__(self, column_name: str, allowed_values: list):
 8 |         super().__init__(column_name)
 9 |         self.allowed_values = allowed_values
10 | 
11 |     def prepare_df_for_check(self, data_frame: DataFrame) -> DataFrame:
12 |         return data_frame
13 | 
14 |     def filter_success(self, data_frame: DataFrame) -> DataFrame:
15 |         return data_frame.filter(data_frame[self.column_name].isin(*self.allowed_values))
16 | 
17 |     def filter_failure(self, data_frame: DataFrame) -> DataFrame:
18 |         return data_frame.filter(~data_frame[self.column_name].isin(*self.allowed_values))
19 | 
20 |     def constraint_name(self):
21 |         return "one_of"
22 | 


--------------------------------------------------------------------------------
/check-engine-lib/checkengine/_constraints/_StatColumn.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Checks that require applying a statistical function to all values in a single column.
 3 | """
 4 | from abc import ABC
 5 | 
 6 | from pyspark.sql import DataFrame
 7 | import pyspark.sql.functions as F
 8 | 
 9 | from checkengine._constraints._Numbers import _Number
10 | 
11 | 
12 | class _StatColumn(_Number, ABC):
13 |     def __init__(self, column_name: str, lower_bound: float, upper_bound: float):
14 |         super().__init__(column_name)
15 |         self.lower_bound = lower_bound
16 |         self.upper_bound = upper_bound
17 | 
18 |     def filter_success(self, data_frame: DataFrame) -> DataFrame:
19 |         return data_frame.filter(f"{self.constraint_column_name} >= {self.lower_bound} AND {self.constraint_column_name} <= {self.upper_bound}")
20 | 
21 |     def filter_failure(self, data_frame: DataFrame) -> DataFrame:
22 |         return data_frame.filter(f"{self.constraint_column_name} < {self.lower_bound} OR {self.constraint_column_name} > {self.upper_bound}")
23 | 
24 | 
25 | class _MeanColumn(_StatColumn):
26 |     def prepare_df_for_check(self, data_frame: DataFrame) -> DataFrame:
27 |         average: DataFrame = data_frame \
28 |             .groupby() \
29 |             .avg(self.column_name) \
30 |             .withColumnRenamed(f"avg({self.column_name})", self.constraint_column_name)
31 | 
32 |         return data_frame.crossJoin(average)
33 | 
34 |     def constraint_name(self):
35 |         return "mean_between"
36 | 
37 | 
38 | class _MedianColumn(_StatColumn):
39 |     def prepare_df_for_check(self, data_frame: DataFrame) -> DataFrame:
40 |         median = F.expr(f"percentile_approx({self.column_name}, 0.5)")
41 | 
42 |         average: DataFrame = data_frame \
43 |             .groupby() \
44 |             .agg(median.alias(self.constraint_column_name))
45 | 
46 |         return data_frame.crossJoin(average)
47 | 
48 |     def constraint_name(self):
49 |         return "median_between"
50 | 


--------------------------------------------------------------------------------
/check-engine-lib/checkengine/_constraints/_TextLength.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple
 2 | 
 3 | from pyspark.sql import DataFrame
 4 | 
 5 | from checkengine._constraints._Constraint import _Constraint
 6 | 
 7 | 
 8 | class _TextLength(_Constraint):
 9 |     def __init__(self, column_name: str, lower_bound: int, upper_bound: int):
10 |         super().__init__(column_name)
11 |         self.lower_bound = lower_bound
12 |         self.upper_bound = upper_bound
13 | 
14 |     def prepare_df_for_check(self, data_frame: DataFrame) -> DataFrame:
15 |         return data_frame
16 | 
17 |     def validate_self(self, data_frame: DataFrame, df_columns: List[str]) -> Tuple[bool, str]:
18 |         parent_validation_result = super().validate_self(data_frame, df_columns)
19 |         if not parent_validation_result[0]:
20 |             return parent_validation_result
21 |         else:
22 |             column_type = [dtype for name, dtype in data_frame.dtypes if name == self.column_name][0]
23 |             if column_type != 'string':
24 |                 return False, f"Column {self.column_name} is not a string."
25 |             else:
26 |                 return self.lower_bound <= self.upper_bound, f"Upper bound ({self.upper_bound}) cannot be lower than lower bound ({self.lower_bound})."
27 | 
28 |     def filter_success(self, data_frame: DataFrame) -> DataFrame:
29 |         return data_frame.filter(f"LENGTH({self.column_name}) >= {self.lower_bound} AND LENGTH({self.column_name}) <= {self.upper_bound}")
30 | 
31 |     def filter_failure(self, data_frame: DataFrame) -> DataFrame:
32 |         return data_frame.filter(f"LENGTH({self.column_name}) < {self.lower_bound} OR LENGTH({self.column_name}) > {self.upper_bound}")
33 | 
34 |     def constraint_name(self):
35 |         return "text_length"
36 | 


--------------------------------------------------------------------------------
/check-engine-lib/checkengine/_constraints/_TextRegex.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple
 2 | 
 3 | from pyspark.sql import DataFrame
 4 | 
 5 | from checkengine._constraints._Constraint import _Constraint
 6 | 
 7 | 
 8 | class _TextRegex(_Constraint):
 9 |     def __init__(self, column_name: str, regex: str):
10 |         super().__init__(column_name)
11 |         self.regex = regex
12 | 
13 |     def prepare_df_for_check(self, data_frame: DataFrame) -> DataFrame:
14 |         return data_frame.withColumn(self.constraint_column_name, data_frame[self.column_name].rlike(self.regex))
15 | 
16 |     def validate_self(self, data_frame: DataFrame, df_columns: List[str]) -> Tuple[bool, str]:
17 |         parent_validation_result = super().validate_self(data_frame, df_columns)
18 |         if not parent_validation_result[0]:
19 |             return parent_validation_result
20 |         else:
21 |             column_type = [dtype for name, dtype in data_frame.dtypes if name == self.column_name][0]
22 |             return column_type == 'string', f"Column {self.column_name} is not a string."
23 | 
24 |     def filter_success(self, data_frame: DataFrame) -> DataFrame:
25 |         return data_frame.filter(f"{self.constraint_column_name} = TRUE")
26 | 
27 |     def filter_failure(self, data_frame: DataFrame) -> DataFrame:
28 |         return data_frame.filter(f"{self.constraint_column_name} = FALSE")
29 | 
30 |     def constraint_name(self):
31 |         return "regex_match"
32 | 


--------------------------------------------------------------------------------
/check-engine-lib/checkengine/_constraints/_Unique.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import DataFrame
 2 | 
 3 | from checkengine._constraints._Constraint import _Constraint
 4 | 
 5 | 
 6 | class _Unique(_Constraint):
 7 |     def __init__(self, column_name: str):
 8 |         super().__init__(column_name)
 9 | 
10 |     def prepare_df_for_check(self, data_frame: DataFrame) -> DataFrame:
11 |         count_repetitions: DataFrame = data_frame \
12 |             .groupby(self.column_name) \
13 |             .count() \
14 |             .withColumnRenamed("count", self.constraint_column_name)
15 | 
16 |         return data_frame.join(count_repetitions, self.column_name, "left")
17 | 
18 |     def filter_success(self, data_frame: DataFrame) -> DataFrame:
19 |         return data_frame.filter(f"{self.constraint_column_name} == 1")
20 | 
21 |     def filter_failure(self, data_frame: DataFrame) -> DataFrame:
22 |         return data_frame.filter(f"{self.constraint_column_name} > 1")
23 | 
24 |     def constraint_name(self):
25 |         return "unique"
26 | 


--------------------------------------------------------------------------------
/check-engine-lib/checkengine/validate_df.py:
--------------------------------------------------------------------------------
  1 | from typing import NamedTuple, List
  2 | 
  3 | from pyspark.sql import DataFrame, SparkSession
  4 | 
  5 | from checkengine._constraints._Constraint import _Constraint
  6 | from checkengine._constraints._NotNull import _NotNull
  7 | from ._constraints._Numbers import _Min, _Max, _Between
  8 | from checkengine._constraints._OneOf import _OneOf
  9 | from checkengine._constraints._TextLength import _TextLength
 10 | from checkengine._constraints._TextRegex import _TextRegex
 11 | from checkengine._constraints._Unique import _Unique
 12 | from checkengine._constraints._StatColumn import _MeanColumn, _MedianColumn
 13 | 
 14 | 
 15 | class ValidationError(NamedTuple):
 16 |     """Describes a single validation error.
 17 | 
 18 |     This object contains the name of the violated constraint, the name of the column that contains incorrect data and the number of rows that violate the rule.
 19 |     """
 20 |     column_name: str
 21 |     constraint_name: str
 22 |     number_of_errors: int
 23 | 
 24 | 
 25 | class ValidationResult(NamedTuple):
 26 |     """Contains both the correct data and the data that has not passed the validation.
 27 | 
 28 |     In addition to the erroneous data, it also contains a list of validation errors.
 29 |     """
 30 |     correct_data: DataFrame
 31 |     erroneous_data: DataFrame
 32 |     errors: List[ValidationError]
 33 | 
 34 | 
 35 | class ValidateSparkDataFrame:
 36 |     """Describes the validation rules of a Spark DataFrame and performs the validation.
 37 | 
 38 |     // TODO update the example when there is a new validation rule
 39 |     Usage example:
 40 |         ValidateSparkDataFrame(spark_session, data_frame) \
 41 |             .is_not_null("column_name") \
 42 |             .are_not_null(["column_name_2", "column_name_3"]) \
 43 |             .is_min("numeric_column", 10) \
 44 |             .is_max("numeric_column", 20) \
 45 |             .is_unique("column_name") \
 46 |             .are_unique(["column_name_2", "column_name_3"]) \
 47 |             .execute()
 48 |     """
 49 | 
 50 |     def __init__(self, spark: SparkSession, data_frame: DataFrame):
 51 |         """ValidateSparkDataFrame
 52 | 
 53 |         Args:
 54 |             spark: a Spark Session
 55 |             data_frame: the DataFrame to be validated
 56 |         """
 57 |         self.spark: SparkSession = spark
 58 |         self.df: DataFrame = data_frame
 59 |         self.input_columns: List[str] = data_frame.columns
 60 |         self.constraints: List[_Constraint] = []
 61 | 
 62 |     def is_unique(self, column_name: str):
 63 |         """Defines a constraint that checks whether the given column contains only unique values.
 64 | 
 65 |         Args:
 66 |             column_name: the name of the column
 67 | 
 68 |         Returns:
 69 |             self
 70 | 
 71 |         Raises:
 72 |             ValueError: if an unique constraint for a given column already exists.
 73 |         """
 74 |         self._add_constraint(_Unique(column_name))
 75 |         return self
 76 | 
 77 |     def are_unique(self, column_names: List[str]):
 78 |         """Defines constraints that check whether given columns contain only unique values.
 79 | 
 80 |         Args:
 81 |             column_names: a list of column names
 82 | 
 83 |         Returns:
 84 |             self
 85 |         """
 86 |         for column_name in column_names:
 87 |             self.is_unique(column_name)
 88 |         return self
 89 | 
 90 |     def is_not_null(self, column_name: str):
 91 |         """Defines a constraint that does not allow null values in a given column.
 92 | 
 93 |         Args:
 94 |             column_name: the column name
 95 | 
 96 |         Returns:
 97 |             self
 98 |         """
 99 |         self._add_constraint(_NotNull(column_name))
100 |         return self
101 | 
102 |     def are_not_null(self, column_names: List[str]):
103 |         """Defines constraints that don't allow null values in all of the given columns
104 | 
105 |         Args:
106 |             column_names: a list of column names
107 | 
108 |         Returns:
109 |             self
110 |         """
111 |         for column_name in column_names:
112 |             self.is_not_null(column_name)
113 |         return self
114 | 
115 |     def is_min(self, column_name: str, value: int):
116 |         """Defines a constraint that check whether the given column contains values equal or larger than a given integer.
117 | 
118 |         Args:
119 |             column_name: the column name
120 |             value: the minimal value
121 | 
122 |         Returns:
123 |             self
124 |         """
125 |         self._add_constraint(_Min(column_name, value))
126 |         return self
127 | 
128 |     def is_max(self, column_name: str, value: int):
129 |         """Defines a constraint that check whether the given column contains values equal or smaller than a given integer.
130 | 
131 |         Args:
132 |             column_name: the column name
133 |             value: the maximal value
134 | 
135 |         Returns:
136 |             self
137 |         """
138 |         self._add_constraint(_Max(column_name, value))
139 |         return self
140 | 
141 |     def is_between(self, column_name, lower_bound, upper_bound):
142 |         """Defines a constraint that checks whether the given column contains a value equal to or between the lower and upper bound.
143 | 
144 |         Args:
145 |             column_name: the column name
146 |             lower_bound: the lower bound of the range
147 |             upper_bound: the upper bound of the range
148 | 
149 |         Returns:
150 |             self
151 |         """
152 |         self._add_constraint(_Between(column_name, lower_bound, upper_bound))
153 |         return self
154 | 
155 |     def has_length_between(self, column_name, lower_bound, upper_bound):
156 |         """Defines a constraint that checks whether the given column contains a text which length is equal to or between the lower and upper bound.
157 | 
158 |         Args:
159 |             column_name: the column name
160 |             lower_bound: the lower bound of the text length
161 |             upper_bound: the upper bound of the text length
162 | 
163 |         Returns:
164 |             self
165 |         """
166 |         self._add_constraint(_TextLength(column_name, lower_bound, upper_bound))
167 |         return self
168 | 
169 |     def text_matches_regex(self, column_name, regex):
170 |         """Defines a constraint that checks whether the content of a given column matches the given regex.
171 | 
172 |         Args:
173 |             column_name: the column name
174 |             regex: the regex
175 | 
176 |         Returns:
177 |             self
178 |         """
179 |         self._add_constraint(_TextRegex(column_name, regex))
180 |         return self
181 | 
182 |     def one_of(self, column_name, allowed_values: list):
183 |         """Defines a constraint that checks whether the column value is equal to one of the given values.
184 | 
185 |         Args:
186 |             column_name: the column name
187 |             allowed_values: a list of allowed values, the type should match the column type
188 | 
189 |         Returns:
190 |             self
191 |         """
192 |         self._add_constraint(_OneOf(column_name, allowed_values))
193 |         return self
194 | 
195 |     def mean_column_value(self, column_name: str, min_mean: float, max_mean: float):
196 |         """Defines a constraint that checks whether the average of all values in the column is between the given min and max value (inclusive).
197 | 
198 |         Args:
199 |             column_name: the column name
200 |             min_mean: the expected min value
201 |             max_mean: the expected max value
202 | 
203 |         Returns:
204 |              self
205 |         """
206 |         self._add_constraint(_MeanColumn(column_name, min_mean, max_mean))
207 |         return self
208 | 
209 |     def median_column_value(self, column_name: str, min_median: float, max_median: float):
210 |         """Defines a constraint that checks whether the median of all values in the column is between the given min and max value (inclusive).
211 | 
212 |         Args:
213 |             column_name: the column name
214 |             min_median: the expected min value
215 |             max_median: the expected max value
216 | 
217 |         Returns:
218 |              self
219 |         """
220 |         self._add_constraint(_MedianColumn(column_name, min_median, max_median))
221 |         return self
222 | 
223 |     def execute(self) -> ValidationResult:
224 |         """Returns a named tuple containing the data that passed the validation, the data that was rejected (only unique rows), and a list of violated constraints.
225 | 
226 |         Note that the order of rows and constraints is not preserved.
227 | 
228 |         Returns:
229 |             an instance of ValidationResult
230 | 
231 |         Raises:
232 |             ValueError: if a constraint has been defined using a non-existing column.
233 |         """
234 |         self._validate_constraints()
235 | 
236 |         if self.constraints:
237 |             for constraint in self.constraints:
238 |                 self.df = constraint.prepare_df_for_check(self.df)
239 | 
240 |             correct_output = self.df
241 |             errors = []
242 | 
243 |             for constraint in self.constraints:
244 |                 correct_output = constraint.filter_success(correct_output)
245 |                 number_of_failures = constraint.filter_failure(self.df).count()
246 | 
247 |                 if number_of_failures > 0:
248 |                     errors.append(ValidationError(constraint.column_name, constraint.constraint_name(), number_of_failures))
249 | 
250 |             correct_output = correct_output.select(self.input_columns)
251 |             incorrect_output = self.df.select(self.input_columns).subtract(correct_output)
252 | 
253 |             return ValidationResult(correct_output, incorrect_output, errors)
254 |         else:
255 |             return ValidationResult(self.df, self.spark.createDataFrame([], self.df.schema), [])
256 | 
257 |     def _add_constraint(self, constraint: _Constraint) -> None:
258 |         existing = filter(lambda c: c.constraint_name() == constraint.constraint_name() and c.column_name == constraint.column_name, self.constraints)
259 |         if list(existing):
260 |             raise ValueError(f"An not_null constraint for column {constraint.column_name} already exists.")
261 | 
262 |         self.constraints.append(constraint)
263 | 
264 |     def _validate_constraints(self) -> None:
265 |         columns = self.df.columns
266 | 
267 |         errors = []
268 |         for constraint in self.constraints:
269 |             is_correct, error_message = constraint.validate_self(self.df, columns)
270 |             if not is_correct:
271 |                 errors.append(error_message)
272 | 
273 |         if errors:
274 |             raise ValueError(", ".join(errors))
275 | 


--------------------------------------------------------------------------------
/check-engine-lib/conftest.py:
--------------------------------------------------------------------------------
1 | 
2 | pytest_plugins = [
3 |     "tests.spark.spark_session",
4 | ]
5 | 


--------------------------------------------------------------------------------
/check-engine-lib/poetry.lock:
--------------------------------------------------------------------------------
  1 | [[package]]
  2 | category = "dev"
  3 | description = "Atomic file writes."
  4 | marker = "sys_platform == \"win32\""
  5 | name = "atomicwrites"
  6 | optional = false
  7 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
  8 | version = "1.4.0"
  9 | 
 10 | [[package]]
 11 | category = "dev"
 12 | description = "Classes Without Boilerplate"
 13 | name = "attrs"
 14 | optional = false
 15 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 16 | version = "21.2.0"
 17 | 
 18 | [package.extras]
 19 | dev = ["coverage (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "furo", "sphinx", "sphinx-notfound-page", "pre-commit"]
 20 | docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"]
 21 | tests = ["coverage (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface"]
 22 | tests_no_zope = ["coverage (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins"]
 23 | 
 24 | [[package]]
 25 | category = "dev"
 26 | description = "behave is behaviour-driven development, Python style"
 27 | name = "behave"
 28 | optional = false
 29 | python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
 30 | version = "1.2.6"
 31 | 
 32 | [package.dependencies]
 33 | parse = ">=1.8.2"
 34 | parse-type = ">=0.4.2"
 35 | six = ">=1.11"
 36 | 
 37 | [package.extras]
 38 | develop = ["coverage", "pytest (>=3.0)", "pytest-cov", "tox", "invoke (>=0.21.0)", "path.py (>=8.1.2)", "pycmd", "pathlib", "modernize (>=0.5)", "pylint"]
 39 | docs = ["sphinx (>=1.6)", "sphinx-bootstrap-theme (>=0.6)"]
 40 | 
 41 | [[package]]
 42 | category = "dev"
 43 | description = "Cross-platform colored terminal text."
 44 | marker = "sys_platform == \"win32\""
 45 | name = "colorama"
 46 | optional = false
 47 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 48 | version = "0.4.4"
 49 | 
 50 | [[package]]
 51 | category = "dev"
 52 | description = "Code coverage measurement for Python"
 53 | name = "coverage"
 54 | optional = false
 55 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4"
 56 | version = "5.5"
 57 | 
 58 | [package.extras]
 59 | toml = ["toml"]
 60 | 
 61 | [[package]]
 62 | category = "dev"
 63 | description = "More routines for operating on iterables, beyond itertools"
 64 | name = "more-itertools"
 65 | optional = false
 66 | python-versions = ">=3.5"
 67 | version = "8.8.0"
 68 | 
 69 | [[package]]
 70 | category = "dev"
 71 | description = "NumPy is the fundamental package for array computing with Python."
 72 | name = "numpy"
 73 | optional = false
 74 | python-versions = ">=3.7"
 75 | version = "1.21.0"
 76 | 
 77 | [[package]]
 78 | category = "dev"
 79 | description = "Core utilities for Python packages"
 80 | name = "packaging"
 81 | optional = false
 82 | python-versions = ">=3.6"
 83 | version = "21.0"
 84 | 
 85 | [package.dependencies]
 86 | pyparsing = ">=2.0.2"
 87 | 
 88 | [[package]]
 89 | category = "dev"
 90 | description = "Powerful data structures for data analysis, time series, and statistics"
 91 | name = "pandas"
 92 | optional = false
 93 | python-versions = ">=3.7.1"
 94 | version = "1.3.0"
 95 | 
 96 | [package.dependencies]
 97 | numpy = ">=1.17.3"
 98 | python-dateutil = ">=2.7.3"
 99 | pytz = ">=2017.3"
100 | 
101 | [package.extras]
102 | test = ["hypothesis (>=3.58)", "pytest (>=6.0)", "pytest-xdist"]
103 | 
104 | [[package]]
105 | category = "dev"
106 | description = "parse() is the opposite of format()"
107 | name = "parse"
108 | optional = false
109 | python-versions = "*"
110 | version = "1.19.0"
111 | 
112 | [[package]]
113 | category = "dev"
114 | description = "Simplifies to build parse types based on the parse module"
115 | name = "parse-type"
116 | optional = false
117 | python-versions = ">=2.6, !=3.0.*, !=3.1.*"
118 | version = "0.5.2"
119 | 
120 | [package.dependencies]
121 | parse = ">=1.8.4"
122 | six = ">=1.11"
123 | 
124 | [package.extras]
125 | develop = ["coverage (>=4.4)", "pytest (>=3.2)", "pytest-cov", "tox (>=2.8)"]
126 | docs = ["sphinx (>=1.2)"]
127 | 
128 | [[package]]
129 | category = "dev"
130 | description = "plugin and hook calling mechanisms for python"
131 | name = "pluggy"
132 | optional = false
133 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
134 | version = "0.13.1"
135 | 
136 | [package.extras]
137 | dev = ["pre-commit", "tox"]
138 | 
139 | [[package]]
140 | category = "dev"
141 | description = "library with cross-python path, ini-parsing, io, code, log facilities"
142 | name = "py"
143 | optional = false
144 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
145 | version = "1.10.0"
146 | 
147 | [[package]]
148 | category = "main"
149 | description = "Enables Python programs to dynamically access arbitrary Java objects"
150 | name = "py4j"
151 | optional = false
152 | python-versions = "*"
153 | version = "0.10.9"
154 | 
155 | [[package]]
156 | category = "dev"
157 | description = "Python parsing module"
158 | name = "pyparsing"
159 | optional = false
160 | python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
161 | version = "2.4.7"
162 | 
163 | [[package]]
164 | category = "main"
165 | description = "Apache Spark Python API"
166 | name = "pyspark"
167 | optional = false
168 | python-versions = "*"
169 | version = "3.0.3"
170 | 
171 | [package.dependencies]
172 | py4j = "0.10.9"
173 | 
174 | [package.extras]
175 | ml = ["numpy (>=1.7)"]
176 | mllib = ["numpy (>=1.7)"]
177 | sql = ["pandas (>=0.23.2)", "pyarrow (>=0.15.1)"]
178 | 
179 | [[package]]
180 | category = "dev"
181 | description = "pytest: simple powerful testing with Python"
182 | name = "pytest"
183 | optional = false
184 | python-versions = ">=3.5"
185 | version = "5.4.3"
186 | 
187 | [package.dependencies]
188 | atomicwrites = ">=1.0"
189 | attrs = ">=17.4.0"
190 | colorama = "*"
191 | more-itertools = ">=4.0.0"
192 | packaging = "*"
193 | pluggy = ">=0.12,<1.0"
194 | py = ">=1.5.0"
195 | wcwidth = "*"
196 | 
197 | [package.extras]
198 | checkqa-mypy = ["mypy (v0.761)"]
199 | testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"]
200 | 
201 | [[package]]
202 | category = "dev"
203 | description = "Pytest plugin for measuring coverage."
204 | name = "pytest-cov"
205 | optional = false
206 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
207 | version = "2.12.1"
208 | 
209 | [package.dependencies]
210 | coverage = ">=5.2.1"
211 | pytest = ">=4.6"
212 | toml = "*"
213 | 
214 | [package.extras]
215 | testing = ["fields", "hunter", "process-tests", "six", "pytest-xdist", "virtualenv"]
216 | 
217 | [[package]]
218 | category = "dev"
219 | description = "Extensions to the standard Python datetime module"
220 | name = "python-dateutil"
221 | optional = false
222 | python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
223 | version = "2.8.1"
224 | 
225 | [package.dependencies]
226 | six = ">=1.5"
227 | 
228 | [[package]]
229 | category = "dev"
230 | description = "World timezone definitions, modern and historical"
231 | name = "pytz"
232 | optional = false
233 | python-versions = "*"
234 | version = "2021.1"
235 | 
236 | [[package]]
237 | category = "dev"
238 | description = "Python 2 and 3 compatibility utilities"
239 | name = "six"
240 | optional = false
241 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
242 | version = "1.16.0"
243 | 
244 | [[package]]
245 | category = "dev"
246 | description = "Python Library for Tom's Obvious, Minimal Language"
247 | name = "toml"
248 | optional = false
249 | python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
250 | version = "0.10.2"
251 | 
252 | [[package]]
253 | category = "dev"
254 | description = "Measures the displayed width of unicode strings in a terminal"
255 | name = "wcwidth"
256 | optional = false
257 | python-versions = "*"
258 | version = "0.2.5"
259 | 
260 | [metadata]
261 | content-hash = "0501f68fcc1c687a561a82f33a1e6fe871ce8dad067ccba85a1f963a69d46138"
262 | python-versions = "^3.8"
263 | 
264 | [metadata.files]
265 | atomicwrites = [
266 |     {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"},
267 |     {file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"},
268 | ]
269 | attrs = [
270 |     {file = "attrs-21.2.0-py2.py3-none-any.whl", hash = "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1"},
271 |     {file = "attrs-21.2.0.tar.gz", hash = "sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"},
272 | ]
273 | behave = [
274 |     {file = "behave-1.2.6-py2.py3-none-any.whl", hash = "sha256:ebda1a6c9e5bfe95c5f9f0a2794e01c7098b3dde86c10a95d8621c5907ff6f1c"},
275 |     {file = "behave-1.2.6.tar.gz", hash = "sha256:b9662327aa53294c1351b0a9c369093ccec1d21026f050c3bd9b3e5cccf81a86"},
276 | ]
277 | colorama = [
278 |     {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"},
279 |     {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"},
280 | ]
281 | coverage = [
282 |     {file = "coverage-5.5-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:b6d534e4b2ab35c9f93f46229363e17f63c53ad01330df9f2d6bd1187e5eaacf"},
283 |     {file = "coverage-5.5-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:b7895207b4c843c76a25ab8c1e866261bcfe27bfaa20c192de5190121770672b"},
284 |     {file = "coverage-5.5-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:c2723d347ab06e7ddad1a58b2a821218239249a9e4365eaff6649d31180c1669"},
285 |     {file = "coverage-5.5-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:900fbf7759501bc7807fd6638c947d7a831fc9fdf742dc10f02956ff7220fa90"},
286 |     {file = "coverage-5.5-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:004d1880bed2d97151facef49f08e255a20ceb6f9432df75f4eef018fdd5a78c"},
287 |     {file = "coverage-5.5-cp27-cp27m-win32.whl", hash = "sha256:06191eb60f8d8a5bc046f3799f8a07a2d7aefb9504b0209aff0b47298333302a"},
288 |     {file = "coverage-5.5-cp27-cp27m-win_amd64.whl", hash = "sha256:7501140f755b725495941b43347ba8a2777407fc7f250d4f5a7d2a1050ba8e82"},
289 |     {file = "coverage-5.5-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:372da284cfd642d8e08ef606917846fa2ee350f64994bebfbd3afb0040436905"},
290 |     {file = "coverage-5.5-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:8963a499849a1fc54b35b1c9f162f4108017b2e6db2c46c1bed93a72262ed083"},
291 |     {file = "coverage-5.5-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:869a64f53488f40fa5b5b9dcb9e9b2962a66a87dab37790f3fcfb5144b996ef5"},
292 |     {file = "coverage-5.5-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:4a7697d8cb0f27399b0e393c0b90f0f1e40c82023ea4d45d22bce7032a5d7b81"},
293 |     {file = "coverage-5.5-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:8d0a0725ad7c1a0bcd8d1b437e191107d457e2ec1084b9f190630a4fb1af78e6"},
294 |     {file = "coverage-5.5-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:51cb9476a3987c8967ebab3f0fe144819781fca264f57f89760037a2ea191cb0"},
295 |     {file = "coverage-5.5-cp310-cp310-win_amd64.whl", hash = "sha256:c0891a6a97b09c1f3e073a890514d5012eb256845c451bd48f7968ef939bf4ae"},
296 |     {file = "coverage-5.5-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:3487286bc29a5aa4b93a072e9592f22254291ce96a9fbc5251f566b6b7343cdb"},
297 |     {file = "coverage-5.5-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:deee1077aae10d8fa88cb02c845cfba9b62c55e1183f52f6ae6a2df6a2187160"},
298 |     {file = "coverage-5.5-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:f11642dddbb0253cc8853254301b51390ba0081750a8ac03f20ea8103f0c56b6"},
299 |     {file = "coverage-5.5-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:6c90e11318f0d3c436a42409f2749ee1a115cd8b067d7f14c148f1ce5574d701"},
300 |     {file = "coverage-5.5-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:30c77c1dc9f253283e34c27935fded5015f7d1abe83bc7821680ac444eaf7793"},
301 |     {file = "coverage-5.5-cp35-cp35m-win32.whl", hash = "sha256:9a1ef3b66e38ef8618ce5fdc7bea3d9f45f3624e2a66295eea5e57966c85909e"},
302 |     {file = "coverage-5.5-cp35-cp35m-win_amd64.whl", hash = "sha256:972c85d205b51e30e59525694670de6a8a89691186012535f9d7dbaa230e42c3"},
303 |     {file = "coverage-5.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:af0e781009aaf59e25c5a678122391cb0f345ac0ec272c7961dc5455e1c40066"},
304 |     {file = "coverage-5.5-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:74d881fc777ebb11c63736622b60cb9e4aee5cace591ce274fb69e582a12a61a"},
305 |     {file = "coverage-5.5-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:92b017ce34b68a7d67bd6d117e6d443a9bf63a2ecf8567bb3d8c6c7bc5014465"},
306 |     {file = "coverage-5.5-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:d636598c8305e1f90b439dbf4f66437de4a5e3c31fdf47ad29542478c8508bbb"},
307 |     {file = "coverage-5.5-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:41179b8a845742d1eb60449bdb2992196e211341818565abded11cfa90efb821"},
308 |     {file = "coverage-5.5-cp36-cp36m-win32.whl", hash = "sha256:040af6c32813fa3eae5305d53f18875bedd079960822ef8ec067a66dd8afcd45"},
309 |     {file = "coverage-5.5-cp36-cp36m-win_amd64.whl", hash = "sha256:5fec2d43a2cc6965edc0bb9e83e1e4b557f76f843a77a2496cbe719583ce8184"},
310 |     {file = "coverage-5.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:18ba8bbede96a2c3dde7b868de9dcbd55670690af0988713f0603f037848418a"},
311 |     {file = "coverage-5.5-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:2910f4d36a6a9b4214bb7038d537f015346f413a975d57ca6b43bf23d6563b53"},
312 |     {file = "coverage-5.5-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:f0b278ce10936db1a37e6954e15a3730bea96a0997c26d7fee88e6c396c2086d"},
313 |     {file = "coverage-5.5-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:796c9c3c79747146ebd278dbe1e5c5c05dd6b10cc3bcb8389dfdf844f3ead638"},
314 |     {file = "coverage-5.5-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:53194af30d5bad77fcba80e23a1441c71abfb3e01192034f8246e0d8f99528f3"},
315 |     {file = "coverage-5.5-cp37-cp37m-win32.whl", hash = "sha256:184a47bbe0aa6400ed2d41d8e9ed868b8205046518c52464fde713ea06e3a74a"},
316 |     {file = "coverage-5.5-cp37-cp37m-win_amd64.whl", hash = "sha256:2949cad1c5208b8298d5686d5a85b66aae46d73eec2c3e08c817dd3513e5848a"},
317 |     {file = "coverage-5.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:217658ec7187497e3f3ebd901afdca1af062b42cfe3e0dafea4cced3983739f6"},
318 |     {file = "coverage-5.5-cp38-cp38-manylinux1_i686.whl", hash = "sha256:1aa846f56c3d49205c952d8318e76ccc2ae23303351d9270ab220004c580cfe2"},
319 |     {file = "coverage-5.5-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:24d4a7de75446be83244eabbff746d66b9240ae020ced65d060815fac3423759"},
320 |     {file = "coverage-5.5-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:d1f8bf7b90ba55699b3a5e44930e93ff0189aa27186e96071fac7dd0d06a1873"},
321 |     {file = "coverage-5.5-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:970284a88b99673ccb2e4e334cfb38a10aab7cd44f7457564d11898a74b62d0a"},
322 |     {file = "coverage-5.5-cp38-cp38-win32.whl", hash = "sha256:01d84219b5cdbfc8122223b39a954820929497a1cb1422824bb86b07b74594b6"},
323 |     {file = "coverage-5.5-cp38-cp38-win_amd64.whl", hash = "sha256:2e0d881ad471768bf6e6c2bf905d183543f10098e3b3640fc029509530091502"},
324 |     {file = "coverage-5.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d1f9ce122f83b2305592c11d64f181b87153fc2c2bbd3bb4a3dde8303cfb1a6b"},
325 |     {file = "coverage-5.5-cp39-cp39-manylinux1_i686.whl", hash = "sha256:13c4ee887eca0f4c5a247b75398d4114c37882658300e153113dafb1d76de529"},
326 |     {file = "coverage-5.5-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:52596d3d0e8bdf3af43db3e9ba8dcdaac724ba7b5ca3f6358529d56f7a166f8b"},
327 |     {file = "coverage-5.5-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:2cafbbb3af0733db200c9b5f798d18953b1a304d3f86a938367de1567f4b5bff"},
328 |     {file = "coverage-5.5-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:44d654437b8ddd9eee7d1eaee28b7219bec228520ff809af170488fd2fed3e2b"},
329 |     {file = "coverage-5.5-cp39-cp39-win32.whl", hash = "sha256:d314ed732c25d29775e84a960c3c60808b682c08d86602ec2c3008e1202e3bb6"},
330 |     {file = "coverage-5.5-cp39-cp39-win_amd64.whl", hash = "sha256:13034c4409db851670bc9acd836243aeee299949bd5673e11844befcb0149f03"},
331 |     {file = "coverage-5.5-pp36-none-any.whl", hash = "sha256:f030f8873312a16414c0d8e1a1ddff2d3235655a2174e3648b4fa66b3f2f1079"},
332 |     {file = "coverage-5.5-pp37-none-any.whl", hash = "sha256:2a3859cb82dcbda1cfd3e6f71c27081d18aa251d20a17d87d26d4cd216fb0af4"},
333 |     {file = "coverage-5.5.tar.gz", hash = "sha256:ebe78fe9a0e874362175b02371bdfbee64d8edc42a044253ddf4ee7d3c15212c"},
334 | ]
335 | more-itertools = [
336 |     {file = "more-itertools-8.8.0.tar.gz", hash = "sha256:83f0308e05477c68f56ea3a888172c78ed5d5b3c282addb67508e7ba6c8f813a"},
337 |     {file = "more_itertools-8.8.0-py3-none-any.whl", hash = "sha256:2cf89ec599962f2ddc4d568a05defc40e0a587fbc10d5989713638864c36be4d"},
338 | ]
339 | numpy = [
340 |     {file = "numpy-1.21.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:d5caa946a9f55511e76446e170bdad1d12d6b54e17a2afe7b189112ed4412bb8"},
341 |     {file = "numpy-1.21.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:ac4fd578322842dbda8d968e3962e9f22e862b6ec6e3378e7415625915e2da4d"},
342 |     {file = "numpy-1.21.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:598fe100b2948465cf3ed64b1a326424b5e4be2670552066e17dfaa67246011d"},
343 |     {file = "numpy-1.21.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c55407f739f0bfcec67d0df49103f9333edc870061358ac8a8c9e37ea02fcd2"},
344 |     {file = "numpy-1.21.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:75579acbadbf74e3afd1153da6177f846212ea2a0cc77de53523ae02c9256513"},
345 |     {file = "numpy-1.21.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:cc367c86eb87e5b7c9592935620f22d13b090c609f1b27e49600cd033b529f54"},
346 |     {file = "numpy-1.21.0-cp37-cp37m-win32.whl", hash = "sha256:d89b0dc7f005090e32bb4f9bf796e1dcca6b52243caf1803fdd2b748d8561f63"},
347 |     {file = "numpy-1.21.0-cp37-cp37m-win_amd64.whl", hash = "sha256:eda2829af498946c59d8585a9fd74da3f810866e05f8df03a86f70079c7531dd"},
348 |     {file = "numpy-1.21.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:1a784e8ff7ea2a32e393cc53eb0003eca1597c7ca628227e34ce34eb11645a0e"},
349 |     {file = "numpy-1.21.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bba474a87496d96e61461f7306fba2ebba127bed7836212c360f144d1e72ac54"},
350 |     {file = "numpy-1.21.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:fd0a359c1c17f00cb37de2969984a74320970e0ceef4808c32e00773b06649d9"},
351 |     {file = "numpy-1.21.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:e4d5a86a5257843a18fb1220c5f1c199532bc5d24e849ed4b0289fb59fbd4d8f"},
352 |     {file = "numpy-1.21.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:620732f42259eb2c4642761bd324462a01cdd13dd111740ce3d344992dd8492f"},
353 |     {file = "numpy-1.21.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9205711e5440954f861ceeea8f1b415d7dd15214add2e878b4d1cf2bcb1a914"},
354 |     {file = "numpy-1.21.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ad09f55cc95ed8d80d8ab2052f78cc21cb231764de73e229140d81ff49d8145e"},
355 |     {file = "numpy-1.21.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:a1f2fb2da242568af0271455b89aee0f71e4e032086ee2b4c5098945d0e11cf6"},
356 |     {file = "numpy-1.21.0-cp38-cp38-win32.whl", hash = "sha256:e58ddb53a7b4959932f5582ac455ff90dcb05fac3f8dcc8079498d43afbbde6c"},
357 |     {file = "numpy-1.21.0-cp38-cp38-win_amd64.whl", hash = "sha256:d2910d0a075caed95de1a605df00ee03b599de5419d0b95d55342e9a33ad1fb3"},
358 |     {file = "numpy-1.21.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:a290989cd671cd0605e9c91a70e6df660f73ae87484218e8285c6522d29f6e38"},
359 |     {file = "numpy-1.21.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3537b967b350ad17633b35c2f4b1a1bbd258c018910b518c30b48c8e41272717"},
360 |     {file = "numpy-1.21.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ccc6c650f8700ce1e3a77668bb7c43e45c20ac06ae00d22bdf6760b38958c883"},
361 |     {file = "numpy-1.21.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:709884863def34d72b183d074d8ba5cfe042bc3ff8898f1ffad0209161caaa99"},
362 |     {file = "numpy-1.21.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:bebab3eaf0641bba26039fb0b2c5bf9b99407924b53b1ea86e03c32c64ef5aef"},
363 |     {file = "numpy-1.21.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cf680682ad0a3bef56dae200dbcbac2d57294a73e5b0f9864955e7dd7c2c2491"},
364 |     {file = "numpy-1.21.0-cp39-cp39-win32.whl", hash = "sha256:d95d16204cd51ff1a1c8d5f9958ce90ae190be81d348b514f9be39f878b8044a"},
365 |     {file = "numpy-1.21.0-cp39-cp39-win_amd64.whl", hash = "sha256:2ba579dde0563f47021dcd652253103d6fd66165b18011dce1a0609215b2791e"},
366 |     {file = "numpy-1.21.0-pp37-pypy37_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:3c40e6b860220ed862e8097b8f81c9af6d7405b723f4a7af24a267b46f90e461"},
367 |     {file = "numpy-1.21.0.zip", hash = "sha256:e80fe25cba41c124d04c662f33f6364909b985f2eb5998aaa5ae4b9587242cce"},
368 | ]
369 | packaging = [
370 |     {file = "packaging-21.0-py3-none-any.whl", hash = "sha256:c86254f9220d55e31cc94d69bade760f0847da8000def4dfe1c6b872fd14ff14"},
371 |     {file = "packaging-21.0.tar.gz", hash = "sha256:7dc96269f53a4ccec5c0670940a4281106dd0bb343f47b7471f779df49c2fbe7"},
372 | ]
373 | pandas = [
374 |     {file = "pandas-1.3.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c81b8d91e9ae861eb4406b4e0f8d4dabbc105b9c479b3d1e921fba1d35b5b62a"},
375 |     {file = "pandas-1.3.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:08eeff3da6a188e24db7f292b39a8ca9e073bf841fbbeadb946b3ad5c19d843e"},
376 |     {file = "pandas-1.3.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:88864c1e28353b958b1f30e4193818519624ad9a1776921622a6a2a016d5d807"},
377 |     {file = "pandas-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:872aa91e0f9ca913046ab639d4181a899f5e592030d954d28c2529b88756a736"},
378 |     {file = "pandas-1.3.0-cp37-cp37m-win32.whl", hash = "sha256:92835113a67cbd34747c198d41f09f4b63f6fe11ca5643baebc7ab1e30e89e95"},
379 |     {file = "pandas-1.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:7d3cd2c99faa94d717ca00ea489264a291ad7209453dffbf059bfb7971fd3a61"},
380 | ]
381 | parse = [
382 |     {file = "parse-1.19.0.tar.gz", hash = "sha256:9ff82852bcb65d139813e2a5197627a94966245c897796760a3a2a8eb66f020b"},
383 | ]
384 | parse-type = [
385 |     {file = "parse_type-0.5.2-py2.py3-none-any.whl", hash = "sha256:089a471b06327103865dfec2dd844230c3c658a4a1b5b4c8b6c16c8f77577f9e"},
386 |     {file = "parse_type-0.5.2.tar.gz", hash = "sha256:7f690b18d35048c15438d6d0571f9045cffbec5907e0b1ccf006f889e3a38c0b"},
387 | ]
388 | pluggy = [
389 |     {file = "pluggy-0.13.1-py2.py3-none-any.whl", hash = "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"},
390 |     {file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"},
391 | ]
392 | py = [
393 |     {file = "py-1.10.0-py2.py3-none-any.whl", hash = "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"},
394 |     {file = "py-1.10.0.tar.gz", hash = "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3"},
395 | ]
396 | py4j = [
397 |     {file = "py4j-0.10.9-py2.py3-none-any.whl", hash = "sha256:859ba728a7bb43e9c2bf058832759fb97a598bb28cc12f34f5fc4abdec08ede6"},
398 |     {file = "py4j-0.10.9.tar.gz", hash = "sha256:36ec57f43ff8ced260a18aa9a4e46c3500a730cac8860e259cbaa546c2b9db2f"},
399 | ]
400 | pyparsing = [
401 |     {file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"},
402 |     {file = "pyparsing-2.4.7.tar.gz", hash = "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1"},
403 | ]
404 | pyspark = [
405 |     {file = "pyspark-3.0.3.tar.gz", hash = "sha256:c4499903e3d7289cf2b4bf7755fb32cf17922598f3b85a1c230860dec020eec4"},
406 | ]
407 | pytest = [
408 |     {file = "pytest-5.4.3-py3-none-any.whl", hash = "sha256:5c0db86b698e8f170ba4582a492248919255fcd4c79b1ee64ace34301fb589a1"},
409 |     {file = "pytest-5.4.3.tar.gz", hash = "sha256:7979331bfcba207414f5e1263b5a0f8f521d0f457318836a7355531ed1a4c7d8"},
410 | ]
411 | pytest-cov = [
412 |     {file = "pytest-cov-2.12.1.tar.gz", hash = "sha256:261ceeb8c227b726249b376b8526b600f38667ee314f910353fa318caa01f4d7"},
413 |     {file = "pytest_cov-2.12.1-py2.py3-none-any.whl", hash = "sha256:261bb9e47e65bd099c89c3edf92972865210c36813f80ede5277dceb77a4a62a"},
414 | ]
415 | python-dateutil = [
416 |     {file = "python-dateutil-2.8.1.tar.gz", hash = "sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c"},
417 |     {file = "python_dateutil-2.8.1-py2.py3-none-any.whl", hash = "sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a"},
418 | ]
419 | pytz = [
420 |     {file = "pytz-2021.1-py2.py3-none-any.whl", hash = "sha256:eb10ce3e7736052ed3623d49975ce333bcd712c7bb19a58b9e2089d4057d0798"},
421 |     {file = "pytz-2021.1.tar.gz", hash = "sha256:83a4a90894bf38e243cf052c8b58f381bfe9a7a483f6a9cab140bc7f702ac4da"},
422 | ]
423 | six = [
424 |     {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
425 |     {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
426 | ]
427 | toml = [
428 |     {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"},
429 |     {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"},
430 | ]
431 | wcwidth = [
432 |     {file = "wcwidth-0.2.5-py2.py3-none-any.whl", hash = "sha256:beb4802a9cebb9144e99086eff703a642a13d6a0052920003a230f3294bbe784"},
433 |     {file = "wcwidth-0.2.5.tar.gz", hash = "sha256:c4d647b99872929fdb7bdcaa4fbe7f01413ed3d98077df798530e5b04f116c83"},
434 | ]
435 | 


--------------------------------------------------------------------------------
/check-engine-lib/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "checkengine"
 3 | version = "0.2.0"
 4 | description = "Data-quality checks for PySpark"
 5 | authors = ["Bartosz Mikulski <mail@mikulskibartosz.name>"]
 6 | license = "MIT"
 7 | readme = "README.md"
 8 | repository = "https://github.com/mikulskibartosz/check-engine"
 9 | 
10 | [tool.poetry.dependencies]
11 | python = "^3.8"
12 | pyspark = "3.0.3"
13 | 
14 | [tool.poetry.dev-dependencies]
15 | pytest = "^5.2"
16 | pytest-cov = "^2.10.0"
17 | pandas = "^1.0.5"
18 | numpy = "^1.19.0"
19 | behave = "^1.2.6"
20 | 
21 | [build-system]
22 | requires = ["poetry_core>=1.0.0"]
23 | build-backend = "poetry.core.masonry.api"
24 | 


--------------------------------------------------------------------------------
/check-engine-lib/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikulskibartosz/check-engine/383cf4106605cc6f94e800bdc707789c0cedbe95/check-engine-lib/tests/__init__.py


--------------------------------------------------------------------------------
/check-engine-lib/tests/spark/AssertResult.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import DataFrame
 2 | 
 3 | from checkengine.validate_df import ValidationResult, ValidationError
 4 | from tests.spark.assert_df import AssertDf
 5 | 
 6 | 
 7 | class AssertValidationResult:
 8 |     def __init__(self, *, column_name: str, constraint_name: str):
 9 |         self.column_name = column_name
10 |         self.constraint_name = constraint_name
11 | 
12 |     def check(self, *, actual: ValidationResult, expected_correct: DataFrame, expected_erroneous: DataFrame):
13 |         if expected_correct.count() == 0:
14 |             AssertDf(actual.correct_data) \
15 |                 .is_empty() \
16 |                 .has_columns(expected_correct.columns)
17 |         else:
18 |             AssertDf(actual.correct_data, order_by_column=self.column_name) \
19 |                 .contains_exactly(expected_correct.toPandas()) \
20 |                 .has_columns(expected_correct.columns)
21 | 
22 |         if expected_erroneous.count() == 0:
23 |             AssertDf(actual.erroneous_data) \
24 |                 .is_empty() \
25 |                 .has_columns(expected_erroneous.columns)
26 |         else:
27 |             AssertDf(actual.erroneous_data, order_by_column=self.column_name) \
28 |                 .contains_exactly(expected_erroneous.toPandas()) \
29 |                 .has_columns(expected_erroneous.columns)
30 | 
31 |         if expected_erroneous.count() == 0:
32 |             assert actual.errors == []
33 |         else:
34 |             assert actual.errors == [ValidationError(self.column_name, self.constraint_name, expected_erroneous.count())]
35 | 


--------------------------------------------------------------------------------
/check-engine-lib/tests/spark/__init__.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession
 2 | from pyspark.sql.types import StructType, StructField, StringType, IntegerType
 3 | 
 4 | single_string_column_schema = StructType([StructField("col1", StringType())])
 5 | two_string_columns_schema = StructType([StructField("col1", StringType()), StructField("col2", StringType())])
 6 | 
 7 | single_integer_column_schema = StructType([StructField("col1", IntegerType())])
 8 | two_integer_columns_schema = StructType([StructField("col1", IntegerType()), StructField("col2", IntegerType())])
 9 | 
10 | 
11 | def empty_string_df(spark_session: SparkSession):
12 |     return spark_session.createDataFrame([], schema=single_string_column_schema)
13 | 
14 | 
15 | def empty_integer_df(spark_session: SparkSession):
16 |     return spark_session.createDataFrame([], schema=single_integer_column_schema)
17 | 


--------------------------------------------------------------------------------
/check-engine-lib/tests/spark/assert_df.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, List, Union
 2 | 
 3 | from pandas import DataFrame as Pandas_df
 4 | from pyspark.sql import DataFrame as Spark_df
 5 | from numpy.testing import assert_array_equal
 6 | 
 7 | 
 8 | class AssertDf:
 9 |     def __init__(self, df: Spark_df, order_by_column: Optional[Union[str, List[str]]] = None):
10 |         self.df: Pandas_df = df.toPandas()
11 |         self.order_by_column = order_by_column
12 | 
13 |     def is_empty(self):
14 |         assert self.df.empty
15 |         return self
16 | 
17 |     def contains_exactly(self, other: Pandas_df):
18 |         if self.order_by_column:
19 |             sorted_df = self.df.sort_values(self.order_by_column)
20 |             other_sorted = other.sort_values(self.order_by_column)
21 |             assert_array_equal(sorted_df.values, other_sorted.values, verbose=True)
22 |         else:
23 |             assert self.df.equals(other)
24 |         return self
25 | 
26 |     def has_columns(self, columns: list):
27 |         existing_columns = sorted(list(self.df.columns))
28 |         expected_columns = sorted(columns)
29 |         assert existing_columns == expected_columns, f"{existing_columns} != {expected_columns}"
30 |         return self
31 | 
32 |     def has_n_rows(self, n):
33 |         assert self.df.shape[0] == n
34 |         return self
35 | 


--------------------------------------------------------------------------------
/check-engine-lib/tests/spark/spark_session.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from pyspark.sql import SparkSession
 3 | 
 4 | 
 5 | @pytest.fixture(scope="session")
 6 | def spark_session(request):
 7 |     """
 8 |     Fixture for creating a Spark Session used in the tests.
 9 |     :param request: pytest.FixtureRequest
10 |     """
11 |     spark_session = SparkSession.builder \
12 |         .master("local[*]") \
13 |         .appName("correct-horse-test") \
14 |         .getOrCreate()
15 | 
16 |     request.addfinalizer(lambda: spark_session.sparkContext.stop())
17 | 
18 |     return spark_session
19 | 


--------------------------------------------------------------------------------
/check-engine-lib/tests/test_between_integer.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Verifies the constraints that check whether the column contains value that is equal to or between two values.
  3 | """
  4 | 
  5 | import pytest
  6 | 
  7 | from tests.spark import empty_integer_df, empty_string_df, single_integer_column_schema, two_integer_columns_schema
  8 | from tests.spark.AssertResult import AssertValidationResult
  9 | from tests.spark.assert_df import AssertDf
 10 | from checkengine.validate_df import ValidateSparkDataFrame, ValidationError
 11 | 
 12 | pytestmark = pytest.mark.usefixtures("spark_session")
 13 | 
 14 | 
 15 | def test_should_return_df_without_changes_if_empty_df_with_is_between_constraint(spark_session):
 16 |     df = empty_integer_df(spark_session)
 17 | 
 18 |     result = ValidateSparkDataFrame(spark_session, df) \
 19 |         .is_between("col1", 5, 10) \
 20 |         .execute()
 21 | 
 22 |     AssertValidationResult(column_name="col1", constraint_name="between") \
 23 |         .check(
 24 |         actual=result,
 25 |         expected_correct=df,
 26 |         expected_erroneous=df
 27 |     )
 28 | 
 29 | 
 30 | def test_should_return_df_without_changes_if_all_are_between(spark_session):
 31 |     df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema)
 32 | 
 33 |     result = ValidateSparkDataFrame(spark_session, df) \
 34 |         .is_between("col1", 5, 15) \
 35 |         .execute()
 36 | 
 37 |     AssertValidationResult(column_name="col1", constraint_name="between") \
 38 |         .check(
 39 |         actual=result,
 40 |         expected_correct=df,
 41 |         expected_erroneous=empty_integer_df(spark_session)
 42 |     )
 43 | 
 44 | 
 45 | def test_should_reject_all_rows_if_not_between(spark_session):
 46 |     df = spark_session.createDataFrame([[5], [10], [20]], schema=single_integer_column_schema)
 47 | 
 48 |     result = ValidateSparkDataFrame(spark_session, df) \
 49 |         .is_between("col1", 11, 19) \
 50 |         .execute()
 51 | 
 52 |     AssertValidationResult(column_name="col1", constraint_name="between") \
 53 |         .check(
 54 |         actual=result,
 55 |         expected_correct=empty_integer_df(spark_session),
 56 |         expected_erroneous=df
 57 |     )
 58 | 
 59 | 
 60 | def test_should_return_both_correct_and_incorrect_rows(spark_session):
 61 |     df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema)
 62 |     expected_correct = spark_session.createDataFrame([[5], [10]], schema=single_integer_column_schema)
 63 |     expected_errors = spark_session.createDataFrame([[15]], schema=single_integer_column_schema)
 64 | 
 65 |     result = ValidateSparkDataFrame(spark_session, df) \
 66 |         .is_between("col1", 0, 10) \
 67 |         .execute()
 68 | 
 69 |     AssertValidationResult(column_name="col1", constraint_name="between") \
 70 |         .check(
 71 |         actual=result,
 72 |         expected_correct=expected_correct,
 73 |         expected_erroneous=expected_errors
 74 |     )
 75 | 
 76 | 
 77 | def test_between_ignores_the_other_column(spark_session):
 78 |     df = spark_session.createDataFrame([[5, 8], [10, 20], [15, 8]], schema=two_integer_columns_schema)
 79 |     expected_correct = spark_session.createDataFrame([[5, 8], [10, 20]], schema=two_integer_columns_schema)
 80 |     expected_errors = spark_session.createDataFrame([[15, 8]], schema=two_integer_columns_schema)
 81 | 
 82 |     result = ValidateSparkDataFrame(spark_session, df) \
 83 |         .is_between("col1", 5, 10) \
 84 |         .execute()
 85 | 
 86 |     AssertDf(result.correct_data, order_by_column="col1") \
 87 |         .contains_exactly(expected_correct.toPandas()) \
 88 |         .has_columns(["col1", "col2"])
 89 | 
 90 |     AssertDf(result.erroneous_data, order_by_column="col2") \
 91 |         .contains_exactly(expected_errors.toPandas()) \
 92 |         .has_columns(["col1", "col2"])
 93 | 
 94 |     assert result.errors == [ValidationError("col1", "between", 1)]
 95 | 
 96 | 
 97 | def test_between_should_check_all_given_columns_separately(spark_session):
 98 |     df = spark_session.createDataFrame([[25, 1], [30, 2], [35, 3]], schema=two_integer_columns_schema)
 99 |     expected_correct = spark_session.createDataFrame([], schema=two_integer_columns_schema)
100 |     expected_errors = spark_session.createDataFrame([[25, 1], [30, 2], [35, 3]], schema=two_integer_columns_schema)
101 | 
102 |     result = ValidateSparkDataFrame(spark_session, df) \
103 |         .is_between("col1", 0, 5) \
104 |         .is_between("col2", 20, 40) \
105 |         .execute()
106 | 
107 |     AssertDf(result.correct_data, order_by_column="col1") \
108 |         .contains_exactly(expected_correct.toPandas()) \
109 |         .has_columns(["col1", "col2"])
110 | 
111 |     AssertDf(result.erroneous_data, order_by_column="col2") \
112 |         .contains_exactly(expected_errors.toPandas()) \
113 |         .has_columns(["col1", "col2"])
114 | 
115 |     assert result.errors == [ValidationError("col1", "between", 3), ValidationError("col2", "between", 3)]
116 | 
117 | 
118 | def test_should_throw_error_if_constraint_is_not_a_numeric_column(spark_session):
119 |     with pytest.raises(ValueError):
120 |         ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \
121 |             .is_between("col1", 5, 10) \
122 |             .execute()
123 | 
124 | 
125 | def test_should_throw_error_if_constraint_uses_non_existing_column(spark_session):
126 |     with pytest.raises(ValueError):
127 |         ValidateSparkDataFrame(spark_session, empty_integer_df(spark_session)) \
128 |             .is_between("column_that_does_not_exist", 5, 5) \
129 |             .execute()
130 | 
131 | 
132 | def test_should_throw_error_if_there_are_duplicate_constraints(spark_session):
133 |     with pytest.raises(ValueError):
134 |         ValidateSparkDataFrame(spark_session, empty_integer_df(spark_session)) \
135 |             .is_between("col1", 5, 10) \
136 |             .is_between("col1", 5, 15) \
137 |             .execute()
138 | 
139 | 
140 | def test_should_throw_error_if_lower_bound_is_greater_than_upper_bound(spark_session):
141 |     with pytest.raises(ValueError):
142 |         ValidateSparkDataFrame(spark_session, empty_integer_df(spark_session)) \
143 |             .is_between("col1", 10, 5) \
144 |             .execute()
145 | 


--------------------------------------------------------------------------------
/check-engine-lib/tests/test_max_integer.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Verifies the constraints that check whether the column contains value equal or smaller than a given integer.
  3 | """
  4 | 
  5 | import pytest
  6 | 
  7 | from tests.spark import empty_integer_df, single_integer_column_schema, two_integer_columns_schema, empty_string_df
  8 | from tests.spark.AssertResult import AssertValidationResult
  9 | from tests.spark.assert_df import AssertDf
 10 | from checkengine.validate_df import ValidateSparkDataFrame, ValidationError
 11 | 
 12 | pytestmark = pytest.mark.usefixtures("spark_session")
 13 | 
 14 | 
 15 | def test_should_return_df_without_changes_if_empty_df_with_is_max_constraint(spark_session):
 16 |     df = empty_integer_df(spark_session)
 17 | 
 18 |     result = ValidateSparkDataFrame(spark_session, df) \
 19 |         .is_max("col1", 5) \
 20 |         .execute()
 21 | 
 22 |     AssertValidationResult(column_name="col1", constraint_name="max") \
 23 |         .check(
 24 |         actual=result,
 25 |         expected_correct=df,
 26 |         expected_erroneous=df
 27 |     )
 28 | 
 29 | 
 30 | def test_should_return_df_without_changes_if_all_rows_smaller_than_max(spark_session):
 31 |     df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema)
 32 | 
 33 |     result = ValidateSparkDataFrame(spark_session, df) \
 34 |         .is_max("col1", 20) \
 35 |         .execute()
 36 | 
 37 |     AssertValidationResult(column_name="col1", constraint_name="max") \
 38 |         .check(
 39 |         actual=result,
 40 |         expected_correct=df,
 41 |         expected_erroneous=empty_integer_df(spark_session)
 42 |     )
 43 | 
 44 | 
 45 | def test_should_reject_all_rows_if_larger_than_max(spark_session):
 46 |     df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema)
 47 | 
 48 |     result = ValidateSparkDataFrame(spark_session, df) \
 49 |         .is_max("col1", 1) \
 50 |         .execute()
 51 | 
 52 |     AssertValidationResult(column_name="col1", constraint_name="max") \
 53 |         .check(
 54 |         actual=result,
 55 |         expected_correct=empty_integer_df(spark_session),
 56 |         expected_erroneous=df
 57 |     )
 58 | 
 59 | 
 60 | def test_should_return_both_correct_and_incorrect_rows(spark_session):
 61 |     df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema)
 62 |     expected_correct = spark_session.createDataFrame([[5], [10]], schema=single_integer_column_schema)
 63 |     expected_errors = spark_session.createDataFrame([[15]], schema=single_integer_column_schema)
 64 | 
 65 |     result = ValidateSparkDataFrame(spark_session, df) \
 66 |         .is_max("col1", 10) \
 67 |         .execute()
 68 | 
 69 |     AssertValidationResult(column_name="col1", constraint_name="max") \
 70 |         .check(
 71 |         actual=result,
 72 |         expected_correct=expected_correct,
 73 |         expected_erroneous=expected_errors
 74 |     )
 75 | 
 76 | 
 77 | def test_max_value_of_other_columns_is_ignored(spark_session):
 78 |     df = spark_session.createDataFrame([[5, 1], [10, 20], [15, 1]], schema=two_integer_columns_schema)
 79 |     expected_correct = spark_session.createDataFrame([[5, 1], [10, 20]], schema=two_integer_columns_schema)
 80 |     expected_errors = spark_session.createDataFrame([[15, 1]], schema=two_integer_columns_schema)
 81 | 
 82 |     result = ValidateSparkDataFrame(spark_session, df) \
 83 |         .is_max("col1", 10) \
 84 |         .execute()
 85 | 
 86 |     AssertValidationResult(column_name="col1", constraint_name="max") \
 87 |         .check(
 88 |         actual=result,
 89 |         expected_correct=expected_correct,
 90 |         expected_erroneous=expected_errors
 91 |     )
 92 | 
 93 | 
 94 | def test_max_should_check_all_given_columns_separately(spark_session):
 95 |     df = spark_session.createDataFrame([[25, 1], [30, 2], [35, 3]], schema=two_integer_columns_schema)
 96 | 
 97 |     expected_correct = spark_session.createDataFrame([], schema=two_integer_columns_schema)
 98 |     expected_errors = spark_session.createDataFrame([[25, 1], [30, 2], [35, 3]], schema=two_integer_columns_schema)
 99 | 
100 |     result = ValidateSparkDataFrame(spark_session, df) \
101 |         .is_max("col1", 20) \
102 |         .is_max("col2", 0) \
103 |         .execute()
104 | 
105 |     AssertDf(result.correct_data, order_by_column="col1") \
106 |         .contains_exactly(expected_correct.toPandas()) \
107 |         .has_columns(["col1", "col2"])
108 | 
109 |     AssertDf(result.erroneous_data, order_by_column="col2") \
110 |         .contains_exactly(expected_errors.toPandas()) \
111 |         .has_columns(["col1", "col2"])
112 | 
113 |     assert result.errors == [ValidationError("col1", "max", 3), ValidationError("col2", "max", 3)]
114 | 
115 | 
116 | def test_should_throw_error_if_constraint_is_not_a_numeric_column(spark_session):
117 |     with pytest.raises(ValueError):
118 |         ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \
119 |             .is_max("col1", 5) \
120 |             .execute()
121 | 
122 | 
123 | def test_should_throw_error_if_constraint_uses_non_existing_column(spark_session):
124 |     with pytest.raises(ValueError):
125 |         ValidateSparkDataFrame(spark_session, empty_integer_df(spark_session)) \
126 |             .is_max("column_that_does_not_exist", 5) \
127 |             .execute()
128 | 
129 | 
130 | def test_should_throw_error_if_there_are_duplicate_constraints(spark_session):
131 |     with pytest.raises(ValueError):
132 |         ValidateSparkDataFrame(spark_session, empty_integer_df(spark_session)) \
133 |             .is_max("col1", 5) \
134 |             .is_max("col1", 10) \
135 |             .execute()
136 | 


--------------------------------------------------------------------------------
/check-engine-lib/tests/test_mean_value.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests the meanColumnValue constraint.
  3 | 
  4 | The implementation should reject all rows in a column if the column mean value is not between the expected values.
  5 | """
  6 | import pytest
  7 | 
  8 | from checkengine.validate_df import ValidateSparkDataFrame
  9 | from tests.spark import empty_integer_df, single_integer_column_schema, two_integer_columns_schema, empty_string_df
 10 | from tests.spark.AssertResult import AssertValidationResult
 11 | 
 12 | pytestmark = pytest.mark.usefixtures("spark_session")
 13 | 
 14 | 
 15 | def test_should_return_df_without_changes_if_empty_df_with_mean_constraint(spark_session):
 16 |     df = empty_integer_df(spark_session)
 17 | 
 18 |     result = ValidateSparkDataFrame(spark_session, df) \
 19 |         .mean_column_value("col1", 0, 1) \
 20 |         .execute()
 21 | 
 22 |     AssertValidationResult(column_name="col1", constraint_name="mean_between") \
 23 |         .check(
 24 |         actual=result,
 25 |         expected_correct=df,
 26 |         expected_erroneous=df
 27 |     )
 28 | 
 29 | 
 30 | def test_should_return_df_without_changes_if_the_mean_is_between_given_values(spark_session):
 31 |     df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema)
 32 | 
 33 |     result = ValidateSparkDataFrame(spark_session, df) \
 34 |         .mean_column_value("col1", 5, 15) \
 35 |         .execute()
 36 | 
 37 |     AssertValidationResult(column_name="col1", constraint_name="mean_between") \
 38 |         .check(
 39 |         actual=result,
 40 |         expected_correct=df,
 41 |         expected_erroneous=empty_integer_df(spark_session)
 42 |     )
 43 | 
 44 | 
 45 | def test_should_reject_all_rows_if_mean_is_smaller_than_given_values(spark_session):
 46 |     df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema)
 47 | 
 48 |     result = ValidateSparkDataFrame(spark_session, df) \
 49 |         .mean_column_value("col1", 12, 15) \
 50 |         .execute()
 51 | 
 52 |     AssertValidationResult(column_name="col1", constraint_name="mean_between") \
 53 |         .check(
 54 |         actual=result,
 55 |         expected_correct=empty_integer_df(spark_session),
 56 |         expected_erroneous=df
 57 |     )
 58 | 
 59 | 
 60 | def test_should_reject_all_rows_if_mean_is_larger_than_given_values(spark_session):
 61 |     df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema)
 62 | 
 63 |     result = ValidateSparkDataFrame(spark_session, df) \
 64 |         .mean_column_value("col1", 5, 8) \
 65 |         .execute()
 66 | 
 67 |     AssertValidationResult(column_name="col1", constraint_name="mean_between") \
 68 |         .check(
 69 |         actual=result,
 70 |         expected_correct=empty_integer_df(spark_session),
 71 |         expected_erroneous=df
 72 |     )
 73 | 
 74 | 
 75 | def test_mean_value_of_other_columns_is_ignored(spark_session):
 76 |     df = spark_session.createDataFrame([[5, 1], [10, 2], [15, 3]], schema=two_integer_columns_schema)
 77 |     expected_errors = spark_session.createDataFrame([], schema=two_integer_columns_schema)
 78 | 
 79 |     result = ValidateSparkDataFrame(spark_session, df) \
 80 |         .mean_column_value("col1", 10, 10) \
 81 |         .execute()
 82 | 
 83 |     AssertValidationResult(column_name="col1", constraint_name="mean_between") \
 84 |         .check(
 85 |         actual=result,
 86 |         expected_correct=df,
 87 |         expected_erroneous=expected_errors
 88 |     )
 89 | 
 90 | 
 91 | def test_mean_should_check_all_given_columns_separately(spark_session):
 92 |     df = spark_session.createDataFrame([[5, 1], [10, 2], [15, 3]], schema=two_integer_columns_schema)
 93 |     expected_errors = spark_session.createDataFrame([], schema=two_integer_columns_schema)
 94 | 
 95 |     result = ValidateSparkDataFrame(spark_session, df) \
 96 |         .mean_column_value("col1", 10, 10) \
 97 |         .mean_column_value("col2", 2, 2) \
 98 |         .execute()
 99 | 
100 |     AssertValidationResult(column_name="col1", constraint_name="mean_between") \
101 |         .check(
102 |         actual=result,
103 |         expected_correct=df,
104 |         expected_erroneous=expected_errors
105 |     )
106 | 
107 | 
108 | def test_should_throw_error_if_constraint_is_not_a_numeric_column(spark_session):
109 |     with pytest.raises(ValueError):
110 |         ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \
111 |             .mean_column_value("col1", 10, 10) \
112 |             .execute()
113 | 
114 | 
115 | def test_should_throw_error_if_constraint_uses_non_existing_column(spark_session):
116 |     with pytest.raises(ValueError):
117 |         ValidateSparkDataFrame(spark_session, empty_integer_df(spark_session)) \
118 |             .mean_column_value("column_that_does_not_exist", 5, 5) \
119 |             .execute()
120 | 
121 | 
122 | def test_should_throw_error_if_there_are_duplicate_constraints(spark_session):
123 |     with pytest.raises(ValueError):
124 |         ValidateSparkDataFrame(spark_session, empty_integer_df(spark_session)) \
125 |             .mean_column_value("col1", 10, 10) \
126 |             .mean_column_value("col1", 5, 5) \
127 |             .execute()
128 | 


--------------------------------------------------------------------------------
/check-engine-lib/tests/test_median_value.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests the medianColumnValue constraint.
  3 | 
  4 | The implementation should reject all rows in a column if the column median value is not between the expected values.
  5 | """
  6 | import pytest
  7 | 
  8 | from checkengine.validate_df import ValidateSparkDataFrame
  9 | from tests.spark import empty_integer_df, single_integer_column_schema, two_integer_columns_schema, empty_string_df
 10 | from tests.spark.AssertResult import AssertValidationResult
 11 | 
 12 | pytestmark = pytest.mark.usefixtures("spark_session")
 13 | 
 14 | 
 15 | def test_should_return_df_without_changes_if_empty_df_with_median_constraint(spark_session):
 16 |     df = empty_integer_df(spark_session)
 17 | 
 18 |     result = ValidateSparkDataFrame(spark_session, df) \
 19 |         .median_column_value("col1", 0, 1) \
 20 |         .execute()
 21 | 
 22 |     AssertValidationResult(column_name="col1", constraint_name="median_between") \
 23 |         .check(
 24 |         actual=result,
 25 |         expected_correct=df,
 26 |         expected_erroneous=df
 27 |     )
 28 | 
 29 | 
 30 | def test_should_return_df_without_changes_if_the_median_is_between_given_values(spark_session):
 31 |     df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema)
 32 | 
 33 |     result = ValidateSparkDataFrame(spark_session, df) \
 34 |         .median_column_value("col1", 5, 15) \
 35 |         .execute()
 36 | 
 37 |     AssertValidationResult(column_name="col1", constraint_name="median_between") \
 38 |         .check(
 39 |         actual=result,
 40 |         expected_correct=df,
 41 |         expected_erroneous=empty_integer_df(spark_session)
 42 |     )
 43 | 
 44 | 
 45 | def test_should_reject_all_rows_if_median_is_smaller_than_given_values(spark_session):
 46 |     df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema)
 47 | 
 48 |     result = ValidateSparkDataFrame(spark_session, df) \
 49 |         .median_column_value("col1", 12, 15) \
 50 |         .execute()
 51 | 
 52 |     AssertValidationResult(column_name="col1", constraint_name="median_between") \
 53 |         .check(
 54 |         actual=result,
 55 |         expected_correct=empty_integer_df(spark_session),
 56 |         expected_erroneous=df
 57 |     )
 58 | 
 59 | 
 60 | def test_should_reject_all_rows_if_median_is_larger_than_given_values(spark_session):
 61 |     df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema)
 62 | 
 63 |     result = ValidateSparkDataFrame(spark_session, df) \
 64 |         .median_column_value("col1", 5, 8) \
 65 |         .execute()
 66 | 
 67 |     AssertValidationResult(column_name="col1", constraint_name="median_between") \
 68 |         .check(
 69 |         actual=result,
 70 |         expected_correct=empty_integer_df(spark_session),
 71 |         expected_erroneous=df
 72 |     )
 73 | 
 74 | 
 75 | def test_median_value_of_other_columns_is_ignored(spark_session):
 76 |     df = spark_session.createDataFrame([[5, 1], [10, 2], [15, 3]], schema=two_integer_columns_schema)
 77 |     expected_errors = spark_session.createDataFrame([], schema=two_integer_columns_schema)
 78 | 
 79 |     result = ValidateSparkDataFrame(spark_session, df) \
 80 |         .median_column_value("col1", 10, 10) \
 81 |         .execute()
 82 | 
 83 |     AssertValidationResult(column_name="col1", constraint_name="median_between") \
 84 |         .check(
 85 |         actual=result,
 86 |         expected_correct=df,
 87 |         expected_erroneous=expected_errors
 88 |     )
 89 | 
 90 | 
 91 | def test_median_should_check_all_given_columns_separately(spark_session):
 92 |     df = spark_session.createDataFrame([[5, 1], [10, 2], [15, 3]], schema=two_integer_columns_schema)
 93 |     expected_errors = spark_session.createDataFrame([], schema=two_integer_columns_schema)
 94 | 
 95 |     result = ValidateSparkDataFrame(spark_session, df) \
 96 |         .median_column_value("col1", 10, 10) \
 97 |         .median_column_value("col2", 2, 2) \
 98 |         .execute()
 99 | 
100 |     AssertValidationResult(column_name="col1", constraint_name="median_between") \
101 |         .check(
102 |         actual=result,
103 |         expected_correct=df,
104 |         expected_erroneous=expected_errors
105 |     )
106 | 
107 | 
108 | def test_should_throw_error_if_constraint_is_not_a_numeric_column(spark_session):
109 |     with pytest.raises(ValueError):
110 |         ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \
111 |             .median_column_value("col1", 10, 10) \
112 |             .execute()
113 | 
114 | 
115 | def test_should_throw_error_if_constraint_uses_non_existing_column(spark_session):
116 |     with pytest.raises(ValueError):
117 |         ValidateSparkDataFrame(spark_session, empty_integer_df(spark_session)) \
118 |             .median_column_value("column_that_does_not_exist", 5, 5) \
119 |             .execute()
120 | 
121 | 
122 | def test_should_throw_error_if_there_are_duplicate_constraints(spark_session):
123 |     with pytest.raises(ValueError):
124 |         ValidateSparkDataFrame(spark_session, empty_integer_df(spark_session)) \
125 |             .median_column_value("col1", 10, 10) \
126 |             .median_column_value("col1", 5, 5) \
127 |             .execute()
128 | 


--------------------------------------------------------------------------------
/check-engine-lib/tests/test_min_integer.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Verifies the constraints that check whether the column contains value equal or larger than a given integer.
  3 | """
  4 | 
  5 | import pytest
  6 | 
  7 | from tests.spark import empty_integer_df, single_integer_column_schema, two_integer_columns_schema, empty_string_df
  8 | from tests.spark.AssertResult import AssertValidationResult
  9 | from tests.spark.assert_df import AssertDf
 10 | from checkengine.validate_df import ValidateSparkDataFrame, ValidationError
 11 | 
 12 | pytestmark = pytest.mark.usefixtures("spark_session")
 13 | 
 14 | 
 15 | def test_should_return_df_without_changes_if_empty_df_with_is_min_constraint(spark_session):
 16 |     df = empty_integer_df(spark_session)
 17 | 
 18 |     result = ValidateSparkDataFrame(spark_session, df) \
 19 |         .is_min("col1", 5) \
 20 |         .execute()
 21 | 
 22 |     AssertValidationResult(column_name="col1", constraint_name="min") \
 23 |         .check(
 24 |         actual=result,
 25 |         expected_correct=df,
 26 |         expected_erroneous=df
 27 |     )
 28 | 
 29 | 
 30 | def test_should_return_df_without_changes_if_all_rows_greater_than_min(spark_session):
 31 |     df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema)
 32 | 
 33 |     result = ValidateSparkDataFrame(spark_session, df) \
 34 |         .is_min("col1", 5) \
 35 |         .execute()
 36 | 
 37 |     AssertValidationResult(column_name="col1", constraint_name="min") \
 38 |         .check(
 39 |         actual=result,
 40 |         expected_correct=df,
 41 |         expected_erroneous=empty_integer_df(spark_session)
 42 |     )
 43 | 
 44 | 
 45 | def test_should_reject_all_rows_if_smaller_than_min(spark_session):
 46 |     df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema)
 47 | 
 48 |     result = ValidateSparkDataFrame(spark_session, df) \
 49 |         .is_min("col1", 20) \
 50 |         .execute()
 51 | 
 52 |     AssertValidationResult(column_name="col1", constraint_name="min") \
 53 |         .check(
 54 |         actual=result,
 55 |         expected_correct=empty_integer_df(spark_session),
 56 |         expected_erroneous=df
 57 |     )
 58 | 
 59 | 
 60 | def test_should_return_both_correct_and_incorrect_rows(spark_session):
 61 |     df = spark_session.createDataFrame([[5], [10], [15]], schema=single_integer_column_schema)
 62 |     expected_correct = spark_session.createDataFrame([[10], [15]], schema=single_integer_column_schema)
 63 |     expected_errors = spark_session.createDataFrame([[5]], schema=single_integer_column_schema)
 64 | 
 65 |     result = ValidateSparkDataFrame(spark_session, df) \
 66 |         .is_min("col1", 10) \
 67 |         .execute()
 68 | 
 69 |     AssertValidationResult(column_name="col1", constraint_name="min") \
 70 |         .check(
 71 |         actual=result,
 72 |         expected_correct=expected_correct,
 73 |         expected_erroneous=expected_errors
 74 |     )
 75 | 
 76 | 
 77 | def test_min_value_of_other_columns_is_ignored(spark_session):
 78 |     df = spark_session.createDataFrame([[5, 1], [10, 2], [15, 3]], schema=two_integer_columns_schema)
 79 |     expected_correct = spark_session.createDataFrame([[10, 2], [15, 3]], schema=two_integer_columns_schema)
 80 |     expected_errors = spark_session.createDataFrame([[5, 1]], schema=two_integer_columns_schema)
 81 | 
 82 |     result = ValidateSparkDataFrame(spark_session, df) \
 83 |         .is_min("col1", 10) \
 84 |         .execute()
 85 | 
 86 |     AssertValidationResult(column_name="col1", constraint_name="min") \
 87 |         .check(
 88 |         actual=result,
 89 |         expected_correct=expected_correct,
 90 |         expected_erroneous=expected_errors
 91 |     )
 92 | 
 93 | 
 94 | def test_min_should_check_all_given_columns_separately(spark_session):
 95 |     df = spark_session.createDataFrame([[5, 1], [10, 2], [15, 3]], schema=two_integer_columns_schema)
 96 |     expected_correct = spark_session.createDataFrame([], schema=two_integer_columns_schema)
 97 |     expected_errors = spark_session.createDataFrame([[5, 1], [10, 2], [15, 3]], schema=two_integer_columns_schema)
 98 | 
 99 |     result = ValidateSparkDataFrame(spark_session, df) \
100 |         .is_min("col1", 20) \
101 |         .is_min("col2", 5) \
102 |         .execute()
103 | 
104 |     AssertDf(result.correct_data, order_by_column="col1") \
105 |         .contains_exactly(expected_correct.toPandas()) \
106 |         .has_columns(["col1", "col2"])
107 | 
108 |     AssertDf(result.erroneous_data, order_by_column="col2") \
109 |         .contains_exactly(expected_errors.toPandas()) \
110 |         .has_columns(["col1", "col2"])
111 | 
112 |     assert result.errors == [ValidationError("col1", "min", 3), ValidationError("col2", "min", 3)]
113 | 
114 | 
115 | def test_should_throw_error_if_constraint_is_not_a_numeric_column(spark_session):
116 |     with pytest.raises(ValueError):
117 |         ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \
118 |             .is_min("col1", 5) \
119 |             .execute()
120 | 
121 | 
122 | def test_should_throw_error_if_constraint_uses_non_existing_column(spark_session):
123 |     with pytest.raises(ValueError):
124 |         ValidateSparkDataFrame(spark_session, empty_integer_df(spark_session)) \
125 |             .is_min("column_that_does_not_exist", 5) \
126 |             .execute()
127 | 
128 | 
129 | def test_should_throw_error_if_there_are_duplicate_constraints(spark_session):
130 |     with pytest.raises(ValueError):
131 |         ValidateSparkDataFrame(spark_session, empty_integer_df(spark_session)) \
132 |             .is_min("col1", 5) \
133 |             .is_min("col1", 10) \
134 |             .execute()
135 | 


--------------------------------------------------------------------------------
/check-engine-lib/tests/test_multi_contraint.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file contains the tests that verify whether the library works correctly when multiple constraints are defined at the same time.
 3 | """
 4 | 
 5 | import pytest
 6 | 
 7 | from checkengine.validate_df import ValidateSparkDataFrame, ValidationError
 8 | from tests.spark import two_integer_columns_schema
 9 | from tests.spark.assert_df import AssertDf
10 | 
11 | pytestmark = pytest.mark.usefixtures("spark_session")
12 | 
13 | 
14 | def test_should_return_rows_that_pass_all_checks_and_reject_rows_that_violate_any_test(spark_session):
15 |     not_between = [25, 1]
16 |     max_exceeded = [3, 30]
17 |     correct = [3, 15]
18 |     less_than_min = [1, 15]
19 |     both_wrong = [7, 30]
20 | 
21 |     df = spark_session.createDataFrame([not_between, max_exceeded, correct, less_than_min, both_wrong], schema=two_integer_columns_schema)
22 |     expected_correct = spark_session.createDataFrame([correct], schema=two_integer_columns_schema)
23 |     expected_errors = spark_session.createDataFrame([not_between, max_exceeded, less_than_min, both_wrong], schema=two_integer_columns_schema)
24 | 
25 |     result = ValidateSparkDataFrame(spark_session, df) \
26 |         .is_between("col1", 0, 5) \
27 |         .is_min("col1", 3) \
28 |         .is_max("col2", 20) \
29 |         .execute()
30 | 
31 |     AssertDf(result.correct_data, order_by_column="col1") \
32 |         .contains_exactly(expected_correct.toPandas()) \
33 |         .has_columns(["col1", "col2"])
34 | 
35 |     AssertDf(result.erroneous_data, order_by_column="col2") \
36 |         .contains_exactly(expected_errors.toPandas()) \
37 |         .has_columns(["col1", "col2"])
38 | 
39 |     assert result.errors == [ValidationError("col1", "between", 2), ValidationError("col1", "min", 1), ValidationError("col2", "max", 2)]
40 | 


--------------------------------------------------------------------------------
/check-engine-lib/tests/test_not_null.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests the not null constraint.
  3 | """
  4 | import pytest
  5 | 
  6 | from tests.spark import empty_string_df, single_string_column_schema, two_string_columns_schema
  7 | from tests.spark.AssertResult import AssertValidationResult
  8 | from tests.spark.assert_df import AssertDf
  9 | from checkengine.validate_df import ValidateSparkDataFrame, ValidationError
 10 | 
 11 | pytestmark = pytest.mark.usefixtures("spark_session")
 12 | 
 13 | 
 14 | def test_should_pass_empty_df_with_not_null_constraint(spark_session):
 15 |     df = empty_string_df(spark_session)
 16 | 
 17 |     result = ValidateSparkDataFrame(spark_session, df) \
 18 |         .is_not_null("col1") \
 19 |         .execute()
 20 | 
 21 |     AssertValidationResult(column_name="col1", constraint_name="not_null") \
 22 |         .check(
 23 |         actual=result,
 24 |         expected_correct=df,
 25 |         expected_erroneous=df
 26 |     )
 27 | 
 28 | 
 29 | def test_should_return_df_without_changes_if_all_rows_are_not_null(spark_session):
 30 |     df = spark_session.createDataFrame([["abc"], ["def"], ["ghi"]], schema=single_string_column_schema)
 31 | 
 32 |     result = ValidateSparkDataFrame(spark_session, df) \
 33 |         .is_not_null("col1") \
 34 |         .execute()
 35 | 
 36 |     AssertValidationResult(column_name="col1", constraint_name="not_null") \
 37 |         .check(
 38 |         actual=result,
 39 |         expected_correct=df,
 40 |         expected_erroneous=empty_string_df(spark_session)
 41 |     )
 42 | 
 43 | 
 44 | def test_should_reject_all_rows_if_all_are_null(spark_session):
 45 |     df = spark_session.createDataFrame([[None], [None], [None]], schema=single_string_column_schema)
 46 |     expected_errors = spark_session.createDataFrame([[None]], schema=single_string_column_schema)
 47 | 
 48 |     result = ValidateSparkDataFrame(spark_session, df) \
 49 |         .is_not_null("col1") \
 50 |         .execute()
 51 | 
 52 |     AssertDf(result.correct_data) \
 53 |         .is_empty() \
 54 |         .has_columns(["col1"])
 55 | 
 56 |     AssertDf(result.erroneous_data) \
 57 |         .contains_exactly(expected_errors.toPandas()) \
 58 |         .has_columns(["col1"])
 59 | 
 60 |     assert result.errors == [ValidationError("col1", "not_null", 3)]
 61 | 
 62 | 
 63 | def test_should_return_both_correct_and_incorrect_rows(spark_session):
 64 |     df = spark_session.createDataFrame([["abc"], [None]], schema=single_string_column_schema)
 65 | 
 66 |     expected_correct = spark_session.createDataFrame([["abc"]], schema=single_string_column_schema)
 67 |     expected_errors = spark_session.createDataFrame([[None]], schema=single_string_column_schema)
 68 | 
 69 |     result = ValidateSparkDataFrame(spark_session, df) \
 70 |         .is_not_null("col1") \
 71 |         .execute()
 72 | 
 73 |     AssertValidationResult(column_name="col1", constraint_name="not_null") \
 74 |         .check(
 75 |         actual=result,
 76 |         expected_correct=expected_correct,
 77 |         expected_erroneous=expected_errors
 78 |     )
 79 | 
 80 | 
 81 | def test_nulls_in_other_columns_are_ignored(spark_session):
 82 |     df = spark_session.createDataFrame([["abc", "123"], [None, "456"], ["def", None]], schema=two_string_columns_schema)
 83 | 
 84 |     expected_correct = spark_session.createDataFrame([["abc", "123"], ["def", None]], schema=two_string_columns_schema)
 85 |     expected_errors = spark_session.createDataFrame([[None, "456"]], schema=two_string_columns_schema)
 86 | 
 87 |     result = ValidateSparkDataFrame(spark_session, df) \
 88 |         .is_not_null("col1") \
 89 |         .execute()
 90 | 
 91 |     AssertValidationResult(column_name="col1", constraint_name="not_null") \
 92 |         .check(
 93 |         actual=result,
 94 |         expected_correct=expected_correct,
 95 |         expected_erroneous=expected_errors
 96 |     )
 97 | 
 98 | 
 99 | def test_not_null_should_check_all_given_columns_separately(spark_session):
100 |     df = spark_session.createDataFrame([["abc", None], [None, "456"], [None, None]], schema=two_string_columns_schema)
101 |     expected_errors = spark_session.createDataFrame([["abc", None], [None, "456"], [None, None]], schema=two_string_columns_schema)
102 | 
103 |     result = ValidateSparkDataFrame(spark_session, df) \
104 |         .is_not_null("col1") \
105 |         .is_not_null("col2") \
106 |         .execute()
107 | 
108 |     AssertDf(result.correct_data) \
109 |         .is_empty() \
110 |         .has_columns(["col1", "col2"])
111 | 
112 |     AssertDf(result.erroneous_data, order_by_column=["col1", "col2"]) \
113 |         .contains_exactly(expected_errors.toPandas()) \
114 |         .has_columns(["col1", "col2"])
115 | 
116 |     assert result.errors == [ValidationError("col1", "not_null", 2), ValidationError("col2", "not_null", 2)]
117 | 
118 | 
119 | def test_not_null_should_check_all_given_columns_separately_even_if_all_of_them_are_defined_at_once(spark_session):
120 |     df = spark_session.createDataFrame([["abc", None], [None, "456"], [None, None]], schema=two_string_columns_schema)
121 |     expected_errors = spark_session.createDataFrame([["abc", None], [None, "456"], [None, None]], schema=two_string_columns_schema)
122 | 
123 |     result = ValidateSparkDataFrame(spark_session, df) \
124 |         .are_not_null(["col1", "col2"]) \
125 |         .execute()
126 | 
127 |     AssertDf(result.correct_data) \
128 |         .is_empty() \
129 |         .has_columns(["col1", "col2"])
130 | 
131 |     AssertDf(result.erroneous_data, order_by_column=["col1", "col2"]) \
132 |         .contains_exactly(expected_errors.toPandas()) \
133 |         .has_columns(["col1", "col2"])
134 | 
135 |     assert result.errors == [ValidationError("col1", "not_null", 2), ValidationError("col2", "not_null", 2)]
136 | 
137 | 
138 | def test_should_throw_error_if_constraint_uses_non_existing_column(spark_session):
139 |     with pytest.raises(ValueError):
140 |         ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \
141 |             .is_not_null("column_that_does_not_exist") \
142 |             .execute()
143 | 
144 | 
145 | def test_should_throw_error_if_there_are_duplicate_constraints(spark_session):
146 |     with pytest.raises(ValueError):
147 |         ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \
148 |             .is_not_null("col1") \
149 |             .is_not_null("col1") \
150 |             .execute()
151 | 


--------------------------------------------------------------------------------
/check-engine-lib/tests/test_one_of.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests the one_of constraint.
  3 | """
  4 | import pytest
  5 | 
  6 | from tests.spark import empty_string_df, single_string_column_schema, two_string_columns_schema
  7 | from tests.spark.AssertResult import AssertValidationResult
  8 | from tests.spark.assert_df import AssertDf
  9 | from checkengine.validate_df import ValidateSparkDataFrame, ValidationError
 10 | 
 11 | pytestmark = pytest.mark.usefixtures("spark_session")
 12 | 
 13 | 
 14 | def test_should_return_df_without_changes_if_empty_df_with_one_of_constraint(spark_session):
 15 |     df = empty_string_df(spark_session)
 16 | 
 17 |     result = ValidateSparkDataFrame(spark_session, df) \
 18 |         .one_of("col1", []) \
 19 |         .execute()
 20 | 
 21 |     AssertValidationResult(column_name="col1", constraint_name="one_of") \
 22 |         .check(
 23 |         actual=result,
 24 |         expected_correct=df,
 25 |         expected_erroneous=df
 26 |     )
 27 | 
 28 | 
 29 | def test_should_return_df_without_changes_if_all_are_in_list(spark_session):
 30 |     df = spark_session.createDataFrame([["abc"], ["def"], ["ghi"]], schema=single_string_column_schema)
 31 | 
 32 |     result = ValidateSparkDataFrame(spark_session, df) \
 33 |         .one_of("col1", ["abc", "def", "ghi"]) \
 34 |         .execute()
 35 | 
 36 |     AssertValidationResult(column_name="col1", constraint_name="one_of") \
 37 |         .check(
 38 |         actual=result,
 39 |         expected_correct=df,
 40 |         expected_erroneous=empty_string_df(spark_session)
 41 |     )
 42 | 
 43 | 
 44 | def test_should_reject_all_rows_if_none_of_them_is_in_the_list(spark_session):
 45 |     df = spark_session.createDataFrame([["abc"], ["a"], ["abcdefghi"]], schema=single_string_column_schema)
 46 |     expected_errors = spark_session.createDataFrame([["abc"], ["a"], ["abcdefghi"]], schema=single_string_column_schema)
 47 | 
 48 |     result = ValidateSparkDataFrame(spark_session, df) \
 49 |         .one_of("col1", ["ab", "b"]) \
 50 |         .execute()
 51 | 
 52 |     AssertValidationResult(column_name="col1", constraint_name="one_of") \
 53 |         .check(
 54 |         actual=result,
 55 |         expected_correct=empty_string_df(spark_session),
 56 |         expected_erroneous=expected_errors
 57 |     )
 58 | 
 59 | 
 60 | def test_should_return_both_correct_and_incorrect_rows(spark_session):
 61 |     df = spark_session.createDataFrame([["a"], ["abc"], ["defg"], ["hijkl"]], schema=single_string_column_schema)
 62 | 
 63 |     expected_correct = spark_session.createDataFrame([["abc"], ["defg"]], schema=single_string_column_schema)
 64 |     expected_errors = spark_session.createDataFrame([["a"], ["hijkl"]], schema=single_string_column_schema)
 65 | 
 66 |     result = ValidateSparkDataFrame(spark_session, df) \
 67 |         .one_of("col1", ["abc", "defg"]) \
 68 |         .execute()
 69 | 
 70 |     AssertValidationResult(column_name="col1", constraint_name="one_of") \
 71 |         .check(
 72 |         actual=result,
 73 |         expected_correct=expected_correct,
 74 |         expected_erroneous=expected_errors
 75 |     )
 76 | 
 77 | 
 78 | def test_should_return_both_correct_and_incorrect_rows_numeric_values(spark_session):
 79 |     df = spark_session.createDataFrame([[1], [2], [3], [4]], schema=single_string_column_schema)
 80 | 
 81 |     expected_correct = spark_session.createDataFrame([[1], [3]], schema=single_string_column_schema)
 82 |     expected_errors = spark_session.createDataFrame([[2], [4]], schema=single_string_column_schema)
 83 | 
 84 |     result = ValidateSparkDataFrame(spark_session, df) \
 85 |         .one_of("col1", [1, 3, 5]) \
 86 |         .execute()
 87 | 
 88 |     AssertValidationResult(column_name="col1", constraint_name="one_of") \
 89 |         .check(
 90 |         actual=result,
 91 |         expected_correct=expected_correct,
 92 |         expected_erroneous=expected_errors
 93 |     )
 94 | 
 95 | 
 96 | def test_one_of_of_other_columns_is_ignored(spark_session):
 97 |     df = spark_session.createDataFrame([["a", "123"], ["bcd", "45"], ["cd", "12345"]], schema=two_string_columns_schema)
 98 | 
 99 |     expected_correct = spark_session.createDataFrame([["cd", "12345"]], schema=two_string_columns_schema)
100 |     expected_errors = spark_session.createDataFrame([["a", "123"], ["bcd", "45"]], schema=two_string_columns_schema)
101 | 
102 |     result = ValidateSparkDataFrame(spark_session, df) \
103 |         .one_of("col1", ["cd", "123", "45"]) \
104 |         .execute()
105 | 
106 |     AssertValidationResult(column_name="col1", constraint_name="one_of") \
107 |         .check(
108 |         actual=result,
109 |         expected_correct=expected_correct,
110 |         expected_erroneous=expected_errors
111 |     )
112 | 
113 | 
114 | def test_should_check_all_given_columns_separately(spark_session):
115 |     df = spark_session.createDataFrame([["a", "12"], ["abcde", "56"], ["def", "123"]], schema=two_string_columns_schema)
116 | 
117 |     expected_correct = spark_session.createDataFrame([], schema=two_string_columns_schema)
118 |     expected_errors = spark_session.createDataFrame([["a", "12"], ["abcde", "56"], ["def", "123"]], schema=two_string_columns_schema)
119 | 
120 |     result = ValidateSparkDataFrame(spark_session, df) \
121 |         .one_of("col1", ["12", "56", "def"]) \
122 |         .one_of("col2", ["12", "56", "adcde"]) \
123 |         .execute()
124 | 
125 |     AssertDf(result.correct_data, order_by_column="col1") \
126 |         .contains_exactly(expected_correct.toPandas()) \
127 |         .has_columns(["col1", "col2"])
128 | 
129 |     AssertDf(result.erroneous_data, order_by_column="col2") \
130 |         .contains_exactly(expected_errors.toPandas()) \
131 |         .has_columns(["col1", "col2"])
132 | 
133 |     assert result.errors == [ValidationError("col1", "one_of", 2), ValidationError("col2", "one_of", 1)]
134 | 
135 | 
136 | def test_should_throw_error_if_constraint_uses_non_existing_column(spark_session):
137 |     with pytest.raises(ValueError):
138 |         ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \
139 |             .one_of("column_that_does_not_exist", []) \
140 |             .execute()
141 | 
142 | 
143 | def test_should_throw_error_if_there_are_duplicate_constraints(spark_session):
144 |     with pytest.raises(ValueError):
145 |         ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \
146 |             .one_of("col1", ["a"]) \
147 |             .one_of("col1", ["b"]) \
148 |             .execute()
149 | 


--------------------------------------------------------------------------------
/check-engine-lib/tests/test_spark_env.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests that verify whether the PySpark instance used for testing is configured properly.
 3 | """
 4 | import pytest
 5 | 
 6 | import pandas as pd
 7 | from pyspark.sql import DataFrame
 8 | 
 9 | from pyspark.sql.types import *
10 | 
11 | from tests.spark.assert_df import AssertDf
12 | 
13 | pytestmark = pytest.mark.usefixtures("spark_session")
14 | 
15 | 
16 | def test_empty_dataframe(spark_session):
17 |     df_schema = StructType([StructField("col1", StringType())])
18 | 
19 |     df = spark_session.createDataFrame([], schema=df_schema)
20 |     AssertDf(df).is_empty()
21 | 
22 | 
23 | def test_spark_sql_operation(spark_session):
24 |     df_schema = StructType([StructField("col1", StringType()), StructField("col2", IntegerType())])
25 | 
26 |     test_list = [["v1", 1], ["v1", 2], ["v2", 3]]
27 | 
28 |     df: DataFrame = spark_session.createDataFrame(test_list, schema=df_schema)
29 |     aggregated = df.groupby("col1").sum("col2").orderBy('col1')
30 | 
31 |     AssertDf(aggregated) \
32 |         .contains_exactly(pd.DataFrame([['v1', 3], ['v2', 3]], columns=['col1', 'sum(col2)']).sort_values('col1')) \
33 |         .has_columns(["col1", "sum(col2)"]) \
34 |         .has_n_rows(2)
35 | 


--------------------------------------------------------------------------------
/check-engine-lib/tests/test_string_length.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests the textLength constraint.
  3 | """
  4 | import pytest
  5 | 
  6 | from tests.spark import single_string_column_schema, two_string_columns_schema, empty_string_df, empty_integer_df
  7 | from tests.spark.AssertResult import AssertValidationResult
  8 | from tests.spark.assert_df import AssertDf
  9 | from checkengine.validate_df import ValidateSparkDataFrame, ValidationError
 10 | 
 11 | pytestmark = pytest.mark.usefixtures("spark_session")
 12 | 
 13 | 
 14 | def test_should_return_df_without_changes_if_empty_df_with_is_text_length_constraint(spark_session):
 15 |     df = empty_string_df(spark_session)
 16 | 
 17 |     result = ValidateSparkDataFrame(spark_session, df) \
 18 |         .has_length_between("col1", 0, 20) \
 19 |         .execute()
 20 | 
 21 |     AssertValidationResult(column_name="col1", constraint_name="text_length") \
 22 |         .check(
 23 |         actual=result,
 24 |         expected_correct=df,
 25 |         expected_erroneous=df
 26 |     )
 27 | 
 28 | 
 29 | def test_should_return_df_without_changes_if_all_are_shorter_than_upper_bound(spark_session):
 30 |     df = spark_session.createDataFrame([["abc"], ["def"], ["ghi"]], schema=single_string_column_schema)
 31 | 
 32 |     result = ValidateSparkDataFrame(spark_session, df) \
 33 |         .has_length_between("col1", 0, 20) \
 34 |         .execute()
 35 | 
 36 |     AssertValidationResult(column_name="col1", constraint_name="text_length") \
 37 |         .check(
 38 |         actual=result,
 39 |         expected_correct=df,
 40 |         expected_erroneous=empty_string_df(spark_session)
 41 |     )
 42 | 
 43 | 
 44 | def test_should_return_df_without_changes_if_all_are_longer_than_lower_bound(spark_session):
 45 |     df = spark_session.createDataFrame([["abcdef"], ["ghijkl"]], schema=single_string_column_schema)
 46 | 
 47 |     result = ValidateSparkDataFrame(spark_session, df) \
 48 |         .has_length_between("col1", 5, 20) \
 49 |         .execute()
 50 | 
 51 |     AssertValidationResult(column_name="col1", constraint_name="text_length") \
 52 |         .check(
 53 |         actual=result,
 54 |         expected_correct=df,
 55 |         expected_erroneous=empty_string_df(spark_session)
 56 |     )
 57 | 
 58 | 
 59 | def test_should_reject_all_rows_if_all_are_too_short_or_too_long(spark_session):
 60 |     df = spark_session.createDataFrame([["abc"], ["a"], ["abcdefghi"]], schema=single_string_column_schema)
 61 |     expected_errors = spark_session.createDataFrame([["abc"], ["a"], ["abcdefghi"]], schema=single_string_column_schema)
 62 | 
 63 |     result = ValidateSparkDataFrame(spark_session, df) \
 64 |         .has_length_between("col1", 5, 8) \
 65 |         .execute()
 66 | 
 67 |     AssertValidationResult(column_name="col1", constraint_name="text_length") \
 68 |         .check(
 69 |         actual=result,
 70 |         expected_correct=empty_string_df(spark_session),
 71 |         expected_erroneous=expected_errors
 72 |     )
 73 | 
 74 | 
 75 | def test_should_return_both_correct_and_incorrect_rows(spark_session):
 76 |     df = spark_session.createDataFrame([["a"], ["abc"], ["defg"], ["hijkl"]], schema=single_string_column_schema)
 77 | 
 78 |     expected_correct = spark_session.createDataFrame([["abc"], ["defg"]], schema=single_string_column_schema)
 79 |     expected_errors = spark_session.createDataFrame([["a"], ["hijkl"]], schema=single_string_column_schema)
 80 | 
 81 |     result = ValidateSparkDataFrame(spark_session, df) \
 82 |         .has_length_between("col1", 3, 4) \
 83 |         .execute()
 84 | 
 85 |     AssertValidationResult(column_name="col1", constraint_name="text_length") \
 86 |         .check(
 87 |         actual=result,
 88 |         expected_correct=expected_correct,
 89 |         expected_erroneous=expected_errors
 90 |     )
 91 | 
 92 | 
 93 | def test_text_length_of_other_columns_is_ignored(spark_session):
 94 |     df = spark_session.createDataFrame([["a", "123"], ["bcd", "45"], ["cd", "12345"]], schema=two_string_columns_schema)
 95 | 
 96 |     expected_correct = spark_session.createDataFrame([["cd", "12345"]], schema=two_string_columns_schema)
 97 |     expected_errors = spark_session.createDataFrame([["a", "123"], ["bcd", "45"]], schema=two_string_columns_schema)
 98 | 
 99 |     result = ValidateSparkDataFrame(spark_session, df) \
100 |         .has_length_between("col1", 2, 2) \
101 |         .execute()
102 | 
103 |     AssertValidationResult(column_name="col1", constraint_name="text_length") \
104 |         .check(
105 |         actual=result,
106 |         expected_correct=expected_correct,
107 |         expected_erroneous=expected_errors
108 |     )
109 | 
110 | 
111 | def test_should_check_all_given_columns_separately(spark_session):
112 |     df = spark_session.createDataFrame([["a", "12"], ["abcde", "56"], ["def", "123"]], schema=two_string_columns_schema)
113 | 
114 |     expected_correct = spark_session.createDataFrame([], schema=two_string_columns_schema)
115 |     expected_errors = spark_session.createDataFrame([["a", "12"], ["abcde", "56"], ["def", "123"]], schema=two_string_columns_schema)
116 | 
117 |     result = ValidateSparkDataFrame(spark_session, df) \
118 |         .has_length_between("col1", 2, 4) \
119 |         .has_length_between("col2", 1, 2) \
120 |         .execute()
121 | 
122 |     AssertDf(result.correct_data, order_by_column="col1") \
123 |         .contains_exactly(expected_correct.toPandas()) \
124 |         .has_columns(["col1", "col2"])
125 | 
126 |     AssertDf(result.erroneous_data, order_by_column="col2") \
127 |         .contains_exactly(expected_errors.toPandas()) \
128 |         .has_columns(["col1", "col2"])
129 | 
130 |     assert result.errors == [ValidationError("col1", "text_length", 2), ValidationError("col2", "text_length", 1)]
131 | 
132 | 
133 | def test_should_throw_error_if_constraint_uses_non_existing_column(spark_session):
134 |     with pytest.raises(ValueError):
135 |         ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \
136 |             .has_length_between("column_that_does_not_exist", 0, 1) \
137 |             .execute()
138 | 
139 | 
140 | def test_should_throw_error_if_there_are_duplicate_constraints(spark_session):
141 |     with pytest.raises(ValueError):
142 |         ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \
143 |             .has_length_between("col1", 0, 10) \
144 |             .has_length_between("col1", 0, 5) \
145 |             .execute()
146 | 
147 | 
148 | def test_should_throw_error_if_lower_bound_is_greater_than_upper_bound(spark_session):
149 |     with pytest.raises(ValueError):
150 |         ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \
151 |             .has_length_between("col1", 10, 5) \
152 |             .execute()
153 | 
154 | 
155 | def test_should_throw_error_if_constraint_is_not_a_text_column(spark_session):
156 |     with pytest.raises(ValueError):
157 |         ValidateSparkDataFrame(spark_session, empty_integer_df(spark_session)) \
158 |             .has_length_between("col1", 5, 10) \
159 |             .execute()
160 | 


--------------------------------------------------------------------------------
/check-engine-lib/tests/test_string_matches.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests the Regex matching constraint.
  3 | """
  4 | import pytest
  5 | 
  6 | from tests.spark import empty_string_df, single_string_column_schema, two_string_columns_schema, empty_integer_df
  7 | from tests.spark.AssertResult import AssertValidationResult
  8 | from tests.spark.assert_df import AssertDf
  9 | from checkengine.validate_df import ValidateSparkDataFrame, ValidationError
 10 | 
 11 | pytestmark = pytest.mark.usefixtures("spark_session")
 12 | 
 13 | 
 14 | def test_should_return_df_without_changes_if_empty_df_with_is_text_matches_regex_constraint(spark_session):
 15 |     df = empty_string_df(spark_session)
 16 | 
 17 |     result = ValidateSparkDataFrame(spark_session, df) \
 18 |         .text_matches_regex("col1", ".*") \
 19 |         .execute()
 20 | 
 21 |     AssertValidationResult(column_name="col1", constraint_name="regex_match") \
 22 |         .check(
 23 |         actual=result,
 24 |         expected_correct=df,
 25 |         expected_erroneous=df
 26 |     )
 27 | 
 28 | 
 29 | def test_should_return_df_without_changes_if_regex_matches_the_text(spark_session):
 30 |     df = spark_session.createDataFrame([["abc"], ["def"], ["ghi"]], schema=single_string_column_schema)
 31 | 
 32 |     result = ValidateSparkDataFrame(spark_session, df) \
 33 |         .text_matches_regex("col1", ".*") \
 34 |         .execute()
 35 | 
 36 |     AssertValidationResult(column_name="col1", constraint_name="regex_match") \
 37 |         .check(
 38 |         actual=result,
 39 |         expected_correct=df,
 40 |         expected_erroneous=empty_string_df(spark_session)
 41 |     )
 42 | 
 43 | 
 44 | def test_should_reject_all_rows_if_regex_match_fails(spark_session):
 45 |     df = spark_session.createDataFrame([["abc"], ["a"], ["abcdefghi"]], schema=single_string_column_schema)
 46 |     expected_errors = spark_session.createDataFrame([["abc"], ["a"], ["abcdefghi"]], schema=single_string_column_schema)
 47 | 
 48 |     result = ValidateSparkDataFrame(spark_session, df) \
 49 |         .text_matches_regex("col1", "[0-9]+") \
 50 |         .execute()
 51 | 
 52 |     AssertValidationResult(column_name="col1", constraint_name="regex_match") \
 53 |         .check(
 54 |         actual=result,
 55 |         expected_correct=empty_string_df(spark_session),
 56 |         expected_erroneous=expected_errors
 57 |     )
 58 | 
 59 | 
 60 | def test_should_return_both_correct_and_incorrect_rows(spark_session):
 61 |     df = spark_session.createDataFrame([["a"], ["abc"], ["defg"], ["hijkl"]], schema=single_string_column_schema)
 62 |     expected_correct = spark_session.createDataFrame([["abc"], ["defg"]], schema=single_string_column_schema)
 63 |     expected_errors = spark_session.createDataFrame([["a"], ["hijkl"]], schema=single_string_column_schema)
 64 | 
 65 |     result = ValidateSparkDataFrame(spark_session, df) \
 66 |         .text_matches_regex("col1", "^[a-z]{3,4}$") \
 67 |         .execute()
 68 | 
 69 |     AssertValidationResult(column_name="col1", constraint_name="regex_match") \
 70 |         .check(
 71 |         actual=result,
 72 |         expected_correct=expected_correct,
 73 |         expected_erroneous=expected_errors
 74 |     )
 75 | 
 76 | 
 77 | def test_matching_of_other_columns_is_ignored(spark_session):
 78 |     df = spark_session.createDataFrame([["a", "123"], ["bcd", "45"], ["cd", "12345"]], schema=two_string_columns_schema)
 79 | 
 80 |     expected_correct = spark_session.createDataFrame([["cd", "12345"]], schema=two_string_columns_schema)
 81 |     expected_errors = spark_session.createDataFrame([["a", "123"], ["bcd", "45"]], schema=two_string_columns_schema)
 82 | 
 83 |     result = ValidateSparkDataFrame(spark_session, df) \
 84 |         .text_matches_regex("col1", "^[cd]+$") \
 85 |         .execute()
 86 | 
 87 |     AssertValidationResult(column_name="col1", constraint_name="regex_match") \
 88 |         .check(
 89 |         actual=result,
 90 |         expected_correct=expected_correct,
 91 |         expected_erroneous=expected_errors
 92 |     )
 93 | 
 94 | 
 95 | def test_should_check_all_given_columns_separately(spark_session):
 96 |     df = spark_session.createDataFrame([["a", "12"], ["abcde", "56"], ["def", "123"]], schema=two_string_columns_schema)
 97 | 
 98 |     expected_correct = spark_session.createDataFrame([], schema=two_string_columns_schema)
 99 |     expected_errors = spark_session.createDataFrame([["a", "12"], ["abcde", "56"], ["def", "123"]], schema=two_string_columns_schema)
100 | 
101 |     result = ValidateSparkDataFrame(spark_session, df) \
102 |         .text_matches_regex("col1", "[0-9]+") \
103 |         .text_matches_regex("col2", "[a-z]+") \
104 |         .execute()
105 | 
106 |     AssertDf(result.correct_data, order_by_column="col1") \
107 |         .contains_exactly(expected_correct.toPandas()) \
108 |         .has_columns(["col1", "col2"])
109 | 
110 |     AssertDf(result.erroneous_data, order_by_column="col2") \
111 |         .contains_exactly(expected_errors.toPandas()) \
112 |         .has_columns(["col1", "col2"])
113 | 
114 |     assert result.errors == [ValidationError("col1", "regex_match", 3), ValidationError("col2", "regex_match", 3)]
115 | 
116 | 
117 | def test_should_throw_error_if_constraint_uses_non_existing_column(spark_session):
118 |     with pytest.raises(ValueError):
119 |         ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \
120 |             .text_matches_regex("column_that_does_not_exist", '.*') \
121 |             .execute()
122 | 
123 | 
124 | def test_should_throw_error_if_there_are_duplicate_constraints(spark_session):
125 |     with pytest.raises(ValueError):
126 |         ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \
127 |             .text_matches_regex("column_that_does_not_exist", '.*') \
128 |             .text_matches_regex("column_that_does_not_exist", '[a-z]*') \
129 |             .execute()
130 | 
131 | 
132 | def test_should_throw_error_if_constraint_is_not_a_text_column(spark_session):
133 |     with pytest.raises(ValueError):
134 |         ValidateSparkDataFrame(spark_session, empty_integer_df(spark_session)) \
135 |             .text_matches_regex("col1", '[a-z]*') \
136 |             .execute()
137 | 


--------------------------------------------------------------------------------
/check-engine-lib/tests/test_uniqueness.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests the isUnique constraint.
  3 | 
  4 | The implementation should reject a row if there is another row that contains the same value in the given column.
  5 | In that case, the rows should be reported as an error (only once).
  6 | """
  7 | import pytest
  8 | 
  9 | from tests.spark import empty_string_df, single_string_column_schema, two_string_columns_schema
 10 | from tests.spark.AssertResult import AssertValidationResult
 11 | from tests.spark.assert_df import AssertDf
 12 | from checkengine.validate_df import ValidateSparkDataFrame, ValidationError
 13 | 
 14 | pytestmark = pytest.mark.usefixtures("spark_session")
 15 | 
 16 | 
 17 | def test_should_return_df_without_changes_if_empty_df_with_is_unique_constraint(spark_session):
 18 |     df = empty_string_df(spark_session)
 19 | 
 20 |     result = ValidateSparkDataFrame(spark_session, df) \
 21 |         .is_unique("col1") \
 22 |         .execute()
 23 | 
 24 |     AssertValidationResult(column_name="col1", constraint_name="unique") \
 25 |         .check(
 26 |         actual=result,
 27 |         expected_correct=df,
 28 |         expected_erroneous=df
 29 |     )
 30 | 
 31 | 
 32 | def test_should_return_df_without_changes_if_all_rows_are_unique(spark_session):
 33 |     df = spark_session.createDataFrame([["abc"], ["def"], ["ghi"]], schema=single_string_column_schema)
 34 | 
 35 |     result = ValidateSparkDataFrame(spark_session, df) \
 36 |         .is_unique("col1") \
 37 |         .execute()
 38 | 
 39 |     AssertValidationResult(column_name="col1", constraint_name="unique") \
 40 |         .check(
 41 |         actual=result,
 42 |         expected_correct=df,
 43 |         expected_erroneous=empty_string_df(spark_session)
 44 |     )
 45 | 
 46 | 
 47 | def test_should_reject_all_rows_if_all_are_the_same(spark_session):
 48 |     df = spark_session.createDataFrame([["abc"], ["abc"], ["abc"]], schema=single_string_column_schema)
 49 |     expected_errors = spark_session.createDataFrame([["abc"]], schema=single_string_column_schema)
 50 | 
 51 |     result = ValidateSparkDataFrame(spark_session, df) \
 52 |         .is_unique("col1") \
 53 |         .execute()
 54 | 
 55 |     AssertDf(result.correct_data) \
 56 |         .is_empty() \
 57 |         .has_columns(["col1"])
 58 | 
 59 |     AssertDf(result.erroneous_data, order_by_column="col1") \
 60 |         .contains_exactly(expected_errors.toPandas()) \
 61 |         .has_columns(["col1"])
 62 | 
 63 |     assert result.errors == [ValidationError("col1", "unique", 3)]
 64 | 
 65 | 
 66 | def test_should_return_both_correct_and_incorrect_rows(spark_session):
 67 |     df = spark_session.createDataFrame([["abc"], ["abc"], ["def"]], schema=single_string_column_schema)
 68 |     expected_correct = spark_session.createDataFrame([["def"]], schema=single_string_column_schema)
 69 |     expected_errors = spark_session.createDataFrame([["abc"]], schema=single_string_column_schema)
 70 | 
 71 |     result = ValidateSparkDataFrame(spark_session, df) \
 72 |         .is_unique("col1") \
 73 |         .execute()
 74 | 
 75 |     AssertDf(result.correct_data, order_by_column="col1") \
 76 |         .contains_exactly(expected_correct.toPandas()) \
 77 |         .has_columns(["col1"])
 78 | 
 79 |     AssertDf(result.erroneous_data, order_by_column="col1") \
 80 |         .contains_exactly(expected_errors.toPandas()) \
 81 |         .has_columns(["col1"])
 82 | 
 83 |     assert result.errors == [ValidationError("col1", "unique", 2)]
 84 | 
 85 | 
 86 | def test_uniqueness_of_other_columns_is_ignored(spark_session):
 87 |     df = spark_session.createDataFrame([["abc", "123"], ["abc", "456"], ["def", "123"]], schema=two_string_columns_schema)
 88 |     expected_correct = spark_session.createDataFrame([["def", "123"]], schema=two_string_columns_schema)
 89 |     expected_errors = spark_session.createDataFrame([["abc", "123"], ["abc", "456"]], schema=two_string_columns_schema)
 90 | 
 91 |     result = ValidateSparkDataFrame(spark_session, df) \
 92 |         .is_unique("col1") \
 93 |         .execute()
 94 | 
 95 |     AssertDf(result.correct_data, order_by_column="col1") \
 96 |         .contains_exactly(expected_correct.toPandas()) \
 97 |         .has_columns(["col1", "col2"])
 98 | 
 99 |     AssertDf(result.erroneous_data, order_by_column="col2") \
100 |         .contains_exactly(expected_errors.toPandas()) \
101 |         .has_columns(["col1", "col2"])
102 | 
103 |     assert result.errors == [ValidationError("col1", "unique", 2)]
104 | 
105 | 
106 | def test_uniqueness_should_check_all_given_columns_separately(spark_session):
107 |     df = spark_session.createDataFrame([["abc", "123"], ["abc", "456"], ["def", "123"]], schema=two_string_columns_schema)
108 |     expected_correct = spark_session.createDataFrame([], schema=two_string_columns_schema)
109 |     expected_errors = spark_session.createDataFrame([["abc", "123"], ["abc", "456"], ["def", "123"]], schema=two_string_columns_schema)
110 | 
111 |     result = ValidateSparkDataFrame(spark_session, df) \
112 |         .is_unique("col1") \
113 |         .is_unique("col2") \
114 |         .execute()
115 | 
116 |     AssertDf(result.correct_data, order_by_column="col1") \
117 |         .contains_exactly(expected_correct.toPandas()) \
118 |         .has_columns(["col1", "col2"])
119 | 
120 |     AssertDf(result.erroneous_data, order_by_column="col2") \
121 |         .contains_exactly(expected_errors.toPandas()) \
122 |         .has_columns(["col1", "col2"])
123 | 
124 |     assert result.errors == [ValidationError("col1", "unique", 2), ValidationError("col2", "unique", 2)]
125 | 
126 | 
127 | def test_uniqueness_should_check_all_given_columns_separately_when_defining_all_columns_at_once(spark_session):
128 |     df = spark_session.createDataFrame([["abc", "123"], ["abc", "456"], ["def", "123"]], schema=two_string_columns_schema)
129 |     expected_correct = spark_session.createDataFrame([], schema=two_string_columns_schema)
130 |     expected_errors = spark_session.createDataFrame([["abc", "123"], ["abc", "456"], ["def", "123"]], schema=two_string_columns_schema)
131 | 
132 |     result = ValidateSparkDataFrame(spark_session, df) \
133 |         .are_unique(["col1", "col2"]) \
134 |         .execute()
135 | 
136 |     AssertDf(result.correct_data, order_by_column="col1") \
137 |         .contains_exactly(expected_correct.toPandas()) \
138 |         .has_columns(["col1", "col2"])
139 | 
140 |     AssertDf(result.erroneous_data, order_by_column="col2") \
141 |         .contains_exactly(expected_errors.toPandas()) \
142 |         .has_columns(["col1", "col2"])
143 | 
144 |     assert result.errors == [ValidationError("col1", "unique", 2), ValidationError("col2", "unique", 2)]
145 | 
146 | 
147 | def test_should_throw_error_if_constraint_uses_non_existing_column(spark_session):
148 |     with pytest.raises(ValueError):
149 |         ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \
150 |             .is_unique("column_that_does_not_exist") \
151 |             .execute()
152 | 
153 | 
154 | def test_should_throw_error_if_there_are_duplicate_constraints(spark_session):
155 |     with pytest.raises(ValueError):
156 |         ValidateSparkDataFrame(spark_session, empty_string_df(spark_session)) \
157 |             .is_unique("col1") \
158 |             .is_unique("col1") \
159 |             .execute()
160 | 


--------------------------------------------------------------------------------
/check-engine-lib/tests/test_validation_without_rules.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests the behavior of ValidateSparkDataFrame when no constraint has been defined.
 3 | In that case, the implementation should pass all of the given data as correct and don't return any errors.
 4 | """
 5 | import pytest
 6 | 
 7 | from tests.spark import empty_string_df, single_string_column_schema
 8 | from tests.spark.AssertResult import AssertValidationResult
 9 | from checkengine.validate_df import ValidateSparkDataFrame
10 | 
11 | pytestmark = pytest.mark.usefixtures("spark_session")
12 | 
13 | 
14 | def test_should_pass_empty_df_if_there_are_no_rules(spark_session):
15 |     df = empty_string_df(spark_session)
16 | 
17 |     result = ValidateSparkDataFrame(spark_session, df).execute()
18 | 
19 |     AssertValidationResult(column_name="col1", constraint_name="") \
20 |         .check(
21 |         actual=result,
22 |         expected_correct=df,
23 |         expected_erroneous=df
24 |     )
25 | 
26 | 
27 | def test_should_pass_df_if_there_are_no_rules(spark_session):
28 |     df = spark_session.createDataFrame([["abc"], ["def"]], schema=single_string_column_schema)
29 | 
30 |     result = ValidateSparkDataFrame(spark_session, df).execute()
31 | 
32 |     AssertValidationResult(column_name="col1", constraint_name="") \
33 |         .check(
34 |         actual=result,
35 |         expected_correct=df,
36 |         expected_erroneous=empty_string_df(spark_session)
37 |     )
38 | 


--------------------------------------------------------------------------------