├── lib
    ├── __init__.py
    ├── MANIFEST.in
    ├── test
    │   ├── __init__.py
    │   └── spark_validation_tests
    │   │   ├── __init__.py
    │   │   └── common
    │   │       ├── __init__.py
    │   │       ├── mock_data
    │   │           ├── data_sample_diff_2.csv
    │   │           ├── family_sample_diff.csv
    │   │           ├── family_sample.csv
    │   │           ├── data_sample_diff.csv
    │   │           ├── data_sample.csv
    │   │           ├── config_example.json
    │   │           ├── config_example_local.json
    │   │           ├── config_family_fs.json
    │   │           ├── config_example_fs.json
    │   │           ├── config_example.yaml
    │   │           └── config_familiy_fs.yaml
    │   │       ├── pyspark_test.py
    │   │       └── general_handler_test.py
    ├── src
    │   └── spark_validation
    │   │   ├── __init__.py
    │   │   ├── common
    │   │       ├── __init__.py
    │   │       ├── validation_results.py
    │   │       ├── constants.py
    │   │       ├── config.py
    │   │       └── general_validator.py
    │   │   ├── dataframe_validation
    │   │       ├── __init__.py
    │   │       ├── hive_validator.py
    │   │       ├── dataframe_validator.py
    │   │       └── file_system_validator.py
    │   │   ├── static
    │   │       ├── robots.txt
    │   │       ├── favicon.ico
    │   │       ├── logo192.png
    │   │       ├── logo512.png
    │   │       ├── static
    │   │       │   ├── media
    │   │       │   │   └── logo.50e8e5ec.png
    │   │       │   ├── css
    │   │       │   │   ├── main.8e896e56.chunk.css
    │   │       │   │   └── main.8e896e56.chunk.css.map
    │   │       │   └── js
    │   │       │   │   ├── 2.e9c9302b.chunk.js.LICENSE.txt
    │   │       │   │   ├── runtime-main.6d8ceafa.js
    │   │       │   │   ├── runtime-main.6d8ceafa.js.map
    │   │       │   │   ├── main.8e11e6a5.chunk.js
    │   │       │   │   └── main.8e11e6a5.chunk.js.map
    │   │       ├── manifest.json
    │   │       ├── precache-manifest.02e0942ee454e4f28891cd6100c3e7f6.js
    │   │       ├── asset-manifest.json
    │   │       ├── service-worker.js
    │   │       └── index.html
    │   │   ├── version.py
    │   │   └── app.py
    ├── setup.cfg
    ├── requirements.txt
    └── setup.py
├── .dockerignore
├── tox.ini
├── requirements.txt
├── docker-compose.yml
├── .pre-commit-config.yaml
├── Dockerfile
├── .pypirc
├── .github
    └── workflows
    │   └── pythonpublish.yml
├── .travis.yml
├── license.md
├── .gitignore
├── README.md
└── .pylintrc


/lib/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/MANIFEST.in:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/lib/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/src/spark_validation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/src/spark_validation/common/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/test/spark_validation_tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/test/spark_validation_tests/common/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/src/spark_validation/dataframe_validation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | venv/
2 | .idea
3 | .github
4 | .travis.yml
5 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [flake8]
2 | exclude = .git
3 | max-line-length = 120
4 | per-file-ignores =
5 |     */__init__.py: D104


--------------------------------------------------------------------------------
/lib/src/spark_validation/static/robots.txt:
--------------------------------------------------------------------------------
1 | # https://www.robotstxt.org/robotstxt.html
2 | User-agent: *
3 | Disallow:
4 | 


--------------------------------------------------------------------------------
/lib/src/spark_validation/version.py:
--------------------------------------------------------------------------------
1 | """version file for the domino ingestion package."""
2 | 
3 | __version__ = "0.4"
4 | 


--------------------------------------------------------------------------------
/lib/src/spark_validation/static/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ronald-smith-angel/owl-data-sanitizer/HEAD/lib/src/spark_validation/static/favicon.ico


--------------------------------------------------------------------------------
/lib/src/spark_validation/static/logo192.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ronald-smith-angel/owl-data-sanitizer/HEAD/lib/src/spark_validation/static/logo192.png


--------------------------------------------------------------------------------
/lib/src/spark_validation/static/logo512.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ronald-smith-angel/owl-data-sanitizer/HEAD/lib/src/spark_validation/static/logo512.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pyspark==2.4.5
2 | Flask==1.1.2
3 | requests==2.23.0
4 | dataclasses==0.6
5 | numpy==1.18.3
6 | pandas==1.0.1
7 | pivottablejs==0.9.0
8 | ipython==7.13.0
9 | 


--------------------------------------------------------------------------------
/lib/setup.cfg:
--------------------------------------------------------------------------------
1 | [aliases]
2 | test = pytest
3 | 
4 | [coverage:run]
5 | source = src
6 | command_line = -m pytest
7 | 
8 | [easy-install]
9 | index-url = https://pypi.python.org/pypi


--------------------------------------------------------------------------------
/lib/src/spark_validation/static/static/media/logo.50e8e5ec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ronald-smith-angel/owl-data-sanitizer/HEAD/lib/src/spark_validation/static/static/media/logo.50e8e5ec.png


--------------------------------------------------------------------------------
/lib/test/spark_validation_tests/common/mock_data/data_sample_diff_2.csv:
--------------------------------------------------------------------------------
1 | CODE,GENERAL_ID,NAME,CODE2,ULTIMATE_PARENT_ID,ULTIMATE_NAME,PARENT_ID,PARENT_NAME
2 | 12000123,1,Dummy 1 Entity,A,null,null,null,Dummy 1 Entity
3 | null,2,,B,2,Dummy 1 Entity,2,Dummy 1 Entity
4 | 


--------------------------------------------------------------------------------
/lib/test/spark_validation_tests/common/mock_data/family_sample_diff.csv:
--------------------------------------------------------------------------------
 1 | ID,NAME,FAMILY_NAME,PARENT,ADDRESS
 2 | 1,Cho,Cha,null,TEST
 3 | 2,Pho,Cha,1,Joos 4 - 3
 4 | 3,Dho,A,2,Joos 4 - 3
 5 | 4,Jho,,3,Joos 4 - 3
 6 | 5,null,A,4,Joos 4 - 3
 7 | null,Tho,Cha,11,Joos 4 - 3
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.7"
 2 | services:
 3 |     app:
 4 |         restart: always
 5 |         build:
 6 |             context: .
 7 |             dockerfile: Dockerfile
 8 |         ports:
 9 |             - "8000:8000"
10 |         volumes:
11 |             - $PWD/logs:/logs/
12 | 


--------------------------------------------------------------------------------
/lib/test/spark_validation_tests/common/mock_data/family_sample.csv:
--------------------------------------------------------------------------------
 1 | ID,NAME,FAMILY_NAME,PARENT,ADDRESS
 2 | 1,Cho,Cha,null,TEST
 3 | 2,Pho,Cha,1,Joos 4 - 3
 4 | 3,Dho,A,2,Joos 4 - 3
 5 | 4,Jho,,3,Joos 4 - 3
 6 | 5,,A,4,Joos 4 - 3
 7 | 6,,B,5,Joos 4 - 3
 8 | null,Tho,Cha,11,Joos 4 - 3
 9 | 7,Bho,Pha,8,Joos 4 - 3
10 | 7,Bho,Pha,8,Joos 4 - 31
11 | 


--------------------------------------------------------------------------------
/lib/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Development
 2 | black==19.10b0
 3 | flake8==3.7.9
 4 | pytest==5.3.2
 5 | coverage==5.0.3
 6 | pyspark==2.4.5
 7 | dataclasses==0.6 # this comes out of the box in python 3.7, we can remove this when we upgrade.
 8 | Flask==1.1.2
 9 | requests==2.23.0
10 | numpy==1.18.3
11 | pandas==1.0.1
12 | pivottablejs==0.9.0
13 | ipython==7.13.0
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/lib/test/spark_validation_tests/common/mock_data/data_sample_diff.csv:
--------------------------------------------------------------------------------
1 | CODE,GENERAL_ID,NAME,CODE2,ULTIMATE_PARENT_ID,ULTIMATE_NAME,PARENT_ID,PARENT_NAME,ADDR_DESC
2 | 12000123,1,Dummy 1 Entity,A,null,null,null,Dummy 1 Entity,
3 | null,2,,B,2,Dummy 1 Entity,2,Dummy 1 Entity,
4 | 12000123,3,null,A,3,null,3,Dummy 1 Entity,
5 | 1,4,1,B,4,Dummy 1 Entity,4,Dummy 1 Entity,
6 | 12000123,5,1,A,5,null,5,Dummy 1 Entity,
7 | 12000123,null,,A,11,null,7,Dummy 1 Entity,


--------------------------------------------------------------------------------
/lib/src/spark_validation/common/validation_results.py:
--------------------------------------------------------------------------------
 1 | """Module to encapsulate validation results."""
 2 | from abc import ABC
 3 | 
 4 | 
 5 | class ValidationResults(ABC):
 6 |     """Module to encapsulate validation results."""
 7 | 
 8 |     def __init__(self, correctness_df, completeness_df, comparison_df):
 9 |         self.correctness_df = correctness_df
10 |         self.completeness_df = completeness_df
11 |         self.comparison_df = comparison_df
12 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/ambv/black
 3 |     rev: stable
 4 |     hooks:
 5 |     - id: black
 6 |       language_version: python3.7
 7 | -   repo: https://gitlab.com/pycqa/flake8
 8 |     rev: 3.7.9
 9 |     hooks:
10 |     - id: flake8
11 |       args: ['--config=tox.ini']
12 |       additional_dependencies: [
13 |           'flake8-deprecated',
14 |           'flake8-docstrings',
15 |           'flake8-tidy-imports'
16 |       ]
17 | 


--------------------------------------------------------------------------------
/lib/test/spark_validation_tests/common/mock_data/data_sample.csv:
--------------------------------------------------------------------------------
 1 | CODE,GENERAL_ID,NAME,CODE2,ULTIMATE_PARENT_ID,ULTIMATE_NAME,PARENT_ID,PARENT_NAME,ADDR_DESC
 2 | 12000123,1,Dummy 1 Entity,A,null,null,null,Dummy 1 Entity,
 3 | null,2,,B,2,Dummy 1 Entity,2,Dummy 1 Entity,
 4 | 12000123,3,null,A,3,null,3,Dummy 1 Entity,
 5 | 1,4,1,B,4,Dummy 1 Entity,4,Dummy 1 Entity,
 6 | 12000123,5,1,A,5,null,5,Dummy 1 Entity,
 7 | 3,6,null,B,6,Dummy 1 Entity,6,Dummy 1 Entity,
 8 | 12000123,null,,A,11,null,7,Dummy 1 Entity,
 9 | ,7,2,B,8,Dummy 1 Entity,8,Dummy 1 Entity,
10 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM openjdk:8-jdk-slim as deployment
 2 | COPY --from=python:3.7 / /
 3 | ENV PYTHONPATH="/app/lib/src:/app/lib/test:$PYTHONPATH"
 4 | ENV JAVA_HOME="/usr/local/openjdk-8"
 5 | 
 6 | WORKDIR "/app"
 7 | COPY lib/ .
 8 | 
 9 | COPY . /app
10 | 
11 | # An explicit installation of GUnicorn is required for it to instantiate worker threads.
12 | RUN pip install -r /app/requirements.txt && \
13 |     pip install gunicorn==20.0.4
14 | 
15 | EXPOSE 8000
16 | CMD ["gunicorn", "-b",  "0.0.0.0:8000", "--workers", "3", "spark_validation.app", "--timeout", "3000"]
17 | 


--------------------------------------------------------------------------------
/lib/src/spark_validation/static/manifest.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "short_name": "React App",
 3 |   "name": "Create React App Sample",
 4 |   "icons": [
 5 |     {
 6 |       "src": "favicon.ico",
 7 |       "sizes": "64x64 32x32 24x24 16x16",
 8 |       "type": "image/x-icon"
 9 |     },
10 |     {
11 |       "src": "logo192.png",
12 |       "type": "image/png",
13 |       "sizes": "192x192"
14 |     },
15 |     {
16 |       "src": "logo512.png",
17 |       "type": "image/png",
18 |       "sizes": "512x512"
19 |     }
20 |   ],
21 |   "start_url": ".",
22 |   "display": "standalone",
23 |   "theme_color": "#000000",
24 |   "background_color": "#ffffff"
25 | }
26 | 


--------------------------------------------------------------------------------
/.pypirc:
--------------------------------------------------------------------------------
1 | [distutils]
2 | index-servers =
3 |     pypi
4 | 
5 | [pypi]
6 | repository:https://upload.pypi.org/legacy/
7 | username: ronaldsmithangel
8 | password:  LZnBL8dhXzc+ygWpPHWDboPKWJ/jW58fg5N8US7np9tRhZ9YQzwFtDjaHetVxmIBCM+h57DmA5kJNBaIx7saNu48wARBxsZTP7T3pnM8L5uMpJhwaqIYQdIbnh00FbzyintuQJ7LqkSNFhUMkkoAoW+1NXAr9lk0HdGbMeTxJmr9ZSh4131rQIfqury8pT8z27/kslSD61x3Gua+yqbhYns1ZwMwuR84t2uRC7ihScM2Bi/gmQusjTm5HXwChn1U+fh1GQPb63bREfPwfTlvAN5GYt8ZuV/A3lz4iTwm0eRMDpnE1cO4kJ92U0xxtjeixgE0Jz73KebAAYQp+4I1zv7ng3gPlyDSPRn98eLYy8e/zp1q15kO179dvb87l+7WzQ8gjfI1FXsNneAlv+Aza/EDOHmssrlFAhXQpG3rDLgIEBEQbAKQCYrFJt/1tQdFHGWtTl+pvW8l+nMca5pRttZsDkPHhfRft6H+P1YWcEL6ksngiFh979EjLnuVISZGjLJmAjF7M3pxDZT9qbBF8NHPxmMjUaKeyTg2kONyGLH2mtFOanRxdYJOurZeIOAZA1GPq8iqL0M/a1Eivr5MTLYHCU/WEvEBDOThVu88N/9Y24sTtnIdMckzBFSZ+SBqgmZOw1TlmY+MugrX0z009tW3zgEAsiNUR+eK5AfGrEc=
9 | 


--------------------------------------------------------------------------------
/lib/src/spark_validation/app.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import sys
 3 | 
 4 | from flask import Flask, jsonify, request, make_response
 5 | 
 6 | from spark_validation.dataframe_validation import file_system_validator
 7 | 
 8 | application = Flask(__name__, static_url_path="")
 9 | 
10 | 
11 | @application.route("/")
12 | def index():
13 |     return application.send_static_file('index.html')
14 | 
15 | @application.route("/api/validate", methods=["POST"])
16 | def validate():
17 |     json_input = request.get_json(force=True)
18 |     with open('config.json', 'w') as fp:
19 |         json.dump(json_input, fp)
20 |     sys.argv = ["example.py", "-c", 'config.json']
21 | 
22 |     print("JSON: {}".format(json_input))
23 | 
24 |     file_system_validator.init()
25 |     response = {"validation": "yes"}
26 | 
27 |     return make_response(jsonify(response), 200)
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     application.run(port=8000, debug=True)
32 | 


--------------------------------------------------------------------------------
/lib/src/spark_validation/static/precache-manifest.02e0942ee454e4f28891cd6100c3e7f6.js:
--------------------------------------------------------------------------------
 1 | self.__precacheManifest = (self.__precacheManifest || []).concat([
 2 |   {
 3 |     "revision": "a8dca60662ceb50b0796e7dd27c3ef12",
 4 |     "url": "/index.html"
 5 |   },
 6 |   {
 7 |     "revision": "e463c5cd056b63b394e6",
 8 |     "url": "/static/css/main.8e896e56.chunk.css"
 9 |   },
10 |   {
11 |     "revision": "aae5bb205dfb84ac1b5a",
12 |     "url": "/static/js/2.e9c9302b.chunk.js"
13 |   },
14 |   {
15 |     "revision": "c64c486544348f10a6d6c716950bc223",
16 |     "url": "/static/js/2.e9c9302b.chunk.js.LICENSE.txt"
17 |   },
18 |   {
19 |     "revision": "e463c5cd056b63b394e6",
20 |     "url": "/static/js/main.8e11e6a5.chunk.js"
21 |   },
22 |   {
23 |     "revision": "3f809adccd7b5eb81ce7",
24 |     "url": "/static/js/runtime-main.6d8ceafa.js"
25 |   },
26 |   {
27 |     "revision": "50e8e5ecb197cb27b1347de458f06521",
28 |     "url": "/static/media/logo.50e8e5ec.png"
29 |   }
30 | ]);


--------------------------------------------------------------------------------
/lib/src/spark_validation/static/static/css/main.8e896e56.chunk.css:
--------------------------------------------------------------------------------
1 | body{margin:0;font-family:-apple-system,BlinkMacSystemFont,"Segoe UI","Roboto","Oxygen","Ubuntu","Cantarell","Fira Sans","Droid Sans","Helvetica Neue",sans-serif;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}code{font-family:source-code-pro,Menlo,Monaco,Consolas,"Courier New",monospace}.App{text-align:center;padding:40px;margin:0 auto;width:90vw}.App-logo{height:40vmin;pointer-events:none}@media (prefers-reduced-motion:no-preference){.App-logo{-webkit-animation:App-logo-spin 20s linear infinite;animation:App-logo-spin 20s linear infinite}}.App-header{background-color:#282c34;min-height:100vh;display:flex;flex-direction:column;align-items:center;justify-content:center;font-size:calc(10px + 2vmin);color:#fff}.App-link{color:#61dafb}@-webkit-keyframes App-logo-spin{0%{transform:rotate(0deg)}to{transform:rotate(1turn)}}@keyframes App-logo-spin{0%{transform:rotate(0deg)}to{transform:rotate(1turn)}}
2 | /*# sourceMappingURL=main.8e896e56.chunk.css.map */


--------------------------------------------------------------------------------
/.github/workflows/pythonpublish.yml:
--------------------------------------------------------------------------------
 1 | # This workflows will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: Upload Python Package
 5 | 
 6 | on:
 7 |   release:
 8 |     types: [created]
 9 | 
10 | jobs:
11 |   deploy:
12 | 
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v2
17 |     - name: Set up Python
18 |       uses: actions/setup-python@v1
19 |       with:
20 |         python-version: '3.x'
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         pip install setuptools wheel twine
25 |     - name: Build and publish
26 |       env:
27 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
28 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
29 |       run: |
30 |         python setup.py sdist bdist_wheel
31 |         twine upload dist/*
32 | 


--------------------------------------------------------------------------------
/lib/src/spark_validation/common/constants.py:
--------------------------------------------------------------------------------
 1 | """Specific general constants used across the validation pipeline."""
 2 | 
 3 | 
 4 | class Constants:
 5 |     """Class with Constants for input and output values used in the the validation pipeline."""
 6 | 
 7 |     DATE_TIME_REPORT_COL = "dt"
 8 |     SUM_REPORT_SUFFIX = "_SUM"
 9 |     OVER_ALL_COUNT_COL = "OVER_ALL_COUNT"
10 |     IS_ERROR_COL = "RULE_FOR_"
11 |     UNIQUE_HASH = "UNIQUE_HASH"
12 |     COUNT_HASH = "COUNT_HASH"
13 |     ROW_ERROR_SUFFIX = "_ROW"
14 |     RULES_REPORT_SUFFIX = "_rules_report"
15 |     COMPARISON_REPORT_SUFFIX = "_comparison_report"
16 |     REPORT_DF_COL = "df"
17 |     MISSING_COLS_RIGHT_COL = "missing_cols_right"
18 |     MISSING_VALS_RIGHT_COL = "missing_vals_right"
19 |     MISSING_COLS_LEFT_COL = "missing_cols_left"
20 |     MISSING_VALS_LEFT_COL = "missing_vals_left"
21 |     OUTPUT_COMPARABLE_COLS = [
22 |         REPORT_DF_COL,
23 |         MISSING_COLS_RIGHT_COL,
24 |         MISSING_COLS_LEFT_COL,
25 |         MISSING_VALS_RIGHT_COL,
26 |         MISSING_VALS_LEFT_COL,
27 |     ]
28 | 


--------------------------------------------------------------------------------
/lib/src/spark_validation/static/asset-manifest.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "files": {
 3 |     "main.css": "/static/css/main.8e896e56.chunk.css",
 4 |     "main.js": "/static/js/main.8e11e6a5.chunk.js",
 5 |     "main.js.map": "/static/js/main.8e11e6a5.chunk.js.map",
 6 |     "runtime-main.js": "/static/js/runtime-main.6d8ceafa.js",
 7 |     "runtime-main.js.map": "/static/js/runtime-main.6d8ceafa.js.map",
 8 |     "static/js/2.e9c9302b.chunk.js": "/static/js/2.e9c9302b.chunk.js",
 9 |     "static/js/2.e9c9302b.chunk.js.map": "/static/js/2.e9c9302b.chunk.js.map",
10 |     "index.html": "/index.html",
11 |     "precache-manifest.02e0942ee454e4f28891cd6100c3e7f6.js": "/precache-manifest.02e0942ee454e4f28891cd6100c3e7f6.js",
12 |     "service-worker.js": "/service-worker.js",
13 |     "static/css/main.8e896e56.chunk.css.map": "/static/css/main.8e896e56.chunk.css.map",
14 |     "static/js/2.e9c9302b.chunk.js.LICENSE.txt": "/static/js/2.e9c9302b.chunk.js.LICENSE.txt",
15 |     "static/media/logo.png": "/static/media/logo.50e8e5ec.png"
16 |   },
17 |   "entrypoints": [
18 |     "static/js/runtime-main.6d8ceafa.js",
19 |     "static/js/2.e9c9302b.chunk.js",
20 |     "static/css/main.8e896e56.chunk.css",
21 |     "static/js/main.8e11e6a5.chunk.js"
22 |   ]
23 | }


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | services:
 3 | - docker
 4 | before_script:
 5 | - export VERSION=$(awk '{print $3}' lib/src/spark_validation/version.py | sed 's/"//g')
 6 | before_install:
 7 | - docker pull covertspartan/docker-airflow-spark
 8 | install:
 9 | - pip install -r requirements.txt
10 | script:
11 | - cd lib
12 | - pip install -r requirements.txt --quiet
13 | - export PYTHONPATH="$PWD/src:$PWD/test:$PYTHONPATH"
14 | deploy:
15 |   skip_cleanup: true
16 |   provider: pypi
17 |   user: ronaldsmithangel
18 |   password:
19 |     secure: LZnBL8dhXzc+ygWpPHWDboPKWJ/jW58fg5N8US7np9tRhZ9YQzwFtDjaHetVxmIBCM+h57DmA5kJNBaIx7saNu48wARBxsZTP7T3pnM8L5uMpJhwaqIYQdIbnh00FbzyintuQJ7LqkSNFhUMkkoAoW+1NXAr9lk0HdGbMeTxJmr9ZSh4131rQIfqury8pT8z27/kslSD61x3Gua+yqbhYns1ZwMwuR84t2uRC7ihScM2Bi/gmQusjTm5HXwChn1U+fh1GQPb63bREfPwfTlvAN5GYt8ZuV/A3lz4iTwm0eRMDpnE1cO4kJ92U0xxtjeixgE0Jz73KebAAYQp+4I1zv7ng3gPlyDSPRn98eLYy8e/zp1q15kO179dvb87l+7WzQ8gjfI1FXsNneAlv+Aza/EDOHmssrlFAhXQpG3rDLgIEBEQbAKQCYrFJt/1tQdFHGWtTl+pvW8l+nMca5pRttZsDkPHhfRft6H+P1YWcEL6ksngiFh979EjLnuVISZGjLJmAjF7M3pxDZT9qbBF8NHPxmMjUaKeyTg2kONyGLH2mtFOanRxdYJOurZeIOAZA1GPq8iqL0M/a1Eivr5MTLYHCU/WEvEBDOThVu88N/9Y24sTtnIdMckzBFSZ+SBqgmZOw1TlmY+MugrX0z009tW3zgEAsiNUR+eK5AfGrEc=
20 |   on:
21 |     branch: master
22 | 


--------------------------------------------------------------------------------
/lib/src/spark_validation/static/static/js/2.e9c9302b.chunk.js.LICENSE.txt:
--------------------------------------------------------------------------------
 1 | /*
 2 | object-assign
 3 | (c) Sindre Sorhus
 4 | @license MIT
 5 | */
 6 | 
 7 | /** @license React v0.19.1
 8 |  * scheduler.production.min.js
 9 |  *
10 |  * Copyright (c) Facebook, Inc. and its affiliates.
11 |  *
12 |  * This source code is licensed under the MIT license found in the
13 |  * LICENSE file in the root directory of this source tree.
14 |  */
15 | 
16 | /** @license React v16.13.1
17 |  * react-dom.production.min.js
18 |  *
19 |  * Copyright (c) Facebook, Inc. and its affiliates.
20 |  *
21 |  * This source code is licensed under the MIT license found in the
22 |  * LICENSE file in the root directory of this source tree.
23 |  */
24 | 
25 | /** @license React v16.13.1
26 |  * react-is.production.min.js
27 |  *
28 |  * Copyright (c) Facebook, Inc. and its affiliates.
29 |  *
30 |  * This source code is licensed under the MIT license found in the
31 |  * LICENSE file in the root directory of this source tree.
32 |  */
33 | 
34 | /** @license React v16.13.1
35 |  * react.production.min.js
36 |  *
37 |  * Copyright (c) Facebook, Inc. and its affiliates.
38 |  *
39 |  * This source code is licensed under the MIT license found in the
40 |  * LICENSE file in the root directory of this source tree.
41 |  */
42 | 


--------------------------------------------------------------------------------
/lib/src/spark_validation/static/service-worker.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Welcome to your Workbox-powered service worker!
 3 |  *
 4 |  * You'll need to register this file in your web app and you should
 5 |  * disable HTTP caching for this file too.
 6 |  * See https://goo.gl/nhQhGp
 7 |  *
 8 |  * The rest of the code is auto-generated. Please don't update this file
 9 |  * directly; instead, make changes to your Workbox build configuration
10 |  * and re-run your build process.
11 |  * See https://goo.gl/2aRDsh
12 |  */
13 | 
14 | importScripts("https://storage.googleapis.com/workbox-cdn/releases/4.3.1/workbox-sw.js");
15 | 
16 | importScripts(
17 |   "/precache-manifest.02e0942ee454e4f28891cd6100c3e7f6.js"
18 | );
19 | 
20 | self.addEventListener('message', (event) => {
21 |   if (event.data && event.data.type === 'SKIP_WAITING') {
22 |     self.skipWaiting();
23 |   }
24 | });
25 | 
26 | workbox.core.clientsClaim();
27 | 
28 | /**
29 |  * The workboxSW.precacheAndRoute() method efficiently caches and responds to
30 |  * requests for URLs in the manifest.
31 |  * See https://goo.gl/S9QRab
32 |  */
33 | self.__precacheManifest = [].concat(self.__precacheManifest || []);
34 | workbox.precaching.precacheAndRoute(self.__precacheManifest, {});
35 | 
36 | workbox.routing.registerNavigationRoute(workbox.precaching.getCacheKeyForURL("/index.html"), {
37 |   
38 |   blacklist: [/^\/_/,/\/[^/?]+\.[^/]+$/],
39 | });
40 | 


--------------------------------------------------------------------------------
/license.md:
--------------------------------------------------------------------------------
 1 | COPYRIGHT
 2 | 
 3 | All contributions by Ronald Angel:
 4 | Copyright (c) 2020, Ronald Angel.
 5 | All rights reserved.
 6 | 
 7 | Each contributor holds copyright over their respective contributions.
 8 | The project versioning (Git) records all such contribution source information.
 9 | 
10 | LICENSE
11 | 
12 | The MIT License (MIT)
13 | 
14 | Permission is hereby granted, free of charge, to any person obtaining a copy
15 | of this software and associated documentation files (the "Software"), to deal
16 | in the Software without restriction, including without limitation the rights
17 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
18 | copies of the Software, and to permit persons to whom the Software is
19 | furnished to do so, subject to the following conditions:
20 | 
21 | The above copyright notice and this permission notice shall be included in all
22 | copies or substantial portions of the Software.
23 | 
24 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 | SOFTWARE.
31 | 


--------------------------------------------------------------------------------
/lib/test/spark_validation_tests/common/mock_data/config_example.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "source_table": {
 3 |     "name": "test.data_test",
 4 |     "id_column": "GENERAL_ID",
 5 |     "output_correctness_table": "test.data_test_correctness",
 6 |     "output_completeness_table": "test.data_test_completeness",
 7 |     "output_comparison_table": "test.data_test_comparison",
 8 |     "unique_column_group_values_per_table": [
 9 |       "GENERAL_ID",
10 |       "ULTIMATE_PARENT_ID"
11 |     ],
12 |     "fuzzy_deduplication_distance": 0
13 |   },
14 |   "correctness_validations": [
15 |     {
16 |       "column": "CODE",
17 |       "rule": "CODE is not null and CODE != '' and CODE != 'null'"
18 |     },
19 |     {
20 |       "column": "NAME",
21 |       "rule": "NAME is not null and NAME != '' and NAME != 'null'"
22 |     },
23 |     {
24 |       "column": "GENERAL_ID",
25 |       "rule": "GENERAL_ID is not null and GENERAL_ID != '' and GENERAL_ID != 'null' and CHAR_LENGTH(GENERAL_ID) < 4"
26 |     }
27 |   ],
28 |   "completeness_validations": {
29 |     "overall": {
30 |       "column": "OVER_ALL_COUNT",
31 |       "rule": "OVER_ALL_COUNT <= 7"
32 |     }
33 |   },
34 |   "parent_children_constraints": [
35 |     {
36 |       "column": "GENERAL_ID",
37 |       "parent": "ULTIMATE_PARENT_ID"
38 |     },
39 |     {
40 |       "column": "GENERAL_ID",
41 |       "parent": "PARENT_ID"
42 |     }
43 |   ],
44 |   "compare_related_tables_list": [
45 |     "test.diff_df",
46 |     "test.diff_df_2"
47 |   ]
48 | }


--------------------------------------------------------------------------------
/lib/test/spark_validation_tests/common/mock_data/config_example_local.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "source_table": {
 3 |     "name": "mock_data/data_sample.csv",
 4 |     "id_column": "GENERAL_ID",
 5 |     "output_correctness_table": "/tmp/mock_data/output/data_sample_test_correctness",
 6 |     "output_completeness_table": "/tmp/mock_data/output/data_sample_test_completeness",
 7 |     "output_comparison_table": "/tmp/mock_data/output/data_sample_test_comparison",
 8 |     "unique_column_group_values_per_table": ["GENERAL_ID", "ULTIMATE_PARENT_ID"],
 9 |     "fuzzy_deduplication_distance": 0
10 |   },
11 |   "correctness_validations": [
12 |     {
13 |       "column": "CODE",
14 |       "rule": "CODE is not null and CODE != '' and CODE != 'null'"
15 |     },
16 |     {
17 |       "column": "NAME",
18 |       "rule": "NAME is not null and NAME != '' and NAME != 'null'"
19 |     },
20 |     {
21 |       "column": "GENERAL_ID",
22 |       "rule": "GENERAL_ID is not null and GENERAL_ID != '' and GENERAL_ID != 'null' and CHAR_LENGTH(GENERAL_ID) < 4"
23 |     }
24 |   ],
25 |   "completeness_validations": {
26 |     "overall": {
27 |       "column": "OVER_ALL_COUNT",
28 |       "rule": "OVER_ALL_COUNT <= 7"
29 |     }
30 |   },
31 |   "parent_children_constraints": [
32 |     {
33 |       "column": "GENERAL_ID",
34 |       "parent": "ULTIMATE_PARENT_ID"
35 |     },
36 |     {
37 |       "column": "GENERAL_ID",
38 |       "parent": "PARENT_ID"
39 |     }
40 |   ],
41 |   "compare_related_tables_list": ["mock_data/data_sample_diff.csv"]
42 | }


--------------------------------------------------------------------------------
/lib/test/spark_validation_tests/common/mock_data/config_family_fs.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "source_table": {
 3 |     "name": "mock_data/family_sample.csv",
 4 |     "id_column": "ID",
 5 |     "output_correctness_table": "/tmp/mock_data/output/family_sample_test_correctness",
 6 |     "output_completeness_table": "/tmp/mock_data/output/family_sample_test_completeness",
 7 |     "output_comparison_table": "/tmp/mock_data/output/family_sample_test_comparison",
 8 |     "unique_column_group_values_per_table": ["ID", "NAME", "FAMILY_NAME", "PARENT"],
 9 |     "fuzzy_deduplication_distance": 0
10 |   },
11 |   "correctness_validations": [
12 |     {
13 |       "column": "ID",
14 |       "rule": "ID is not null and ID != '' and ID != 'null'"
15 |     },
16 |     {
17 |       "column": "NAME",
18 |       "rule": "NAME is not null and NAME != '' and NAME != 'null' and NAME like '%ho%'"
19 |     },
20 |     {
21 |       "column": "FAMILY_NAME",
22 |       "rule": "NAME is not null and FAMILY_NAME in ('Cha', 'Pha')"
23 |     },
24 |     {
25 |       "column": "ADDRESS",
26 |       "rule": "ADDRESS is not null and ADDRESS != '' and ADDRESS != 'null' and CHAR_LENGTH(ADDRESS) > 4"
27 |     }
28 |   ],
29 |  "completeness_validations": {
30 |     "overall": {
31 |       "column": "OVER_ALL_COUNT",
32 |       "rule": "OVER_ALL_COUNT <= 5"
33 |     }
34 |   },
35 |   "parent_children_constraints": [
36 |     {
37 |       "column": "ID",
38 |       "parent": "PARENT"
39 |     }
40 |   ],
41 |   "compare_related_tables_list": ["mock_data/family_sample_diff.csv"]
42 | }
43 | 


--------------------------------------------------------------------------------
/lib/src/spark_validation/static/static/css/main.8e896e56.chunk.css.map:
--------------------------------------------------------------------------------
1 | {"version":3,"sources":["index.css","App.css"],"names":[],"mappings":"AAAA,KACE,QAAS,CACT,mJAEY,CACZ,kCAAmC,CACnC,iCACF,CAEA,KACE,yEAEF,CCZA,KACE,iBAAkB,CAClB,YAAa,CACb,aAAc,CACd,UACF,CAEA,UACE,aAAc,CACd,mBACF,CAEA,8CACE,UACE,mDAA4C,CAA5C,2CACF,CACF,CAEA,YACE,wBAAyB,CACzB,gBAAiB,CACjB,YAAa,CACb,qBAAsB,CACtB,kBAAmB,CACnB,sBAAuB,CACvB,4BAA6B,CAC7B,UACF,CAEA,UACE,aACF,CAEA,iCACE,GACE,sBACF,CACA,GACE,uBACF,CACF,CAPA,yBACE,GACE,sBACF,CACA,GACE,uBACF,CACF","file":"main.8e896e56.chunk.css","sourcesContent":["body {\n  margin: 0;\n  font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen',\n    'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue',\n    sans-serif;\n  -webkit-font-smoothing: antialiased;\n  -moz-osx-font-smoothing: grayscale;\n}\n\ncode {\n  font-family: source-code-pro, Menlo, Monaco, Consolas, 'Courier New',\n    monospace;\n}\n",".App {\n  text-align: center;\n  padding: 40px;\n  margin: 0 auto;\n  width: 90vw;\n}\n\n.App-logo {\n  height: 40vmin;\n  pointer-events: none;\n}\n\n@media (prefers-reduced-motion: no-preference) {\n  .App-logo {\n    animation: App-logo-spin infinite 20s linear;\n  }\n}\n\n.App-header {\n  background-color: #282c34;\n  min-height: 100vh;\n  display: flex;\n  flex-direction: column;\n  align-items: center;\n  justify-content: center;\n  font-size: calc(10px + 2vmin);\n  color: white;\n}\n\n.App-link {\n  color: #61dafb;\n}\n\n@keyframes App-logo-spin {\n  from {\n    transform: rotate(0deg);\n  }\n  to {\n    transform: rotate(360deg);\n  }\n}\n"]}


--------------------------------------------------------------------------------
/lib/src/spark_validation/static/static/js/runtime-main.6d8ceafa.js:
--------------------------------------------------------------------------------
1 | !function(e){function r(r){for(var n,p,l=r[0],a=r[1],f=r[2],c=0,s=[];c<l.length;c++)p=l[c],Object.prototype.hasOwnProperty.call(o,p)&&o[p]&&s.push(o[p][0]),o[p]=0;for(n in a)Object.prototype.hasOwnProperty.call(a,n)&&(e[n]=a[n]);for(i&&i(r);s.length;)s.shift()();return u.push.apply(u,f||[]),t()}function t(){for(var e,r=0;r<u.length;r++){for(var t=u[r],n=!0,l=1;l<t.length;l++){var a=t[l];0!==o[a]&&(n=!1)}n&&(u.splice(r--,1),e=p(p.s=t[0]))}return e}var n={},o={1:0},u=[];function p(r){if(n[r])return n[r].exports;var t=n[r]={i:r,l:!1,exports:{}};return e[r].call(t.exports,t,t.exports,p),t.l=!0,t.exports}p.m=e,p.c=n,p.d=function(e,r,t){p.o(e,r)||Object.defineProperty(e,r,{enumerable:!0,get:t})},p.r=function(e){"undefined"!==typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},p.t=function(e,r){if(1&r&&(e=p(e)),8&r)return e;if(4&r&&"object"===typeof e&&e&&e.__esModule)return e;var t=Object.create(null);if(p.r(t),Object.defineProperty(t,"default",{enumerable:!0,value:e}),2&r&&"string"!=typeof e)for(var n in e)p.d(t,n,function(r){return e[r]}.bind(null,n));return t},p.n=function(e){var r=e&&e.__esModule?function(){return e.default}:function(){return e};return p.d(r,"a",r),r},p.o=function(e,r){return Object.prototype.hasOwnProperty.call(e,r)},p.p="/";var l=this.webpackJsonpapp=this.webpackJsonpapp||[],a=l.push.bind(l);l.push=r,l=l.slice();for(var f=0;f<l.length;f++)r(l[f]);var i=a;t()}([]);
2 | //# sourceMappingURL=runtime-main.6d8ceafa.js.map


--------------------------------------------------------------------------------
/lib/test/spark_validation_tests/common/mock_data/config_example_fs.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "source_table": {
 3 |     "name": "lib/test/spark_validation_tests/common/mock_data/data_sample.csv",
 4 |     "id_column": "GENERAL_ID",
 5 |     "output_correctness_table": "lib/test/spark_validation_tests/common/mock_data/output/data_sample_test_correctness",
 6 |     "output_completeness_table": "lib/test/spark_validation_tests/common/mock_data/output/data_sample_test_completeness",
 7 |     "output_comparison_table": "lib/test/spark_validation_tests/common/mock_data/output/data_sample_test_comparison",
 8 |     "unique_column_group_values_per_table": ["GENERAL_ID", "ULTIMATE_PARENT_ID"],
 9 |     "fuzzy_deduplication_distance": 0
10 |   },
11 |   "correctness_validations": [
12 |     {
13 |       "column": "CODE",
14 |       "rule": "CODE is not null and CODE != '' and CODE != 'null'"
15 |     },
16 |     {
17 |       "column": "NAME",
18 |       "rule": "NAME is not null and NAME != '' and NAME != 'null'"
19 |     },
20 |     {
21 |       "column": "GENERAL_ID",
22 |       "rule": "GENERAL_ID is not null and GENERAL_ID != '' and GENERAL_ID != 'null' and CHAR_LENGTH(GENERAL_ID) < 4"
23 |     }
24 |   ],
25 |   "completeness_validations": {
26 |     "overall": {
27 |       "column": "OVER_ALL_COUNT",
28 |       "rule": "OVER_ALL_COUNT <= 7"
29 |     }
30 |   },
31 |   "parent_children_constraints": [
32 |     {
33 |       "column": "GENERAL_ID",
34 |       "parent": "ULTIMATE_PARENT_ID"
35 |     },
36 |     {
37 |       "column": "GENERAL_ID",
38 |       "parent": "PARENT_ID"
39 |     }
40 |   ],
41 |   "compare_related_tables_list": ["lib/test/spark_validation_tests/common/mock_data/data_sample_diff.csv"]
42 | }
43 | 


--------------------------------------------------------------------------------
/lib/test/spark_validation_tests/common/pyspark_test.py:
--------------------------------------------------------------------------------
 1 | """Module with general pyspark function to tests."""
 2 | 
 3 | import logging
 4 | import os
 5 | import shutil
 6 | import unittest
 7 | 
 8 | from pyspark.sql import SparkSession
 9 | 
10 | 
11 | class PySparkTest(unittest.TestCase):
12 |     """Class with general pyspark functions to tests."""
13 | 
14 |     derby_home = "/tmp/derby_home"
15 |     spark_warehouse = "/tmp/spark_warehouse"
16 | 
17 |     @classmethod
18 |     def suppress_py4j_logging(cls):
19 |         """Set waring level for py4j logs."""
20 |         logger = logging.getLogger("py4j")
21 |         logger.setLevel(logging.WARN)
22 | 
23 |     @classmethod
24 |     def create_session(cls):
25 |         """Create dummy spark config."""
26 |         derby_property = "-Dderby.system.home={}".format(cls.derby_home)
27 |         return (
28 |             SparkSession.builder.master("local[*]")
29 |             .appName("local-testing-pyspark-context")
30 |             .config("spark.driver.extraJavaOptions", derby_property)
31 |             .config("spark.sql.warehouse.dir", cls.spark_warehouse)
32 |             .enableHiveSupport()
33 |             .getOrCreate()
34 |         )
35 | 
36 |     @classmethod
37 |     def setUpClass(cls):
38 |         """Init needed directories and sessions."""
39 |         cls.suppress_py4j_logging()
40 | 
41 |         if os.path.exists(cls.derby_home):
42 |             shutil.rmtree(cls.derby_home)
43 |         if os.path.exists(cls.spark_warehouse):
44 |             shutil.rmtree(cls.spark_warehouse)
45 |         os.mkdir(cls.derby_home)
46 |         os.mkdir(cls.spark_warehouse)
47 |         cls.spark = cls.create_session()
48 | 
49 |     @classmethod
50 |     def tearDownClass(cls):
51 |         """Remove directories and sessions."""
52 |         cls.spark.stop()
53 |         shutil.rmtree(cls.derby_home)
54 |         shutil.rmtree(cls.spark_warehouse)
55 | 


--------------------------------------------------------------------------------
/lib/test/spark_validation_tests/common/mock_data/config_example.yaml:
--------------------------------------------------------------------------------
 1 | #metadata for the source table.
 2 | source_table:
 3 |   name: test.data_test #include schema.
 4 |   id_column: GENERAL_ID
 5 |   output_correctness_table: test.data_test_correctness
 6 |   output_completeness_table: test.data_test_completeness
 7 |   output_comparison_table: test.data_test_comparison
 8 |   unique_column_group_values_per_table: #deduplication using 1 or more columns.
 9 |     - GENERAL_ID
10 |     - ULTIMATE_PARENT_ID
11 |   fuzzy_deduplication_distance: 0 #apply fuzzy matching distance N to deduplication, 0 for disable.
12 | #correctness rules per column, use a SQL query in a negative way. Example: where col is not null and is != 'a'.
13 | correctness_validations:
14 |   - column: CODE
15 |     rule: CODE is not null and CODE != '' and CODE != 'null'
16 |   - column: NAME
17 |     rule: NAME is not null and NAME != '' and NAME != 'null'
18 |   - column: GENERAL_ID
19 |     rule: >-
20 |       GENERAL_ID is not null and GENERAL_ID != '' and GENERAL_ID != 'null' and
21 |       CHAR_LENGTH(GENERAL_ID) < 4
22 | #validations for parent children constraints. Example: parent should be a valid entity within the table.
23 | parent_children_constraints:
24 |   - column: GENERAL_ID
25 |     parent: ULTIMATE_PARENT_ID
26 |   - column: GENERAL_ID
27 |     parent: PARENT_ID
28 | #completeness rules. Use either an overall completeness count or a count compared with previous partitions.
29 | #for simple overall count keep the column name OVER_ALL_COUNT.
30 | completeness_validations:
31 |   overall:
32 |     column: OVER_ALL_COUNT
33 |     rule: OVER_ALL_COUNT <= 7
34 |   partitioned:
35 |     previous_partition: test.data_test_diff
36 |     max_grow_percentage: 10 #use negative for reduction.
37 | #checksum compare data with different tables. Example: compare against a goal set or the test vs production env.
38 | compare_related_tables_list:
39 |   - test.diff_df
40 |   - test.diff_df_2
41 | 


--------------------------------------------------------------------------------
/lib/setup.py:
--------------------------------------------------------------------------------
 1 | """General project setup for domino-ingestion."""
 2 | import os
 3 | import re
 4 | 
 5 | from setuptools import setup, find_packages
 6 | 
 7 | SETUP_REQUIREMENTS = [
 8 |     "dataclasses==0.6",
 9 |     "pyspark==2.4.5",
10 |     "Flask==1.1.2",
11 |     "requests==2.23.0",
12 |     "dataclasses==0.6",
13 |     "numpy==1.18.3",
14 |     "pandas==1.0.1",
15 |     "pivottablejs==0.9.0",
16 |     "ipython==7.13.0",
17 | ]
18 | 
19 | from os import path
20 | 
21 | readme_directory = path.abspath(path.dirname(__file__)).replace("/lib", "")
22 | with open(path.join(readme_directory, "README.md"), encoding="utf-8") as f:
23 |     long_description = f.read()
24 | 
25 | 
26 | def _get_version():
27 |     """Read the __version__ value from src/domino_ingestion/version.py.
28 | 
29 |     We can't import the package because we're the installation script for the package,
30 |     so we use regex and read the python file as a raw text file.
31 |     """
32 |     version_regex = re.compile(
33 |         r"""^__version__\s=\s['"](?P<version>.*?)['"] """, re.MULTILINE | re.VERBOSE
34 |     )
35 |     version_file = os.path.join("src", "spark_validation", "version.py")
36 |     with open(version_file) as handle:
37 |         lines = handle.read()
38 |         result = version_regex.search(lines)
39 |         if result:
40 |             return result.groupdict()["version"]
41 |         raise ValueError("Unable to determine __version__")
42 | 
43 | 
44 | setup(
45 |     name="owl-sanitizer-data-quality",
46 |     version=_get_version(),
47 |     description="Data Quality framework for Pyspark jobs",
48 |     long_description=long_description,
49 |     long_description_content_type="text/markdown",
50 |     author="Ronald Angel",
51 |     author_email="ronaldsmithangel@gmail.com",
52 |     url="https://github.com/ronald-smith-angel/owl-data-sanitizer.git",
53 |     license="MIT",
54 |     packages=find_packages(where="src"),
55 |     package_dir={"": "src"},
56 |     install_requires=SETUP_REQUIREMENTS,
57 | )
58 | 


--------------------------------------------------------------------------------
/lib/test/spark_validation_tests/common/mock_data/config_familiy_fs.yaml:
--------------------------------------------------------------------------------
 1 | #metadata for the source table.
 2 | source_table:
 3 |   name: mock_data/family_sample.csv
 4 |   id_column: ID
 5 |   output_correctness_table: /tmp/mock_data/output/family_sample_test_correctness
 6 |   output_completeness_table: /tmp/mock_data/output/family_sample_test_completeness
 7 |   output_comparison_table: /tmp/mock_data/output/family_sample_test_comparison
 8 |   unique_column_group_values_per_table: #deduplication using 1 or more columns.
 9 |     - ID
10 |     - NAME
11 |     - FAMILY_NAME
12 |     - PARENT
13 |   fuzzy_deduplication_distance: 0 #apply fuzzy matching distance N to deduplication, 0 for disable.
14 |   #correctness rules per column, use a SQL query in a negative way. Example: where col is not null and is != 'a'.
15 | correctness_validations:
16 |   - column: ID
17 |     rule: ID is not null and ID != '' and ID != 'null'
18 |   - column: NAME
19 |     rule: NAME is not null and NAME != '' and NAME != 'null' and NAME like '%ho%'
20 |   - column: FAMILY_NAME
21 |     rule: 'NAME is not null and FAMILY_NAME in (''Cha'', ''Pha'')'
22 |   - column: ADDRESS
23 |     rule: >-
24 |       ADDRESS is not null and ADDRESS != '' and ADDRESS != 'null' and
25 |       CHAR_LENGTH(ADDRESS) > 4
26 | #validations for parent children constraints. Example: parent should be a valid entity within the table.
27 | parent_children_constraints:
28 |   - column: ID
29 |     parent: PARENT
30 | #completeness rules. Use either an overall completeness count or a count compared with previous partitions.
31 | #for simple overall count keep the column name OVER_ALL_COUNT.
32 | completeness_validations:
33 |   overall:
34 |     column: OVER_ALL_COUNT
35 |     rule: OVER_ALL_COUNT <= 5
36 |   partitioned:
37 |     previous_partition: mock_data/family_sample_previous.csv
38 |     max_grow_percentage: 10 #use negative for reduction.
39 | #checksum compare data with different tables. Example: compare against a goal set or the test vs production env.
40 | compare_related_tables_list:
41 |   - mock_data/family_sample_diff.csv


--------------------------------------------------------------------------------
/lib/src/spark_validation/static/index.html:
--------------------------------------------------------------------------------
1 | <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="/favicon.ico"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="/logo192.png"/><link rel="manifest" href="/manifest.json"/><title>React App</title><link href="/static/css/main.8e896e56.chunk.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div><script>!function(e){function r(r){for(var n,p,l=r[0],a=r[1],f=r[2],c=0,s=[];c<l.length;c++)p=l[c],Object.prototype.hasOwnProperty.call(o,p)&&o[p]&&s.push(o[p][0]),o[p]=0;for(n in a)Object.prototype.hasOwnProperty.call(a,n)&&(e[n]=a[n]);for(i&&i(r);s.length;)s.shift()();return u.push.apply(u,f||[]),t()}function t(){for(var e,r=0;r<u.length;r++){for(var t=u[r],n=!0,l=1;l<t.length;l++){var a=t[l];0!==o[a]&&(n=!1)}n&&(u.splice(r--,1),e=p(p.s=t[0]))}return e}var n={},o={1:0},u=[];function p(r){if(n[r])return n[r].exports;var t=n[r]={i:r,l:!1,exports:{}};return e[r].call(t.exports,t,t.exports,p),t.l=!0,t.exports}p.m=e,p.c=n,p.d=function(e,r,t){p.o(e,r)||Object.defineProperty(e,r,{enumerable:!0,get:t})},p.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},p.t=function(e,r){if(1&r&&(e=p(e)),8&r)return e;if(4&r&&"object"==typeof e&&e&&e.__esModule)return e;var t=Object.create(null);if(p.r(t),Object.defineProperty(t,"default",{enumerable:!0,value:e}),2&r&&"string"!=typeof e)for(var n in e)p.d(t,n,function(r){return e[r]}.bind(null,n));return t},p.n=function(e){var r=e&&e.__esModule?function(){return e.default}:function(){return e};return p.d(r,"a",r),r},p.o=function(e,r){return Object.prototype.hasOwnProperty.call(e,r)},p.p="/";var l=this.webpackJsonpapp=this.webpackJsonpapp||[],a=l.push.bind(l);l.push=r,l=l.slice();for(var f=0;f<l.length;f++)r(l[f]);var i=a;t()}([])</script><script src="/static/js/2.e9c9302b.chunk.js"></script><script src="/static/js/main.8e11e6a5.chunk.js"></script></body></html>


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # PySpark / Hive
  2 | metastore_db
  3 | spark-warehouse
  4 | env_dags
  5 | venv/
  6 | 
  7 | # IDE
  8 | .idea/
  9 | .vscode/
 10 | 
 11 | # Intellij stuff
 12 | dags.iml
 13 | 
 14 | # OS specific files
 15 | .DS_Store
 16 | 
 17 | # Byte-compiled / optimized / DLL files
 18 | __pycache__/
 19 | *.py[cod]
 20 | *$py.class
 21 | 
 22 | # C extensions
 23 | *.so
 24 | 
 25 | # Distribution / packaging
 26 | .Python
 27 | develop-eggs/
 28 | dist/
 29 | downloads/
 30 | eggs/
 31 | .eggs/
 32 | lib64/
 33 | parts/
 34 | sdist/
 35 | var/
 36 | wheels/
 37 | pip-wheel-metadata/
 38 | share/python-wheels/
 39 | *.egg-info/
 40 | .installed.cfg
 41 | *.egg
 42 | MANIFEST
 43 | dags/
 44 | 
 45 | # PyInstaller
 46 | #  Usually these files are written by a python script from a template
 47 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 48 | *.manifest
 49 | *.spec
 50 | 
 51 | # Installer logs
 52 | pip-log.txt
 53 | pip-delete-this-directory.txt
 54 | 
 55 | # Unit test / coverage reports
 56 | htmlcov/
 57 | .tox/
 58 | .nox/
 59 | .coverage
 60 | .coverage.*
 61 | .cache
 62 | nosetests.xml
 63 | coverage.xml
 64 | *.cover
 65 | *.py,cover
 66 | .hypothesis/
 67 | .pytest_cache/
 68 | 
 69 | # Translations
 70 | *.mo
 71 | *.pot
 72 | 
 73 | # Django stuff:
 74 | *.log
 75 | local_settings.py
 76 | db.sqlite3
 77 | db.sqlite3-journal
 78 | 
 79 | # Flask stuff:
 80 | instance/
 81 | .webassets-cache
 82 | 
 83 | # Scrapy stuff:
 84 | .scrapy
 85 | 
 86 | # Sphinx documentation
 87 | docs/_build/
 88 | 
 89 | # PyBuilder
 90 | target/
 91 | 
 92 | # Jupyter Notebook
 93 | .ipynb_checkpoints
 94 | 
 95 | # IPython
 96 | profile_default/
 97 | ipython_config.py
 98 | 
 99 | # pyenv
100 | .python-version
101 | 
102 | # pipenv
103 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
104 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
105 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
106 | #   install all needed dependencies.
107 | #Pipfile.lock
108 | 
109 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
110 | __pypackages__/
111 | 
112 | # Celery stuff
113 | celerybeat-schedule
114 | celerybeat.pid
115 | 
116 | # SageMath parsed files
117 | *.sage.py
118 | 
119 | # Environments
120 | .env
121 | .venv
122 | env/
123 | venv/
124 | ENV/
125 | env.bak/
126 | venv.bak/
127 | 
128 | # Spyder project settings
129 | .spyderproject
130 | .spyproject
131 | 
132 | # Rope project settings
133 | .ropeproject
134 | 
135 | # mkdocs documentation
136 | /site
137 | 
138 | # mypy
139 | .mypy_cache/
140 | .dmypy.json
141 | dmypy.json
142 | 
143 | # Pyre type checker
144 | .pyre/
145 | 


--------------------------------------------------------------------------------
/lib/src/spark_validation/dataframe_validation/hive_validator.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | from pyspark.sql import SparkSession
 4 | 
 5 | from spark_validation.common.config import Config
 6 | from spark_validation.common.constants import Constants
 7 | from spark_validation.dataframe_validation.dataframe_validator import DataframeValidator
 8 | 
 9 | 
10 | class CreateHiveValidationDF:
11 |     """Class to create validations tables."""
12 | 
13 |     logger = logging.getLogger(__name__)
14 | 
15 |     @staticmethod
16 |     def validate(ss, config):
17 |         """Apply validation process using config input file."""
18 |         source_read_df = ss.table(config.source_df)
19 |         comparable_dfs_list = [(t, ss.table(t)) for t in config.comparable_dfs_list]
20 | 
21 |         validator = DataframeValidator(
22 |             spark=ss,
23 |             source_df=source_read_df,
24 |             id_col_name=config.id_col_name,
25 |             correctness_rules_dict=config.correctness_rules_dict,
26 |             parent_children_validation_pairs=config.parent_children_validation_pairs,
27 |             completeness_rules_dic=config.completeness_rules_dic,
28 |             comparable_dfs_list=comparable_dfs_list,
29 |             unique_column_group_values_per_table=config.unique_column_group_values_per_table,
30 |         )
31 | 
32 |         processed_df = validator.process()
33 |         completeness_df = processed_df.limit(1).select(
34 |             Constants.OVER_ALL_COUNT_COL,
35 |             Constants.IS_ERROR_COL + Constants.OVER_ALL_COUNT_COL,
36 |             Constants.DATE_TIME_REPORT_COL,
37 |         )
38 | 
39 |         correctness_df = processed_df.drop(
40 |             Constants.OVER_ALL_COUNT_COL,
41 |             Constants.IS_ERROR_COL + Constants.OVER_ALL_COUNT_COL,
42 |         )
43 |         comparison_df = validator.compare()
44 | 
45 |         correctness_df.write.mode("append").saveAsTable(config.output_correctness_table)
46 | 
47 |         completeness_df.write.mode("append").saveAsTable(
48 |             config.output_completeness_table
49 |         )
50 |         comparison_df.write.mode("append").saveAsTable(config.output_comparison_table)
51 | 
52 | 
53 | def main(args):
54 |     """Run the main create table function using the sys arguments."""
55 |     spark_session = SparkSession.builder.enableHiveSupport().getOrCreate()
56 |     spark_session.conf.set("spark.sql.debug.maxToStringFields", "1000")
57 |     spark_session.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")
58 |     arg_conf = spark_session.sparkContext.wholeTextFiles(args.config).collect()[0][1]
59 |     config = Config.parse_text(arg_conf)
60 | 
61 |     CreateHiveValidationDF.validate(spark_session, config)
62 | 
63 | 
64 | def create_parser():
65 |     """Parse sys arguments and return parser object."""
66 |     parser = argparse.ArgumentParser(description="Hive Validation")
67 |     parser.add_argument(
68 |         "-c", dest="config", action="store", help="config file", required=True,
69 |     )
70 |     return parser
71 | 
72 | 
73 | def init():
74 |     """Wrap to make main call function testable by sending parsed arguments."""
75 |     parser = create_parser()
76 |     args = parser.parse_args()
77 |     main(args)
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     init()
82 | 


--------------------------------------------------------------------------------
/lib/src/spark_validation/dataframe_validation/dataframe_validator.py:
--------------------------------------------------------------------------------
 1 | """This module exposes the handler class for the dataframes validation process."""
 2 | import datetime
 3 | 
 4 | import pyspark.sql.functions as F
 5 | from pyspark.sql.dataframe import DataFrame
 6 | 
 7 | from spark_validation.common.constants import Constants
 8 | from spark_validation.common.general_validator import GeneralDFValidator
 9 | 
10 | 
11 | class DataframeValidator(GeneralDFValidator):
12 |     """Class to create a handler with the main function for the grid ingestion process."""
13 | 
14 |     def __init__(
15 |         self,
16 |         spark,
17 |         source_df,
18 |         id_col_name,
19 |         correctness_rules_dict,
20 |         parent_children_validation_pairs,
21 |         completeness_rules_dic,
22 |         comparable_dfs_list,
23 |         unique_column_group_values_per_table=[],
24 |     ):
25 |         """Create handler with initial df for the specific date."""
26 |         self.spark = spark
27 |         self.source_df = source_df
28 |         self.id_col_name = id_col_name
29 |         self.correctness_rules_dict = correctness_rules_dict
30 |         self.parent_children_validation_pairs = parent_children_validation_pairs
31 |         self.completeness_rules_dic = completeness_rules_dic
32 |         self.comparable_dfs_list = comparable_dfs_list
33 |         self.unique_column_group_values_per_table = unique_column_group_values_per_table
34 | 
35 |     def process(self):
36 |         """Run the the entire validation pipeline.
37 | 
38 |         1. Run all the rules for correcteness.
39 |         2. Run all the rules completness.
40 |         3. Return processed_df with all the computed values.
41 |         """
42 |         processed_df = (
43 |             self.source_df.transform(
44 |                 lambda df: self.join_cols_with_all_parents(
45 |                     df, self.parent_children_validation_pairs
46 |                 )
47 |             )
48 |             .transform(
49 |                 lambda df: self.add_unique_error(
50 |                     df, self.id_col_name, self.unique_column_group_values_per_table
51 |                 )
52 |             )
53 |             .transform(
54 |                 lambda df: self.build_correctness_df(
55 |                     df,
56 |                     self.correctness_rules_dict,
57 |                     self.parent_children_validation_pairs,
58 |                 )
59 |             )
60 |         )
61 | 
62 |         validation_result_cols = list(
63 |             filter(lambda x: Constants.IS_ERROR_COL in x, processed_df.schema.names)
64 |         )
65 |         processed_df = processed_df.select(
66 |             *([self.id_col_name] + validation_result_cols)
67 |         )
68 | 
69 |         processed_df = processed_df.transform(
70 |             lambda df: self.build_correctness_report_df(df, validation_result_cols)
71 |         ).transform(
72 |             lambda df: self.build_computed_rules_correctness_df(
73 |                 df, self.completeness_rules_dic
74 |             )
75 |         )
76 |         return processed_df
77 | 
78 |     def compare(self):
79 |         """Compare the source df with related dfs.
80 | 
81 |         Get comparison metrics like:
82 |         1. missing_cols_right.
83 |         2. missing_cols_left.
84 |         3. missing_vals_right.
85 |         4. missing_vals_left.
86 |         """
87 |         return self.spark.createDataFrame(
88 |             self.compared_with_related_dfs(
89 |                 self.source_df, self.id_col_name, self.comparable_dfs_list
90 |             ),
91 |             Constants.OUTPUT_COMPARABLE_COLS,
92 |         ).withColumn(Constants.DATE_TIME_REPORT_COL, F.lit(datetime.datetime.now()))
93 | 
94 | 
95 | DataFrame.transform = DataframeValidator.transform
96 | 


--------------------------------------------------------------------------------
/lib/src/spark_validation/common/config.py:
--------------------------------------------------------------------------------
  1 | """Module representing config data."""
  2 | import json
  3 | import sys
  4 | import yaml
  5 | from abc import ABC
  6 | 
  7 | from pyspark.sql.utils import AnalysisException
  8 | 
  9 | 
 10 | class Config(ABC):
 11 |     """Class with config data."""
 12 | 
 13 |     def __init__(
 14 |         self,
 15 |         source_df,
 16 |         id_col_name,
 17 |         correctness_rules_dict,
 18 |         parent_children_validation_pairs,
 19 |         completeness_rules_dic,
 20 |         comparable_dfs_list,
 21 |         output_correctness_table,
 22 |         output_completeness_table,
 23 |         output_comparison_table,
 24 |         unique_column_group_values_per_table=[],
 25 |         fuzzy_deduplication_distance=0,
 26 |     ):
 27 |         self.source_df = source_df
 28 |         self.id_col_name = id_col_name
 29 |         self.correctness_rules_dict = correctness_rules_dict
 30 |         self.parent_children_validation_pairs = parent_children_validation_pairs
 31 |         self.completeness_rules_dic = completeness_rules_dic
 32 |         self.comparable_dfs_list = comparable_dfs_list
 33 |         self.output_correctness_table = output_correctness_table
 34 |         self.output_completeness_table = output_completeness_table
 35 |         self.output_comparison_table = output_comparison_table
 36 |         self.unique_column_group_values_per_table = unique_column_group_values_per_table
 37 |         self.fuzzy_deduplication_distance = fuzzy_deduplication_distance
 38 | 
 39 |     @staticmethod
 40 |     def _create_config(config):
 41 |         try:
 42 |             correctness_validations = {
 43 |                 rule["column"]: rule["rule"]
 44 |                 for rule in config["correctness_validations"]
 45 |             }
 46 |             parent_children_validations = [
 47 |                 (rule["column"], rule["parent"])
 48 |                 for rule in config["parent_children_constraints"]
 49 |             ]
 50 | 
 51 |             completeness_overall_rule = config["completeness_validations"]["overall"]
 52 |             completeness_validations = {
 53 |                 completeness_overall_rule["column"]: completeness_overall_rule["rule"]
 54 |             }
 55 |             return Config(
 56 |                 source_df=config["source_table"]["name"],
 57 |                 id_col_name=config["source_table"]["id_column"],
 58 |                 correctness_rules_dict=correctness_validations,
 59 |                 parent_children_validation_pairs=parent_children_validations,
 60 |                 completeness_rules_dic=completeness_validations,
 61 |                 comparable_dfs_list=config["compare_related_tables_list"],
 62 |                 output_correctness_table=config["source_table"][
 63 |                     "output_correctness_table"
 64 |                 ],
 65 |                 output_completeness_table=config["source_table"][
 66 |                     "output_completeness_table"
 67 |                 ],
 68 |                 output_comparison_table=config["source_table"][
 69 |                     "output_comparison_table"
 70 |                 ],
 71 |                 unique_column_group_values_per_table=config["source_table"][
 72 |                     "unique_column_group_values_per_table"
 73 |                 ]
 74 |                 if (
 75 |                     "unique_column_group_values_per_table"
 76 |                     in config["source_table"].keys()
 77 |                 )
 78 |                 else [],
 79 |                 fuzzy_deduplication_distance=config["source_table"][
 80 |                     "fuzzy_deduplication_distance"
 81 |                 ]
 82 |                 if ("fuzzy_deduplication_distance" in config["source_table"].keys())
 83 |                 else 0,
 84 |             )
 85 |         except KeyError as e:
 86 |             print(
 87 |                 "The config file has key error, check source_table, correctness_validations,"
 88 |                 " completeness_validations, parent_children_constraints, compare_related_tables_list as mandatory)"
 89 |                 ' - reason "%s"' % str(e)
 90 |             )
 91 | 
 92 |     @staticmethod
 93 |     def parse(file):
 94 |         """parse a json file to a config object."""
 95 |         try:
 96 |             config = json.load(file) if 'json' in file.name else yaml.load(file)
 97 |         except OSError:
 98 |             print("Could not open/read file:", file)
 99 |             sys.exit()
100 |         return Config._create_config(config)
101 | 
102 |     @staticmethod
103 |     def parse_text(str_file):
104 |         """parse a json file to a config object."""
105 |         try:
106 |             config = json.loads(str_file)
107 |         except AnalysisException:
108 |             print("Could not open/read file:", str_file)
109 |             sys.exit()
110 |         return Config._create_config(config)
111 | 


--------------------------------------------------------------------------------
/lib/src/spark_validation/dataframe_validation/file_system_validator.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import os
  4 | 
  5 | from pivottablejs import pivot_ui
  6 | from pyspark.sql import SparkSession
  7 | 
  8 | from spark_validation.common.config import Config
  9 | from spark_validation.common.constants import Constants
 10 | from spark_validation.dataframe_validation.dataframe_validator import DataframeValidator
 11 | 
 12 | PACKAGE_DIR = os.path.dirname(os.path.abspath(__file__))
 13 | 
 14 | 
 15 | class CreateFSValidationDF:
 16 |     """Class to create validations tables."""
 17 | 
 18 |     logger = logging.getLogger(__name__)
 19 | 
 20 |     @staticmethod
 21 |     def validate(ss, config):
 22 |         """Apply validation process using config input file."""
 23 |         source_read_df = (
 24 |             ss.read.format("csv").option("header", "true").load(config.source_df)
 25 |         )
 26 |         comparable_dfs_list = [
 27 |             (t, ss.read.format("csv").option("header", "true").load(t))
 28 |             for t in config.comparable_dfs_list
 29 |         ]
 30 | 
 31 |         validator = DataframeValidator(
 32 |             spark=ss,
 33 |             source_df=source_read_df,
 34 |             id_col_name=config.id_col_name,
 35 |             correctness_rules_dict=config.correctness_rules_dict,
 36 |             parent_children_validation_pairs=config.parent_children_validation_pairs,
 37 |             completeness_rules_dic=config.completeness_rules_dic,
 38 |             comparable_dfs_list=comparable_dfs_list,
 39 |             unique_column_group_values_per_table=config.unique_column_group_values_per_table,
 40 |         )
 41 | 
 42 |         processed_df = validator.process()
 43 |         completeness_df = processed_df.limit(1).select(
 44 |             Constants.OVER_ALL_COUNT_COL,
 45 |             Constants.IS_ERROR_COL + Constants.OVER_ALL_COUNT_COL,
 46 |             Constants.DATE_TIME_REPORT_COL,
 47 |         )
 48 | 
 49 |         correctness_df = processed_df.drop(
 50 |             Constants.OVER_ALL_COUNT_COL,
 51 |             Constants.IS_ERROR_COL + Constants.OVER_ALL_COUNT_COL,
 52 |         )
 53 |         comparison_df = validator.compare()
 54 | 
 55 |         correctness_df.coalesce(1).write.mode("append").json(
 56 |             config.output_correctness_table
 57 |         )
 58 |         completeness_df.coalesce(1).write.mode("append").json(
 59 |             config.output_completeness_table
 60 |         )
 61 |         comparison_df.coalesce(1).write.mode("append").json(
 62 |             config.output_comparison_table
 63 |         )
 64 | 
 65 |         pd_correctness_df = ss.read.json(config.output_correctness_table).toPandas()
 66 |         pd_completeness_df = ss.read.json(config.output_completeness_table).toPandas()
 67 |         comparison_df = ss.read.json(config.output_comparison_table).toPandas()
 68 | 
 69 |         pivot_ui(
 70 |             pd_correctness_df,
 71 |             outfile_path="{}.html".format(config.output_correctness_table),
 72 |             menuLimit=5000,
 73 |             overwrite=True,
 74 |             rows=[config.id_col_name]
 75 |             + list(
 76 |                 filter(
 77 |                     lambda x: Constants.IS_ERROR_COL in x
 78 |                     and Constants.SUM_REPORT_SUFFIX not in x
 79 |                     and Constants.ROW_ERROR_SUFFIX not in x,
 80 |                     pd_correctness_df.columns,
 81 |                 )
 82 |             ),
 83 |             cols=[Constants.DATE_TIME_REPORT_COL],
 84 |             vals=[Constants.IS_ERROR_COL + Constants.ROW_ERROR_SUFFIX],
 85 |             aggregatorName="Sum",
 86 |             rendererName="Table Barchart",
 87 |             rowOrder="value_z_to_a",
 88 |         )
 89 | 
 90 |         pivot_ui(
 91 |             pd_completeness_df,
 92 |             outfile_path="{}.html".format(config.output_completeness_table),
 93 |             menuLimit=5000,
 94 |             overwrite=True,
 95 |             rows=[Constants.OVER_ALL_COUNT_COL],
 96 |             cols=[Constants.DATE_TIME_REPORT_COL],
 97 |             vals=[Constants.IS_ERROR_COL + Constants.OVER_ALL_COUNT_COL],
 98 |             aggregatorName="Sum",
 99 |             rendererName="Table Barchart",
100 |             rowOrder="value_z_to_a",
101 |         )
102 | 
103 |         pivot_ui(
104 |             comparison_df,
105 |             outfile_path="{}.html".format(config.output_comparison_table),
106 |             menuLimit=5000,
107 |             overwrite=True,
108 |             rows=list(
109 |                 filter(
110 |                     lambda x: Constants.DATE_TIME_REPORT_COL not in x,
111 |                     comparison_df.columns,
112 |                 )
113 |             ),
114 |             cols=[Constants.DATE_TIME_REPORT_COL],
115 |             rendererName="Table Barchart",
116 |             rowOrder="value_z_to_a",
117 |         )
118 | 
119 | 
120 | def main(args):
121 |     """Run the main create table function using the sys arguments."""
122 |     spark_session = SparkSession.builder.getOrCreate()
123 |     spark_session.conf.set("spark.sql.debug.maxToStringFields", "1000")
124 |     spark_session.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")
125 |     with open(args.config) as f:
126 |         config = Config.parse(f)
127 | 
128 |     CreateFSValidationDF.validate(spark_session, config)
129 | 
130 | 
131 | def create_parser():
132 |     """Parse sys arguments and return parser object."""
133 |     parser = argparse.ArgumentParser(description="Hive Validation")
134 |     parser.add_argument(
135 |         "-c", dest="config", action="store", help="config file", required=True,
136 |     )
137 |     return parser
138 | 
139 | 
140 | def init():
141 |     """Wrap to make main call function testable by sending parsed arguments."""
142 |     parser = create_parser()
143 |     args = parser.parse_args()
144 |     main(args)
145 | 
146 | 
147 | if __name__ == "__main__":
148 |     init()
149 | 


--------------------------------------------------------------------------------
/lib/src/spark_validation/static/static/js/runtime-main.6d8ceafa.js.map:
--------------------------------------------------------------------------------
1 | {"version":3,"sources":["../webpack/bootstrap"],"names":["webpackJsonpCallback","data","moduleId","chunkId","chunkIds","moreModules","executeModules","i","resolves","length","Object","prototype","hasOwnProperty","call","installedChunks","push","modules","parentJsonpFunction","shift","deferredModules","apply","checkDeferredModules","result","deferredModule","fulfilled","j","depId","splice","__webpack_require__","s","installedModules","1","exports","module","l","m","c","d","name","getter","o","defineProperty","enumerable","get","r","Symbol","toStringTag","value","t","mode","__esModule","ns","create","key","bind","n","object","property","p","jsonpArray","this","oldJsonpFunction","slice"],"mappings":"aACE,SAASA,EAAqBC,GAQ7B,IAPA,IAMIC,EAAUC,EANVC,EAAWH,EAAK,GAChBI,EAAcJ,EAAK,GACnBK,EAAiBL,EAAK,GAIHM,EAAI,EAAGC,EAAW,GACpCD,EAAIH,EAASK,OAAQF,IACzBJ,EAAUC,EAASG,GAChBG,OAAOC,UAAUC,eAAeC,KAAKC,EAAiBX,IAAYW,EAAgBX,IACpFK,EAASO,KAAKD,EAAgBX,GAAS,IAExCW,EAAgBX,GAAW,EAE5B,IAAID,KAAYG,EACZK,OAAOC,UAAUC,eAAeC,KAAKR,EAAaH,KACpDc,EAAQd,GAAYG,EAAYH,IAKlC,IAFGe,GAAqBA,EAAoBhB,GAEtCO,EAASC,QACdD,EAASU,OAATV,GAOD,OAHAW,EAAgBJ,KAAKK,MAAMD,EAAiBb,GAAkB,IAGvDe,IAER,SAASA,IAER,IADA,IAAIC,EACIf,EAAI,EAAGA,EAAIY,EAAgBV,OAAQF,IAAK,CAG/C,IAFA,IAAIgB,EAAiBJ,EAAgBZ,GACjCiB,GAAY,EACRC,EAAI,EAAGA,EAAIF,EAAed,OAAQgB,IAAK,CAC9C,IAAIC,EAAQH,EAAeE,GACG,IAA3BX,EAAgBY,KAAcF,GAAY,GAE3CA,IACFL,EAAgBQ,OAAOpB,IAAK,GAC5Be,EAASM,EAAoBA,EAAoBC,EAAIN,EAAe,KAItE,OAAOD,EAIR,IAAIQ,EAAmB,GAKnBhB,EAAkB,CACrBiB,EAAG,GAGAZ,EAAkB,GAGtB,SAASS,EAAoB1B,GAG5B,GAAG4B,EAAiB5B,GACnB,OAAO4B,EAAiB5B,GAAU8B,QAGnC,IAAIC,EAASH,EAAiB5B,GAAY,CACzCK,EAAGL,EACHgC,GAAG,EACHF,QAAS,IAUV,OANAhB,EAAQd,GAAUW,KAAKoB,EAAOD,QAASC,EAAQA,EAAOD,QAASJ,GAG/DK,EAAOC,GAAI,EAGJD,EAAOD,QAKfJ,EAAoBO,EAAInB,EAGxBY,EAAoBQ,EAAIN,EAGxBF,EAAoBS,EAAI,SAASL,EAASM,EAAMC,GAC3CX,EAAoBY,EAAER,EAASM,IAClC5B,OAAO+B,eAAeT,EAASM,EAAM,CAAEI,YAAY,EAAMC,IAAKJ,KAKhEX,EAAoBgB,EAAI,SAASZ,GACX,qBAAXa,QAA0BA,OAAOC,aAC1CpC,OAAO+B,eAAeT,EAASa,OAAOC,YAAa,CAAEC,MAAO,WAE7DrC,OAAO+B,eAAeT,EAAS,aAAc,CAAEe,OAAO,KAQvDnB,EAAoBoB,EAAI,SAASD,EAAOE,GAEvC,GADU,EAAPA,IAAUF,EAAQnB,EAAoBmB,IAC/B,EAAPE,EAAU,OAAOF,EACpB,GAAW,EAAPE,GAA8B,kBAAVF,GAAsBA,GAASA,EAAMG,WAAY,OAAOH,EAChF,IAAII,EAAKzC,OAAO0C,OAAO,MAGvB,GAFAxB,EAAoBgB,EAAEO,GACtBzC,OAAO+B,eAAeU,EAAI,UAAW,CAAET,YAAY,EAAMK,MAAOA,IACtD,EAAPE,GAA4B,iBAATF,EAAmB,IAAI,IAAIM,KAAON,EAAOnB,EAAoBS,EAAEc,EAAIE,EAAK,SAASA,GAAO,OAAON,EAAMM,IAAQC,KAAK,KAAMD,IAC9I,OAAOF,GAIRvB,EAAoB2B,EAAI,SAAStB,GAChC,IAAIM,EAASN,GAAUA,EAAOiB,WAC7B,WAAwB,OAAOjB,EAAgB,SAC/C,WAA8B,OAAOA,GAEtC,OADAL,EAAoBS,EAAEE,EAAQ,IAAKA,GAC5BA,GAIRX,EAAoBY,EAAI,SAASgB,EAAQC,GAAY,OAAO/C,OAAOC,UAAUC,eAAeC,KAAK2C,EAAQC,IAGzG7B,EAAoB8B,EAAI,IAExB,IAAIC,EAAaC,KAAsB,gBAAIA,KAAsB,iBAAK,GAClEC,EAAmBF,EAAW5C,KAAKuC,KAAKK,GAC5CA,EAAW5C,KAAOf,EAClB2D,EAAaA,EAAWG,QACxB,IAAI,IAAIvD,EAAI,EAAGA,EAAIoD,EAAWlD,OAAQF,IAAKP,EAAqB2D,EAAWpD,IAC3E,IAAIU,EAAsB4C,EAI1BxC,I","file":"static/js/runtime-main.6d8ceafa.js","sourcesContent":[" \t// install a JSONP callback for chunk loading\n \tfunction webpackJsonpCallback(data) {\n \t\tvar chunkIds = data[0];\n \t\tvar moreModules = data[1];\n \t\tvar executeModules = data[2];\n\n \t\t// add \"moreModules\" to the modules object,\n \t\t// then flag all \"chunkIds\" as loaded and fire callback\n \t\tvar moduleId, chunkId, i = 0, resolves = [];\n \t\tfor(;i < chunkIds.length; i++) {\n \t\t\tchunkId = chunkIds[i];\n \t\t\tif(Object.prototype.hasOwnProperty.call(installedChunks, chunkId) && installedChunks[chunkId]) {\n \t\t\t\tresolves.push(installedChunks[chunkId][0]);\n \t\t\t}\n \t\t\tinstalledChunks[chunkId] = 0;\n \t\t}\n \t\tfor(moduleId in moreModules) {\n \t\t\tif(Object.prototype.hasOwnProperty.call(moreModules, moduleId)) {\n \t\t\t\tmodules[moduleId] = moreModules[moduleId];\n \t\t\t}\n \t\t}\n \t\tif(parentJsonpFunction) parentJsonpFunction(data);\n\n \t\twhile(resolves.length) {\n \t\t\tresolves.shift()();\n \t\t}\n\n \t\t// add entry modules from loaded chunk to deferred list\n \t\tdeferredModules.push.apply(deferredModules, executeModules || []);\n\n \t\t// run deferred modules when all chunks ready\n \t\treturn checkDeferredModules();\n \t};\n \tfunction checkDeferredModules() {\n \t\tvar result;\n \t\tfor(var i = 0; i < deferredModules.length; i++) {\n \t\t\tvar deferredModule = deferredModules[i];\n \t\t\tvar fulfilled = true;\n \t\t\tfor(var j = 1; j < deferredModule.length; j++) {\n \t\t\t\tvar depId = deferredModule[j];\n \t\t\t\tif(installedChunks[depId] !== 0) fulfilled = false;\n \t\t\t}\n \t\t\tif(fulfilled) {\n \t\t\t\tdeferredModules.splice(i--, 1);\n \t\t\t\tresult = __webpack_require__(__webpack_require__.s = deferredModule[0]);\n \t\t\t}\n \t\t}\n\n \t\treturn result;\n \t}\n\n \t// The module cache\n \tvar installedModules = {};\n\n \t// object to store loaded and loading chunks\n \t// undefined = chunk not loaded, null = chunk preloaded/prefetched\n \t// Promise = chunk loading, 0 = chunk loaded\n \tvar installedChunks = {\n \t\t1: 0\n \t};\n\n \tvar deferredModules = [];\n\n \t// The require function\n \tfunction __webpack_require__(moduleId) {\n\n \t\t// Check if module is in cache\n \t\tif(installedModules[moduleId]) {\n \t\t\treturn installedModules[moduleId].exports;\n \t\t}\n \t\t// Create a new module (and put it into the cache)\n \t\tvar module = installedModules[moduleId] = {\n \t\t\ti: moduleId,\n \t\t\tl: false,\n \t\t\texports: {}\n \t\t};\n\n \t\t// Execute the module function\n \t\tmodules[moduleId].call(module.exports, module, module.exports, __webpack_require__);\n\n \t\t// Flag the module as loaded\n \t\tmodule.l = true;\n\n \t\t// Return the exports of the module\n \t\treturn module.exports;\n \t}\n\n\n \t// expose the modules object (__webpack_modules__)\n \t__webpack_require__.m = modules;\n\n \t// expose the module cache\n \t__webpack_require__.c = installedModules;\n\n \t// define getter function for harmony exports\n \t__webpack_require__.d = function(exports, name, getter) {\n \t\tif(!__webpack_require__.o(exports, name)) {\n \t\t\tObject.defineProperty(exports, name, { enumerable: true, get: getter });\n \t\t}\n \t};\n\n \t// define __esModule on exports\n \t__webpack_require__.r = function(exports) {\n \t\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n \t\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n \t\t}\n \t\tObject.defineProperty(exports, '__esModule', { value: true });\n \t};\n\n \t// create a fake namespace object\n \t// mode & 1: value is a module id, require it\n \t// mode & 2: merge all properties of value into the ns\n \t// mode & 4: return value when already ns object\n \t// mode & 8|1: behave like require\n \t__webpack_require__.t = function(value, mode) {\n \t\tif(mode & 1) value = __webpack_require__(value);\n \t\tif(mode & 8) return value;\n \t\tif((mode & 4) && typeof value === 'object' && value && value.__esModule) return value;\n \t\tvar ns = Object.create(null);\n \t\t__webpack_require__.r(ns);\n \t\tObject.defineProperty(ns, 'default', { enumerable: true, value: value });\n \t\tif(mode & 2 && typeof value != 'string') for(var key in value) __webpack_require__.d(ns, key, function(key) { return value[key]; }.bind(null, key));\n \t\treturn ns;\n \t};\n\n \t// getDefaultExport function for compatibility with non-harmony modules\n \t__webpack_require__.n = function(module) {\n \t\tvar getter = module && module.__esModule ?\n \t\t\tfunction getDefault() { return module['default']; } :\n \t\t\tfunction getModuleExports() { return module; };\n \t\t__webpack_require__.d(getter, 'a', getter);\n \t\treturn getter;\n \t};\n\n \t// Object.prototype.hasOwnProperty.call\n \t__webpack_require__.o = function(object, property) { return Object.prototype.hasOwnProperty.call(object, property); };\n\n \t// __webpack_public_path__\n \t__webpack_require__.p = \"/\";\n\n \tvar jsonpArray = this[\"webpackJsonpapp\"] = this[\"webpackJsonpapp\"] || [];\n \tvar oldJsonpFunction = jsonpArray.push.bind(jsonpArray);\n \tjsonpArray.push = webpackJsonpCallback;\n \tjsonpArray = jsonpArray.slice();\n \tfor(var i = 0; i < jsonpArray.length; i++) webpackJsonpCallback(jsonpArray[i]);\n \tvar parentJsonpFunction = oldJsonpFunction;\n\n\n \t// run deferred modules from other chunks\n \tcheckDeferredModules();\n"],"sourceRoot":""}


--------------------------------------------------------------------------------
/lib/src/spark_validation/static/static/js/main.8e11e6a5.chunk.js:
--------------------------------------------------------------------------------
1 | (this.webpackJsonpapp=this.webpackJsonpapp||[]).push([[0],{245:function(e,n,t){"use strict";t.r(n);var a=t(0),l=t.n(a),r=t(2),o=t.n(r),c=(t(44),t(7)),u=(t(45),t(4)),i=t(20),s=t(5),m=t(30),p=t.n(m),d=t(249),f=t(248),_=t(27),b=t(9),g=t(32);function v(){var e=Object(u.a)(["\n  color: darkblue;\n  background: rgb(248, 248, 248);\n  border-radius: 0px 8px 8px 0px;\n  text-align: left;\n  padding: 0px 30px;\n  font-size: 0.8em;\n  overflow: auto;\n"]);return v=function(){return e},e}function E(){var e=Object(u.a)(["\n  padding: 0px 30px 50px 30px;\n  color: darkblue;\n  text-align: left;\n  overflow: auto;\n  h4 {\n    padding: 0px;\n    margin: 0px;\n    color: #505050;\n    font-weight: normal;\n    margin-bottom: 5px;\n  }\n\n  .completness {\n    display: grid;\n    grid-template-columns: 1fr 2fr;\n    grid-gap: 8px;\n  }\n\n  .ruleGroup {\n    padding: 0.5rem;\n    border: 1px solid #1890ff;\n    border-radius: 4px;\n    background: rgba(180, 220, 255, 0.2);\n\n    .rule,\n    .ruleGroup {\n      margin-top: 0.5rem;\n      margin-left: 0.5rem;\n    }\n\n    .ruleGroup-combinators.betweenRules {\n      margin-top: 0.5rem;\n    }\n\n    .ruleGroup-notToggle {\n      margin-right: 0.5rem;\n    }\n  }\n"]);return E=function(){return e},e}function h(){var e=Object(u.a)(["\n  height: 70vh;\n  border: 1px solid lightgray;\n  border-radius: 8px;\n  display: grid;\n  grid-template-columns: repeat(2, 1fr);\n"]);return h=function(){return e},e}function O(){var e=Object(u.a)(["\n  height: 30px;\n  background: #1890ff;\n  border-radius: 4px;\n  border: none;\n  color: white;\n  line-height: 1.5;\n  font-weight: bold;\n  cursor: pointer;\n  width: 200px;\n  margin: 30px;\n"]);return O=function(){return e},e}var y=s.a.button(O()),N=s.a.div(h()),A=s.a.div(E()),x=s.a.div(v()),L=function(e){var n=e.name,t=e.type;return{operators:[{name:"Is NULL",label:"Is NULL"},{name:"Is NOT NULL",label:"Is NOT NULL"},{name:"= ''",label:"Is EMPTY"},{name:"!= ''",label:"Is NOT EMPTY"},{name:"In",label:"In"},{name:"=",label:"="},{name:"!=",label:"!="},{name:"<",label:"<"},{name:">",label:">"},{name:"<=",label:"<="},{name:">=",label:">="}],fields:[{name:n,label:n},{name:"CHAR_LENGTH(".concat(n,")"),label:"CHAR_LENGTH(".concat(n,")")}],getControlElements:function(){return{valueEditor:function(e){var a=e.field,r=e.operator,o=e.handleOnChange,c=e.value,u=r.toLowerCase();return u.startsWith("is")||["= ''","!= ''"].includes(u)?"":"number"===t||a==="CHAR_LENGTH(".concat(n,")")?l.a.createElement("input",{value:c,onChange:o}):l.a.createElement("input",{onChange:o})}}}}},S=function(e,n,t,a){var l=function e(n,t){console.log(n);var a="",l={null:"IS NULL",notNull:"IS NOT NULL",contains:"LIKE"},r=n.rules,o=n.combinator;return r.forEach((function(n){var r=n.field,c=void 0===r?"":r,u=n.operator,i=void 0===u?"":u,s=n.value,m=n.rules,p=void 0===m?[]:m;console.log(c);var d=t&&""===a?"":o;a+=p.length?e(n):" ".concat(d," ").concat(c," ").concat(l[i]||i," ").concat(s," ")})),a}(n,!0).replace(/\s\s+/g," ").trim();a(t.map((function(n){return n.column===e?{column:n.column,rule:l}:n})))},k=function(e){var n=e.table,t=void 0===n?{columns:[]}:n,r=Object(a.useState)([]),o=Object(c.a)(r,2),u=o[0],s=o[1],m=Object(a.useState)([]),v=Object(c.a)(m,2),E=v[0],h=v[1],O=Object(b.useToasts)().addToast,k=Object(a.useState)(""),j=Object(c.a)(k,2),I=j[0],T=j[1],D=Object(a.useState)(""),M=Object(c.a)(D,2),w=M[0],C=M[1];Object(a.useEffect)((function(){s(t.columns.map((function(e){return{column:e.name,rule:""}})))}),[t]);t.columns.reduce((function(e,n){return e[n.name]=n.type,e}),{});var R,P=(R={correctness_validations:u,completeness_validations:[{column:"OVER_ALL_COUNT",rule:"OVER_ALL_COUNT "+I+" "+w}],parent_children_constraints:E},{source_table:{name:"mock_data/family_sample.csv",id_column:"ID",output_correctness_table:"/tmp/mock_data/output/family_sample_test_correctness",output_completeness_table:"/tmp/mock_data/output/family_sample_test_completeness",output_comparison_table:"/tmp/mock_data/output/family_sample_test_comparison",unique_column_group_values_per_table:["ID","NAME","FAMILY_NAME","PARENT"],fuzzy_deduplication_distance:0},correctness_validations:Object(i.a)(R.correctness_validations),completeness_validations:Object(i.a)(R.completeness_validations),parent_children_constraints:Object(i.a)(R.parent_children_constraints),compare_related_tables_list:["test.diff_df","test.diff_df_2"]});return l.a.createElement(a.Fragment,null,0==t.columns.length&&l.a.createElement("h3",null," Please select table to add rules"),0!=t.columns.length&&l.a.createElement(N,null,l.a.createElement(A,null,l.a.createElement("h1",null,"Editor"),l.a.createElement("h3",null,"Correctness validations"),t.columns.map((function(e){return l.a.createElement("div",{key:e.name},l.a.createElement("h4",null,e.name,": [",e.type,"]"),l.a.createElement(p.a,{fields:L(e).fields,onQueryChange:function(n){return S(e.name,n,u,s)}}),l.a.createElement("br",null))})),l.a.createElement("h3",null,"Completeness validations"),l.a.createElement("h4",null,"Number of rows"),l.a.createElement("div",{class:"completness"},l.a.createElement(_.a,{onChange:function(e){return T(e.value)},options:[{value:">",label:">"},{value:">=",label:">="},{value:"=",label:"="},{value:"<",label:"<"},{value:"<=",label:"<="}]}),l.a.createElement("input",{type:"number",onChange:function(e){return C(e.target.value)}})),l.a.createElement("h3",null,"Parent of constraints"),t.columns.map((function(e){return l.a.createElement("div",{key:e.name},l.a.createElement("h4",null,e.name,": [",e.type,"]"),l.a.createElement(_.a,{isMulti:!0,onChange:function(n){var t=(n||[]).map((function(n){return{column:e.name,parent:n.value}})),a=E.filter((function(n){return n.column!=e.name}));h(t.concat(a))},options:t.columns.filter((function(n){return n.name!=e.name})).map((function(e){return{value:e.name,label:e.name}}))}),l.a.createElement("br",null))}))),l.a.createElement(x,null,l.a.createElement("h1",null,"Output"),l.a.createElement(d.a,{language:"JSON",style:f.a},JSON.stringify(P,null,2)))),0!=t.columns.length&&l.a.createElement(y,{onClick:function(){fetch("api/validate",{body:JSON.stringify(g,null,2),method:"POST"}),O("Succesfully submited",{appearance:"success",autoDismiss:!0,autoDismissTimeout:3e3})}},"Submit"))},j=t(37),I=t(36),T=t.n(I);function D(){var e=Object(u.a)(["\n  display: grid;\n  grid-template-columns: 1fr 1fr;\n  justify-items: center;\n  align-items: center;\n  font-weight: bold;\n  color: darkblue;\n"]);return D=function(){return e},e}function M(){var e=Object(u.a)(["\n  width: 60px;\n"]);return M=function(){return e},e}function w(){var e=Object(u.a)(["\n  height: 30px;\n  width: 100px;\n  background: ",";\n  color: ",";\n  border-radius: 4px;\n  border: 1px solid #1890ff;\n  line-height: 1.5;\n  font-weight: bold;\n  cursor: pointer;\n"]);return w=function(){return e},e}function C(){var e=Object(u.a)(["\n  height: 60px;\n  border-radius: 8px;\n  border: 1px solid lightgrey;\n  display: grid;\n  justify-items: center;\n  align-items: center;\n  grid-template-columns: repeat(3, 1fr);\n  margin-bottom: 40px;\n"]);return C=function(){return e},e}var R=s.a.div(C()),P=s.a.button(w(),(function(e){return"secondary"===e.type?"white":"#1890ff"}),(function(e){return"secondary"===e.type?"#1890ff":"white"})),U={name:"family_tree.csv",columns:[{name:"ID",type:"number"},{name:"NAME",type:"string"},{name:"FAMILY_NAME",type:"string"},{name:"PARENT",type:"string"},{name:"ADDRESS",type:"string"}]},G=s.a.img(M()),H=s.a.div(D()),Y={columns:[]},F=function(e){var n=e.onTableSelected,t=Object(b.useToasts)().addToast,r=Object(a.useState)(Object(j.a)({},Y)),o=Object(c.a)(r,2),u=o[0],i=o[1];return l.a.createElement(R,null,l.a.createElement(P,{onClick:function(){n(U),i(U),t("Succesfully uploaded",{appearance:"success",autoDismiss:!0,autoDismissTimeout:3e3})}},"Upload"),l.a.createElement("span",null,l.a.createElement("b",null,u.name||"No table selected")),l.a.createElement(H,null,"ValiData ",l.a.createElement(G,{src:T.a})))};var J=function(){var e=Object(a.useState)({columns:[]}),n=Object(c.a)(e,2),t=n[0],r=n[1];return l.a.createElement(b.ToastProvider,null,l.a.createElement("div",{className:"App"},l.a.createElement(F,{onTableSelected:function(e){return r(e)}}),l.a.createElement(k,{table:t})))};Boolean("localhost"===window.location.hostname||"[::1]"===window.location.hostname||window.location.hostname.match(/^127(?:\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}$/));o.a.render(l.a.createElement(l.a.StrictMode,null,l.a.createElement(J,null)),document.getElementById("root")),"serviceWorker"in navigator&&navigator.serviceWorker.ready.then((function(e){e.unregister()})).catch((function(e){console.error(e.message)}))},32:function(e){e.exports=JSON.parse('{"source_table":{"name":"lib/test/spark_validation_tests/common/mock_data/family_sample.csv","id_column":"ID","output_correctness_table":"lib/test/spark_validation_tests/common/mock_data/output/fs/family_sample_test_correctness","output_completeness_table":"lib/test/spark_validation_tests/common/mock_data/output/fs/family_sample_test_completeness","output_comparison_table":"lib/test/spark_validation_tests/common/mock_data/output/fs/family_sample_test_comparison","unique_column_group_values_per_table":["ID","NAME","FAMILY_NAME","PARENT"],"fuzzy_deduplication_distance":0},"correctness_validations":[{"column":"ID","rule":"ID is not null and ID != \'\' and ID != \'null\'"},{"column":"NAME","rule":"NAME is not null and NAME != \'\' and NAME != \'null\' and NAME like \'%ho%\'"},{"column":"FAMILY_NAME","rule":"FAMILY_NAME is not null and FAMILY_NAME in (\'Cha\', \'Pha\')"},{"column":"ADDRESS","rule":"ADDRESS is not null and ADDRESS != \'\' and ADDRESS != \'null\' and CHAR_LENGTH(ADDRESS) > 4"}],"completeness_validations":[{"column":"OVER_ALL_COUNT","rule":"OVER_ALL_COUNT <= 5"}],"parent_children_constraints":[{"column":"ID","parent":"PARENT"}],"compare_related_tables_list":["lib/test/spark_validation_tests/common/mock_data/family_sample_diff.csv"]}')},36:function(e,n,t){e.exports=t.p+"static/media/logo.50e8e5ec.png"},39:function(e,n,t){e.exports=t(245)},44:function(e,n,t){},45:function(e,n,t){}},[[39,1,2]]]);
2 | //# sourceMappingURL=main.8e11e6a5.chunk.js.map
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Owl Data Sanitizer: A light Spark data validation framework
  2 | 
  3 | [![license](https://img.shields.io/github/license/mashape/apistatus.svg?maxAge=2592000)](https://github.com/ronald-smith-angel/owl-data-sanitizer/blob/develop/license.md)
  4 | [![Build Status](https://travis-ci.org/ronald-smith-angel/owl-data-sanitizer.svg?branch=develop)](https://travis-ci.org/github/ronald-smith-angel/owl-data-sanitizer)
  5 | 
  6 | This is a small framework for data quality validation. This first version works reading spark dataframes from local 
  7 | datasources like local system, s3 or hive and delivers hive tables with quality reports.
  8 | 
  9 | Let's follow this example:
 10 | 
 11 | Input data from a hive table:
 12 | 
 13 | ```
 14 | +----------+--------------+--------+---------+------------------+---------+
 15 | |GENERAL_ID|          NAME|    CODE|ADDR_DESC|ULTIMATE_PARENT_ID|PARENT_ID|
 16 | +----------+--------------+--------+---------+------------------+---------+
 17 | |         1|Dummy 1 Entity|12000123|     null|              null|     null|
 18 | |         2|          null|    null|     null|                 2|        2|
 19 | |         3|          null|12000123|     null|                 3|        3|
 20 | |         4|             1|       1|     null|                 4|        4|
 21 | |         5|             1|12000123|     null|                 5|        5|
 22 | |         6|          null|       3|     null|                 6|        6|
 23 | |      null|          null|12000123|     null|                11|        7|
 24 | |         7|             2|    null|     null|                 8|        8|
 25 | +----------+--------------+--------+---------+------------------+---------+
 26 | ```
 27 | 
 28 | following this validation config with 4 sections:
 29 | 
 30 | 1. `source_table` including the table metadata.
 31 | 2. `correctness_validations` including correctness validations per column. 
 32 | the rule must be a valid spark SQL expression.
 33 | 3. `parent_children_constraints` including children parent constrains. 
 34 | This means that any parent id should be valid id.
 35 | 4. `compare_related_tables_list` including comparison with other tables or 
 36 | the same table in other environments.
 37 | 
 38 | ```
 39 | {
 40 |   "source_table": {
 41 |     "name": "test.data_test",
 42 |     "id_column": "GENERAL_ID",
 43 |     "unique_column_group_values_per_table": ["GENERAL_ID", "ULTIMATE_PARENT_ID"],
 44 |     "fuzzy_deduplication_distance": 0,
 45 |     "output_correctness_table": "test.data_test_correctness",
 46 |     "output_completeness_table": "test.data_test_completeness",
 47 |     "output_comparison_table": "test.data_test_comparison"
 48 |   },
 49 |   "correctness_validations": [
 50 |     {
 51 |       "column": "CODE",
 52 |       "rule": "CODE is not null and CODE != '' and CODE != 'null'"
 53 |     },
 54 |     {
 55 |       "column": "NAME",
 56 |       "rule": "NAME is not null and NAME != '' and NAME != 'null'"
 57 |     },
 58 |     {
 59 |       "column": "GENERAL_ID",
 60 |       "rule": "GENERAL_ID is not null and GENERAL_ID != '' and GENERAL_ID != 'null' and CHAR_LENGTH(GENERAL_ID) < 4"
 61 |     }
 62 |   ],
 63 |   "completeness_validations": [
 64 |     {
 65 |       "column": "OVER_ALL_COUNT",
 66 |       "rule": "OVER_ALL_COUNT <= 7"
 67 |     }
 68 |   ],
 69 |   "parent_children_constraints": [
 70 |     {
 71 |       "column": "GENERAL_ID",
 72 |       "parent": "ULTIMATE_PARENT_ID"
 73 |     },
 74 |     {
 75 |       "column": "GENERAL_ID",
 76 |       "parent": "PARENT_ID"
 77 |     }
 78 |   ],
 79 |   "compare_related_tables_list": ["test.diff_df", "test.diff_df_2"]
 80 | }
 81 | ```
 82 | 
 83 | Therefore, these results are delivered in two output hive tables:
 84 | 
 85 | a). Correctness Report.
 86 | 
 87 | - You will see and output col per validation col showing either 1 when there is error or 0 when is clean.
 88 | - Sum of error per columns.
 89 | 
 90 | ```
 91 | +----------+-------------+-------------+-------------------+--------------------------------------+-----------------------------+-------------+--------------------------+-----------------+-----------------+-----------------------+------------------------------------------+---------------------------------+-----------------+
 92 | |GENERAL_ID|IS_ERROR_CODE|IS_ERROR_NAME|IS_ERROR_GENERAL_ID|IS_ERROR_GENERAL_ID_ULTIMATE_PARENT_ID|IS_ERROR_GENERAL_ID_PARENT_ID|IS_ERROR__ROW|dt                        |IS_ERROR_CODE_SUM|IS_ERROR_NAME_SUM|IS_ERROR_GENERAL_ID_SUM|IS_ERROR_GENERAL_ID_ULTIMATE_PARENT_ID_SUM|IS_ERROR_GENERAL_ID_PARENT_ID_SUM|IS_ERROR__ROW_SUM|
 93 | +----------+-------------+-------------+-------------------+--------------------------------------+-----------------------------+-------------+--------------------------+-----------------+-----------------+-----------------------+------------------------------------------+---------------------------------+-----------------+
 94 | |null      |0            |1            |1                  |1                                     |0                            |1            |2020-04-17 09:39:04.783505|2                |4                |1                      |2                                         |1                                |5                |
 95 | |3         |0            |1            |0                  |0                                     |0                            |1            |2020-04-17 09:39:04.783505|2                |4                |1                      |2                                         |1                                |5                |
 96 | |7         |1            |0            |0                  |1                                     |1                            |1            |2020-04-17 09:39:04.783505|2                |4                |1                      |2                                         |1                                |5                |
 97 | |5         |0            |0            |0                  |0                                     |0                            |0            |2020-04-17 09:39:04.783505|2                |4                |1                      |2                                         |1                                |5                |
 98 | |6         |0            |1            |0                  |0                                     |0                            |1            |2020-04-17 09:39:04.783505|2                |4                |1                      |2                                         |1                                |5                |
 99 | |4         |0            |0            |0                  |0                                     |0                            |0            |2020-04-17 09:39:04.783505|2                |4                |1                      |2                                         |1                                |5                |
100 | |2         |1            |1            |0                  |0                                     |0                            |1            |2020-04-17 09:39:04.783505|2                |4                |1                      |2                                         |1                                |5                |
101 | |1         |0            |0            |0                  |0                                     |0                            |0            |2020-04-17 09:39:04.783505|2                |4                |1                      |2                                         |1                                |5                |
102 | +----------+-------------+-------------+-------------------+--------------------------------------+-----------------------------+-------------+--------------------------+-----------------+-----------------+-----------------------+------------------------------------------+---------------------------------+-----------------+
103 | ```
104 | b) Completeness Report.
105 | - The overall count of the dataframe.
106 | - Column checking if the overall count is complete, example: `IS_ERROR_OVER_ALL_COUNT`.
107 | ```
108 | +--------------+-----------------------+--------------------------+
109 | |OVER_ALL_COUNT|IS_ERROR_OVER_ALL_COUNT|dt                        |
110 | +--------------+-----------------------+--------------------------+
111 | |8             |1                      |2020-04-17 09:39:04.783505|
112 | +--------------+-----------------------+--------------------------+
113 | ```
114 | 
115 | c). Comparison of schema and values with related dataframes. 
116 | 
117 | NOTE: the result includes for now only the ids that are different and a further 
118 | join with the source data to see differences is needed.
119 | 
120 | ```
121 | +--------------+----------------------------------+-----------------+------------------+-----------------+--------------------------+
122 | |df            |missing_cols_right                |missing_cols_left|missing_vals_right|missing_vals_left|dt                        |
123 | +--------------+----------------------------------+-----------------+------------------+-----------------+--------------------------+
124 | |test.diff_df_2|GENERAL_ID:string,ADDR_DESC:string|GENERAL_ID:int   |                  |                 |2020-04-17 09:39:07.572483|
125 | |test.diff_df  |                                  |                 |6,7               |                 |2020-04-17 09:39:07.572483|
126 | +--------------+----------------------------------+-----------------+------------------+-----------------+--------------------------+
127 | ```
128 | 
129 | ## Installation
130 | 
131 | Install owl sanitizer from PyPI:
132 | 
133 | ```pip install owl-sanitizer-data-quality```
134 | 
135 | Then you can call the library.
136 | 
137 | ```
138 | from spark_validation.dataframe_validation.dataframe_validator import CreateHiveValidationDF
139 | from spark_validation.common.config import Config
140 | 
141 | spark_session = SparkSession.builder.enableHiveSupport().getOrCreate()
142 | with open(PATH_TO_CONFIG_FILE) as f:
143 |         config = Config.parse(f)
144 | CreateHiveValidationDF.validate(spark_session, config)
145 | ```
146 | 
147 | To use in your spark submit command or airflow dag.
148 | 
149 | - Add `py_files` : `[https://pypi.org/project/owl-sanitizer-data-quality/latest/]` .
150 | - `application` : `owl-sanitizer-data-quality/latest/src/spark_validation/dataframe_validation/hive_validator.py`
151 | - `application_package`: `https://pypi.org/project/owl-sanitizer-data-quality/latest/owl-sanitizer-data-quality-latest.tar.gz`
152 | - `application_params`: `URL_TO_YOUR_REMOTE_CONFIG_FILE`
153 | 
154 | Contact
155 | -------
156 | 
157 | Please ask questions about technical issues here on GitHub. 


--------------------------------------------------------------------------------
/lib/test/spark_validation_tests/common/general_handler_test.py:
--------------------------------------------------------------------------------
  1 | """Module with general function tests for the GeneralDFHandler."""
  2 | import os
  3 | import sys
  4 | import unittest
  5 | 
  6 | import pyspark.sql.functions as F
  7 | 
  8 | from spark_validation.common.config import Config
  9 | from spark_validation.common.constants import Constants
 10 | from spark_validation.dataframe_validation import file_system_validator
 11 | from spark_validation.dataframe_validation import hive_validator
 12 | from spark_validation.dataframe_validation.dataframe_validator import DataframeValidator
 13 | from spark_validation_tests.common.pyspark_test import PySparkTest
 14 | 
 15 | PACKAGE_DIR = os.path.dirname(os.path.abspath(__file__))
 16 | 
 17 | 
 18 | class GeneralHandlerTest(PySparkTest):
 19 |     """Class with general function tests for the GeneralDFHandler."""
 20 | 
 21 |     TEST_DATABASE_NAME = "test"
 22 | 
 23 |     def setUp(self):
 24 |         """Init test db for grid."""
 25 |         self.spark.sql(
 26 |             "CREATE DATABASE IF NOT EXISTS {}".format(
 27 |                 GeneralHandlerTest.TEST_DATABASE_NAME
 28 |             )
 29 |         )
 30 | 
 31 |     @classmethod
 32 |     def setUpClass(cls):
 33 |         """Init the shared values for the tests."""
 34 |         super(GeneralHandlerTest, cls).setUpClass()
 35 |         cls.spark.sql(
 36 |             "CREATE DATABASE IF NOT EXISTS {}".format(
 37 |                 GeneralHandlerTest.TEST_DATABASE_NAME
 38 |             )
 39 |         )
 40 | 
 41 |         cls.source_df = cls._create_source_df(
 42 |             PACKAGE_DIR + "/mock_data/data_sample.csv"
 43 |         )
 44 |         cls.grid_diff_df = cls._create_source_df(
 45 |             PACKAGE_DIR + "/mock_data/data_sample_diff.csv"
 46 |         )
 47 |         cls.grid_diff_2_df = cls._create_source_df(
 48 |             PACKAGE_DIR + "/mock_data/data_sample_diff_2.csv"
 49 |         )
 50 | 
 51 |     @classmethod
 52 |     def _create_source_df(cls, csv_file):
 53 |         return (
 54 |             cls.spark.read.option("delimiter", ",")
 55 |             .option("header", True)
 56 |             .option("inferSchema", True)
 57 |             .option("mode", "PERMISSIVE")
 58 |             .csv(csv_file)
 59 |         )
 60 | 
 61 |     def test_grid_validator_process(self):
 62 |         """Integration test for rule set defined in mock config file."""
 63 |         test_rules = {
 64 |             "CODE": """CODE is not null and CODE != '' and CODE != 'null'""",
 65 |             "NAME": """NAME is not null and NAME != '' and NAME != 'null'""",
 66 |             "GENERAL_ID": (
 67 |                 "GENERAL_ID is not null and GENERAL_ID != '' and GENERAL_ID != 'null' and"
 68 |                 " CHAR_LENGTH(GENERAL_ID) < 4"
 69 |             ),
 70 |             "ULTIMATE_PARENT_ID": """ULTIMATE_PARENT_ID is not null""",
 71 |             "PARENT_ID": """PARENT_ID is not null""",
 72 |         }
 73 | 
 74 |         parent_rules = [
 75 |             ("GENERAL_ID", "ULTIMATE_PARENT_ID"),
 76 |             ("GENERAL_ID", "PARENT_ID"),
 77 |         ]
 78 | 
 79 |         completeness_rules = {"OVER_ALL_COUNT": """OVER_ALL_COUNT <= 7"""}
 80 | 
 81 |         validator = DataframeValidator(
 82 |             spark=self.spark,
 83 |             source_df=self.source_df,
 84 |             id_col_name="GENERAL_ID",
 85 |             correctness_rules_dict=test_rules,
 86 |             parent_children_validation_pairs=parent_rules,
 87 |             completeness_rules_dic=completeness_rules,
 88 |             comparable_dfs_list=[
 89 |                 ("diff_df", self.grid_diff_df),
 90 |                 ("diff_df_2", self.grid_diff_2_df),
 91 |             ],
 92 |         )
 93 | 
 94 |         processed_df = validator.process()
 95 | 
 96 |         comparable_df = validator.compare()
 97 | 
 98 |         self.assertEqual(processed_df.count(), 8)
 99 |         self.assertEqual(comparable_df.count(), 2)
100 | 
101 |     def test_integration_hive_validator(self):
102 |         """Integration test for rule set defined in mock config file."""
103 |         with open(PACKAGE_DIR + "/mock_data/config_example.yaml") as f:
104 |             config = Config.parse(f)
105 | 
106 |         self.source_df.write.saveAsTable(config.source_df)
107 |         self.grid_diff_df.write.saveAsTable(config.comparable_dfs_list[0])
108 |         self.grid_diff_2_df.write.saveAsTable(config.comparable_dfs_list[1])
109 | 
110 |         source_read_df = self.spark.table(config.source_df)
111 |         comparable_dfs_list = [
112 |             (t, self.spark.table(t)) for t in config.comparable_dfs_list
113 |         ]
114 | 
115 |         validator = DataframeValidator(
116 |             spark=self.spark,
117 |             source_df=source_read_df,
118 |             id_col_name=config.id_col_name,
119 |             correctness_rules_dict=config.correctness_rules_dict,
120 |             parent_children_validation_pairs=config.parent_children_validation_pairs,
121 |             completeness_rules_dic=config.completeness_rules_dic,
122 |             comparable_dfs_list=comparable_dfs_list,
123 |         )
124 | 
125 |         processed_df = validator.process()
126 |         comparable_df = validator.compare()
127 | 
128 |         self.assertEqual(processed_df.count(), 8)
129 |         self.assertEqual(comparable_df.count(), 2)
130 | 
131 |         self.spark.sparkContext.addFile(PACKAGE_DIR + "/mock_data/config_example.json")
132 |         sys.argv = ["example.py", "-c", PACKAGE_DIR + "/mock_data/config_example.json"]
133 | 
134 |         hive_validator.init()
135 | 
136 |         correctness_table = self.spark.table(config.output_correctness_table)
137 |         completeness_table = self.spark.table(config.output_completeness_table)
138 |         comparison_table = self.spark.table(config.output_comparison_table)
139 | 
140 |         # Correctness validations.
141 |         _is_error_name = Constants.IS_ERROR_COL + "NAME" + Constants.SUM_REPORT_SUFFIX
142 |         _sum_errors_col = (
143 |             Constants.IS_ERROR_COL
144 |             + Constants.ROW_ERROR_SUFFIX
145 |             + Constants.SUM_REPORT_SUFFIX
146 |         )
147 |         self.assertEqual(correctness_table.count(), 8)
148 | 
149 |         self.assertEqual(
150 |             correctness_table.select(_is_error_name).first()[_is_error_name], 4
151 |         )
152 |         self.assertEqual(
153 |             correctness_table.select(_sum_errors_col).first()[_sum_errors_col], 5
154 |         )
155 | 
156 |         # Completeness validations.
157 |         _is_error_count_over_all = Constants.IS_ERROR_COL + Constants.OVER_ALL_COUNT_COL
158 |         self.assertEqual(
159 |             completeness_table.select(_is_error_count_over_all).first()[
160 |                 _is_error_count_over_all
161 |             ],
162 |             1,
163 |         )
164 | 
165 |         # Comparison validations.
166 | 
167 |         self.assertEqual(
168 |             comparison_table.filter(
169 |                 F.col(Constants.REPORT_DF_COL) == config.comparable_dfs_list[0]
170 |             )
171 |             .select(Constants.MISSING_VALS_RIGHT_COL)
172 |             .first()[Constants.MISSING_VALS_RIGHT_COL],
173 |             "6,7",
174 |         )
175 | 
176 |         self.assertEqual(
177 |             comparison_table.filter(
178 |                 F.col(Constants.REPORT_DF_COL) == config.comparable_dfs_list[1]
179 |             )
180 |             .select(Constants.MISSING_COLS_LEFT_COL)
181 |             .first()[Constants.MISSING_COLS_LEFT_COL],
182 |             "GENERAL_ID:int",
183 |         )
184 | 
185 |     def test_integration_fs_validator(self):
186 |         """Integration test for rule set defined in mock config file."""
187 |         with open(PACKAGE_DIR + "/mock_data/config_example_local.json") as f:
188 |             config = Config.parse(f)
189 | 
190 |         config.source_df = PACKAGE_DIR + config.source_df
191 |         config.output_correctness_table = PACKAGE_DIR + config.output_completeness_table
192 |         config.output_completeness_table = (
193 |             PACKAGE_DIR + config.output_completeness_table
194 |         )
195 |         config.output_comparison_table = PACKAGE_DIR + config.output_comparison_table
196 |         config.comparable_dfs_list = list(
197 |             map(lambda x: PACKAGE_DIR + x, config.comparable_dfs_list)
198 |         )
199 | 
200 |         self.spark.sparkContext.addFile(
201 |             PACKAGE_DIR + "/mock_data/config_example_local.json"
202 |         )
203 |         sys.argv = [
204 |             "example.py",
205 |             "-c",
206 |             PACKAGE_DIR + "/mock_data/config_example_local.json",
207 |         ]
208 | 
209 |         file_system_validator.init()
210 | 
211 |         correctness_table = self.spark.read.json(
212 |             "/tmp/mock_data/output/data_sample_test_correctness"
213 |         )
214 |         completeness_table = self.spark.read.json(
215 |             "/tmp/mock_data/output/data_sample_test_completeness"
216 |         )
217 |         comparison_table = self.spark.read.json(
218 |             "/tmp/mock_data/output/data_sample_test_comparison"
219 |         )
220 | 
221 |         self.assertTrue(correctness_table.count() >= 8)
222 |         self.assertTrue(completeness_table.count() >= 1)
223 |         self.assertTrue(comparison_table.count() >= 1)
224 | 
225 |     def test_sample_case_integration_fs_validator(self):
226 |         """Integration test for rule set defined in mock config file."""
227 |         with open(PACKAGE_DIR + "/mock_data/config_family_fs.json") as f:
228 |             config = Config.parse(f)
229 | 
230 |         config.source_df = PACKAGE_DIR + config.source_df
231 |         config.output_correctness_table = PACKAGE_DIR + config.output_completeness_table
232 |         config.output_completeness_table = (
233 |             PACKAGE_DIR + config.output_completeness_table
234 |         )
235 |         config.output_comparison_table = PACKAGE_DIR + config.output_comparison_table
236 |         config.comparable_dfs_list = list(
237 |             map(lambda x: PACKAGE_DIR + x, config.comparable_dfs_list)
238 |         )
239 | 
240 |         self.spark.sparkContext.addFile(
241 |             PACKAGE_DIR + "/mock_data/config_family_fs.json"
242 |         )
243 |         sys.argv = [
244 |             "example.py",
245 |             "-c",
246 |             PACKAGE_DIR + "/mock_data/config_family_fs.json",
247 |         ]
248 | 
249 |         file_system_validator.init()
250 | 
251 |         correctness_table = self.spark.read.json(
252 |             "/tmp/mock_data/output/family_sample_test_correctness"
253 |         )
254 |         completeness_table = self.spark.read.json(
255 |             "/tmp/mock_data/output/family_sample_test_completeness"
256 |         )
257 |         comparison_table = self.spark.read.json(
258 |             "/tmp/mock_data/output/family_sample_test_comparison"
259 |         )
260 | 
261 |         self.assertTrue(correctness_table.count() >= 6)
262 |         self.assertTrue(completeness_table.count() >= 1)
263 |         self.assertTrue(comparison_table.count() >= 1)
264 | 
265 |     @classmethod
266 |     def tearDownClass(cls):
267 |         """Remove spark tables for testing."""
268 |         cls.spark.sql(
269 |             "drop database if exists {} cascade".format(
270 |                 GeneralHandlerTest.TEST_DATABASE_NAME
271 |             )
272 |         ).collect()
273 | 
274 |     def tearDown(self):
275 |         """Remove test databases and tables after every test."""
276 |         self.spark.sql(
277 |             "drop database if exists {} cascade".format(
278 |                 GeneralHandlerTest.TEST_DATABASE_NAME
279 |             )
280 |         ).collect()
281 | 
282 | 
283 | if __name__ == "__main__":
284 |     unittest.main()
285 | 


--------------------------------------------------------------------------------
/lib/src/spark_validation/common/general_validator.py:
--------------------------------------------------------------------------------
  1 | """This module exposes a general interface with common df functions across all the pipelines.
  2 | 
  3 | This function could be extensible to create specific handlers. For instance: PandasDataHandler(GeneralDFHandler).
  4 | 
  5 | """
  6 | import datetime
  7 | from abc import ABC
  8 | from functools import reduce
  9 | 
 10 | from pyspark.sql import Window
 11 | from pyspark.sql import functions as F
 12 | from pyspark.sql.dataframe import DataFrame
 13 | 
 14 | from spark_validation.common.constants import Constants
 15 | 
 16 | 
 17 | class GeneralDFValidator(ABC):
 18 |     """Class with general handlers functions."""
 19 | 
 20 |     def transform(self, f):
 21 |         """Wrap the transform spark function non available for python."""
 22 |         return f(self)
 23 | 
 24 |     @staticmethod
 25 |     def rename_cols(df, transformation_map):
 26 |         """Rename a set of spark columns within a df using a transformation_map dictionary.
 27 | 
 28 |         Example:
 29 |         df:
 30 |             +--------+--------+
 31 |             |   col1 |   col2 |
 32 |             |--------+--------+
 33 |             |     15 |     76 |
 34 |             |     30 |     97 |
 35 |             +--------+--------+
 36 |         transformation_map :
 37 |             {col1: id, col2: code}
 38 |         return:
 39 |             +--------+--------+
 40 |             |   id   |   code |
 41 |             |--------+--------+
 42 |             |     15 |     76 |
 43 |             |     30 |     97 |
 44 |             +--------+--------+
 45 |         """
 46 |         return reduce(
 47 |             lambda internal_df, col_name: internal_df.withColumnRenamed(
 48 |                 col_name, transformation_map[col_name]
 49 |             ),
 50 |             transformation_map.keys(),
 51 |             df,
 52 |         )
 53 | 
 54 |     @staticmethod
 55 |     def combine_dataframes(sources):
 56 |         """Join multiple dataframes using the spark union function."""
 57 |         return reduce(lambda x, y: x.union(y), sources)
 58 | 
 59 |     @staticmethod
 60 |     def join_cols_with_all_parents(df, parent_validations_pairs):
 61 |         """Join df with all parent ids to obtain incorrect ids."""
 62 |         for col, parent in parent_validations_pairs:
 63 |             df = GeneralDFValidator.join_grid_with_parent(df, col, parent)
 64 |         return df
 65 | 
 66 |     @staticmethod
 67 |     def join_grid_with_parent(df, id_col, parent_id_col):
 68 |         """Join a df with a single parent id to obtain incorrect ids."""
 69 |         # Renaming col to avoid spark duplicate cols issues.
 70 |         _ids_renamed_org_id = id_col + "_" + parent_id_col
 71 |         parent_ids_df = df.select(id_col).withColumnRenamed(id_col, _ids_renamed_org_id)
 72 | 
 73 |         self_joined_df = df.join(
 74 |             parent_ids_df,
 75 |             df[parent_id_col] == parent_ids_df[_ids_renamed_org_id],
 76 |             "left",
 77 |         )
 78 | 
 79 |         return self_joined_df
 80 | 
 81 |     @staticmethod
 82 |     def _validate_parent_id(df, id_col, parent_id_col):
 83 |         # Generating proper parent id validation column obtained after join_grid_with_parent = prefix + col_name.
 84 |         _ids_renamed_org_id = id_col + "_" + parent_id_col
 85 |         return df.withColumn(
 86 |             Constants.IS_ERROR_COL + _ids_renamed_org_id,
 87 |             F.when(
 88 |                 (
 89 |                     (F.col(_ids_renamed_org_id).isNotNull())
 90 |                     | (F.col(parent_id_col).isNull())
 91 |                 ),
 92 |                 0,
 93 |             ).otherwise(1),
 94 |         )
 95 | 
 96 |     @staticmethod
 97 |     def add_unique_error(df, id_col, unique_cols):
 98 |         """ Adding deduplication validation."""
 99 |         _unique_cols_list = list(map(lambda c: F.col(c + "_str"), unique_cols))
100 |         _w = Window.partitionBy(Constants.UNIQUE_HASH).orderBy(F.col(id_col).asc())
101 | 
102 |         return (
103 |             reduce(
104 |                 lambda internal_df, col_name: internal_df.withColumn(
105 |                     col_name + "_str",
106 |                     (
107 |                         F.when(
108 |                             F.col(col_name).isNotNull(),
109 |                             F.lower(F.col(col_name).cast("string")),
110 |                         ).otherwise("")
111 |                     ),
112 |                 ),
113 |                 unique_cols,
114 |                 df,
115 |             )
116 |             .withColumn(Constants.UNIQUE_HASH, F.concat(*_unique_cols_list))
117 |             .withColumn(Constants.COUNT_HASH, F.count(id_col).over(_w))
118 |             .withColumn(
119 |                 Constants.IS_ERROR_COL + Constants.UNIQUE_HASH,
120 |                 F.when(F.col(Constants.COUNT_HASH) > 1, 1).otherwise(0),
121 |             )
122 |             .orderBy(F.col(id_col).asc())
123 |         )
124 | 
125 |     @staticmethod
126 |     def build_correctness_report_df(processed_df, validated_cols):
127 |         """Build a report df computing column errors.
128 | 
129 |         1. Sum of errors per column.
130 |         2. Add an over all row count.
131 |         3. Add a time stamp to this dataframe.
132 |         """
133 |         windows_errors = Window.partitionBy(Constants.DATE_TIME_REPORT_COL)
134 |         report_df = reduce(
135 |             lambda internal_df, col_name: internal_df.transform(
136 |                 lambda df: df.withColumn(
137 |                     col_name + Constants.SUM_REPORT_SUFFIX,
138 |                     F.sum(col_name).over(windows_errors),
139 |                 )
140 |             ),
141 |             validated_cols,
142 |             processed_df.withColumn(
143 |                 Constants.DATE_TIME_REPORT_COL, F.lit(datetime.datetime.now())
144 |             ),
145 |         ).withColumn(
146 |             Constants.OVER_ALL_COUNT_COL,
147 |             F.count(Constants.DATE_TIME_REPORT_COL).over(windows_errors),
148 |         )
149 | 
150 |         return report_df
151 | 
152 |     @staticmethod
153 |     def build_computed_rules_correctness_df(processed_df, rules_map):
154 |         """Build a dataframe with some rules computed.
155 | 
156 |         :param processed_df: input dataframe.
157 |         :param rules_map: a map of rules with format {col_name = spark_sql_expr}
158 |         :return: a dataframe with a new column IS_ERROR (1 - ERROR or 0 - NO ERROR) per column on the map.
159 |         """
160 | 
161 |         return reduce(
162 |             lambda internal_df, col_name: internal_df.transform(
163 |                 lambda df: GeneralDFValidator._compute_col_val_correctness(
164 |                     df, col_name, rules_map[col_name]
165 |                 )
166 |             ),
167 |             rules_map.keys(),
168 |             processed_df,
169 |         )
170 | 
171 |     @staticmethod
172 |     def build_correctness_df(
173 |         processed_df, validation_rules_map, parent_validations_pairs
174 |     ):
175 |         """Build correctness df.
176 | 
177 |         1. validate all the rules per column.
178 |         2. return df with error columns. This column will have the following schema:
179 |         col_name = Constants.IS_ERROR_COL + col_name.
180 |         value = 1 when error, 0 when column is clean.
181 |         3. Add a column with Constants.IS_ERROR_COL + Constants.ROW_ERROR_SUFFIX representing error on any
182 |         column of the row.
183 |         """
184 |         _list_correctness_cols = list(validation_rules_map.keys())
185 |         _list_cols_parent_validation = list(
186 |             set([pair[0] + "_" + pair[1] for pair in parent_validations_pairs])
187 |         )
188 |         _list_cols_parent_cols = list(
189 |             set([pair[0] for pair in parent_validations_pairs])
190 |         )
191 |         _error_cols_correctness = list(
192 |             map(lambda c: Constants.IS_ERROR_COL + c, validation_rules_map.keys(),)
193 |         )
194 |         _error_cols_parents_pairs = list(
195 |             map(lambda c: Constants.IS_ERROR_COL + c, _list_cols_parent_validation,)
196 |         )
197 |         _list_general_rows_errors = list(
198 |             [
199 |                 Constants.IS_ERROR_COL + Constants.ROW_ERROR_SUFFIX,
200 |                 Constants.IS_ERROR_COL + Constants.UNIQUE_HASH,
201 |             ]
202 |         )
203 | 
204 |         final_select_cols = (
205 |             _list_correctness_cols
206 |             + _list_cols_parent_validation
207 |             + _list_cols_parent_cols
208 |             + _error_cols_correctness
209 |             + _error_cols_parents_pairs
210 |             + _list_general_rows_errors
211 |         )
212 | 
213 |         validate_expr = GeneralDFValidator._generate_validate_errors_expr(
214 |             _list_correctness_cols + _list_cols_parent_validation
215 |         )
216 |         validated_df = GeneralDFValidator.build_computed_rules_correctness_df(
217 |             processed_df, validation_rules_map
218 |         )
219 | 
220 |         validated_df = (
221 |             reduce(
222 |                 lambda internal_df, pair_parent: internal_df.transform(
223 |                     lambda df: GeneralDFValidator._validate_parent_id(
224 |                         df, pair_parent[0], pair_parent[1]
225 |                     )
226 |                 ),
227 |                 parent_validations_pairs,
228 |                 validated_df,
229 |             )
230 |             .withColumn(
231 |                 Constants.IS_ERROR_COL + Constants.ROW_ERROR_SUFFIX,
232 |                 F.when(F.expr(validate_expr), 1).otherwise(0),
233 |             )
234 |             .select(final_select_cols)
235 |         )
236 |         return validated_df
237 | 
238 |     @staticmethod
239 |     def _compute_col_val_correctness(df, col_name, col_rule):
240 |         # Error column name is generated with error_prefix + col_name.
241 |         return df.withColumn(
242 |             Constants.IS_ERROR_COL + col_name, F.when(F.expr(col_rule), 0).otherwise(1),
243 |         )
244 | 
245 |     @staticmethod
246 |     def _generate_validate_errors_expr(list_validation_cols):
247 |         """Generate SQL exp that validates that there a not error (col_val == 1) on any validation column."""
248 |         return """{}{} == 1 {}""".format(
249 |             Constants.IS_ERROR_COL,
250 |             list_validation_cols[0],
251 |             "".join(
252 |                 list(
253 |                     map(
254 |                         lambda x: " or {}{} == 1".format(Constants.IS_ERROR_COL, x),
255 |                         list_validation_cols[1:],
256 |                     )
257 |                 )
258 |             ),
259 |         )
260 | 
261 |     @staticmethod
262 |     def compared_with_related_dfs(source_df, id_col_name, map_related_dfs):
263 |         """Compare source df with related dfs.
264 | 
265 |         Obtaining a list per related dfs:
266 |         1. Columns present in source not in related.
267 |         2. Columns present in related in source.
268 |         When both previous are empty:
269 |         3. Row values present in source not equal in related.
270 |         4. Row values present in related not equal in source.
271 |         """
272 |         comparison_results = []
273 |         for k, df in map_related_dfs:
274 |             missing_cols_right = GeneralDFValidator._missing_values_between_schemas(
275 |                 source_df.schema, df.schema
276 |             )
277 |             missing_cols_left = GeneralDFValidator._missing_values_between_schemas(
278 |                 df.schema, source_df.schema
279 |             )
280 | 
281 |             missing_vals_right = GeneralDFValidator._list_different_rows_ids_between_dfs(
282 |                 source_df, id_col_name, df, missing_cols_right
283 |             )
284 | 
285 |             missing_vals_left = GeneralDFValidator._list_different_rows_ids_between_dfs(
286 |                 df, id_col_name, source_df, missing_cols_left
287 |             )
288 | 
289 |             comparison_results.append(
290 |                 (
291 |                     k,
292 |                     ",".join(missing_cols_right),
293 |                     ",".join(missing_cols_left),
294 |                     ",".join(missing_vals_right),
295 |                     ",".join(missing_vals_left),
296 |                 )
297 |             )
298 | 
299 |         return comparison_results
300 | 
301 |     @staticmethod
302 |     def _missing_values_between_schemas(schema1, schema2):
303 |         return list(
304 |             set(list(map(lambda c: c.name + ":" + c.dataType.simpleString(), schema1)))
305 |             - set(
306 |                 list(map(lambda c: c.name + ":" + c.dataType.simpleString(), schema2))
307 |             )
308 |         )
309 | 
310 |     @staticmethod
311 |     def _list_different_rows_ids_between_dfs(
312 |         source_df, id_col_name, related_df, schema_correct
313 |     ):
314 |         return (
315 |             list(
316 |                 map(
317 |                     lambda col: col.__getitem__(id_col_name),
318 |                     source_df.subtract(related_df).select(id_col_name).collect(),
319 |                 )
320 |             )
321 |             if not schema_correct
322 |             else []
323 |         )
324 | 
325 | 
326 | DataFrame.transform = GeneralDFValidator.transform
327 | 


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
  1 | [MASTER]
  2 | init-hook='import sys; sys.path.append("./lib/")
  3 | 
  4 | # A comma-separated list of package or module names from where C extensions may
  5 | # be loaded. Extensions are loading into the active Python interpreter and may
  6 | # run arbitrary code.
  7 | extension-pkg-whitelist=
  8 | 
  9 | # Add files or directories to the blacklist. They should be base names, not
 10 | # paths.
 11 | ignore=CVS
 12 | 
 13 | # Add files or directories matching the regex patterns to the blacklist. The
 14 | # regex matches against base names, not paths.
 15 | ignore-patterns=
 16 | 
 17 | # Python code to execute, usually for sys.path manipulation such as
 18 | # pygtk.require().
 19 | #init-hook=
 20 | 
 21 | # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
 22 | # number of processors available to use.
 23 | jobs=1
 24 | 
 25 | # Control the amount of potential inferred values when inferring a single
 26 | # object. This can help the performance when dealing with large functions or
 27 | # complex, nested conditions.
 28 | limit-inference-results=100
 29 | 
 30 | # List of plugins (as comma separated values of python module names) to load,
 31 | # usually to register additional checkers.
 32 | load-plugins=
 33 | 
 34 | # Pickle collected data for later comparisons.
 35 | persistent=yes
 36 | 
 37 | # Specify a configuration file.
 38 | #rcfile=
 39 | 
 40 | # When enabled, pylint would attempt to guess common misconfiguration and emit
 41 | # user-friendly hints instead of false-positive error messages.
 42 | suggestion-mode=yes
 43 | 
 44 | # Allow loading of arbitrary C extensions. Extensions are imported into the
 45 | # active Python interpreter and may run arbitrary code.
 46 | unsafe-load-any-extension=no
 47 | 
 48 | 
 49 | [MESSAGES CONTROL]
 50 | 
 51 | # Only show warnings with the listed confidence levels. Leave empty to show
 52 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
 53 | confidence=
 54 | 
 55 | # Disable the message, report, category or checker with the given id(s). You
 56 | # can either give multiple identifiers separated by comma (,) or put this
 57 | # option multiple times (only on the command line, not in the configuration
 58 | # file where it should appear only once). You can also use "--disable=all" to
 59 | # disable everything first and then reenable specific checks. For example, if
 60 | # you want to run only the similarities checker, you can use "--disable=all
 61 | # --enable=similarities". If you want to run only the classes checker, but have
 62 | # no Warning level messages displayed, use "--disable=all --enable=classes
 63 | # --disable=W".
 64 | disable=print-statement,
 65 |         parameter-unpacking,
 66 |         unpacking-in-except,
 67 |         old-raise-syntax,
 68 |         backtick,
 69 |         long-suffix,
 70 |         old-ne-operator,
 71 |         old-octal-literal,
 72 |         import-star-module-level,
 73 |         non-ascii-bytes-literal,
 74 |         raw-checker-failed,
 75 |         bad-inline-option,
 76 |         locally-disabled,
 77 |         file-ignored,
 78 |         suppressed-message,
 79 |         useless-suppression,
 80 |         deprecated-pragma,
 81 |         use-symbolic-message-instead,
 82 |         apply-builtin,
 83 |         basestring-builtin,
 84 |         buffer-builtin,
 85 |         cmp-builtin,
 86 |         coerce-builtin,
 87 |         execfile-builtin,
 88 |         file-builtin,
 89 |         long-builtin,
 90 |         raw_input-builtin,
 91 |         reduce-builtin,
 92 |         standarderror-builtin,
 93 |         unicode-builtin,
 94 |         xrange-builtin,
 95 |         coerce-method,
 96 |         delslice-method,
 97 |         getslice-method,
 98 |         setslice-method,
 99 |         no-absolute-import,
100 |         old-division,
101 |         dict-iter-method,
102 |         dict-view-method,
103 |         next-method-called,
104 |         metaclass-assignment,
105 |         indexing-exception,
106 |         raising-string,
107 |         reload-builtin,
108 |         oct-method,
109 |         hex-method,
110 |         nonzero-method,
111 |         cmp-method,
112 |         input-builtin,
113 |         round-builtin,
114 |         intern-builtin,
115 |         unichr-builtin,
116 |         map-builtin-not-iterating,
117 |         zip-builtin-not-iterating,
118 |         range-builtin-not-iterating,
119 |         filter-builtin-not-iterating,
120 |         using-cmp-argument,
121 |         eq-without-hash,
122 |         div-method,
123 |         idiv-method,
124 |         rdiv-method,
125 |         exception-message-attribute,
126 |         invalid-str-codec,
127 |         sys-max-int,
128 |         bad-python3-import,
129 |         deprecated-string-function,
130 |         deprecated-str-translate-call,
131 |         deprecated-itertools-function,
132 |         deprecated-types-field,
133 |         next-method-defined,
134 |         dict-items-not-iterating,
135 |         dict-keys-not-iterating,
136 |         dict-values-not-iterating,
137 |         deprecated-operator-function,
138 |         deprecated-urllib-function,
139 |         xreadlines-attribute,
140 |         deprecated-sys-function,
141 |         exception-escape,
142 |         comprehension-escape
143 | 
144 | # Enable the message, report, category or checker with the given id(s). You can
145 | # either give multiple identifier separated by comma (,) or put this option
146 | # multiple time (only on the command line, not in the configuration file where
147 | # it should appear only once). See also the "--disable" option for examples.
148 | enable=c-extension-no-member
149 | 
150 | 
151 | [REPORTS]
152 | 
153 | # Python expression which should return a score less than or equal to 10. You
154 | # have access to the variables 'error', 'warning', 'refactor', and 'convention'
155 | # which contain the number of messages in each category, as well as 'statement'
156 | # which is the total number of statements analyzed. This score is used by the
157 | # global evaluation report (RP0004).
158 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
159 | 
160 | # Template used to display messages. This is a python new-style format string
161 | # used to format the message information. See doc for all details.
162 | #msg-template=
163 | 
164 | # Set the output format. Available formats are text, parseable, colorized, json
165 | # and msvs (visual studio). You can also give a reporter class, e.g.
166 | # mypackage.mymodule.MyReporterClass.
167 | output-format=text
168 | 
169 | # Tells whether to display a full report or only the messages.
170 | reports=no
171 | 
172 | # Activate the evaluation score.
173 | score=yes
174 | 
175 | 
176 | [REFACTORING]
177 | 
178 | # Maximum number of nested blocks for function / method body
179 | max-nested-blocks=5
180 | 
181 | # Complete name of functions that never returns. When checking for
182 | # inconsistent-return-statements if a never returning function is called then
183 | # it will be considered as an explicit return statement and no message will be
184 | # printed.
185 | never-returning-functions=sys.exit
186 | 
187 | 
188 | [LOGGING]
189 | 
190 | # Format style used to check logging format string. `old` means using %
191 | # formatting, `new` is for `{}` formatting,and `fstr` is for f-strings.
192 | logging-format-style=old
193 | 
194 | # Logging modules to check that the string format arguments are in logging
195 | # function parameter format.
196 | logging-modules=logging
197 | 
198 | 
199 | [SPELLING]
200 | 
201 | # Limits count of emitted suggestions for spelling mistakes.
202 | max-spelling-suggestions=4
203 | 
204 | # Spelling dictionary name. Available dictionaries: none. To make it work,
205 | # install the python-enchant package.
206 | spelling-dict=
207 | 
208 | # List of comma separated words that should not be checked.
209 | spelling-ignore-words=
210 | 
211 | # A path to a file that contains the private dictionary; one word per line.
212 | spelling-private-dict-file=
213 | 
214 | # Tells whether to store unknown words to the private dictionary (see the
215 | # --spelling-private-dict-file option) instead of raising a message.
216 | spelling-store-unknown-words=no
217 | 
218 | 
219 | [MISCELLANEOUS]
220 | 
221 | # List of note tags to take in consideration, separated by a comma.
222 | notes=FIXME,
223 |       XXX,
224 |       TODO
225 | 
226 | 
227 | [TYPECHECK]
228 | 
229 | # List of decorators that produce context managers, such as
230 | # contextlib.contextmanager. Add to this list to register other decorators that
231 | # produce valid context managers.
232 | contextmanager-decorators=contextlib.contextmanager
233 | 
234 | # List of members which are set dynamically and missed by pylint inference
235 | # system, and so shouldn't trigger E1101 when accessed. Python regular
236 | # expressions are accepted.
237 | generated-members=
238 | 
239 | # Tells whether missing members accessed in mixin class should be ignored. A
240 | # mixin class is detected if its name ends with "mixin" (case insensitive).
241 | ignore-mixin-members=yes
242 | 
243 | # Tells whether to warn about missing members when the owner of the attribute
244 | # is inferred to be None.
245 | ignore-none=yes
246 | 
247 | # This flag controls whether pylint should warn about no-member and similar
248 | # checks whenever an opaque object is returned when inferring. The inference
249 | # can return multiple potential results while evaluating a Python object, but
250 | # some branches might not be evaluated, which results in partial inference. In
251 | # that case, it might be useful to still emit no-member and other checks for
252 | # the rest of the inferred objects.
253 | ignore-on-opaque-inference=yes
254 | 
255 | # List of class names for which member attributes should not be checked (useful
256 | # for classes with dynamically set attributes). This supports the use of
257 | # qualified names.
258 | ignored-classes=optparse.Values,thread._local,_thread._local
259 | 
260 | # List of module names for which member attributes should not be checked
261 | # (useful for modules/projects where namespaces are manipulated during runtime
262 | # and thus existing member attributes cannot be deduced by static analysis). It
263 | # supports qualified module names, as well as Unix pattern matching.
264 | ignored-modules=
265 | 
266 | # Show a hint with possible names when a member name was not found. The aspect
267 | # of finding the hint is based on edit distance.
268 | missing-member-hint=yes
269 | 
270 | # The minimum edit distance a name should have in order to be considered a
271 | # similar match for a missing member name.
272 | missing-member-hint-distance=1
273 | 
274 | # The total number of similar names that should be taken in consideration when
275 | # showing a hint for a missing member.
276 | missing-member-max-choices=1
277 | 
278 | # List of decorators that change the signature of a decorated function.
279 | signature-mutators=
280 | 
281 | 
282 | [VARIABLES]
283 | 
284 | # List of additional names supposed to be defined in builtins. Remember that
285 | # you should avoid defining new builtins when possible.
286 | additional-builtins=
287 | 
288 | # Tells whether unused global variables should be treated as a violation.
289 | allow-global-unused-variables=yes
290 | 
291 | # List of strings which can identify a callback function by name. A callback
292 | # name must start or end with one of those strings.
293 | callbacks=cb_,
294 |           _cb
295 | 
296 | # A regular expression matching the name of dummy variables (i.e. expected to
297 | # not be used).
298 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
299 | 
300 | # Argument names that match this expression will be ignored. Default to name
301 | # with leading underscore.
302 | ignored-argument-names=_.*|^ignored_|^unused_
303 | 
304 | # Tells whether we should check for unused import in __init__ files.
305 | init-import=no
306 | 
307 | # List of qualified module names which can have objects that can redefine
308 | # builtins.
309 | redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
310 | 
311 | 
312 | [FORMAT]
313 | 
314 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
315 | expected-line-ending-format=
316 | 
317 | # Regexp for a line that is allowed to be longer than the limit.
318 | ignore-long-lines=^\s*(# )?<?https?://\S+>?$
319 | 
320 | # Number of spaces of indent required inside a hanging or continued line.
321 | indent-after-paren=4
322 | 
323 | # String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
324 | # tab).
325 | indent-string='    '
326 | 
327 | # Maximum number of characters on a single line.
328 | max-line-length=100
329 | 
330 | # Maximum number of lines in a module.
331 | max-module-lines=1000
332 | 
333 | # List of optional constructs for which whitespace checking is disabled. `dict-
334 | # separator` is used to allow tabulation in dicts, etc.: {1  : 1,\n222: 2}.
335 | # `trailing-comma` allows a space between comma and closing bracket: (a, ).
336 | # `empty-line` allows space-only lines.
337 | no-space-check=trailing-comma,
338 |                dict-separator
339 | 
340 | # Allow the body of a class to be on the same line as the declaration if body
341 | # contains single statement.
342 | single-line-class-stmt=no
343 | 
344 | # Allow the body of an if to be on the same line as the test if there is no
345 | # else.
346 | single-line-if-stmt=no
347 | 
348 | 
349 | [SIMILARITIES]
350 | 
351 | # Ignore comments when computing similarities.
352 | ignore-comments=yes
353 | 
354 | # Ignore docstrings when computing similarities.
355 | ignore-docstrings=yes
356 | 
357 | # Ignore imports when computing similarities.
358 | ignore-imports=no
359 | 
360 | # Minimum lines number of a similarity.
361 | min-similarity-lines=4
362 | 
363 | 
364 | [BASIC]
365 | 
366 | # Naming style matching correct argument names.
367 | argument-naming-style=snake_case
368 | 
369 | # Regular expression matching correct argument names. Overrides argument-
370 | # naming-style.
371 | #argument-rgx=
372 | 
373 | # Naming style matching correct attribute names.
374 | attr-naming-style=snake_case
375 | 
376 | # Regular expression matching correct attribute names. Overrides attr-naming-
377 | # style.
378 | #attr-rgx=
379 | 
380 | # Bad variable names which should always be refused, separated by a comma.
381 | bad-names=foo,
382 |           bar,
383 |           baz,
384 |           toto,
385 |           tutu,
386 |           tata
387 | 
388 | # Naming style matching correct class attribute names.
389 | class-attribute-naming-style=any
390 | 
391 | # Regular expression matching correct class attribute names. Overrides class-
392 | # attribute-naming-style.
393 | #class-attribute-rgx=
394 | 
395 | # Naming style matching correct class names.
396 | class-naming-style=PascalCase
397 | 
398 | # Regular expression matching correct class names. Overrides class-naming-
399 | # style.
400 | #class-rgx=
401 | 
402 | # Naming style matching correct constant names.
403 | const-naming-style=UPPER_CASE
404 | 
405 | # Regular expression matching correct constant names. Overrides const-naming-
406 | # style.
407 | #const-rgx=
408 | 
409 | # Minimum line length for functions/classes that require docstrings, shorter
410 | # ones are exempt.
411 | docstring-min-length=-1
412 | 
413 | # Naming style matching correct function names.
414 | function-naming-style=snake_case
415 | 
416 | # Regular expression matching correct function names. Overrides function-
417 | # naming-style.
418 | #function-rgx=
419 | 
420 | # Good variable names which should always be accepted, separated by a comma.
421 | good-names=i,
422 |            j,
423 |            k,
424 |            ex,
425 |            Run,
426 |            _
427 | 
428 | # Include a hint for the correct naming format with invalid-name.
429 | include-naming-hint=no
430 | 
431 | # Naming style matching correct inline iteration names.
432 | inlinevar-naming-style=any
433 | 
434 | # Regular expression matching correct inline iteration names. Overrides
435 | # inlinevar-naming-style.
436 | #inlinevar-rgx=
437 | 
438 | # Naming style matching correct method names.
439 | method-naming-style=snake_case
440 | 
441 | # Regular expression matching correct method names. Overrides method-naming-
442 | # style.
443 | #method-rgx=
444 | 
445 | # Naming style matching correct module names.
446 | module-naming-style=snake_case
447 | 
448 | # Regular expression matching correct module names. Overrides module-naming-
449 | # style.
450 | #module-rgx=
451 | 
452 | # Colon-delimited sets of names that determine each other's naming style when
453 | # the name regexes allow several styles.
454 | name-group=
455 | 
456 | # Regular expression which should only match function or class names that do
457 | # not require a docstring.
458 | no-docstring-rgx=^_
459 | 
460 | # List of decorators that produce properties, such as abc.abstractproperty. Add
461 | # to this list to register other decorators that produce valid properties.
462 | # These decorators are taken in consideration only for invalid-name.
463 | property-classes=abc.abstractproperty
464 | 
465 | # Naming style matching correct variable names.
466 | variable-naming-style=snake_case
467 | 
468 | # Regular expression matching correct variable names. Overrides variable-
469 | # naming-style.
470 | #variable-rgx=
471 | 
472 | 
473 | [STRING]
474 | 
475 | # This flag controls whether the implicit-str-concat-in-sequence should
476 | # generate a warning on implicit string concatenation in sequences defined over
477 | # several lines.
478 | check-str-concat-over-line-jumps=no
479 | 
480 | 
481 | [IMPORTS]
482 | 
483 | # List of modules that can be imported at any level, not just the top level
484 | # one.
485 | allow-any-import-level=
486 | 
487 | # Allow wildcard imports from modules that define __all__.
488 | allow-wildcard-with-all=no
489 | 
490 | # Analyse import fallback blocks. This can be used to support both Python 2 and
491 | # 3 compatible code, which means that the block might have code that exists
492 | # only in one or another interpreter, leading to false positives when analysed.
493 | analyse-fallback-blocks=no
494 | 
495 | # Deprecated modules which should not be used, separated by a comma.
496 | deprecated-modules=optparse,tkinter.tix
497 | 
498 | # Create a graph of external dependencies in the given file (report RP0402 must
499 | # not be disabled).
500 | ext-import-graph=
501 | 
502 | # Create a graph of every (i.e. internal and external) dependencies in the
503 | # given file (report RP0402 must not be disabled).
504 | import-graph=
505 | 
506 | # Create a graph of internal dependencies in the given file (report RP0402 must
507 | # not be disabled).
508 | int-import-graph=
509 | 
510 | # Force import order to recognize a module as part of the standard
511 | # compatibility libraries.
512 | known-standard-library=
513 | 
514 | # Force import order to recognize a module as part of a third party library.
515 | known-third-party=enchant
516 | 
517 | # Couples of modules and preferred modules, separated by a comma.
518 | preferred-modules=
519 | 
520 | 
521 | [CLASSES]
522 | 
523 | # List of method names used to declare (i.e. assign) instance attributes.
524 | defining-attr-methods=__init__,
525 |                       __new__,
526 |                       setUp,
527 |                       __post_init__
528 | 
529 | # List of member names, which should be excluded from the protected access
530 | # warning.
531 | exclude-protected=_asdict,
532 |                   _fields,
533 |                   _replace,
534 |                   _source,
535 |                   _make
536 | 
537 | # List of valid names for the first argument in a class method.
538 | valid-classmethod-first-arg=cls
539 | 
540 | # List of valid names for the first argument in a metaclass class method.
541 | valid-metaclass-classmethod-first-arg=cls
542 | 
543 | 
544 | [DESIGN]
545 | 
546 | # Maximum number of arguments for function / method.
547 | max-args=5
548 | 
549 | # Maximum number of attributes for a class (see R0902).
550 | max-attributes=7
551 | 
552 | # Maximum number of boolean expressions in an if statement (see R0916).
553 | max-bool-expr=5
554 | 
555 | # Maximum number of branch for function / method body.
556 | max-branches=12
557 | 
558 | # Maximum number of locals for function / method body.
559 | max-locals=15
560 | 
561 | # Maximum number of parents for a class (see R0901).
562 | max-parents=7
563 | 
564 | # Maximum number of public methods for a class (see R0904).
565 | max-public-methods=20
566 | 
567 | # Maximum number of return / yield for function / method body.
568 | max-returns=6
569 | 
570 | # Maximum number of statements in function / method body.
571 | max-statements=50
572 | 
573 | # Minimum number of public methods for a class (see R0903).
574 | min-public-methods=2
575 | 
576 | 
577 | [EXCEPTIONS]
578 | 
579 | # Exceptions that will emit a warning when being caught. Defaults to
580 | # "BaseException, Exception".
581 | overgeneral-exceptions=BaseException,
582 |                        Exception
583 | 


--------------------------------------------------------------------------------
/lib/src/spark_validation/static/static/js/main.8e11e6a5.chunk.js.map:
--------------------------------------------------------------------------------
1 | {"version":3,"sources":["components/RoleEditor/index.jsx","components/ToolBar/index.jsx","App.jsx","serviceWorker.js","index.js","assets/logo.png"],"names":["Button","styled","button","Wrapper","div","Editor","Output","getColumnBuilderProps","name","type","operators","label","fields","getControlElements","valueEditor","specs","field","operator","handleOnChange","value","operatorName","toLowerCase","startsWith","includes","onChange","handleQueryChange","targetColumn","input","currentQueries","setQuery","rule","toQuery","isParent","console","log","query","mapOperator","groupRules","rules","groupCombinator","combinator","forEach","prefixOperator","length","replace","trim","map","data","column","RoleEditor","table","columns","useState","queries","parentOf","setParentOf","addToast","useToasts","completnessOperator","setCompletnessOperator","completnessValue","setCompletnessValue","useEffect","reduce","obj","state","body","correctness_validations","completeness_validations","parent_children_constraints","key","onQueryChange","class","selectedOption","options","e","target","isMulti","selectedOptions","newSelection","oldSelection","filter","item","concat","language","style","github","JSON","stringify","onClick","fetch","config","method","appearance","autoDismiss","autoDismissTimeout","props","tableData","Image","img","Logo","initialState","ToolBar","onTableSelected","setTable","src","logo","App","className","Boolean","window","location","hostname","match","ReactDOM","render","StrictMode","document","getElementById","navigator","serviceWorker","ready","then","registration","unregister","catch","error","message","module","exports"],"mappings":"0rDASA,IAgBMA,EAASC,IAAOC,OAAV,KAaNC,EAAUF,IAAOG,IAAV,KAQPC,EAASJ,IAAOG,IAAV,KAyCNE,EAASL,IAAOG,IAAV,KAUNG,EAAwB,SAAC,GAAD,IAAGC,EAAH,EAAGA,KAAMC,EAAT,EAASA,KAAT,MAAqB,CACjDC,UAAW,CACT,CAAEF,KAAM,UAAWG,MAAO,WAC1B,CAAEH,KAAM,cAAeG,MAAO,eAC9B,CAAEH,KAAM,OAAQG,MAAO,YACvB,CAAEH,KAAM,QAASG,MAAO,gBACxB,CAAEH,KAAM,KAAMG,MAAO,MACrB,CAAEH,KAAM,IAAKG,MAAO,KACpB,CAAEH,KAAM,KAAMG,MAAO,MACrB,CAAEH,KAAM,IAAKG,MAAO,KACpB,CAAEH,KAAM,IAAKG,MAAO,KACpB,CAAEH,KAAM,KAAMG,MAAO,MACrB,CAAEH,KAAM,KAAMG,MAAO,OAEvBC,OAAQ,CACN,CAAEJ,KAAMA,EAAMG,MAAOH,GACrB,CAAEA,KAAK,eAAD,OAAiBA,EAAjB,KAA0BG,MAAM,eAAD,OAAiBH,EAAjB,OAEvCK,mBAAoB,iBAAO,CAIzBC,YAAa,SAAAC,GAAU,IACbC,EAA2CD,EAA3CC,MAAOC,EAAoCF,EAApCE,SAAUC,EAA0BH,EAA1BG,eAAgBC,EAAUJ,EAAVI,MACnCC,EAAeH,EAASI,cAC9B,OACED,EAAaE,WAAW,OACxB,CAAC,OAAQ,SAASC,SAASH,GAEpB,GAEI,WAATX,GAAqBO,IAAK,sBAAoBR,EAApB,KACrB,2BAAOW,MAAOA,EAAOK,SAAUN,IAEjC,2BAAOM,SAAUN,SAgCxBO,EAAoB,SAACC,EAAcC,EAAOC,EAAgBC,GAC9D,IAAMC,EA5BQ,SAAVC,EAAWJ,EAAOK,GACtBC,QAAQC,IAAIP,GACZ,IAAIQ,EAAQ,GACNC,EAAc,CAClB,KAAQ,UACR,QAAW,cACX,SAAY,QAMCC,EAA4CV,EAAnDW,MAA+BC,EAAoBZ,EAAhCa,WAY3B,OAVAH,EAAWI,SAAQ,SAAA1B,GAAU,IAAD,EAC+BA,EAAjDC,aADkB,MACV,GADU,IAC+BD,EAArCE,gBADM,MACK,GADL,EACSE,EAAsBJ,EAAtBI,MADT,EAC+BJ,EAAfuB,aADhB,MACwB,GADxB,EAE1BL,QAAQC,IAAIlB,GAEZ,IAAM0B,EAAiBV,GAAsB,KAAVG,EAAe,GAAKI,EACvDJ,GAASG,EAAMK,OACXZ,EAAQhB,GADH,WAED2B,EAFC,YAEiB1B,EAFjB,YAE0BoB,EAAYnB,IAAaA,EAFnD,YAE+DE,EAF/D,QAKJgB,EAIMJ,CAAQJ,GAAO,GAAMiB,QAAQ,SAAU,KAAKC,OAOzDhB,EANmBD,EAAekB,KAAI,SAAAC,GACpC,OAAIA,EAAKC,SAAWtB,EACX,CAAEsB,OAAQD,EAAKC,OAAQlB,QAEzBiB,OAyHIE,EApHI,SAAC,GAA6B,IAAD,IAA3BC,aAA2B,MAAnB,CAACC,QAAS,IAAS,IAClBC,mBAAS,IADS,mBACvCC,EADuC,KAC9BxB,EAD8B,OAEduB,mBAAS,IAFK,mBAEvCE,EAFuC,KAE7BC,EAF6B,KAGtCC,EAAaC,sBAAbD,SAHsC,EAKQJ,mBAAS,IALjB,mBAKvCM,EALuC,KAKlBC,EALkB,OAMEP,mBAAS,IANX,mBAMvCQ,EANuC,KAMrBC,EANqB,KAQ9CC,qBAAU,WACRjC,EAASqB,EAAMC,QAAQL,KAAI,kBAAe,CAAEE,OAAjB,EAAGxC,KAA4BsB,KAAM,UAC/D,CAACoB,IAEcA,EAAMC,QAAQY,QAAO,SAACjB,EAAKkB,GAE3C,OADAlB,EAAIkB,EAAIxD,MAAQwD,EAAIvD,KACbqC,IACN,IAHH,IAjLemB,EAwLTC,GAxLSD,EAyLb,CACEE,wBAAyBd,EACzBe,yBAA0B,CAAE,CACxB,OAAU,iBACV,KAAQ,kBAAoBV,EAAsB,IAAME,IAG5DS,4BAA6Bf,GAhMR,CACzB,aAAgB,CACd,KAAQ,8BACR,UAAa,KACb,yBAA4B,uDAC5B,0BAA6B,wDAC7B,wBAA2B,sDAC3B,qCAAwC,CAAC,KAAM,OAAQ,cAAe,UACtE,6BAAgC,GAElC,wBAA0B,YAAKW,EAAME,yBACrC,yBAA2B,YAAKF,EAAMG,0BACtC,4BAA8B,YAAKH,EAAMI,6BACzC,4BAA+B,CAAC,eAAgB,oBAiMhD,OACE,kBAAC,WAAD,KAC2B,GAAxBnB,EAAMC,QAAQR,QAAe,iEACL,GAAxBO,EAAMC,QAAQR,QACb,kBAACxC,EAAD,KACE,kBAACE,EAAD,KACE,sCACA,uDAEE6C,EAAMC,QAAQL,KAAI,SAAA/B,GAAK,OACrB,yBAAKuD,IAAKvD,EAAMP,MACd,4BAAKO,EAAMP,KAAX,MAAoBO,EAAMN,KAA1B,KACA,kBAAC,IAAD,CACEG,OAAQL,EAAsBQ,GAAOH,OAErC2D,cAAe,SAAApC,GAAK,OAAIV,EAAkBV,EAAMP,KAAM2B,EAAOkB,EAASxB,MAEtE,iCAIR,wDACA,8CACA,yBAAK2C,MAAM,eACT,kBAAC,IAAD,CACEhD,SAAW,SAAAiD,GAAc,OAAId,EAAuBc,EAAetD,QACnEuD,QAAS,CACP,CAAEvD,MAAO,IAAKR,MAAO,KACrB,CAAEQ,MAAO,KAAMR,MAAO,MACtB,CAAEQ,MAAO,IAAKR,MAAO,KACrB,CAAEQ,MAAO,IAAKR,MAAO,KACrB,CAAEQ,MAAO,KAAMR,MAAO,SAG1B,2BAAOF,KAAK,SAASe,SAAU,SAAAmD,GAAC,OAAId,EAAoBc,EAAEC,OAAOzD,WAEnE,qDAEE+B,EAAMC,QAAQL,KAAI,SAAA/B,GAAK,OACrB,yBAAKuD,IAAKvD,EAAMP,MACd,4BAAKO,EAAMP,KAAX,MAAoBO,EAAMN,KAA1B,KACA,kBAAC,IAAD,CACEoE,SAAO,EACPrD,SAAU,SAAAsD,GACR,IAAMC,GAAgBD,GAAmB,IAAIhC,KAAI,SAAA2B,GAAc,MAC7D,CACE,OAAU1D,EAAMP,KAChB,OAAUiE,EAAetD,UAGvB6D,EAAe1B,EAAS2B,QAAO,SAAAC,GAAI,OAAIA,EAAKlC,QAAUjC,EAAMP,QAClE+C,EAAYwB,EAAaI,OAAOH,KAElCN,QAASxB,EAAMC,QAAQ8B,QAAO,SAAAC,GAAI,OAAIA,EAAK1E,MAAQO,EAAMP,QAAMsC,KAAI,SAAAoC,GAAI,MAAK,CAC1E/D,MAAO+D,EAAK1E,KACZG,MAAOuE,EAAK1E,WAEd,kCAKV,kBAACF,EAAD,KACE,sCACA,kBAAC,IAAD,CAAmB8E,SAAS,OAAOC,MAAOC,KACvCC,KAAKC,UAAUtB,EAAM,KAAM,MAKX,GAAxBhB,EAAMC,QAAQR,QAAe,kBAAC3C,EAAD,CAAQyF,QAhFzB,WACfC,MAAM,eAAgB,CAACxB,KAAMqB,KAAKC,UAAUG,EAAQ,KAAM,GAAIC,OAAQ,SAEtEpC,EAAS,uBAAwB,CAC/BqC,WAAY,UACZC,aAAa,EACbC,mBAAoB,QA0EU,Y,61BCvRpC,IAAM5F,EAAUF,IAAOG,IAAV,KAWPJ,EAASC,IAAOC,OAAV,KAGI,SAAA8F,GAAK,MAAmB,cAAfA,EAAMvF,KAAuB,QAAS,aACpD,SAAAuF,GAAK,MAAmB,cAAfA,EAAMvF,KAAsB,UAAY,WAQtDwF,EAAY,CAChBzF,KAAM,kBACN2C,QACA,CACE,CACE,KAAQ,KACR,KAAQ,UAEV,CACE,KAAQ,OACR,KAAQ,UAEV,CACE,KAAQ,cACR,KAAQ,UAEV,CACE,KAAQ,SACR,KAAQ,UAEV,CACE,KAAQ,UACR,KAAQ,YAKR+C,EAAQjG,IAAOkG,IAAV,KAILC,EAAOnG,IAAOG,IAAV,KASJiG,EAAe,CAAClD,QAAS,IA4BhBmD,EA3BC,SAAC,GAAuB,IAAtBC,EAAqB,EAArBA,gBACR/C,EAAaC,sBAAbD,SAD6B,EAEXJ,mBAAS,eAAIiD,IAFF,mBAE9BnD,EAF8B,KAEvBsD,EAFuB,KAcrC,OACE,kBAAC,EAAD,KACE,kBAAC,EAAD,CAAQf,QAZK,WACfc,EAAgBN,GAChBO,EAASP,GACTzC,EAAS,uBAAwB,CAC/BqC,WAAY,UACZC,aAAa,EACbC,mBAAoB,QAMpB,UACA,8BACE,2BACG7C,EAAM1C,MAAQ,sBAGnB,kBAAC4F,EAAD,uBAAqB,kBAACF,EAAD,CAAOO,IAAKC,SC1ExBC,MAZf,WAAgB,IAAD,EACavD,mBAAS,CAACD,QAAS,KADhC,mBACND,EADM,KACCsD,EADD,KAEb,OACE,kBAAC,gBAAD,KACE,yBAAKI,UAAU,OACb,kBAAC,EAAD,CAASL,gBAAiB,SAAArD,GAAK,OAAIsD,EAAStD,MAC5C,kBAAC,EAAD,CAAYA,MAAOA,OCAP2D,QACW,cAA7BC,OAAOC,SAASC,UAEe,UAA7BF,OAAOC,SAASC,UAEhBF,OAAOC,SAASC,SAASC,MACvB,2DCZNC,IAASC,OACP,kBAAC,IAAMC,WAAP,KACE,kBAAC,EAAD,OAEFC,SAASC,eAAe,SDyHpB,kBAAmBC,WACrBA,UAAUC,cAAcC,MACrBC,MAAK,SAAAC,GACJA,EAAaC,gBAEdC,OAAM,SAAAC,GACL7F,QAAQ6F,MAAMA,EAAMC,a,yyCEzI5BC,EAAOC,QAAU,IAA0B,kC","file":"static/js/main.8e11e6a5.chunk.js","sourcesContent":["import React, {useState, Fragment, useEffect} from 'react';\nimport styled from 'styled-components';\nimport QueryBuilder from 'react-querybuilder';\nimport SyntaxHighlighter from 'react-syntax-highlighter';\nimport {github} from 'react-syntax-highlighter/dist/esm/styles/hljs';\nimport Select from 'react-select';\nimport {useToasts} from 'react-toast-notifications'\nimport config from './config.json';\n \nconst template = state => ({\n  \"source_table\": {\n    \"name\": \"mock_data/family_sample.csv\",\n    \"id_column\": \"ID\",\n    \"output_correctness_table\": \"/tmp/mock_data/output/family_sample_test_correctness\",\n    \"output_completeness_table\": \"/tmp/mock_data/output/family_sample_test_completeness\",\n    \"output_comparison_table\": \"/tmp/mock_data/output/family_sample_test_comparison\",\n    \"unique_column_group_values_per_table\": [\"ID\", \"NAME\", \"FAMILY_NAME\", \"PARENT\"],\n    \"fuzzy_deduplication_distance\": 0\n  },\n  \"correctness_validations\": [...state.correctness_validations],\n  \"completeness_validations\": [...state.completeness_validations],\n  \"parent_children_constraints\": [...state.parent_children_constraints],\n  \"compare_related_tables_list\": [\"test.diff_df\", \"test.diff_df_2\"]\n});\n\nconst Button = styled.button`\n  height: 30px;\n  background: #1890ff;\n  border-radius: 4px;\n  border: none;\n  color: white;\n  line-height: 1.5;\n  font-weight: bold;\n  cursor: pointer;\n  width: 200px;\n  margin: 30px;\n`;\n \nconst Wrapper = styled.div`\n  height: 70vh;\n  border: 1px solid lightgray;\n  border-radius: 8px;\n  display: grid;\n  grid-template-columns: repeat(2, 1fr);\n`;\n\nconst Editor = styled.div`\n  padding: 0px 30px 50px 30px;\n  color: darkblue;\n  text-align: left;\n  overflow: auto;\n  h4 {\n    padding: 0px;\n    margin: 0px;\n    color: #505050;\n    font-weight: normal;\n    margin-bottom: 5px;\n  }\n\n  .completness {\n    display: grid;\n    grid-template-columns: 1fr 2fr;\n    grid-gap: 8px;\n  }\n\n  .ruleGroup {\n    padding: 0.5rem;\n    border: 1px solid #1890ff;\n    border-radius: 4px;\n    background: rgba(180, 220, 255, 0.2);\n\n    .rule,\n    .ruleGroup {\n      margin-top: 0.5rem;\n      margin-left: 0.5rem;\n    }\n\n    .ruleGroup-combinators.betweenRules {\n      margin-top: 0.5rem;\n    }\n\n    .ruleGroup-notToggle {\n      margin-right: 0.5rem;\n    }\n  }\n`;\n\nconst Output = styled.div`\n  color: darkblue;\n  background: rgb(248, 248, 248);\n  border-radius: 0px 8px 8px 0px;\n  text-align: left;\n  padding: 0px 30px;\n  font-size: 0.8em;\n  overflow: auto;\n`;\n\nconst getColumnBuilderProps = ({ name, type }) => ({\n  operators: [\n    { name: \"Is NULL\", label: \"Is NULL\" },\n    { name: \"Is NOT NULL\", label: \"Is NOT NULL\" },\n    { name: \"= ''\", label: \"Is EMPTY\" },\n    { name: \"!= ''\", label: \"Is NOT EMPTY\" },\n    { name: \"In\", label: \"In\" },\n    { name: \"=\", label: \"=\" },\n    { name: \"!=\", label: \"!=\" },\n    { name: \"<\", label: \"<\" },\n    { name: \">\", label: \">\" },\n    { name: \"<=\", label: \"<=\" },\n    { name: \">=\", label: \">=\" }\n  ],\n  fields: [\n    { name: name, label: name },\n    { name: `CHAR_LENGTH(${name})`, label: `CHAR_LENGTH(${name})` }\n  ],\n  getControlElements: () => ({\n    // This part handles the rendering of input components based on the operator/column DOCTYPE\n    // atm the plain Input text element doesnt work somehow the value is a react synthetic event\n    // which I am not familiar with yet :p\n    valueEditor: specs => {\n      const { field, operator, handleOnChange, value } = specs;\n      const operatorName = operator.toLowerCase();\n      if (\n        operatorName.startsWith(\"is\") ||\n        [\"= ''\", \"!= ''\"].includes(operatorName)\n      ) {\n        return \"\";\n      }\n      if (type === \"number\" || field === `CHAR_LENGTH(${name})`) {\n        return <input value={value} onChange={handleOnChange} />;\n      }\n      return <input onChange={handleOnChange} />;\n    }\n  })\n});\n\nconst toQuery = (input, isParent) => {\n  console.log(input);\n  let query = \"\";\n  const mapOperator = {\n    \"null\": \"IS NULL\",\n    \"notNull\": \"IS NOT NULL\",\n    \"contains\": \"LIKE\"\n  }\n\n  // const isNum = val => /^\\d+$/.test(val);\n  // const getValue = val =>  isNum(val) || val == '' ? val : `'${val}'`\n\n  const { rules: groupRules, combinator: groupCombinator } = input;\n\n  groupRules.forEach(specs => {\n    const { field = \"\", operator = \"\", value, rules = [] } = specs;\n    console.log(field);\n\n    const prefixOperator = isParent && query === \"\" ? \"\" : groupCombinator;\n    query += rules.length\n      ? toQuery(specs)\n      : ` ${prefixOperator} ${field} ${mapOperator[operator] || operator} ${value} `;\n  });\n\n  return query;\n};\n\nconst handleQueryChange = (targetColumn, input, currentQueries, setQuery) => {\n  const rule = toQuery(input, true).replace(/\\s\\s+/g, ' ').trim();\n  const newQueries = currentQueries.map(data => {\n    if (data.column === targetColumn) {\n      return { column: data.column, rule };\n    }\n    return data;\n  });\n  setQuery(newQueries);\n};\n\nconst RoleEditor = ({table = {columns: []}}) => {\n  const [queries, setQuery] = useState([]);\n  const [parentOf, setParentOf] = useState([]);\n  const { addToast } = useToasts()\n\n  const [completnessOperator, setCompletnessOperator] = useState('');\n  const [completnessValue, setCompletnessValue] = useState('');\n\n  useEffect(() => {\n    setQuery(table.columns.map(({ name }) => ({ column: name, rule: \"\" })));\n  }, [table]);\n\n  const columnMap = table.columns.reduce((map, obj) => {\n    map[obj.name] = obj.type;\n    return map;\n  }, {});\n  \n  const getInputType = (field) => columnMap[field] || 'text';\n\n  const body = template(\n    {\n      correctness_validations: queries,\n      completeness_validations: [ {\n          \"column\": \"OVER_ALL_COUNT\",\n          \"rule\": \"OVER_ALL_COUNT \" + completnessOperator + \" \" + completnessValue\n        }\n      ],\n      parent_children_constraints: parentOf,\n    }\n  );\n\n  const onSubmit = () => {\n    fetch('api/validate', {body: JSON.stringify(config, null, 2), method: 'POST'});\n\n    addToast('Succesfully submited', {\n      appearance: 'success',\n      autoDismiss: true,\n      autoDismissTimeout: 3000,\n    });\n  };\n\n  return (\n    <Fragment>\n      {table.columns.length == 0 && <h3> Please select table to add rules</h3>}\n      {table.columns.length != 0 && \n        <Wrapper>\n          <Editor>\n            <h1>Editor</h1>\n            <h3>Correctness validations</h3>\n            {\n              table.columns.map(specs => (\n                <div key={specs.name}>\n                  <h4>{specs.name}: [{specs.type}]</h4>\n                  <QueryBuilder \n                    fields={getColumnBuilderProps(specs).fields} \n                    // getInputType={getInputType}\n                    onQueryChange={query => handleQueryChange(specs.name, query, queries, setQuery)}\n                    />\n                    <br/>\n                </div>\n              ))\n            }\n            <h3>Completeness validations</h3>\n            <h4>Number of rows</h4>\n            <div class=\"completness\">\n              <Select\n                onChange={ selectedOption => setCompletnessOperator(selectedOption.value)}\n                options={[\n                  { value: '>', label: '>' },\n                  { value: '>=', label: '>=' },\n                  { value: '=', label: '=' },\n                  { value: '<', label: '<' },\n                  { value: '<=', label: '<=' },\n                ]}\n              />\n              <input type='number' onChange={e => setCompletnessValue(e.target.value)}></input>\n            </div>\n            <h3>Parent of constraints</h3>\n            {\n              table.columns.map(specs => (\n                <div key={specs.name}>\n                  <h4>{specs.name}: [{specs.type}]</h4>\n                  <Select\n                    isMulti\n                    onChange={selectedOptions => {\n                      const newSelection = (selectedOptions || []).map(selectedOption => (\n                        {\n                          \"column\": specs.name,\n                          \"parent\": selectedOption.value\n                        }\n                      ));\n                      const oldSelection = parentOf.filter(item => item.column != specs.name);\n                      setParentOf(newSelection.concat(oldSelection));\n                    }}\n                    options={table.columns.filter(item => item.name != specs.name).map(item => ({\n                      value: item.name,\n                      label: item.name,\n                    }))}\n                  /><br/>\n                </div>\n              ))\n            }\n          </Editor>\n          <Output>\n            <h1>Output</h1>\n            <SyntaxHighlighter language=\"JSON\" style={github}>\n              {JSON.stringify(body, null, 2)}\n            </SyntaxHighlighter>\n          </Output>\n        </Wrapper>\n      }\n      {table.columns.length != 0 && <Button onClick={onSubmit}>Submit</Button>}\n    </Fragment>\n  );\n};\n\nexport default RoleEditor;","import React, {useState} from 'react';\nimport styled from 'styled-components';\nimport logo from '../../assets/logo.png';\n\nimport { useToasts} from 'react-toast-notifications'\n\nconst Wrapper = styled.div`\n  height: 60px;\n  border-radius: 8px;\n  border: 1px solid lightgrey;\n  display: grid;\n  justify-items: center;\n  align-items: center;\n  grid-template-columns: repeat(3, 1fr);\n  margin-bottom: 40px;\n`;\n\nconst Button = styled.button`\n  height: 30px;\n  width: 100px;\n  background: ${props => props.type === 'secondary' ? 'white': '#1890ff'};\n  color: ${props => props.type === 'secondary' ?'#1890ff' : 'white'};\n  border-radius: 4px;\n  border: 1px solid #1890ff;\n  line-height: 1.5;\n  font-weight: bold;\n  cursor: pointer;\n`;\n\nconst tableData = {\n  name: \"family_tree.csv\",\n  columns: \n  [\n    {\n      \"name\": \"ID\",\n      \"type\": \"number\",\n    },\n    {\n      \"name\": \"NAME\",\n      \"type\": \"string\",\n    },\n    {\n      \"name\": \"FAMILY_NAME\",\n      \"type\": \"string\",\n    },\n    {\n      \"name\": \"PARENT\",\n      \"type\": \"string\",\n    },\n    {\n      \"name\": \"ADDRESS\",\n      \"type\": \"string\",\n    },\n  ]\n}\n\nconst Image = styled.img`\n  width: 60px;\n`;\n\nconst Logo = styled.div`\n  display: grid;\n  grid-template-columns: 1fr 1fr;\n  justify-items: center;\n  align-items: center;\n  font-weight: bold;\n  color: darkblue;\n`;\n\nconst initialState = {columns: []};\nconst ToolBar = ({onTableSelected}) => {\n  const { addToast } = useToasts()\n  const [table, setTable] = useState({...initialState});\n\n  const onUpload = () => {\n    onTableSelected(tableData);\n    setTable(tableData);\n    addToast('Succesfully uploaded', {\n      appearance: 'success',\n      autoDismiss: true,\n      autoDismissTimeout: 3000,\n    })\n  };\n\n  return (\n    <Wrapper>\n      <Button onClick={onUpload}>Upload</Button>\n      <span>\n        <b>\n          {table.name || 'No table selected'}\n        </b>\n      </span>\n      <Logo>
2 | ValiData <Image src={logo}/></Logo>\n    </Wrapper>\n  );\n};\n\nexport default ToolBar;","import React, {useState} from 'react';\nimport './App.css';\nimport RoleEditor from './components/RoleEditor';\nimport ToolBar from './components/ToolBar';\nimport { ToastProvider } from 'react-toast-notifications'\n\nfunction App() {\n  const [table, setTable] = useState({columns: []});\n  return (\n    <ToastProvider>\n      <div className=\"App\">\n        <ToolBar onTableSelected={table => setTable(table)}/>\n        <RoleEditor table={table}/>\n      </div>\n    </ToastProvider>\n  );\n}\n\nexport default App;\n","// This optional code is used to register a service worker.\n// register() is not called by default.\n\n// This lets the app load faster on subsequent visits in production, and gives\n// it offline capabilities. However, it also means that developers (and users)\n// will only see deployed updates on subsequent visits to a page, after all the\n// existing tabs open on the page have been closed, since previously cached\n// resources are updated in the background.\n\n// To learn more about the benefits of this model and instructions on how to\n// opt-in, read https://bit.ly/CRA-PWA\n\nconst isLocalhost = Boolean(\n  window.location.hostname === 'localhost' ||\n    // [::1] is the IPv6 localhost address.\n    window.location.hostname === '[::1]' ||\n    // 127.0.0.0/8 are considered localhost for IPv4.\n    window.location.hostname.match(\n      /^127(?:\\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}$/\n    )\n);\n\nexport function register(config) {\n  if (process.env.NODE_ENV === 'production' && 'serviceWorker' in navigator) {\n    // The URL constructor is available in all browsers that support SW.\n    const publicUrl = new URL(process.env.PUBLIC_URL, window.location.href);\n    if (publicUrl.origin !== window.location.origin) {\n      // Our service worker won't work if PUBLIC_URL is on a different origin\n      // from what our page is served on. This might happen if a CDN is used to\n      // serve assets; see https://github.com/facebook/create-react-app/issues/2374\n      return;\n    }\n\n    window.addEventListener('load', () => {\n      const swUrl = `${process.env.PUBLIC_URL}/service-worker.js`;\n\n      if (isLocalhost) {\n        // This is running on localhost. Let's check if a service worker still exists or not.\n        checkValidServiceWorker(swUrl, config);\n\n        // Add some additional logging to localhost, pointing developers to the\n        // service worker/PWA documentation.\n        navigator.serviceWorker.ready.then(() => {\n          console.log(\n            'This web app is being served cache-first by a service ' +\n              'worker. To learn more, visit https://bit.ly/CRA-PWA'\n          );\n        });\n      } else {\n        // Is not localhost. Just register service worker\n        registerValidSW(swUrl, config);\n      }\n    });\n  }\n}\n\nfunction registerValidSW(swUrl, config) {\n  navigator.serviceWorker\n    .register(swUrl)\n    .then(registration => {\n      registration.onupdatefound = () => {\n        const installingWorker = registration.installing;\n        if (installingWorker == null) {\n          return;\n        }\n        installingWorker.onstatechange = () => {\n          if (installingWorker.state === 'installed') {\n            if (navigator.serviceWorker.controller) {\n              // At this point, the updated precached content has been fetched,\n              // but the previous service worker will still serve the older\n              // content until all client tabs are closed.\n              console.log(\n                'New content is available and will be used when all ' +\n                  'tabs for this page are closed. See https://bit.ly/CRA-PWA.'\n              );\n\n              // Execute callback\n              if (config && config.onUpdate) {\n                config.onUpdate(registration);\n              }\n            } else {\n              // At this point, everything has been precached.\n              // It's the perfect time to display a\n              // \"Content is cached for offline use.\" message.\n              console.log('Content is cached for offline use.');\n\n              // Execute callback\n              if (config && config.onSuccess) {\n                config.onSuccess(registration);\n              }\n            }\n          }\n        };\n      };\n    })\n    .catch(error => {\n      console.error('Error during service worker registration:', error);\n    });\n}\n\nfunction checkValidServiceWorker(swUrl, config) {\n  // Check if the service worker can be found. If it can't reload the page.\n  fetch(swUrl, {\n    headers: { 'Service-Worker': 'script' },\n  })\n    .then(response => {\n      // Ensure service worker exists, and that we really are getting a JS file.\n      const contentType = response.headers.get('content-type');\n      if (\n        response.status === 404 ||\n        (contentType != null && contentType.indexOf('javascript') === -1)\n      ) {\n        // No service worker found. Probably a different app. Reload the page.\n        navigator.serviceWorker.ready.then(registration => {\n          registration.unregister().then(() => {\n            window.location.reload();\n          });\n        });\n      } else {\n        // Service worker found. Proceed as normal.\n        registerValidSW(swUrl, config);\n      }\n    })\n    .catch(() => {\n      console.log(\n        'No internet connection found. App is running in offline mode.'\n      );\n    });\n}\n\nexport function unregister() {\n  if ('serviceWorker' in navigator) {\n    navigator.serviceWorker.ready\n      .then(registration => {\n        registration.unregister();\n      })\n      .catch(error => {\n        console.error(error.message);\n      });\n  }\n}\n","import React from 'react';\nimport ReactDOM from 'react-dom';\nimport './index.css';\nimport App from './App';\nimport * as serviceWorker from './serviceWorker';\n\nReactDOM.render(\n  <React.StrictMode>\n    <App />\n  </React.StrictMode>,\n  document.getElementById('root')\n);\n\n// If you want your app to work offline and load faster, you can change\n// unregister() to register() below. Note this comes with some pitfalls.\n// Learn more about service workers: https://bit.ly/CRA-PWA\nserviceWorker.unregister();\n","module.exports = __webpack_public_path__ + \"static/media/logo.50e8e5ec.png\";"],"sourceRoot":""}
3 | 


--------------------------------------------------------------------------------