├── lib ├── __init__.py ├── MANIFEST.in ├── test │ ├── __init__.py │ └── spark_validation_tests │ │ ├── __init__.py │ │ └── common │ │ ├── __init__.py │ │ ├── mock_data │ │ ├── data_sample_diff_2.csv │ │ ├── family_sample_diff.csv │ │ ├── family_sample.csv │ │ ├── data_sample_diff.csv │ │ ├── data_sample.csv │ │ ├── config_example.json │ │ ├── config_example_local.json │ │ ├── config_family_fs.json │ │ ├── config_example_fs.json │ │ ├── config_example.yaml │ │ └── config_familiy_fs.yaml │ │ ├── pyspark_test.py │ │ └── general_handler_test.py ├── src │ └── spark_validation │ │ ├── __init__.py │ │ ├── common │ │ ├── __init__.py │ │ ├── validation_results.py │ │ ├── constants.py │ │ ├── config.py │ │ └── general_validator.py │ │ ├── dataframe_validation │ │ ├── __init__.py │ │ ├── hive_validator.py │ │ ├── dataframe_validator.py │ │ └── file_system_validator.py │ │ ├── static │ │ ├── robots.txt │ │ ├── favicon.ico │ │ ├── logo192.png │ │ ├── logo512.png │ │ ├── static │ │ │ ├── media │ │ │ │ └── logo.50e8e5ec.png │ │ │ ├── css │ │ │ │ ├── main.8e896e56.chunk.css │ │ │ │ └── main.8e896e56.chunk.css.map │ │ │ └── js │ │ │ │ ├── 2.e9c9302b.chunk.js.LICENSE.txt │ │ │ │ ├── runtime-main.6d8ceafa.js │ │ │ │ ├── runtime-main.6d8ceafa.js.map │ │ │ │ ├── main.8e11e6a5.chunk.js │ │ │ │ └── main.8e11e6a5.chunk.js.map │ │ ├── manifest.json │ │ ├── precache-manifest.02e0942ee454e4f28891cd6100c3e7f6.js │ │ ├── asset-manifest.json │ │ ├── service-worker.js │ │ └── index.html │ │ ├── version.py │ │ └── app.py ├── setup.cfg ├── requirements.txt └── setup.py ├── .dockerignore ├── tox.ini ├── requirements.txt ├── docker-compose.yml ├── .pre-commit-config.yaml ├── Dockerfile ├── .pypirc ├── .github └── workflows │ └── pythonpublish.yml ├── .travis.yml ├── license.md ├── .gitignore ├── README.md └── .pylintrc /lib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/MANIFEST.in: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /lib/test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/src/spark_validation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/src/spark_validation/common/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/test/spark_validation_tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/test/spark_validation_tests/common/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/src/spark_validation/dataframe_validation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | venv/ 2 | .idea 3 | .github 4 | .travis.yml 5 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [flake8] 2 | exclude = .git 3 | max-line-length = 120 4 | per-file-ignores = 5 | */__init__.py: D104 -------------------------------------------------------------------------------- /lib/src/spark_validation/static/robots.txt: -------------------------------------------------------------------------------- 1 | # https://www.robotstxt.org/robotstxt.html 2 | User-agent: * 3 | Disallow: 4 | -------------------------------------------------------------------------------- /lib/src/spark_validation/version.py: -------------------------------------------------------------------------------- 1 | """version file for the domino ingestion package.""" 2 | 3 | __version__ = "0.4" 4 | -------------------------------------------------------------------------------- /lib/src/spark_validation/static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ronald-smith-angel/owl-data-sanitizer/HEAD/lib/src/spark_validation/static/favicon.ico -------------------------------------------------------------------------------- /lib/src/spark_validation/static/logo192.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ronald-smith-angel/owl-data-sanitizer/HEAD/lib/src/spark_validation/static/logo192.png -------------------------------------------------------------------------------- /lib/src/spark_validation/static/logo512.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ronald-smith-angel/owl-data-sanitizer/HEAD/lib/src/spark_validation/static/logo512.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==2.4.5 2 | Flask==1.1.2 3 | requests==2.23.0 4 | dataclasses==0.6 5 | numpy==1.18.3 6 | pandas==1.0.1 7 | pivottablejs==0.9.0 8 | ipython==7.13.0 9 | -------------------------------------------------------------------------------- /lib/setup.cfg: -------------------------------------------------------------------------------- 1 | [aliases] 2 | test = pytest 3 | 4 | [coverage:run] 5 | source = src 6 | command_line = -m pytest 7 | 8 | [easy-install] 9 | index-url = https://pypi.python.org/pypi -------------------------------------------------------------------------------- /lib/src/spark_validation/static/static/media/logo.50e8e5ec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ronald-smith-angel/owl-data-sanitizer/HEAD/lib/src/spark_validation/static/static/media/logo.50e8e5ec.png -------------------------------------------------------------------------------- /lib/test/spark_validation_tests/common/mock_data/data_sample_diff_2.csv: -------------------------------------------------------------------------------- 1 | CODE,GENERAL_ID,NAME,CODE2,ULTIMATE_PARENT_ID,ULTIMATE_NAME,PARENT_ID,PARENT_NAME 2 | 12000123,1,Dummy 1 Entity,A,null,null,null,Dummy 1 Entity 3 | null,2,,B,2,Dummy 1 Entity,2,Dummy 1 Entity 4 | -------------------------------------------------------------------------------- /lib/test/spark_validation_tests/common/mock_data/family_sample_diff.csv: -------------------------------------------------------------------------------- 1 | ID,NAME,FAMILY_NAME,PARENT,ADDRESS 2 | 1,Cho,Cha,null,TEST 3 | 2,Pho,Cha,1,Joos 4 - 3 4 | 3,Dho,A,2,Joos 4 - 3 5 | 4,Jho,,3,Joos 4 - 3 6 | 5,null,A,4,Joos 4 - 3 7 | null,Tho,Cha,11,Joos 4 - 3 8 | 9 | 10 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.7" 2 | services: 3 | app: 4 | restart: always 5 | build: 6 | context: . 7 | dockerfile: Dockerfile 8 | ports: 9 | - "8000:8000" 10 | volumes: 11 | - $PWD/logs:/logs/ 12 | -------------------------------------------------------------------------------- /lib/test/spark_validation_tests/common/mock_data/family_sample.csv: -------------------------------------------------------------------------------- 1 | ID,NAME,FAMILY_NAME,PARENT,ADDRESS 2 | 1,Cho,Cha,null,TEST 3 | 2,Pho,Cha,1,Joos 4 - 3 4 | 3,Dho,A,2,Joos 4 - 3 5 | 4,Jho,,3,Joos 4 - 3 6 | 5,,A,4,Joos 4 - 3 7 | 6,,B,5,Joos 4 - 3 8 | null,Tho,Cha,11,Joos 4 - 3 9 | 7,Bho,Pha,8,Joos 4 - 3 10 | 7,Bho,Pha,8,Joos 4 - 31 11 | -------------------------------------------------------------------------------- /lib/requirements.txt: -------------------------------------------------------------------------------- 1 | # Development 2 | black==19.10b0 3 | flake8==3.7.9 4 | pytest==5.3.2 5 | coverage==5.0.3 6 | pyspark==2.4.5 7 | dataclasses==0.6 # this comes out of the box in python 3.7, we can remove this when we upgrade. 8 | Flask==1.1.2 9 | requests==2.23.0 10 | numpy==1.18.3 11 | pandas==1.0.1 12 | pivottablejs==0.9.0 13 | ipython==7.13.0 14 | 15 | 16 | -------------------------------------------------------------------------------- /lib/test/spark_validation_tests/common/mock_data/data_sample_diff.csv: -------------------------------------------------------------------------------- 1 | CODE,GENERAL_ID,NAME,CODE2,ULTIMATE_PARENT_ID,ULTIMATE_NAME,PARENT_ID,PARENT_NAME,ADDR_DESC 2 | 12000123,1,Dummy 1 Entity,A,null,null,null,Dummy 1 Entity, 3 | null,2,,B,2,Dummy 1 Entity,2,Dummy 1 Entity, 4 | 12000123,3,null,A,3,null,3,Dummy 1 Entity, 5 | 1,4,1,B,4,Dummy 1 Entity,4,Dummy 1 Entity, 6 | 12000123,5,1,A,5,null,5,Dummy 1 Entity, 7 | 12000123,null,,A,11,null,7,Dummy 1 Entity, -------------------------------------------------------------------------------- /lib/src/spark_validation/common/validation_results.py: -------------------------------------------------------------------------------- 1 | """Module to encapsulate validation results.""" 2 | from abc import ABC 3 | 4 | 5 | class ValidationResults(ABC): 6 | """Module to encapsulate validation results.""" 7 | 8 | def __init__(self, correctness_df, completeness_df, comparison_df): 9 | self.correctness_df = correctness_df 10 | self.completeness_df = completeness_df 11 | self.comparison_df = comparison_df 12 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/ambv/black 3 | rev: stable 4 | hooks: 5 | - id: black 6 | language_version: python3.7 7 | - repo: https://gitlab.com/pycqa/flake8 8 | rev: 3.7.9 9 | hooks: 10 | - id: flake8 11 | args: ['--config=tox.ini'] 12 | additional_dependencies: [ 13 | 'flake8-deprecated', 14 | 'flake8-docstrings', 15 | 'flake8-tidy-imports' 16 | ] 17 | -------------------------------------------------------------------------------- /lib/test/spark_validation_tests/common/mock_data/data_sample.csv: -------------------------------------------------------------------------------- 1 | CODE,GENERAL_ID,NAME,CODE2,ULTIMATE_PARENT_ID,ULTIMATE_NAME,PARENT_ID,PARENT_NAME,ADDR_DESC 2 | 12000123,1,Dummy 1 Entity,A,null,null,null,Dummy 1 Entity, 3 | null,2,,B,2,Dummy 1 Entity,2,Dummy 1 Entity, 4 | 12000123,3,null,A,3,null,3,Dummy 1 Entity, 5 | 1,4,1,B,4,Dummy 1 Entity,4,Dummy 1 Entity, 6 | 12000123,5,1,A,5,null,5,Dummy 1 Entity, 7 | 3,6,null,B,6,Dummy 1 Entity,6,Dummy 1 Entity, 8 | 12000123,null,,A,11,null,7,Dummy 1 Entity, 9 | ,7,2,B,8,Dummy 1 Entity,8,Dummy 1 Entity, 10 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM openjdk:8-jdk-slim as deployment 2 | COPY --from=python:3.7 / / 3 | ENV PYTHONPATH="/app/lib/src:/app/lib/test:$PYTHONPATH" 4 | ENV JAVA_HOME="/usr/local/openjdk-8" 5 | 6 | WORKDIR "/app" 7 | COPY lib/ . 8 | 9 | COPY . /app 10 | 11 | # An explicit installation of GUnicorn is required for it to instantiate worker threads. 12 | RUN pip install -r /app/requirements.txt && \ 13 | pip install gunicorn==20.0.4 14 | 15 | EXPOSE 8000 16 | CMD ["gunicorn", "-b", "0.0.0.0:8000", "--workers", "3", "spark_validation.app", "--timeout", "3000"] 17 | -------------------------------------------------------------------------------- /lib/src/spark_validation/static/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "short_name": "React App", 3 | "name": "Create React App Sample", 4 | "icons": [ 5 | { 6 | "src": "favicon.ico", 7 | "sizes": "64x64 32x32 24x24 16x16", 8 | "type": "image/x-icon" 9 | }, 10 | { 11 | "src": "logo192.png", 12 | "type": "image/png", 13 | "sizes": "192x192" 14 | }, 15 | { 16 | "src": "logo512.png", 17 | "type": "image/png", 18 | "sizes": "512x512" 19 | } 20 | ], 21 | "start_url": ".", 22 | "display": "standalone", 23 | "theme_color": "#000000", 24 | "background_color": "#ffffff" 25 | } 26 | -------------------------------------------------------------------------------- /.pypirc: -------------------------------------------------------------------------------- 1 | [distutils] 2 | index-servers = 3 | pypi 4 | 5 | [pypi] 6 | repository:https://upload.pypi.org/legacy/ 7 | username: ronaldsmithangel 8 | password: LZnBL8dhXzc+ygWpPHWDboPKWJ/jW58fg5N8US7np9tRhZ9YQzwFtDjaHetVxmIBCM+h57DmA5kJNBaIx7saNu48wARBxsZTP7T3pnM8L5uMpJhwaqIYQdIbnh00FbzyintuQJ7LqkSNFhUMkkoAoW+1NXAr9lk0HdGbMeTxJmr9ZSh4131rQIfqury8pT8z27/kslSD61x3Gua+yqbhYns1ZwMwuR84t2uRC7ihScM2Bi/gmQusjTm5HXwChn1U+fh1GQPb63bREfPwfTlvAN5GYt8ZuV/A3lz4iTwm0eRMDpnE1cO4kJ92U0xxtjeixgE0Jz73KebAAYQp+4I1zv7ng3gPlyDSPRn98eLYy8e/zp1q15kO179dvb87l+7WzQ8gjfI1FXsNneAlv+Aza/EDOHmssrlFAhXQpG3rDLgIEBEQbAKQCYrFJt/1tQdFHGWtTl+pvW8l+nMca5pRttZsDkPHhfRft6H+P1YWcEL6ksngiFh979EjLnuVISZGjLJmAjF7M3pxDZT9qbBF8NHPxmMjUaKeyTg2kONyGLH2mtFOanRxdYJOurZeIOAZA1GPq8iqL0M/a1Eivr5MTLYHCU/WEvEBDOThVu88N/9Y24sTtnIdMckzBFSZ+SBqgmZOw1TlmY+MugrX0z009tW3zgEAsiNUR+eK5AfGrEc= 9 | -------------------------------------------------------------------------------- /lib/src/spark_validation/app.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | 4 | from flask import Flask, jsonify, request, make_response 5 | 6 | from spark_validation.dataframe_validation import file_system_validator 7 | 8 | application = Flask(__name__, static_url_path="") 9 | 10 | 11 | @application.route("/") 12 | def index(): 13 | return application.send_static_file('index.html') 14 | 15 | @application.route("/api/validate", methods=["POST"]) 16 | def validate(): 17 | json_input = request.get_json(force=True) 18 | with open('config.json', 'w') as fp: 19 | json.dump(json_input, fp) 20 | sys.argv = ["example.py", "-c", 'config.json'] 21 | 22 | print("JSON: {}".format(json_input)) 23 | 24 | file_system_validator.init() 25 | response = {"validation": "yes"} 26 | 27 | return make_response(jsonify(response), 200) 28 | 29 | 30 | if __name__ == "__main__": 31 | application.run(port=8000, debug=True) 32 | -------------------------------------------------------------------------------- /lib/src/spark_validation/static/precache-manifest.02e0942ee454e4f28891cd6100c3e7f6.js: -------------------------------------------------------------------------------- 1 | self.__precacheManifest = (self.__precacheManifest || []).concat([ 2 | { 3 | "revision": "a8dca60662ceb50b0796e7dd27c3ef12", 4 | "url": "/index.html" 5 | }, 6 | { 7 | "revision": "e463c5cd056b63b394e6", 8 | "url": "/static/css/main.8e896e56.chunk.css" 9 | }, 10 | { 11 | "revision": "aae5bb205dfb84ac1b5a", 12 | "url": "/static/js/2.e9c9302b.chunk.js" 13 | }, 14 | { 15 | "revision": "c64c486544348f10a6d6c716950bc223", 16 | "url": "/static/js/2.e9c9302b.chunk.js.LICENSE.txt" 17 | }, 18 | { 19 | "revision": "e463c5cd056b63b394e6", 20 | "url": "/static/js/main.8e11e6a5.chunk.js" 21 | }, 22 | { 23 | "revision": "3f809adccd7b5eb81ce7", 24 | "url": "/static/js/runtime-main.6d8ceafa.js" 25 | }, 26 | { 27 | "revision": "50e8e5ecb197cb27b1347de458f06521", 28 | "url": "/static/media/logo.50e8e5ec.png" 29 | } 30 | ]); -------------------------------------------------------------------------------- /lib/src/spark_validation/static/static/css/main.8e896e56.chunk.css: -------------------------------------------------------------------------------- 1 | body{margin:0;font-family:-apple-system,BlinkMacSystemFont,"Segoe UI","Roboto","Oxygen","Ubuntu","Cantarell","Fira Sans","Droid Sans","Helvetica Neue",sans-serif;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}code{font-family:source-code-pro,Menlo,Monaco,Consolas,"Courier New",monospace}.App{text-align:center;padding:40px;margin:0 auto;width:90vw}.App-logo{height:40vmin;pointer-events:none}@media (prefers-reduced-motion:no-preference){.App-logo{-webkit-animation:App-logo-spin 20s linear infinite;animation:App-logo-spin 20s linear infinite}}.App-header{background-color:#282c34;min-height:100vh;display:flex;flex-direction:column;align-items:center;justify-content:center;font-size:calc(10px + 2vmin);color:#fff}.App-link{color:#61dafb}@-webkit-keyframes App-logo-spin{0%{transform:rotate(0deg)}to{transform:rotate(1turn)}}@keyframes App-logo-spin{0%{transform:rotate(0deg)}to{transform:rotate(1turn)}} 2 | /*# sourceMappingURL=main.8e896e56.chunk.css.map */ -------------------------------------------------------------------------------- /.github/workflows/pythonpublish.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package 5 | 6 | on: 7 | release: 8 | types: [created] 9 | 10 | jobs: 11 | deploy: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Set up Python 18 | uses: actions/setup-python@v1 19 | with: 20 | python-version: '3.x' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install setuptools wheel twine 25 | - name: Build and publish 26 | env: 27 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 28 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 29 | run: | 30 | python setup.py sdist bdist_wheel 31 | twine upload dist/* 32 | -------------------------------------------------------------------------------- /lib/src/spark_validation/common/constants.py: -------------------------------------------------------------------------------- 1 | """Specific general constants used across the validation pipeline.""" 2 | 3 | 4 | class Constants: 5 | """Class with Constants for input and output values used in the the validation pipeline.""" 6 | 7 | DATE_TIME_REPORT_COL = "dt" 8 | SUM_REPORT_SUFFIX = "_SUM" 9 | OVER_ALL_COUNT_COL = "OVER_ALL_COUNT" 10 | IS_ERROR_COL = "RULE_FOR_" 11 | UNIQUE_HASH = "UNIQUE_HASH" 12 | COUNT_HASH = "COUNT_HASH" 13 | ROW_ERROR_SUFFIX = "_ROW" 14 | RULES_REPORT_SUFFIX = "_rules_report" 15 | COMPARISON_REPORT_SUFFIX = "_comparison_report" 16 | REPORT_DF_COL = "df" 17 | MISSING_COLS_RIGHT_COL = "missing_cols_right" 18 | MISSING_VALS_RIGHT_COL = "missing_vals_right" 19 | MISSING_COLS_LEFT_COL = "missing_cols_left" 20 | MISSING_VALS_LEFT_COL = "missing_vals_left" 21 | OUTPUT_COMPARABLE_COLS = [ 22 | REPORT_DF_COL, 23 | MISSING_COLS_RIGHT_COL, 24 | MISSING_COLS_LEFT_COL, 25 | MISSING_VALS_RIGHT_COL, 26 | MISSING_VALS_LEFT_COL, 27 | ] 28 | -------------------------------------------------------------------------------- /lib/src/spark_validation/static/asset-manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "files": { 3 | "main.css": "/static/css/main.8e896e56.chunk.css", 4 | "main.js": "/static/js/main.8e11e6a5.chunk.js", 5 | "main.js.map": "/static/js/main.8e11e6a5.chunk.js.map", 6 | "runtime-main.js": "/static/js/runtime-main.6d8ceafa.js", 7 | "runtime-main.js.map": "/static/js/runtime-main.6d8ceafa.js.map", 8 | "static/js/2.e9c9302b.chunk.js": "/static/js/2.e9c9302b.chunk.js", 9 | "static/js/2.e9c9302b.chunk.js.map": "/static/js/2.e9c9302b.chunk.js.map", 10 | "index.html": "/index.html", 11 | "precache-manifest.02e0942ee454e4f28891cd6100c3e7f6.js": "/precache-manifest.02e0942ee454e4f28891cd6100c3e7f6.js", 12 | "service-worker.js": "/service-worker.js", 13 | "static/css/main.8e896e56.chunk.css.map": "/static/css/main.8e896e56.chunk.css.map", 14 | "static/js/2.e9c9302b.chunk.js.LICENSE.txt": "/static/js/2.e9c9302b.chunk.js.LICENSE.txt", 15 | "static/media/logo.png": "/static/media/logo.50e8e5ec.png" 16 | }, 17 | "entrypoints": [ 18 | "static/js/runtime-main.6d8ceafa.js", 19 | "static/js/2.e9c9302b.chunk.js", 20 | "static/css/main.8e896e56.chunk.css", 21 | "static/js/main.8e11e6a5.chunk.js" 22 | ] 23 | } -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | services: 3 | - docker 4 | before_script: 5 | - export VERSION=$(awk '{print $3}' lib/src/spark_validation/version.py | sed 's/"//g') 6 | before_install: 7 | - docker pull covertspartan/docker-airflow-spark 8 | install: 9 | - pip install -r requirements.txt 10 | script: 11 | - cd lib 12 | - pip install -r requirements.txt --quiet 13 | - export PYTHONPATH="$PWD/src:$PWD/test:$PYTHONPATH" 14 | deploy: 15 | skip_cleanup: true 16 | provider: pypi 17 | user: ronaldsmithangel 18 | password: 19 | secure: LZnBL8dhXzc+ygWpPHWDboPKWJ/jW58fg5N8US7np9tRhZ9YQzwFtDjaHetVxmIBCM+h57DmA5kJNBaIx7saNu48wARBxsZTP7T3pnM8L5uMpJhwaqIYQdIbnh00FbzyintuQJ7LqkSNFhUMkkoAoW+1NXAr9lk0HdGbMeTxJmr9ZSh4131rQIfqury8pT8z27/kslSD61x3Gua+yqbhYns1ZwMwuR84t2uRC7ihScM2Bi/gmQusjTm5HXwChn1U+fh1GQPb63bREfPwfTlvAN5GYt8ZuV/A3lz4iTwm0eRMDpnE1cO4kJ92U0xxtjeixgE0Jz73KebAAYQp+4I1zv7ng3gPlyDSPRn98eLYy8e/zp1q15kO179dvb87l+7WzQ8gjfI1FXsNneAlv+Aza/EDOHmssrlFAhXQpG3rDLgIEBEQbAKQCYrFJt/1tQdFHGWtTl+pvW8l+nMca5pRttZsDkPHhfRft6H+P1YWcEL6ksngiFh979EjLnuVISZGjLJmAjF7M3pxDZT9qbBF8NHPxmMjUaKeyTg2kONyGLH2mtFOanRxdYJOurZeIOAZA1GPq8iqL0M/a1Eivr5MTLYHCU/WEvEBDOThVu88N/9Y24sTtnIdMckzBFSZ+SBqgmZOw1TlmY+MugrX0z009tW3zgEAsiNUR+eK5AfGrEc= 20 | on: 21 | branch: master 22 | -------------------------------------------------------------------------------- /lib/src/spark_validation/static/static/js/2.e9c9302b.chunk.js.LICENSE.txt: -------------------------------------------------------------------------------- 1 | /* 2 | object-assign 3 | (c) Sindre Sorhus 4 | @license MIT 5 | */ 6 | 7 | /** @license React v0.19.1 8 | * scheduler.production.min.js 9 | * 10 | * Copyright (c) Facebook, Inc. and its affiliates. 11 | * 12 | * This source code is licensed under the MIT license found in the 13 | * LICENSE file in the root directory of this source tree. 14 | */ 15 | 16 | /** @license React v16.13.1 17 | * react-dom.production.min.js 18 | * 19 | * Copyright (c) Facebook, Inc. and its affiliates. 20 | * 21 | * This source code is licensed under the MIT license found in the 22 | * LICENSE file in the root directory of this source tree. 23 | */ 24 | 25 | /** @license React v16.13.1 26 | * react-is.production.min.js 27 | * 28 | * Copyright (c) Facebook, Inc. and its affiliates. 29 | * 30 | * This source code is licensed under the MIT license found in the 31 | * LICENSE file in the root directory of this source tree. 32 | */ 33 | 34 | /** @license React v16.13.1 35 | * react.production.min.js 36 | * 37 | * Copyright (c) Facebook, Inc. and its affiliates. 38 | * 39 | * This source code is licensed under the MIT license found in the 40 | * LICENSE file in the root directory of this source tree. 41 | */ 42 | -------------------------------------------------------------------------------- /lib/src/spark_validation/static/service-worker.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Welcome to your Workbox-powered service worker! 3 | * 4 | * You'll need to register this file in your web app and you should 5 | * disable HTTP caching for this file too. 6 | * See https://goo.gl/nhQhGp 7 | * 8 | * The rest of the code is auto-generated. Please don't update this file 9 | * directly; instead, make changes to your Workbox build configuration 10 | * and re-run your build process. 11 | * See https://goo.gl/2aRDsh 12 | */ 13 | 14 | importScripts("https://storage.googleapis.com/workbox-cdn/releases/4.3.1/workbox-sw.js"); 15 | 16 | importScripts( 17 | "/precache-manifest.02e0942ee454e4f28891cd6100c3e7f6.js" 18 | ); 19 | 20 | self.addEventListener('message', (event) => { 21 | if (event.data && event.data.type === 'SKIP_WAITING') { 22 | self.skipWaiting(); 23 | } 24 | }); 25 | 26 | workbox.core.clientsClaim(); 27 | 28 | /** 29 | * The workboxSW.precacheAndRoute() method efficiently caches and responds to 30 | * requests for URLs in the manifest. 31 | * See https://goo.gl/S9QRab 32 | */ 33 | self.__precacheManifest = [].concat(self.__precacheManifest || []); 34 | workbox.precaching.precacheAndRoute(self.__precacheManifest, {}); 35 | 36 | workbox.routing.registerNavigationRoute(workbox.precaching.getCacheKeyForURL("/index.html"), { 37 | 38 | blacklist: [/^\/_/,/\/[^/?]+\.[^/]+$/], 39 | }); 40 | -------------------------------------------------------------------------------- /license.md: -------------------------------------------------------------------------------- 1 | COPYRIGHT 2 | 3 | All contributions by Ronald Angel: 4 | Copyright (c) 2020, Ronald Angel. 5 | All rights reserved. 6 | 7 | Each contributor holds copyright over their respective contributions. 8 | The project versioning (Git) records all such contribution source information. 9 | 10 | LICENSE 11 | 12 | The MIT License (MIT) 13 | 14 | Permission is hereby granted, free of charge, to any person obtaining a copy 15 | of this software and associated documentation files (the "Software"), to deal 16 | in the Software without restriction, including without limitation the rights 17 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 18 | copies of the Software, and to permit persons to whom the Software is 19 | furnished to do so, subject to the following conditions: 20 | 21 | The above copyright notice and this permission notice shall be included in all 22 | copies or substantial portions of the Software. 23 | 24 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 25 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 26 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 27 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 28 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 29 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 | SOFTWARE. 31 | -------------------------------------------------------------------------------- /lib/test/spark_validation_tests/common/mock_data/config_example.json: -------------------------------------------------------------------------------- 1 | { 2 | "source_table": { 3 | "name": "test.data_test", 4 | "id_column": "GENERAL_ID", 5 | "output_correctness_table": "test.data_test_correctness", 6 | "output_completeness_table": "test.data_test_completeness", 7 | "output_comparison_table": "test.data_test_comparison", 8 | "unique_column_group_values_per_table": [ 9 | "GENERAL_ID", 10 | "ULTIMATE_PARENT_ID" 11 | ], 12 | "fuzzy_deduplication_distance": 0 13 | }, 14 | "correctness_validations": [ 15 | { 16 | "column": "CODE", 17 | "rule": "CODE is not null and CODE != '' and CODE != 'null'" 18 | }, 19 | { 20 | "column": "NAME", 21 | "rule": "NAME is not null and NAME != '' and NAME != 'null'" 22 | }, 23 | { 24 | "column": "GENERAL_ID", 25 | "rule": "GENERAL_ID is not null and GENERAL_ID != '' and GENERAL_ID != 'null' and CHAR_LENGTH(GENERAL_ID) < 4" 26 | } 27 | ], 28 | "completeness_validations": { 29 | "overall": { 30 | "column": "OVER_ALL_COUNT", 31 | "rule": "OVER_ALL_COUNT <= 7" 32 | } 33 | }, 34 | "parent_children_constraints": [ 35 | { 36 | "column": "GENERAL_ID", 37 | "parent": "ULTIMATE_PARENT_ID" 38 | }, 39 | { 40 | "column": "GENERAL_ID", 41 | "parent": "PARENT_ID" 42 | } 43 | ], 44 | "compare_related_tables_list": [ 45 | "test.diff_df", 46 | "test.diff_df_2" 47 | ] 48 | } -------------------------------------------------------------------------------- /lib/test/spark_validation_tests/common/mock_data/config_example_local.json: -------------------------------------------------------------------------------- 1 | { 2 | "source_table": { 3 | "name": "mock_data/data_sample.csv", 4 | "id_column": "GENERAL_ID", 5 | "output_correctness_table": "/tmp/mock_data/output/data_sample_test_correctness", 6 | "output_completeness_table": "/tmp/mock_data/output/data_sample_test_completeness", 7 | "output_comparison_table": "/tmp/mock_data/output/data_sample_test_comparison", 8 | "unique_column_group_values_per_table": ["GENERAL_ID", "ULTIMATE_PARENT_ID"], 9 | "fuzzy_deduplication_distance": 0 10 | }, 11 | "correctness_validations": [ 12 | { 13 | "column": "CODE", 14 | "rule": "CODE is not null and CODE != '' and CODE != 'null'" 15 | }, 16 | { 17 | "column": "NAME", 18 | "rule": "NAME is not null and NAME != '' and NAME != 'null'" 19 | }, 20 | { 21 | "column": "GENERAL_ID", 22 | "rule": "GENERAL_ID is not null and GENERAL_ID != '' and GENERAL_ID != 'null' and CHAR_LENGTH(GENERAL_ID) < 4" 23 | } 24 | ], 25 | "completeness_validations": { 26 | "overall": { 27 | "column": "OVER_ALL_COUNT", 28 | "rule": "OVER_ALL_COUNT <= 7" 29 | } 30 | }, 31 | "parent_children_constraints": [ 32 | { 33 | "column": "GENERAL_ID", 34 | "parent": "ULTIMATE_PARENT_ID" 35 | }, 36 | { 37 | "column": "GENERAL_ID", 38 | "parent": "PARENT_ID" 39 | } 40 | ], 41 | "compare_related_tables_list": ["mock_data/data_sample_diff.csv"] 42 | } -------------------------------------------------------------------------------- /lib/test/spark_validation_tests/common/mock_data/config_family_fs.json: -------------------------------------------------------------------------------- 1 | { 2 | "source_table": { 3 | "name": "mock_data/family_sample.csv", 4 | "id_column": "ID", 5 | "output_correctness_table": "/tmp/mock_data/output/family_sample_test_correctness", 6 | "output_completeness_table": "/tmp/mock_data/output/family_sample_test_completeness", 7 | "output_comparison_table": "/tmp/mock_data/output/family_sample_test_comparison", 8 | "unique_column_group_values_per_table": ["ID", "NAME", "FAMILY_NAME", "PARENT"], 9 | "fuzzy_deduplication_distance": 0 10 | }, 11 | "correctness_validations": [ 12 | { 13 | "column": "ID", 14 | "rule": "ID is not null and ID != '' and ID != 'null'" 15 | }, 16 | { 17 | "column": "NAME", 18 | "rule": "NAME is not null and NAME != '' and NAME != 'null' and NAME like '%ho%'" 19 | }, 20 | { 21 | "column": "FAMILY_NAME", 22 | "rule": "NAME is not null and FAMILY_NAME in ('Cha', 'Pha')" 23 | }, 24 | { 25 | "column": "ADDRESS", 26 | "rule": "ADDRESS is not null and ADDRESS != '' and ADDRESS != 'null' and CHAR_LENGTH(ADDRESS) > 4" 27 | } 28 | ], 29 | "completeness_validations": { 30 | "overall": { 31 | "column": "OVER_ALL_COUNT", 32 | "rule": "OVER_ALL_COUNT <= 5" 33 | } 34 | }, 35 | "parent_children_constraints": [ 36 | { 37 | "column": "ID", 38 | "parent": "PARENT" 39 | } 40 | ], 41 | "compare_related_tables_list": ["mock_data/family_sample_diff.csv"] 42 | } 43 | -------------------------------------------------------------------------------- /lib/src/spark_validation/static/static/css/main.8e896e56.chunk.css.map: -------------------------------------------------------------------------------- 1 | {"version":3,"sources":["index.css","App.css"],"names":[],"mappings":"AAAA,KACE,QAAS,CACT,mJAEY,CACZ,kCAAmC,CACnC,iCACF,CAEA,KACE,yEAEF,CCZA,KACE,iBAAkB,CAClB,YAAa,CACb,aAAc,CACd,UACF,CAEA,UACE,aAAc,CACd,mBACF,CAEA,8CACE,UACE,mDAA4C,CAA5C,2CACF,CACF,CAEA,YACE,wBAAyB,CACzB,gBAAiB,CACjB,YAAa,CACb,qBAAsB,CACtB,kBAAmB,CACnB,sBAAuB,CACvB,4BAA6B,CAC7B,UACF,CAEA,UACE,aACF,CAEA,iCACE,GACE,sBACF,CACA,GACE,uBACF,CACF,CAPA,yBACE,GACE,sBACF,CACA,GACE,uBACF,CACF","file":"main.8e896e56.chunk.css","sourcesContent":["body {\n margin: 0;\n font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen',\n 'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue',\n sans-serif;\n -webkit-font-smoothing: antialiased;\n -moz-osx-font-smoothing: grayscale;\n}\n\ncode {\n font-family: source-code-pro, Menlo, Monaco, Consolas, 'Courier New',\n monospace;\n}\n",".App {\n text-align: center;\n padding: 40px;\n margin: 0 auto;\n width: 90vw;\n}\n\n.App-logo {\n height: 40vmin;\n pointer-events: none;\n}\n\n@media (prefers-reduced-motion: no-preference) {\n .App-logo {\n animation: App-logo-spin infinite 20s linear;\n }\n}\n\n.App-header {\n background-color: #282c34;\n min-height: 100vh;\n display: flex;\n flex-direction: column;\n align-items: center;\n justify-content: center;\n font-size: calc(10px + 2vmin);\n color: white;\n}\n\n.App-link {\n color: #61dafb;\n}\n\n@keyframes App-logo-spin {\n from {\n transform: rotate(0deg);\n }\n to {\n transform: rotate(360deg);\n }\n}\n"]} -------------------------------------------------------------------------------- /lib/src/spark_validation/static/static/js/runtime-main.6d8ceafa.js: -------------------------------------------------------------------------------- 1 | !function(e){function r(r){for(var n,p,l=r[0],a=r[1],f=r[2],c=0,s=[];c- 20 | GENERAL_ID is not null and GENERAL_ID != '' and GENERAL_ID != 'null' and 21 | CHAR_LENGTH(GENERAL_ID) < 4 22 | #validations for parent children constraints. Example: parent should be a valid entity within the table. 23 | parent_children_constraints: 24 | - column: GENERAL_ID 25 | parent: ULTIMATE_PARENT_ID 26 | - column: GENERAL_ID 27 | parent: PARENT_ID 28 | #completeness rules. Use either an overall completeness count or a count compared with previous partitions. 29 | #for simple overall count keep the column name OVER_ALL_COUNT. 30 | completeness_validations: 31 | overall: 32 | column: OVER_ALL_COUNT 33 | rule: OVER_ALL_COUNT <= 7 34 | partitioned: 35 | previous_partition: test.data_test_diff 36 | max_grow_percentage: 10 #use negative for reduction. 37 | #checksum compare data with different tables. Example: compare against a goal set or the test vs production env. 38 | compare_related_tables_list: 39 | - test.diff_df 40 | - test.diff_df_2 41 | -------------------------------------------------------------------------------- /lib/setup.py: -------------------------------------------------------------------------------- 1 | """General project setup for domino-ingestion.""" 2 | import os 3 | import re 4 | 5 | from setuptools import setup, find_packages 6 | 7 | SETUP_REQUIREMENTS = [ 8 | "dataclasses==0.6", 9 | "pyspark==2.4.5", 10 | "Flask==1.1.2", 11 | "requests==2.23.0", 12 | "dataclasses==0.6", 13 | "numpy==1.18.3", 14 | "pandas==1.0.1", 15 | "pivottablejs==0.9.0", 16 | "ipython==7.13.0", 17 | ] 18 | 19 | from os import path 20 | 21 | readme_directory = path.abspath(path.dirname(__file__)).replace("/lib", "") 22 | with open(path.join(readme_directory, "README.md"), encoding="utf-8") as f: 23 | long_description = f.read() 24 | 25 | 26 | def _get_version(): 27 | """Read the __version__ value from src/domino_ingestion/version.py. 28 | 29 | We can't import the package because we're the installation script for the package, 30 | so we use regex and read the python file as a raw text file. 31 | """ 32 | version_regex = re.compile( 33 | r"""^__version__\s=\s['"](?P.*?)['"] """, re.MULTILINE | re.VERBOSE 34 | ) 35 | version_file = os.path.join("src", "spark_validation", "version.py") 36 | with open(version_file) as handle: 37 | lines = handle.read() 38 | result = version_regex.search(lines) 39 | if result: 40 | return result.groupdict()["version"] 41 | raise ValueError("Unable to determine __version__") 42 | 43 | 44 | setup( 45 | name="owl-sanitizer-data-quality", 46 | version=_get_version(), 47 | description="Data Quality framework for Pyspark jobs", 48 | long_description=long_description, 49 | long_description_content_type="text/markdown", 50 | author="Ronald Angel", 51 | author_email="ronaldsmithangel@gmail.com", 52 | url="https://github.com/ronald-smith-angel/owl-data-sanitizer.git", 53 | license="MIT", 54 | packages=find_packages(where="src"), 55 | package_dir={"": "src"}, 56 | install_requires=SETUP_REQUIREMENTS, 57 | ) 58 | -------------------------------------------------------------------------------- /lib/test/spark_validation_tests/common/mock_data/config_familiy_fs.yaml: -------------------------------------------------------------------------------- 1 | #metadata for the source table. 2 | source_table: 3 | name: mock_data/family_sample.csv 4 | id_column: ID 5 | output_correctness_table: /tmp/mock_data/output/family_sample_test_correctness 6 | output_completeness_table: /tmp/mock_data/output/family_sample_test_completeness 7 | output_comparison_table: /tmp/mock_data/output/family_sample_test_comparison 8 | unique_column_group_values_per_table: #deduplication using 1 or more columns. 9 | - ID 10 | - NAME 11 | - FAMILY_NAME 12 | - PARENT 13 | fuzzy_deduplication_distance: 0 #apply fuzzy matching distance N to deduplication, 0 for disable. 14 | #correctness rules per column, use a SQL query in a negative way. Example: where col is not null and is != 'a'. 15 | correctness_validations: 16 | - column: ID 17 | rule: ID is not null and ID != '' and ID != 'null' 18 | - column: NAME 19 | rule: NAME is not null and NAME != '' and NAME != 'null' and NAME like '%ho%' 20 | - column: FAMILY_NAME 21 | rule: 'NAME is not null and FAMILY_NAME in (''Cha'', ''Pha'')' 22 | - column: ADDRESS 23 | rule: >- 24 | ADDRESS is not null and ADDRESS != '' and ADDRESS != 'null' and 25 | CHAR_LENGTH(ADDRESS) > 4 26 | #validations for parent children constraints. Example: parent should be a valid entity within the table. 27 | parent_children_constraints: 28 | - column: ID 29 | parent: PARENT 30 | #completeness rules. Use either an overall completeness count or a count compared with previous partitions. 31 | #for simple overall count keep the column name OVER_ALL_COUNT. 32 | completeness_validations: 33 | overall: 34 | column: OVER_ALL_COUNT 35 | rule: OVER_ALL_COUNT <= 5 36 | partitioned: 37 | previous_partition: mock_data/family_sample_previous.csv 38 | max_grow_percentage: 10 #use negative for reduction. 39 | #checksum compare data with different tables. Example: compare against a goal set or the test vs production env. 40 | compare_related_tables_list: 41 | - mock_data/family_sample_diff.csv -------------------------------------------------------------------------------- /lib/src/spark_validation/static/index.html: -------------------------------------------------------------------------------- 1 | React App
-------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # PySpark / Hive 2 | metastore_db 3 | spark-warehouse 4 | env_dags 5 | venv/ 6 | 7 | # IDE 8 | .idea/ 9 | .vscode/ 10 | 11 | # Intellij stuff 12 | dags.iml 13 | 14 | # OS specific files 15 | .DS_Store 16 | 17 | # Byte-compiled / optimized / DLL files 18 | __pycache__/ 19 | *.py[cod] 20 | *$py.class 21 | 22 | # C extensions 23 | *.so 24 | 25 | # Distribution / packaging 26 | .Python 27 | develop-eggs/ 28 | dist/ 29 | downloads/ 30 | eggs/ 31 | .eggs/ 32 | lib64/ 33 | parts/ 34 | sdist/ 35 | var/ 36 | wheels/ 37 | pip-wheel-metadata/ 38 | share/python-wheels/ 39 | *.egg-info/ 40 | .installed.cfg 41 | *.egg 42 | MANIFEST 43 | dags/ 44 | 45 | # PyInstaller 46 | # Usually these files are written by a python script from a template 47 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 48 | *.manifest 49 | *.spec 50 | 51 | # Installer logs 52 | pip-log.txt 53 | pip-delete-this-directory.txt 54 | 55 | # Unit test / coverage reports 56 | htmlcov/ 57 | .tox/ 58 | .nox/ 59 | .coverage 60 | .coverage.* 61 | .cache 62 | nosetests.xml 63 | coverage.xml 64 | *.cover 65 | *.py,cover 66 | .hypothesis/ 67 | .pytest_cache/ 68 | 69 | # Translations 70 | *.mo 71 | *.pot 72 | 73 | # Django stuff: 74 | *.log 75 | local_settings.py 76 | db.sqlite3 77 | db.sqlite3-journal 78 | 79 | # Flask stuff: 80 | instance/ 81 | .webassets-cache 82 | 83 | # Scrapy stuff: 84 | .scrapy 85 | 86 | # Sphinx documentation 87 | docs/_build/ 88 | 89 | # PyBuilder 90 | target/ 91 | 92 | # Jupyter Notebook 93 | .ipynb_checkpoints 94 | 95 | # IPython 96 | profile_default/ 97 | ipython_config.py 98 | 99 | # pyenv 100 | .python-version 101 | 102 | # pipenv 103 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 104 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 105 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 106 | # install all needed dependencies. 107 | #Pipfile.lock 108 | 109 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 110 | __pypackages__/ 111 | 112 | # Celery stuff 113 | celerybeat-schedule 114 | celerybeat.pid 115 | 116 | # SageMath parsed files 117 | *.sage.py 118 | 119 | # Environments 120 | .env 121 | .venv 122 | env/ 123 | venv/ 124 | ENV/ 125 | env.bak/ 126 | venv.bak/ 127 | 128 | # Spyder project settings 129 | .spyderproject 130 | .spyproject 131 | 132 | # Rope project settings 133 | .ropeproject 134 | 135 | # mkdocs documentation 136 | /site 137 | 138 | # mypy 139 | .mypy_cache/ 140 | .dmypy.json 141 | dmypy.json 142 | 143 | # Pyre type checker 144 | .pyre/ 145 | -------------------------------------------------------------------------------- /lib/src/spark_validation/dataframe_validation/hive_validator.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | from pyspark.sql import SparkSession 4 | 5 | from spark_validation.common.config import Config 6 | from spark_validation.common.constants import Constants 7 | from spark_validation.dataframe_validation.dataframe_validator import DataframeValidator 8 | 9 | 10 | class CreateHiveValidationDF: 11 | """Class to create validations tables.""" 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | @staticmethod 16 | def validate(ss, config): 17 | """Apply validation process using config input file.""" 18 | source_read_df = ss.table(config.source_df) 19 | comparable_dfs_list = [(t, ss.table(t)) for t in config.comparable_dfs_list] 20 | 21 | validator = DataframeValidator( 22 | spark=ss, 23 | source_df=source_read_df, 24 | id_col_name=config.id_col_name, 25 | correctness_rules_dict=config.correctness_rules_dict, 26 | parent_children_validation_pairs=config.parent_children_validation_pairs, 27 | completeness_rules_dic=config.completeness_rules_dic, 28 | comparable_dfs_list=comparable_dfs_list, 29 | unique_column_group_values_per_table=config.unique_column_group_values_per_table, 30 | ) 31 | 32 | processed_df = validator.process() 33 | completeness_df = processed_df.limit(1).select( 34 | Constants.OVER_ALL_COUNT_COL, 35 | Constants.IS_ERROR_COL + Constants.OVER_ALL_COUNT_COL, 36 | Constants.DATE_TIME_REPORT_COL, 37 | ) 38 | 39 | correctness_df = processed_df.drop( 40 | Constants.OVER_ALL_COUNT_COL, 41 | Constants.IS_ERROR_COL + Constants.OVER_ALL_COUNT_COL, 42 | ) 43 | comparison_df = validator.compare() 44 | 45 | correctness_df.write.mode("append").saveAsTable(config.output_correctness_table) 46 | 47 | completeness_df.write.mode("append").saveAsTable( 48 | config.output_completeness_table 49 | ) 50 | comparison_df.write.mode("append").saveAsTable(config.output_comparison_table) 51 | 52 | 53 | def main(args): 54 | """Run the main create table function using the sys arguments.""" 55 | spark_session = SparkSession.builder.enableHiveSupport().getOrCreate() 56 | spark_session.conf.set("spark.sql.debug.maxToStringFields", "1000") 57 | spark_session.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1") 58 | arg_conf = spark_session.sparkContext.wholeTextFiles(args.config).collect()[0][1] 59 | config = Config.parse_text(arg_conf) 60 | 61 | CreateHiveValidationDF.validate(spark_session, config) 62 | 63 | 64 | def create_parser(): 65 | """Parse sys arguments and return parser object.""" 66 | parser = argparse.ArgumentParser(description="Hive Validation") 67 | parser.add_argument( 68 | "-c", dest="config", action="store", help="config file", required=True, 69 | ) 70 | return parser 71 | 72 | 73 | def init(): 74 | """Wrap to make main call function testable by sending parsed arguments.""" 75 | parser = create_parser() 76 | args = parser.parse_args() 77 | main(args) 78 | 79 | 80 | if __name__ == "__main__": 81 | init() 82 | -------------------------------------------------------------------------------- /lib/src/spark_validation/dataframe_validation/dataframe_validator.py: -------------------------------------------------------------------------------- 1 | """This module exposes the handler class for the dataframes validation process.""" 2 | import datetime 3 | 4 | import pyspark.sql.functions as F 5 | from pyspark.sql.dataframe import DataFrame 6 | 7 | from spark_validation.common.constants import Constants 8 | from spark_validation.common.general_validator import GeneralDFValidator 9 | 10 | 11 | class DataframeValidator(GeneralDFValidator): 12 | """Class to create a handler with the main function for the grid ingestion process.""" 13 | 14 | def __init__( 15 | self, 16 | spark, 17 | source_df, 18 | id_col_name, 19 | correctness_rules_dict, 20 | parent_children_validation_pairs, 21 | completeness_rules_dic, 22 | comparable_dfs_list, 23 | unique_column_group_values_per_table=[], 24 | ): 25 | """Create handler with initial df for the specific date.""" 26 | self.spark = spark 27 | self.source_df = source_df 28 | self.id_col_name = id_col_name 29 | self.correctness_rules_dict = correctness_rules_dict 30 | self.parent_children_validation_pairs = parent_children_validation_pairs 31 | self.completeness_rules_dic = completeness_rules_dic 32 | self.comparable_dfs_list = comparable_dfs_list 33 | self.unique_column_group_values_per_table = unique_column_group_values_per_table 34 | 35 | def process(self): 36 | """Run the the entire validation pipeline. 37 | 38 | 1. Run all the rules for correcteness. 39 | 2. Run all the rules completness. 40 | 3. Return processed_df with all the computed values. 41 | """ 42 | processed_df = ( 43 | self.source_df.transform( 44 | lambda df: self.join_cols_with_all_parents( 45 | df, self.parent_children_validation_pairs 46 | ) 47 | ) 48 | .transform( 49 | lambda df: self.add_unique_error( 50 | df, self.id_col_name, self.unique_column_group_values_per_table 51 | ) 52 | ) 53 | .transform( 54 | lambda df: self.build_correctness_df( 55 | df, 56 | self.correctness_rules_dict, 57 | self.parent_children_validation_pairs, 58 | ) 59 | ) 60 | ) 61 | 62 | validation_result_cols = list( 63 | filter(lambda x: Constants.IS_ERROR_COL in x, processed_df.schema.names) 64 | ) 65 | processed_df = processed_df.select( 66 | *([self.id_col_name] + validation_result_cols) 67 | ) 68 | 69 | processed_df = processed_df.transform( 70 | lambda df: self.build_correctness_report_df(df, validation_result_cols) 71 | ).transform( 72 | lambda df: self.build_computed_rules_correctness_df( 73 | df, self.completeness_rules_dic 74 | ) 75 | ) 76 | return processed_df 77 | 78 | def compare(self): 79 | """Compare the source df with related dfs. 80 | 81 | Get comparison metrics like: 82 | 1. missing_cols_right. 83 | 2. missing_cols_left. 84 | 3. missing_vals_right. 85 | 4. missing_vals_left. 86 | """ 87 | return self.spark.createDataFrame( 88 | self.compared_with_related_dfs( 89 | self.source_df, self.id_col_name, self.comparable_dfs_list 90 | ), 91 | Constants.OUTPUT_COMPARABLE_COLS, 92 | ).withColumn(Constants.DATE_TIME_REPORT_COL, F.lit(datetime.datetime.now())) 93 | 94 | 95 | DataFrame.transform = DataframeValidator.transform 96 | -------------------------------------------------------------------------------- /lib/src/spark_validation/common/config.py: -------------------------------------------------------------------------------- 1 | """Module representing config data.""" 2 | import json 3 | import sys 4 | import yaml 5 | from abc import ABC 6 | 7 | from pyspark.sql.utils import AnalysisException 8 | 9 | 10 | class Config(ABC): 11 | """Class with config data.""" 12 | 13 | def __init__( 14 | self, 15 | source_df, 16 | id_col_name, 17 | correctness_rules_dict, 18 | parent_children_validation_pairs, 19 | completeness_rules_dic, 20 | comparable_dfs_list, 21 | output_correctness_table, 22 | output_completeness_table, 23 | output_comparison_table, 24 | unique_column_group_values_per_table=[], 25 | fuzzy_deduplication_distance=0, 26 | ): 27 | self.source_df = source_df 28 | self.id_col_name = id_col_name 29 | self.correctness_rules_dict = correctness_rules_dict 30 | self.parent_children_validation_pairs = parent_children_validation_pairs 31 | self.completeness_rules_dic = completeness_rules_dic 32 | self.comparable_dfs_list = comparable_dfs_list 33 | self.output_correctness_table = output_correctness_table 34 | self.output_completeness_table = output_completeness_table 35 | self.output_comparison_table = output_comparison_table 36 | self.unique_column_group_values_per_table = unique_column_group_values_per_table 37 | self.fuzzy_deduplication_distance = fuzzy_deduplication_distance 38 | 39 | @staticmethod 40 | def _create_config(config): 41 | try: 42 | correctness_validations = { 43 | rule["column"]: rule["rule"] 44 | for rule in config["correctness_validations"] 45 | } 46 | parent_children_validations = [ 47 | (rule["column"], rule["parent"]) 48 | for rule in config["parent_children_constraints"] 49 | ] 50 | 51 | completeness_overall_rule = config["completeness_validations"]["overall"] 52 | completeness_validations = { 53 | completeness_overall_rule["column"]: completeness_overall_rule["rule"] 54 | } 55 | return Config( 56 | source_df=config["source_table"]["name"], 57 | id_col_name=config["source_table"]["id_column"], 58 | correctness_rules_dict=correctness_validations, 59 | parent_children_validation_pairs=parent_children_validations, 60 | completeness_rules_dic=completeness_validations, 61 | comparable_dfs_list=config["compare_related_tables_list"], 62 | output_correctness_table=config["source_table"][ 63 | "output_correctness_table" 64 | ], 65 | output_completeness_table=config["source_table"][ 66 | "output_completeness_table" 67 | ], 68 | output_comparison_table=config["source_table"][ 69 | "output_comparison_table" 70 | ], 71 | unique_column_group_values_per_table=config["source_table"][ 72 | "unique_column_group_values_per_table" 73 | ] 74 | if ( 75 | "unique_column_group_values_per_table" 76 | in config["source_table"].keys() 77 | ) 78 | else [], 79 | fuzzy_deduplication_distance=config["source_table"][ 80 | "fuzzy_deduplication_distance" 81 | ] 82 | if ("fuzzy_deduplication_distance" in config["source_table"].keys()) 83 | else 0, 84 | ) 85 | except KeyError as e: 86 | print( 87 | "The config file has key error, check source_table, correctness_validations," 88 | " completeness_validations, parent_children_constraints, compare_related_tables_list as mandatory)" 89 | ' - reason "%s"' % str(e) 90 | ) 91 | 92 | @staticmethod 93 | def parse(file): 94 | """parse a json file to a config object.""" 95 | try: 96 | config = json.load(file) if 'json' in file.name else yaml.load(file) 97 | except OSError: 98 | print("Could not open/read file:", file) 99 | sys.exit() 100 | return Config._create_config(config) 101 | 102 | @staticmethod 103 | def parse_text(str_file): 104 | """parse a json file to a config object.""" 105 | try: 106 | config = json.loads(str_file) 107 | except AnalysisException: 108 | print("Could not open/read file:", str_file) 109 | sys.exit() 110 | return Config._create_config(config) 111 | -------------------------------------------------------------------------------- /lib/src/spark_validation/dataframe_validation/file_system_validator.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import os 4 | 5 | from pivottablejs import pivot_ui 6 | from pyspark.sql import SparkSession 7 | 8 | from spark_validation.common.config import Config 9 | from spark_validation.common.constants import Constants 10 | from spark_validation.dataframe_validation.dataframe_validator import DataframeValidator 11 | 12 | PACKAGE_DIR = os.path.dirname(os.path.abspath(__file__)) 13 | 14 | 15 | class CreateFSValidationDF: 16 | """Class to create validations tables.""" 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | @staticmethod 21 | def validate(ss, config): 22 | """Apply validation process using config input file.""" 23 | source_read_df = ( 24 | ss.read.format("csv").option("header", "true").load(config.source_df) 25 | ) 26 | comparable_dfs_list = [ 27 | (t, ss.read.format("csv").option("header", "true").load(t)) 28 | for t in config.comparable_dfs_list 29 | ] 30 | 31 | validator = DataframeValidator( 32 | spark=ss, 33 | source_df=source_read_df, 34 | id_col_name=config.id_col_name, 35 | correctness_rules_dict=config.correctness_rules_dict, 36 | parent_children_validation_pairs=config.parent_children_validation_pairs, 37 | completeness_rules_dic=config.completeness_rules_dic, 38 | comparable_dfs_list=comparable_dfs_list, 39 | unique_column_group_values_per_table=config.unique_column_group_values_per_table, 40 | ) 41 | 42 | processed_df = validator.process() 43 | completeness_df = processed_df.limit(1).select( 44 | Constants.OVER_ALL_COUNT_COL, 45 | Constants.IS_ERROR_COL + Constants.OVER_ALL_COUNT_COL, 46 | Constants.DATE_TIME_REPORT_COL, 47 | ) 48 | 49 | correctness_df = processed_df.drop( 50 | Constants.OVER_ALL_COUNT_COL, 51 | Constants.IS_ERROR_COL + Constants.OVER_ALL_COUNT_COL, 52 | ) 53 | comparison_df = validator.compare() 54 | 55 | correctness_df.coalesce(1).write.mode("append").json( 56 | config.output_correctness_table 57 | ) 58 | completeness_df.coalesce(1).write.mode("append").json( 59 | config.output_completeness_table 60 | ) 61 | comparison_df.coalesce(1).write.mode("append").json( 62 | config.output_comparison_table 63 | ) 64 | 65 | pd_correctness_df = ss.read.json(config.output_correctness_table).toPandas() 66 | pd_completeness_df = ss.read.json(config.output_completeness_table).toPandas() 67 | comparison_df = ss.read.json(config.output_comparison_table).toPandas() 68 | 69 | pivot_ui( 70 | pd_correctness_df, 71 | outfile_path="{}.html".format(config.output_correctness_table), 72 | menuLimit=5000, 73 | overwrite=True, 74 | rows=[config.id_col_name] 75 | + list( 76 | filter( 77 | lambda x: Constants.IS_ERROR_COL in x 78 | and Constants.SUM_REPORT_SUFFIX not in x 79 | and Constants.ROW_ERROR_SUFFIX not in x, 80 | pd_correctness_df.columns, 81 | ) 82 | ), 83 | cols=[Constants.DATE_TIME_REPORT_COL], 84 | vals=[Constants.IS_ERROR_COL + Constants.ROW_ERROR_SUFFIX], 85 | aggregatorName="Sum", 86 | rendererName="Table Barchart", 87 | rowOrder="value_z_to_a", 88 | ) 89 | 90 | pivot_ui( 91 | pd_completeness_df, 92 | outfile_path="{}.html".format(config.output_completeness_table), 93 | menuLimit=5000, 94 | overwrite=True, 95 | rows=[Constants.OVER_ALL_COUNT_COL], 96 | cols=[Constants.DATE_TIME_REPORT_COL], 97 | vals=[Constants.IS_ERROR_COL + Constants.OVER_ALL_COUNT_COL], 98 | aggregatorName="Sum", 99 | rendererName="Table Barchart", 100 | rowOrder="value_z_to_a", 101 | ) 102 | 103 | pivot_ui( 104 | comparison_df, 105 | outfile_path="{}.html".format(config.output_comparison_table), 106 | menuLimit=5000, 107 | overwrite=True, 108 | rows=list( 109 | filter( 110 | lambda x: Constants.DATE_TIME_REPORT_COL not in x, 111 | comparison_df.columns, 112 | ) 113 | ), 114 | cols=[Constants.DATE_TIME_REPORT_COL], 115 | rendererName="Table Barchart", 116 | rowOrder="value_z_to_a", 117 | ) 118 | 119 | 120 | def main(args): 121 | """Run the main create table function using the sys arguments.""" 122 | spark_session = SparkSession.builder.getOrCreate() 123 | spark_session.conf.set("spark.sql.debug.maxToStringFields", "1000") 124 | spark_session.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1") 125 | with open(args.config) as f: 126 | config = Config.parse(f) 127 | 128 | CreateFSValidationDF.validate(spark_session, config) 129 | 130 | 131 | def create_parser(): 132 | """Parse sys arguments and return parser object.""" 133 | parser = argparse.ArgumentParser(description="Hive Validation") 134 | parser.add_argument( 135 | "-c", dest="config", action="store", help="config file", required=True, 136 | ) 137 | return parser 138 | 139 | 140 | def init(): 141 | """Wrap to make main call function testable by sending parsed arguments.""" 142 | parser = create_parser() 143 | args = parser.parse_args() 144 | main(args) 145 | 146 | 147 | if __name__ == "__main__": 148 | init() 149 | -------------------------------------------------------------------------------- /lib/src/spark_validation/static/static/js/runtime-main.6d8ceafa.js.map: -------------------------------------------------------------------------------- 1 | {"version":3,"sources":["../webpack/bootstrap"],"names":["webpackJsonpCallback","data","moduleId","chunkId","chunkIds","moreModules","executeModules","i","resolves","length","Object","prototype","hasOwnProperty","call","installedChunks","push","modules","parentJsonpFunction","shift","deferredModules","apply","checkDeferredModules","result","deferredModule","fulfilled","j","depId","splice","__webpack_require__","s","installedModules","1","exports","module","l","m","c","d","name","getter","o","defineProperty","enumerable","get","r","Symbol","toStringTag","value","t","mode","__esModule","ns","create","key","bind","n","object","property","p","jsonpArray","this","oldJsonpFunction","slice"],"mappings":"aACE,SAASA,EAAqBC,GAQ7B,IAPA,IAMIC,EAAUC,EANVC,EAAWH,EAAK,GAChBI,EAAcJ,EAAK,GACnBK,EAAiBL,EAAK,GAIHM,EAAI,EAAGC,EAAW,GACpCD,EAAIH,EAASK,OAAQF,IACzBJ,EAAUC,EAASG,GAChBG,OAAOC,UAAUC,eAAeC,KAAKC,EAAiBX,IAAYW,EAAgBX,IACpFK,EAASO,KAAKD,EAAgBX,GAAS,IAExCW,EAAgBX,GAAW,EAE5B,IAAID,KAAYG,EACZK,OAAOC,UAAUC,eAAeC,KAAKR,EAAaH,KACpDc,EAAQd,GAAYG,EAAYH,IAKlC,IAFGe,GAAqBA,EAAoBhB,GAEtCO,EAASC,QACdD,EAASU,OAATV,GAOD,OAHAW,EAAgBJ,KAAKK,MAAMD,EAAiBb,GAAkB,IAGvDe,IAER,SAASA,IAER,IADA,IAAIC,EACIf,EAAI,EAAGA,EAAIY,EAAgBV,OAAQF,IAAK,CAG/C,IAFA,IAAIgB,EAAiBJ,EAAgBZ,GACjCiB,GAAY,EACRC,EAAI,EAAGA,EAAIF,EAAed,OAAQgB,IAAK,CAC9C,IAAIC,EAAQH,EAAeE,GACG,IAA3BX,EAAgBY,KAAcF,GAAY,GAE3CA,IACFL,EAAgBQ,OAAOpB,IAAK,GAC5Be,EAASM,EAAoBA,EAAoBC,EAAIN,EAAe,KAItE,OAAOD,EAIR,IAAIQ,EAAmB,GAKnBhB,EAAkB,CACrBiB,EAAG,GAGAZ,EAAkB,GAGtB,SAASS,EAAoB1B,GAG5B,GAAG4B,EAAiB5B,GACnB,OAAO4B,EAAiB5B,GAAU8B,QAGnC,IAAIC,EAASH,EAAiB5B,GAAY,CACzCK,EAAGL,EACHgC,GAAG,EACHF,QAAS,IAUV,OANAhB,EAAQd,GAAUW,KAAKoB,EAAOD,QAASC,EAAQA,EAAOD,QAASJ,GAG/DK,EAAOC,GAAI,EAGJD,EAAOD,QAKfJ,EAAoBO,EAAInB,EAGxBY,EAAoBQ,EAAIN,EAGxBF,EAAoBS,EAAI,SAASL,EAASM,EAAMC,GAC3CX,EAAoBY,EAAER,EAASM,IAClC5B,OAAO+B,eAAeT,EAASM,EAAM,CAAEI,YAAY,EAAMC,IAAKJ,KAKhEX,EAAoBgB,EAAI,SAASZ,GACX,qBAAXa,QAA0BA,OAAOC,aAC1CpC,OAAO+B,eAAeT,EAASa,OAAOC,YAAa,CAAEC,MAAO,WAE7DrC,OAAO+B,eAAeT,EAAS,aAAc,CAAEe,OAAO,KAQvDnB,EAAoBoB,EAAI,SAASD,EAAOE,GAEvC,GADU,EAAPA,IAAUF,EAAQnB,EAAoBmB,IAC/B,EAAPE,EAAU,OAAOF,EACpB,GAAW,EAAPE,GAA8B,kBAAVF,GAAsBA,GAASA,EAAMG,WAAY,OAAOH,EAChF,IAAII,EAAKzC,OAAO0C,OAAO,MAGvB,GAFAxB,EAAoBgB,EAAEO,GACtBzC,OAAO+B,eAAeU,EAAI,UAAW,CAAET,YAAY,EAAMK,MAAOA,IACtD,EAAPE,GAA4B,iBAATF,EAAmB,IAAI,IAAIM,KAAON,EAAOnB,EAAoBS,EAAEc,EAAIE,EAAK,SAASA,GAAO,OAAON,EAAMM,IAAQC,KAAK,KAAMD,IAC9I,OAAOF,GAIRvB,EAAoB2B,EAAI,SAAStB,GAChC,IAAIM,EAASN,GAAUA,EAAOiB,WAC7B,WAAwB,OAAOjB,EAAgB,SAC/C,WAA8B,OAAOA,GAEtC,OADAL,EAAoBS,EAAEE,EAAQ,IAAKA,GAC5BA,GAIRX,EAAoBY,EAAI,SAASgB,EAAQC,GAAY,OAAO/C,OAAOC,UAAUC,eAAeC,KAAK2C,EAAQC,IAGzG7B,EAAoB8B,EAAI,IAExB,IAAIC,EAAaC,KAAsB,gBAAIA,KAAsB,iBAAK,GAClEC,EAAmBF,EAAW5C,KAAKuC,KAAKK,GAC5CA,EAAW5C,KAAOf,EAClB2D,EAAaA,EAAWG,QACxB,IAAI,IAAIvD,EAAI,EAAGA,EAAIoD,EAAWlD,OAAQF,IAAKP,EAAqB2D,EAAWpD,IAC3E,IAAIU,EAAsB4C,EAI1BxC,I","file":"static/js/runtime-main.6d8ceafa.js","sourcesContent":[" \t// install a JSONP callback for chunk loading\n \tfunction webpackJsonpCallback(data) {\n \t\tvar chunkIds = data[0];\n \t\tvar moreModules = data[1];\n \t\tvar executeModules = data[2];\n\n \t\t// add \"moreModules\" to the modules object,\n \t\t// then flag all \"chunkIds\" as loaded and fire callback\n \t\tvar moduleId, chunkId, i = 0, resolves = [];\n \t\tfor(;i < chunkIds.length; i++) {\n \t\t\tchunkId = chunkIds[i];\n \t\t\tif(Object.prototype.hasOwnProperty.call(installedChunks, chunkId) && installedChunks[chunkId]) {\n \t\t\t\tresolves.push(installedChunks[chunkId][0]);\n \t\t\t}\n \t\t\tinstalledChunks[chunkId] = 0;\n \t\t}\n \t\tfor(moduleId in moreModules) {\n \t\t\tif(Object.prototype.hasOwnProperty.call(moreModules, moduleId)) {\n \t\t\t\tmodules[moduleId] = moreModules[moduleId];\n \t\t\t}\n \t\t}\n \t\tif(parentJsonpFunction) parentJsonpFunction(data);\n\n \t\twhile(resolves.length) {\n \t\t\tresolves.shift()();\n \t\t}\n\n \t\t// add entry modules from loaded chunk to deferred list\n \t\tdeferredModules.push.apply(deferredModules, executeModules || []);\n\n \t\t// run deferred modules when all chunks ready\n \t\treturn checkDeferredModules();\n \t};\n \tfunction checkDeferredModules() {\n \t\tvar result;\n \t\tfor(var i = 0; i < deferredModules.length; i++) {\n \t\t\tvar deferredModule = deferredModules[i];\n \t\t\tvar fulfilled = true;\n \t\t\tfor(var j = 1; j < deferredModule.length; j++) {\n \t\t\t\tvar depId = deferredModule[j];\n \t\t\t\tif(installedChunks[depId] !== 0) fulfilled = false;\n \t\t\t}\n \t\t\tif(fulfilled) {\n \t\t\t\tdeferredModules.splice(i--, 1);\n \t\t\t\tresult = __webpack_require__(__webpack_require__.s = deferredModule[0]);\n \t\t\t}\n \t\t}\n\n \t\treturn result;\n \t}\n\n \t// The module cache\n \tvar installedModules = {};\n\n \t// object to store loaded and loading chunks\n \t// undefined = chunk not loaded, null = chunk preloaded/prefetched\n \t// Promise = chunk loading, 0 = chunk loaded\n \tvar installedChunks = {\n \t\t1: 0\n \t};\n\n \tvar deferredModules = [];\n\n \t// The require function\n \tfunction __webpack_require__(moduleId) {\n\n \t\t// Check if module is in cache\n \t\tif(installedModules[moduleId]) {\n \t\t\treturn installedModules[moduleId].exports;\n \t\t}\n \t\t// Create a new module (and put it into the cache)\n \t\tvar module = installedModules[moduleId] = {\n \t\t\ti: moduleId,\n \t\t\tl: false,\n \t\t\texports: {}\n \t\t};\n\n \t\t// Execute the module function\n \t\tmodules[moduleId].call(module.exports, module, module.exports, __webpack_require__);\n\n \t\t// Flag the module as loaded\n \t\tmodule.l = true;\n\n \t\t// Return the exports of the module\n \t\treturn module.exports;\n \t}\n\n\n \t// expose the modules object (__webpack_modules__)\n \t__webpack_require__.m = modules;\n\n \t// expose the module cache\n \t__webpack_require__.c = installedModules;\n\n \t// define getter function for harmony exports\n \t__webpack_require__.d = function(exports, name, getter) {\n \t\tif(!__webpack_require__.o(exports, name)) {\n \t\t\tObject.defineProperty(exports, name, { enumerable: true, get: getter });\n \t\t}\n \t};\n\n \t// define __esModule on exports\n \t__webpack_require__.r = function(exports) {\n \t\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n \t\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n \t\t}\n \t\tObject.defineProperty(exports, '__esModule', { value: true });\n \t};\n\n \t// create a fake namespace object\n \t// mode & 1: value is a module id, require it\n \t// mode & 2: merge all properties of value into the ns\n \t// mode & 4: return value when already ns object\n \t// mode & 8|1: behave like require\n \t__webpack_require__.t = function(value, mode) {\n \t\tif(mode & 1) value = __webpack_require__(value);\n \t\tif(mode & 8) return value;\n \t\tif((mode & 4) && typeof value === 'object' && value && value.__esModule) return value;\n \t\tvar ns = Object.create(null);\n \t\t__webpack_require__.r(ns);\n \t\tObject.defineProperty(ns, 'default', { enumerable: true, value: value });\n \t\tif(mode & 2 && typeof value != 'string') for(var key in value) __webpack_require__.d(ns, key, function(key) { return value[key]; }.bind(null, key));\n \t\treturn ns;\n \t};\n\n \t// getDefaultExport function for compatibility with non-harmony modules\n \t__webpack_require__.n = function(module) {\n \t\tvar getter = module && module.__esModule ?\n \t\t\tfunction getDefault() { return module['default']; } :\n \t\t\tfunction getModuleExports() { return module; };\n \t\t__webpack_require__.d(getter, 'a', getter);\n \t\treturn getter;\n \t};\n\n \t// Object.prototype.hasOwnProperty.call\n \t__webpack_require__.o = function(object, property) { return Object.prototype.hasOwnProperty.call(object, property); };\n\n \t// __webpack_public_path__\n \t__webpack_require__.p = \"/\";\n\n \tvar jsonpArray = this[\"webpackJsonpapp\"] = this[\"webpackJsonpapp\"] || [];\n \tvar oldJsonpFunction = jsonpArray.push.bind(jsonpArray);\n \tjsonpArray.push = webpackJsonpCallback;\n \tjsonpArray = jsonpArray.slice();\n \tfor(var i = 0; i < jsonpArray.length; i++) webpackJsonpCallback(jsonpArray[i]);\n \tvar parentJsonpFunction = oldJsonpFunction;\n\n\n \t// run deferred modules from other chunks\n \tcheckDeferredModules();\n"],"sourceRoot":""} -------------------------------------------------------------------------------- /lib/src/spark_validation/static/static/js/main.8e11e6a5.chunk.js: -------------------------------------------------------------------------------- 1 | (this.webpackJsonpapp=this.webpackJsonpapp||[]).push([[0],{245:function(e,n,t){"use strict";t.r(n);var a=t(0),l=t.n(a),r=t(2),o=t.n(r),c=(t(44),t(7)),u=(t(45),t(4)),i=t(20),s=t(5),m=t(30),p=t.n(m),d=t(249),f=t(248),_=t(27),b=t(9),g=t(32);function v(){var e=Object(u.a)(["\n color: darkblue;\n background: rgb(248, 248, 248);\n border-radius: 0px 8px 8px 0px;\n text-align: left;\n padding: 0px 30px;\n font-size: 0.8em;\n overflow: auto;\n"]);return v=function(){return e},e}function E(){var e=Object(u.a)(["\n padding: 0px 30px 50px 30px;\n color: darkblue;\n text-align: left;\n overflow: auto;\n h4 {\n padding: 0px;\n margin: 0px;\n color: #505050;\n font-weight: normal;\n margin-bottom: 5px;\n }\n\n .completness {\n display: grid;\n grid-template-columns: 1fr 2fr;\n grid-gap: 8px;\n }\n\n .ruleGroup {\n padding: 0.5rem;\n border: 1px solid #1890ff;\n border-radius: 4px;\n background: rgba(180, 220, 255, 0.2);\n\n .rule,\n .ruleGroup {\n margin-top: 0.5rem;\n margin-left: 0.5rem;\n }\n\n .ruleGroup-combinators.betweenRules {\n margin-top: 0.5rem;\n }\n\n .ruleGroup-notToggle {\n margin-right: 0.5rem;\n }\n }\n"]);return E=function(){return e},e}function h(){var e=Object(u.a)(["\n height: 70vh;\n border: 1px solid lightgray;\n border-radius: 8px;\n display: grid;\n grid-template-columns: repeat(2, 1fr);\n"]);return h=function(){return e},e}function O(){var e=Object(u.a)(["\n height: 30px;\n background: #1890ff;\n border-radius: 4px;\n border: none;\n color: white;\n line-height: 1.5;\n font-weight: bold;\n cursor: pointer;\n width: 200px;\n margin: 30px;\n"]);return O=function(){return e},e}var y=s.a.button(O()),N=s.a.div(h()),A=s.a.div(E()),x=s.a.div(v()),L=function(e){var n=e.name,t=e.type;return{operators:[{name:"Is NULL",label:"Is NULL"},{name:"Is NOT NULL",label:"Is NOT NULL"},{name:"= ''",label:"Is EMPTY"},{name:"!= ''",label:"Is NOT EMPTY"},{name:"In",label:"In"},{name:"=",label:"="},{name:"!=",label:"!="},{name:"<",label:"<"},{name:">",label:">"},{name:"<=",label:"<="},{name:">=",label:">="}],fields:[{name:n,label:n},{name:"CHAR_LENGTH(".concat(n,")"),label:"CHAR_LENGTH(".concat(n,")")}],getControlElements:function(){return{valueEditor:function(e){var a=e.field,r=e.operator,o=e.handleOnChange,c=e.value,u=r.toLowerCase();return u.startsWith("is")||["= ''","!= ''"].includes(u)?"":"number"===t||a==="CHAR_LENGTH(".concat(n,")")?l.a.createElement("input",{value:c,onChange:o}):l.a.createElement("input",{onChange:o})}}}}},S=function(e,n,t,a){var l=function e(n,t){console.log(n);var a="",l={null:"IS NULL",notNull:"IS NOT NULL",contains:"LIKE"},r=n.rules,o=n.combinator;return r.forEach((function(n){var r=n.field,c=void 0===r?"":r,u=n.operator,i=void 0===u?"":u,s=n.value,m=n.rules,p=void 0===m?[]:m;console.log(c);var d=t&&""===a?"":o;a+=p.length?e(n):" ".concat(d," ").concat(c," ").concat(l[i]||i," ").concat(s," ")})),a}(n,!0).replace(/\s\s+/g," ").trim();a(t.map((function(n){return n.column===e?{column:n.column,rule:l}:n})))},k=function(e){var n=e.table,t=void 0===n?{columns:[]}:n,r=Object(a.useState)([]),o=Object(c.a)(r,2),u=o[0],s=o[1],m=Object(a.useState)([]),v=Object(c.a)(m,2),E=v[0],h=v[1],O=Object(b.useToasts)().addToast,k=Object(a.useState)(""),j=Object(c.a)(k,2),I=j[0],T=j[1],D=Object(a.useState)(""),M=Object(c.a)(D,2),w=M[0],C=M[1];Object(a.useEffect)((function(){s(t.columns.map((function(e){return{column:e.name,rule:""}})))}),[t]);t.columns.reduce((function(e,n){return e[n.name]=n.type,e}),{});var R,P=(R={correctness_validations:u,completeness_validations:[{column:"OVER_ALL_COUNT",rule:"OVER_ALL_COUNT "+I+" "+w}],parent_children_constraints:E},{source_table:{name:"mock_data/family_sample.csv",id_column:"ID",output_correctness_table:"/tmp/mock_data/output/family_sample_test_correctness",output_completeness_table:"/tmp/mock_data/output/family_sample_test_completeness",output_comparison_table:"/tmp/mock_data/output/family_sample_test_comparison",unique_column_group_values_per_table:["ID","NAME","FAMILY_NAME","PARENT"],fuzzy_deduplication_distance:0},correctness_validations:Object(i.a)(R.correctness_validations),completeness_validations:Object(i.a)(R.completeness_validations),parent_children_constraints:Object(i.a)(R.parent_children_constraints),compare_related_tables_list:["test.diff_df","test.diff_df_2"]});return l.a.createElement(a.Fragment,null,0==t.columns.length&&l.a.createElement("h3",null," Please select table to add rules"),0!=t.columns.length&&l.a.createElement(N,null,l.a.createElement(A,null,l.a.createElement("h1",null,"Editor"),l.a.createElement("h3",null,"Correctness validations"),t.columns.map((function(e){return l.a.createElement("div",{key:e.name},l.a.createElement("h4",null,e.name,": [",e.type,"]"),l.a.createElement(p.a,{fields:L(e).fields,onQueryChange:function(n){return S(e.name,n,u,s)}}),l.a.createElement("br",null))})),l.a.createElement("h3",null,"Completeness validations"),l.a.createElement("h4",null,"Number of rows"),l.a.createElement("div",{class:"completness"},l.a.createElement(_.a,{onChange:function(e){return T(e.value)},options:[{value:">",label:">"},{value:">=",label:">="},{value:"=",label:"="},{value:"<",label:"<"},{value:"<=",label:"<="}]}),l.a.createElement("input",{type:"number",onChange:function(e){return C(e.target.value)}})),l.a.createElement("h3",null,"Parent of constraints"),t.columns.map((function(e){return l.a.createElement("div",{key:e.name},l.a.createElement("h4",null,e.name,": [",e.type,"]"),l.a.createElement(_.a,{isMulti:!0,onChange:function(n){var t=(n||[]).map((function(n){return{column:e.name,parent:n.value}})),a=E.filter((function(n){return n.column!=e.name}));h(t.concat(a))},options:t.columns.filter((function(n){return n.name!=e.name})).map((function(e){return{value:e.name,label:e.name}}))}),l.a.createElement("br",null))}))),l.a.createElement(x,null,l.a.createElement("h1",null,"Output"),l.a.createElement(d.a,{language:"JSON",style:f.a},JSON.stringify(P,null,2)))),0!=t.columns.length&&l.a.createElement(y,{onClick:function(){fetch("api/validate",{body:JSON.stringify(g,null,2),method:"POST"}),O("Succesfully submited",{appearance:"success",autoDismiss:!0,autoDismissTimeout:3e3})}},"Submit"))},j=t(37),I=t(36),T=t.n(I);function D(){var e=Object(u.a)(["\n display: grid;\n grid-template-columns: 1fr 1fr;\n justify-items: center;\n align-items: center;\n font-weight: bold;\n color: darkblue;\n"]);return D=function(){return e},e}function M(){var e=Object(u.a)(["\n width: 60px;\n"]);return M=function(){return e},e}function w(){var e=Object(u.a)(["\n height: 30px;\n width: 100px;\n background: ",";\n color: ",";\n border-radius: 4px;\n border: 1px solid #1890ff;\n line-height: 1.5;\n font-weight: bold;\n cursor: pointer;\n"]);return w=function(){return e},e}function C(){var e=Object(u.a)(["\n height: 60px;\n border-radius: 8px;\n border: 1px solid lightgrey;\n display: grid;\n justify-items: center;\n align-items: center;\n grid-template-columns: repeat(3, 1fr);\n margin-bottom: 40px;\n"]);return C=function(){return e},e}var R=s.a.div(C()),P=s.a.button(w(),(function(e){return"secondary"===e.type?"white":"#1890ff"}),(function(e){return"secondary"===e.type?"#1890ff":"white"})),U={name:"family_tree.csv",columns:[{name:"ID",type:"number"},{name:"NAME",type:"string"},{name:"FAMILY_NAME",type:"string"},{name:"PARENT",type:"string"},{name:"ADDRESS",type:"string"}]},G=s.a.img(M()),H=s.a.div(D()),Y={columns:[]},F=function(e){var n=e.onTableSelected,t=Object(b.useToasts)().addToast,r=Object(a.useState)(Object(j.a)({},Y)),o=Object(c.a)(r,2),u=o[0],i=o[1];return l.a.createElement(R,null,l.a.createElement(P,{onClick:function(){n(U),i(U),t("Succesfully uploaded",{appearance:"success",autoDismiss:!0,autoDismissTimeout:3e3})}},"Upload"),l.a.createElement("span",null,l.a.createElement("b",null,u.name||"No table selected")),l.a.createElement(H,null,"ValiData ",l.a.createElement(G,{src:T.a})))};var J=function(){var e=Object(a.useState)({columns:[]}),n=Object(c.a)(e,2),t=n[0],r=n[1];return l.a.createElement(b.ToastProvider,null,l.a.createElement("div",{className:"App"},l.a.createElement(F,{onTableSelected:function(e){return r(e)}}),l.a.createElement(k,{table:t})))};Boolean("localhost"===window.location.hostname||"[::1]"===window.location.hostname||window.location.hostname.match(/^127(?:\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}$/));o.a.render(l.a.createElement(l.a.StrictMode,null,l.a.createElement(J,null)),document.getElementById("root")),"serviceWorker"in navigator&&navigator.serviceWorker.ready.then((function(e){e.unregister()})).catch((function(e){console.error(e.message)}))},32:function(e){e.exports=JSON.parse('{"source_table":{"name":"lib/test/spark_validation_tests/common/mock_data/family_sample.csv","id_column":"ID","output_correctness_table":"lib/test/spark_validation_tests/common/mock_data/output/fs/family_sample_test_correctness","output_completeness_table":"lib/test/spark_validation_tests/common/mock_data/output/fs/family_sample_test_completeness","output_comparison_table":"lib/test/spark_validation_tests/common/mock_data/output/fs/family_sample_test_comparison","unique_column_group_values_per_table":["ID","NAME","FAMILY_NAME","PARENT"],"fuzzy_deduplication_distance":0},"correctness_validations":[{"column":"ID","rule":"ID is not null and ID != \'\' and ID != \'null\'"},{"column":"NAME","rule":"NAME is not null and NAME != \'\' and NAME != \'null\' and NAME like \'%ho%\'"},{"column":"FAMILY_NAME","rule":"FAMILY_NAME is not null and FAMILY_NAME in (\'Cha\', \'Pha\')"},{"column":"ADDRESS","rule":"ADDRESS is not null and ADDRESS != \'\' and ADDRESS != \'null\' and CHAR_LENGTH(ADDRESS) > 4"}],"completeness_validations":[{"column":"OVER_ALL_COUNT","rule":"OVER_ALL_COUNT <= 5"}],"parent_children_constraints":[{"column":"ID","parent":"PARENT"}],"compare_related_tables_list":["lib/test/spark_validation_tests/common/mock_data/family_sample_diff.csv"]}')},36:function(e,n,t){e.exports=t.p+"static/media/logo.50e8e5ec.png"},39:function(e,n,t){e.exports=t(245)},44:function(e,n,t){},45:function(e,n,t){}},[[39,1,2]]]); 2 | //# sourceMappingURL=main.8e11e6a5.chunk.js.map 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Owl Data Sanitizer: A light Spark data validation framework 2 | 3 | [![license](https://img.shields.io/github/license/mashape/apistatus.svg?maxAge=2592000)](https://github.com/ronald-smith-angel/owl-data-sanitizer/blob/develop/license.md) 4 | [![Build Status](https://travis-ci.org/ronald-smith-angel/owl-data-sanitizer.svg?branch=develop)](https://travis-ci.org/github/ronald-smith-angel/owl-data-sanitizer) 5 | 6 | This is a small framework for data quality validation. This first version works reading spark dataframes from local 7 | datasources like local system, s3 or hive and delivers hive tables with quality reports. 8 | 9 | Let's follow this example: 10 | 11 | Input data from a hive table: 12 | 13 | ``` 14 | +----------+--------------+--------+---------+------------------+---------+ 15 | |GENERAL_ID| NAME| CODE|ADDR_DESC|ULTIMATE_PARENT_ID|PARENT_ID| 16 | +----------+--------------+--------+---------+------------------+---------+ 17 | | 1|Dummy 1 Entity|12000123| null| null| null| 18 | | 2| null| null| null| 2| 2| 19 | | 3| null|12000123| null| 3| 3| 20 | | 4| 1| 1| null| 4| 4| 21 | | 5| 1|12000123| null| 5| 5| 22 | | 6| null| 3| null| 6| 6| 23 | | null| null|12000123| null| 11| 7| 24 | | 7| 2| null| null| 8| 8| 25 | +----------+--------------+--------+---------+------------------+---------+ 26 | ``` 27 | 28 | following this validation config with 4 sections: 29 | 30 | 1. `source_table` including the table metadata. 31 | 2. `correctness_validations` including correctness validations per column. 32 | the rule must be a valid spark SQL expression. 33 | 3. `parent_children_constraints` including children parent constrains. 34 | This means that any parent id should be valid id. 35 | 4. `compare_related_tables_list` including comparison with other tables or 36 | the same table in other environments. 37 | 38 | ``` 39 | { 40 | "source_table": { 41 | "name": "test.data_test", 42 | "id_column": "GENERAL_ID", 43 | "unique_column_group_values_per_table": ["GENERAL_ID", "ULTIMATE_PARENT_ID"], 44 | "fuzzy_deduplication_distance": 0, 45 | "output_correctness_table": "test.data_test_correctness", 46 | "output_completeness_table": "test.data_test_completeness", 47 | "output_comparison_table": "test.data_test_comparison" 48 | }, 49 | "correctness_validations": [ 50 | { 51 | "column": "CODE", 52 | "rule": "CODE is not null and CODE != '' and CODE != 'null'" 53 | }, 54 | { 55 | "column": "NAME", 56 | "rule": "NAME is not null and NAME != '' and NAME != 'null'" 57 | }, 58 | { 59 | "column": "GENERAL_ID", 60 | "rule": "GENERAL_ID is not null and GENERAL_ID != '' and GENERAL_ID != 'null' and CHAR_LENGTH(GENERAL_ID) < 4" 61 | } 62 | ], 63 | "completeness_validations": [ 64 | { 65 | "column": "OVER_ALL_COUNT", 66 | "rule": "OVER_ALL_COUNT <= 7" 67 | } 68 | ], 69 | "parent_children_constraints": [ 70 | { 71 | "column": "GENERAL_ID", 72 | "parent": "ULTIMATE_PARENT_ID" 73 | }, 74 | { 75 | "column": "GENERAL_ID", 76 | "parent": "PARENT_ID" 77 | } 78 | ], 79 | "compare_related_tables_list": ["test.diff_df", "test.diff_df_2"] 80 | } 81 | ``` 82 | 83 | Therefore, these results are delivered in two output hive tables: 84 | 85 | a). Correctness Report. 86 | 87 | - You will see and output col per validation col showing either 1 when there is error or 0 when is clean. 88 | - Sum of error per columns. 89 | 90 | ``` 91 | +----------+-------------+-------------+-------------------+--------------------------------------+-----------------------------+-------------+--------------------------+-----------------+-----------------+-----------------------+------------------------------------------+---------------------------------+-----------------+ 92 | |GENERAL_ID|IS_ERROR_CODE|IS_ERROR_NAME|IS_ERROR_GENERAL_ID|IS_ERROR_GENERAL_ID_ULTIMATE_PARENT_ID|IS_ERROR_GENERAL_ID_PARENT_ID|IS_ERROR__ROW|dt |IS_ERROR_CODE_SUM|IS_ERROR_NAME_SUM|IS_ERROR_GENERAL_ID_SUM|IS_ERROR_GENERAL_ID_ULTIMATE_PARENT_ID_SUM|IS_ERROR_GENERAL_ID_PARENT_ID_SUM|IS_ERROR__ROW_SUM| 93 | +----------+-------------+-------------+-------------------+--------------------------------------+-----------------------------+-------------+--------------------------+-----------------+-----------------+-----------------------+------------------------------------------+---------------------------------+-----------------+ 94 | |null |0 |1 |1 |1 |0 |1 |2020-04-17 09:39:04.783505|2 |4 |1 |2 |1 |5 | 95 | |3 |0 |1 |0 |0 |0 |1 |2020-04-17 09:39:04.783505|2 |4 |1 |2 |1 |5 | 96 | |7 |1 |0 |0 |1 |1 |1 |2020-04-17 09:39:04.783505|2 |4 |1 |2 |1 |5 | 97 | |5 |0 |0 |0 |0 |0 |0 |2020-04-17 09:39:04.783505|2 |4 |1 |2 |1 |5 | 98 | |6 |0 |1 |0 |0 |0 |1 |2020-04-17 09:39:04.783505|2 |4 |1 |2 |1 |5 | 99 | |4 |0 |0 |0 |0 |0 |0 |2020-04-17 09:39:04.783505|2 |4 |1 |2 |1 |5 | 100 | |2 |1 |1 |0 |0 |0 |1 |2020-04-17 09:39:04.783505|2 |4 |1 |2 |1 |5 | 101 | |1 |0 |0 |0 |0 |0 |0 |2020-04-17 09:39:04.783505|2 |4 |1 |2 |1 |5 | 102 | +----------+-------------+-------------+-------------------+--------------------------------------+-----------------------------+-------------+--------------------------+-----------------+-----------------+-----------------------+------------------------------------------+---------------------------------+-----------------+ 103 | ``` 104 | b) Completeness Report. 105 | - The overall count of the dataframe. 106 | - Column checking if the overall count is complete, example: `IS_ERROR_OVER_ALL_COUNT`. 107 | ``` 108 | +--------------+-----------------------+--------------------------+ 109 | |OVER_ALL_COUNT|IS_ERROR_OVER_ALL_COUNT|dt | 110 | +--------------+-----------------------+--------------------------+ 111 | |8 |1 |2020-04-17 09:39:04.783505| 112 | +--------------+-----------------------+--------------------------+ 113 | ``` 114 | 115 | c). Comparison of schema and values with related dataframes. 116 | 117 | NOTE: the result includes for now only the ids that are different and a further 118 | join with the source data to see differences is needed. 119 | 120 | ``` 121 | +--------------+----------------------------------+-----------------+------------------+-----------------+--------------------------+ 122 | |df |missing_cols_right |missing_cols_left|missing_vals_right|missing_vals_left|dt | 123 | +--------------+----------------------------------+-----------------+------------------+-----------------+--------------------------+ 124 | |test.diff_df_2|GENERAL_ID:string,ADDR_DESC:string|GENERAL_ID:int | | |2020-04-17 09:39:07.572483| 125 | |test.diff_df | | |6,7 | |2020-04-17 09:39:07.572483| 126 | +--------------+----------------------------------+-----------------+------------------+-----------------+--------------------------+ 127 | ``` 128 | 129 | ## Installation 130 | 131 | Install owl sanitizer from PyPI: 132 | 133 | ```pip install owl-sanitizer-data-quality``` 134 | 135 | Then you can call the library. 136 | 137 | ``` 138 | from spark_validation.dataframe_validation.dataframe_validator import CreateHiveValidationDF 139 | from spark_validation.common.config import Config 140 | 141 | spark_session = SparkSession.builder.enableHiveSupport().getOrCreate() 142 | with open(PATH_TO_CONFIG_FILE) as f: 143 | config = Config.parse(f) 144 | CreateHiveValidationDF.validate(spark_session, config) 145 | ``` 146 | 147 | To use in your spark submit command or airflow dag. 148 | 149 | - Add `py_files` : `[https://pypi.org/project/owl-sanitizer-data-quality/latest/]` . 150 | - `application` : `owl-sanitizer-data-quality/latest/src/spark_validation/dataframe_validation/hive_validator.py` 151 | - `application_package`: `https://pypi.org/project/owl-sanitizer-data-quality/latest/owl-sanitizer-data-quality-latest.tar.gz` 152 | - `application_params`: `URL_TO_YOUR_REMOTE_CONFIG_FILE` 153 | 154 | Contact 155 | ------- 156 | 157 | Please ask questions about technical issues here on GitHub. -------------------------------------------------------------------------------- /lib/test/spark_validation_tests/common/general_handler_test.py: -------------------------------------------------------------------------------- 1 | """Module with general function tests for the GeneralDFHandler.""" 2 | import os 3 | import sys 4 | import unittest 5 | 6 | import pyspark.sql.functions as F 7 | 8 | from spark_validation.common.config import Config 9 | from spark_validation.common.constants import Constants 10 | from spark_validation.dataframe_validation import file_system_validator 11 | from spark_validation.dataframe_validation import hive_validator 12 | from spark_validation.dataframe_validation.dataframe_validator import DataframeValidator 13 | from spark_validation_tests.common.pyspark_test import PySparkTest 14 | 15 | PACKAGE_DIR = os.path.dirname(os.path.abspath(__file__)) 16 | 17 | 18 | class GeneralHandlerTest(PySparkTest): 19 | """Class with general function tests for the GeneralDFHandler.""" 20 | 21 | TEST_DATABASE_NAME = "test" 22 | 23 | def setUp(self): 24 | """Init test db for grid.""" 25 | self.spark.sql( 26 | "CREATE DATABASE IF NOT EXISTS {}".format( 27 | GeneralHandlerTest.TEST_DATABASE_NAME 28 | ) 29 | ) 30 | 31 | @classmethod 32 | def setUpClass(cls): 33 | """Init the shared values for the tests.""" 34 | super(GeneralHandlerTest, cls).setUpClass() 35 | cls.spark.sql( 36 | "CREATE DATABASE IF NOT EXISTS {}".format( 37 | GeneralHandlerTest.TEST_DATABASE_NAME 38 | ) 39 | ) 40 | 41 | cls.source_df = cls._create_source_df( 42 | PACKAGE_DIR + "/mock_data/data_sample.csv" 43 | ) 44 | cls.grid_diff_df = cls._create_source_df( 45 | PACKAGE_DIR + "/mock_data/data_sample_diff.csv" 46 | ) 47 | cls.grid_diff_2_df = cls._create_source_df( 48 | PACKAGE_DIR + "/mock_data/data_sample_diff_2.csv" 49 | ) 50 | 51 | @classmethod 52 | def _create_source_df(cls, csv_file): 53 | return ( 54 | cls.spark.read.option("delimiter", ",") 55 | .option("header", True) 56 | .option("inferSchema", True) 57 | .option("mode", "PERMISSIVE") 58 | .csv(csv_file) 59 | ) 60 | 61 | def test_grid_validator_process(self): 62 | """Integration test for rule set defined in mock config file.""" 63 | test_rules = { 64 | "CODE": """CODE is not null and CODE != '' and CODE != 'null'""", 65 | "NAME": """NAME is not null and NAME != '' and NAME != 'null'""", 66 | "GENERAL_ID": ( 67 | "GENERAL_ID is not null and GENERAL_ID != '' and GENERAL_ID != 'null' and" 68 | " CHAR_LENGTH(GENERAL_ID) < 4" 69 | ), 70 | "ULTIMATE_PARENT_ID": """ULTIMATE_PARENT_ID is not null""", 71 | "PARENT_ID": """PARENT_ID is not null""", 72 | } 73 | 74 | parent_rules = [ 75 | ("GENERAL_ID", "ULTIMATE_PARENT_ID"), 76 | ("GENERAL_ID", "PARENT_ID"), 77 | ] 78 | 79 | completeness_rules = {"OVER_ALL_COUNT": """OVER_ALL_COUNT <= 7"""} 80 | 81 | validator = DataframeValidator( 82 | spark=self.spark, 83 | source_df=self.source_df, 84 | id_col_name="GENERAL_ID", 85 | correctness_rules_dict=test_rules, 86 | parent_children_validation_pairs=parent_rules, 87 | completeness_rules_dic=completeness_rules, 88 | comparable_dfs_list=[ 89 | ("diff_df", self.grid_diff_df), 90 | ("diff_df_2", self.grid_diff_2_df), 91 | ], 92 | ) 93 | 94 | processed_df = validator.process() 95 | 96 | comparable_df = validator.compare() 97 | 98 | self.assertEqual(processed_df.count(), 8) 99 | self.assertEqual(comparable_df.count(), 2) 100 | 101 | def test_integration_hive_validator(self): 102 | """Integration test for rule set defined in mock config file.""" 103 | with open(PACKAGE_DIR + "/mock_data/config_example.yaml") as f: 104 | config = Config.parse(f) 105 | 106 | self.source_df.write.saveAsTable(config.source_df) 107 | self.grid_diff_df.write.saveAsTable(config.comparable_dfs_list[0]) 108 | self.grid_diff_2_df.write.saveAsTable(config.comparable_dfs_list[1]) 109 | 110 | source_read_df = self.spark.table(config.source_df) 111 | comparable_dfs_list = [ 112 | (t, self.spark.table(t)) for t in config.comparable_dfs_list 113 | ] 114 | 115 | validator = DataframeValidator( 116 | spark=self.spark, 117 | source_df=source_read_df, 118 | id_col_name=config.id_col_name, 119 | correctness_rules_dict=config.correctness_rules_dict, 120 | parent_children_validation_pairs=config.parent_children_validation_pairs, 121 | completeness_rules_dic=config.completeness_rules_dic, 122 | comparable_dfs_list=comparable_dfs_list, 123 | ) 124 | 125 | processed_df = validator.process() 126 | comparable_df = validator.compare() 127 | 128 | self.assertEqual(processed_df.count(), 8) 129 | self.assertEqual(comparable_df.count(), 2) 130 | 131 | self.spark.sparkContext.addFile(PACKAGE_DIR + "/mock_data/config_example.json") 132 | sys.argv = ["example.py", "-c", PACKAGE_DIR + "/mock_data/config_example.json"] 133 | 134 | hive_validator.init() 135 | 136 | correctness_table = self.spark.table(config.output_correctness_table) 137 | completeness_table = self.spark.table(config.output_completeness_table) 138 | comparison_table = self.spark.table(config.output_comparison_table) 139 | 140 | # Correctness validations. 141 | _is_error_name = Constants.IS_ERROR_COL + "NAME" + Constants.SUM_REPORT_SUFFIX 142 | _sum_errors_col = ( 143 | Constants.IS_ERROR_COL 144 | + Constants.ROW_ERROR_SUFFIX 145 | + Constants.SUM_REPORT_SUFFIX 146 | ) 147 | self.assertEqual(correctness_table.count(), 8) 148 | 149 | self.assertEqual( 150 | correctness_table.select(_is_error_name).first()[_is_error_name], 4 151 | ) 152 | self.assertEqual( 153 | correctness_table.select(_sum_errors_col).first()[_sum_errors_col], 5 154 | ) 155 | 156 | # Completeness validations. 157 | _is_error_count_over_all = Constants.IS_ERROR_COL + Constants.OVER_ALL_COUNT_COL 158 | self.assertEqual( 159 | completeness_table.select(_is_error_count_over_all).first()[ 160 | _is_error_count_over_all 161 | ], 162 | 1, 163 | ) 164 | 165 | # Comparison validations. 166 | 167 | self.assertEqual( 168 | comparison_table.filter( 169 | F.col(Constants.REPORT_DF_COL) == config.comparable_dfs_list[0] 170 | ) 171 | .select(Constants.MISSING_VALS_RIGHT_COL) 172 | .first()[Constants.MISSING_VALS_RIGHT_COL], 173 | "6,7", 174 | ) 175 | 176 | self.assertEqual( 177 | comparison_table.filter( 178 | F.col(Constants.REPORT_DF_COL) == config.comparable_dfs_list[1] 179 | ) 180 | .select(Constants.MISSING_COLS_LEFT_COL) 181 | .first()[Constants.MISSING_COLS_LEFT_COL], 182 | "GENERAL_ID:int", 183 | ) 184 | 185 | def test_integration_fs_validator(self): 186 | """Integration test for rule set defined in mock config file.""" 187 | with open(PACKAGE_DIR + "/mock_data/config_example_local.json") as f: 188 | config = Config.parse(f) 189 | 190 | config.source_df = PACKAGE_DIR + config.source_df 191 | config.output_correctness_table = PACKAGE_DIR + config.output_completeness_table 192 | config.output_completeness_table = ( 193 | PACKAGE_DIR + config.output_completeness_table 194 | ) 195 | config.output_comparison_table = PACKAGE_DIR + config.output_comparison_table 196 | config.comparable_dfs_list = list( 197 | map(lambda x: PACKAGE_DIR + x, config.comparable_dfs_list) 198 | ) 199 | 200 | self.spark.sparkContext.addFile( 201 | PACKAGE_DIR + "/mock_data/config_example_local.json" 202 | ) 203 | sys.argv = [ 204 | "example.py", 205 | "-c", 206 | PACKAGE_DIR + "/mock_data/config_example_local.json", 207 | ] 208 | 209 | file_system_validator.init() 210 | 211 | correctness_table = self.spark.read.json( 212 | "/tmp/mock_data/output/data_sample_test_correctness" 213 | ) 214 | completeness_table = self.spark.read.json( 215 | "/tmp/mock_data/output/data_sample_test_completeness" 216 | ) 217 | comparison_table = self.spark.read.json( 218 | "/tmp/mock_data/output/data_sample_test_comparison" 219 | ) 220 | 221 | self.assertTrue(correctness_table.count() >= 8) 222 | self.assertTrue(completeness_table.count() >= 1) 223 | self.assertTrue(comparison_table.count() >= 1) 224 | 225 | def test_sample_case_integration_fs_validator(self): 226 | """Integration test for rule set defined in mock config file.""" 227 | with open(PACKAGE_DIR + "/mock_data/config_family_fs.json") as f: 228 | config = Config.parse(f) 229 | 230 | config.source_df = PACKAGE_DIR + config.source_df 231 | config.output_correctness_table = PACKAGE_DIR + config.output_completeness_table 232 | config.output_completeness_table = ( 233 | PACKAGE_DIR + config.output_completeness_table 234 | ) 235 | config.output_comparison_table = PACKAGE_DIR + config.output_comparison_table 236 | config.comparable_dfs_list = list( 237 | map(lambda x: PACKAGE_DIR + x, config.comparable_dfs_list) 238 | ) 239 | 240 | self.spark.sparkContext.addFile( 241 | PACKAGE_DIR + "/mock_data/config_family_fs.json" 242 | ) 243 | sys.argv = [ 244 | "example.py", 245 | "-c", 246 | PACKAGE_DIR + "/mock_data/config_family_fs.json", 247 | ] 248 | 249 | file_system_validator.init() 250 | 251 | correctness_table = self.spark.read.json( 252 | "/tmp/mock_data/output/family_sample_test_correctness" 253 | ) 254 | completeness_table = self.spark.read.json( 255 | "/tmp/mock_data/output/family_sample_test_completeness" 256 | ) 257 | comparison_table = self.spark.read.json( 258 | "/tmp/mock_data/output/family_sample_test_comparison" 259 | ) 260 | 261 | self.assertTrue(correctness_table.count() >= 6) 262 | self.assertTrue(completeness_table.count() >= 1) 263 | self.assertTrue(comparison_table.count() >= 1) 264 | 265 | @classmethod 266 | def tearDownClass(cls): 267 | """Remove spark tables for testing.""" 268 | cls.spark.sql( 269 | "drop database if exists {} cascade".format( 270 | GeneralHandlerTest.TEST_DATABASE_NAME 271 | ) 272 | ).collect() 273 | 274 | def tearDown(self): 275 | """Remove test databases and tables after every test.""" 276 | self.spark.sql( 277 | "drop database if exists {} cascade".format( 278 | GeneralHandlerTest.TEST_DATABASE_NAME 279 | ) 280 | ).collect() 281 | 282 | 283 | if __name__ == "__main__": 284 | unittest.main() 285 | -------------------------------------------------------------------------------- /lib/src/spark_validation/common/general_validator.py: -------------------------------------------------------------------------------- 1 | """This module exposes a general interface with common df functions across all the pipelines. 2 | 3 | This function could be extensible to create specific handlers. For instance: PandasDataHandler(GeneralDFHandler). 4 | 5 | """ 6 | import datetime 7 | from abc import ABC 8 | from functools import reduce 9 | 10 | from pyspark.sql import Window 11 | from pyspark.sql import functions as F 12 | from pyspark.sql.dataframe import DataFrame 13 | 14 | from spark_validation.common.constants import Constants 15 | 16 | 17 | class GeneralDFValidator(ABC): 18 | """Class with general handlers functions.""" 19 | 20 | def transform(self, f): 21 | """Wrap the transform spark function non available for python.""" 22 | return f(self) 23 | 24 | @staticmethod 25 | def rename_cols(df, transformation_map): 26 | """Rename a set of spark columns within a df using a transformation_map dictionary. 27 | 28 | Example: 29 | df: 30 | +--------+--------+ 31 | | col1 | col2 | 32 | |--------+--------+ 33 | | 15 | 76 | 34 | | 30 | 97 | 35 | +--------+--------+ 36 | transformation_map : 37 | {col1: id, col2: code} 38 | return: 39 | +--------+--------+ 40 | | id | code | 41 | |--------+--------+ 42 | | 15 | 76 | 43 | | 30 | 97 | 44 | +--------+--------+ 45 | """ 46 | return reduce( 47 | lambda internal_df, col_name: internal_df.withColumnRenamed( 48 | col_name, transformation_map[col_name] 49 | ), 50 | transformation_map.keys(), 51 | df, 52 | ) 53 | 54 | @staticmethod 55 | def combine_dataframes(sources): 56 | """Join multiple dataframes using the spark union function.""" 57 | return reduce(lambda x, y: x.union(y), sources) 58 | 59 | @staticmethod 60 | def join_cols_with_all_parents(df, parent_validations_pairs): 61 | """Join df with all parent ids to obtain incorrect ids.""" 62 | for col, parent in parent_validations_pairs: 63 | df = GeneralDFValidator.join_grid_with_parent(df, col, parent) 64 | return df 65 | 66 | @staticmethod 67 | def join_grid_with_parent(df, id_col, parent_id_col): 68 | """Join a df with a single parent id to obtain incorrect ids.""" 69 | # Renaming col to avoid spark duplicate cols issues. 70 | _ids_renamed_org_id = id_col + "_" + parent_id_col 71 | parent_ids_df = df.select(id_col).withColumnRenamed(id_col, _ids_renamed_org_id) 72 | 73 | self_joined_df = df.join( 74 | parent_ids_df, 75 | df[parent_id_col] == parent_ids_df[_ids_renamed_org_id], 76 | "left", 77 | ) 78 | 79 | return self_joined_df 80 | 81 | @staticmethod 82 | def _validate_parent_id(df, id_col, parent_id_col): 83 | # Generating proper parent id validation column obtained after join_grid_with_parent = prefix + col_name. 84 | _ids_renamed_org_id = id_col + "_" + parent_id_col 85 | return df.withColumn( 86 | Constants.IS_ERROR_COL + _ids_renamed_org_id, 87 | F.when( 88 | ( 89 | (F.col(_ids_renamed_org_id).isNotNull()) 90 | | (F.col(parent_id_col).isNull()) 91 | ), 92 | 0, 93 | ).otherwise(1), 94 | ) 95 | 96 | @staticmethod 97 | def add_unique_error(df, id_col, unique_cols): 98 | """ Adding deduplication validation.""" 99 | _unique_cols_list = list(map(lambda c: F.col(c + "_str"), unique_cols)) 100 | _w = Window.partitionBy(Constants.UNIQUE_HASH).orderBy(F.col(id_col).asc()) 101 | 102 | return ( 103 | reduce( 104 | lambda internal_df, col_name: internal_df.withColumn( 105 | col_name + "_str", 106 | ( 107 | F.when( 108 | F.col(col_name).isNotNull(), 109 | F.lower(F.col(col_name).cast("string")), 110 | ).otherwise("") 111 | ), 112 | ), 113 | unique_cols, 114 | df, 115 | ) 116 | .withColumn(Constants.UNIQUE_HASH, F.concat(*_unique_cols_list)) 117 | .withColumn(Constants.COUNT_HASH, F.count(id_col).over(_w)) 118 | .withColumn( 119 | Constants.IS_ERROR_COL + Constants.UNIQUE_HASH, 120 | F.when(F.col(Constants.COUNT_HASH) > 1, 1).otherwise(0), 121 | ) 122 | .orderBy(F.col(id_col).asc()) 123 | ) 124 | 125 | @staticmethod 126 | def build_correctness_report_df(processed_df, validated_cols): 127 | """Build a report df computing column errors. 128 | 129 | 1. Sum of errors per column. 130 | 2. Add an over all row count. 131 | 3. Add a time stamp to this dataframe. 132 | """ 133 | windows_errors = Window.partitionBy(Constants.DATE_TIME_REPORT_COL) 134 | report_df = reduce( 135 | lambda internal_df, col_name: internal_df.transform( 136 | lambda df: df.withColumn( 137 | col_name + Constants.SUM_REPORT_SUFFIX, 138 | F.sum(col_name).over(windows_errors), 139 | ) 140 | ), 141 | validated_cols, 142 | processed_df.withColumn( 143 | Constants.DATE_TIME_REPORT_COL, F.lit(datetime.datetime.now()) 144 | ), 145 | ).withColumn( 146 | Constants.OVER_ALL_COUNT_COL, 147 | F.count(Constants.DATE_TIME_REPORT_COL).over(windows_errors), 148 | ) 149 | 150 | return report_df 151 | 152 | @staticmethod 153 | def build_computed_rules_correctness_df(processed_df, rules_map): 154 | """Build a dataframe with some rules computed. 155 | 156 | :param processed_df: input dataframe. 157 | :param rules_map: a map of rules with format {col_name = spark_sql_expr} 158 | :return: a dataframe with a new column IS_ERROR (1 - ERROR or 0 - NO ERROR) per column on the map. 159 | """ 160 | 161 | return reduce( 162 | lambda internal_df, col_name: internal_df.transform( 163 | lambda df: GeneralDFValidator._compute_col_val_correctness( 164 | df, col_name, rules_map[col_name] 165 | ) 166 | ), 167 | rules_map.keys(), 168 | processed_df, 169 | ) 170 | 171 | @staticmethod 172 | def build_correctness_df( 173 | processed_df, validation_rules_map, parent_validations_pairs 174 | ): 175 | """Build correctness df. 176 | 177 | 1. validate all the rules per column. 178 | 2. return df with error columns. This column will have the following schema: 179 | col_name = Constants.IS_ERROR_COL + col_name. 180 | value = 1 when error, 0 when column is clean. 181 | 3. Add a column with Constants.IS_ERROR_COL + Constants.ROW_ERROR_SUFFIX representing error on any 182 | column of the row. 183 | """ 184 | _list_correctness_cols = list(validation_rules_map.keys()) 185 | _list_cols_parent_validation = list( 186 | set([pair[0] + "_" + pair[1] for pair in parent_validations_pairs]) 187 | ) 188 | _list_cols_parent_cols = list( 189 | set([pair[0] for pair in parent_validations_pairs]) 190 | ) 191 | _error_cols_correctness = list( 192 | map(lambda c: Constants.IS_ERROR_COL + c, validation_rules_map.keys(),) 193 | ) 194 | _error_cols_parents_pairs = list( 195 | map(lambda c: Constants.IS_ERROR_COL + c, _list_cols_parent_validation,) 196 | ) 197 | _list_general_rows_errors = list( 198 | [ 199 | Constants.IS_ERROR_COL + Constants.ROW_ERROR_SUFFIX, 200 | Constants.IS_ERROR_COL + Constants.UNIQUE_HASH, 201 | ] 202 | ) 203 | 204 | final_select_cols = ( 205 | _list_correctness_cols 206 | + _list_cols_parent_validation 207 | + _list_cols_parent_cols 208 | + _error_cols_correctness 209 | + _error_cols_parents_pairs 210 | + _list_general_rows_errors 211 | ) 212 | 213 | validate_expr = GeneralDFValidator._generate_validate_errors_expr( 214 | _list_correctness_cols + _list_cols_parent_validation 215 | ) 216 | validated_df = GeneralDFValidator.build_computed_rules_correctness_df( 217 | processed_df, validation_rules_map 218 | ) 219 | 220 | validated_df = ( 221 | reduce( 222 | lambda internal_df, pair_parent: internal_df.transform( 223 | lambda df: GeneralDFValidator._validate_parent_id( 224 | df, pair_parent[0], pair_parent[1] 225 | ) 226 | ), 227 | parent_validations_pairs, 228 | validated_df, 229 | ) 230 | .withColumn( 231 | Constants.IS_ERROR_COL + Constants.ROW_ERROR_SUFFIX, 232 | F.when(F.expr(validate_expr), 1).otherwise(0), 233 | ) 234 | .select(final_select_cols) 235 | ) 236 | return validated_df 237 | 238 | @staticmethod 239 | def _compute_col_val_correctness(df, col_name, col_rule): 240 | # Error column name is generated with error_prefix + col_name. 241 | return df.withColumn( 242 | Constants.IS_ERROR_COL + col_name, F.when(F.expr(col_rule), 0).otherwise(1), 243 | ) 244 | 245 | @staticmethod 246 | def _generate_validate_errors_expr(list_validation_cols): 247 | """Generate SQL exp that validates that there a not error (col_val == 1) on any validation column.""" 248 | return """{}{} == 1 {}""".format( 249 | Constants.IS_ERROR_COL, 250 | list_validation_cols[0], 251 | "".join( 252 | list( 253 | map( 254 | lambda x: " or {}{} == 1".format(Constants.IS_ERROR_COL, x), 255 | list_validation_cols[1:], 256 | ) 257 | ) 258 | ), 259 | ) 260 | 261 | @staticmethod 262 | def compared_with_related_dfs(source_df, id_col_name, map_related_dfs): 263 | """Compare source df with related dfs. 264 | 265 | Obtaining a list per related dfs: 266 | 1. Columns present in source not in related. 267 | 2. Columns present in related in source. 268 | When both previous are empty: 269 | 3. Row values present in source not equal in related. 270 | 4. Row values present in related not equal in source. 271 | """ 272 | comparison_results = [] 273 | for k, df in map_related_dfs: 274 | missing_cols_right = GeneralDFValidator._missing_values_between_schemas( 275 | source_df.schema, df.schema 276 | ) 277 | missing_cols_left = GeneralDFValidator._missing_values_between_schemas( 278 | df.schema, source_df.schema 279 | ) 280 | 281 | missing_vals_right = GeneralDFValidator._list_different_rows_ids_between_dfs( 282 | source_df, id_col_name, df, missing_cols_right 283 | ) 284 | 285 | missing_vals_left = GeneralDFValidator._list_different_rows_ids_between_dfs( 286 | df, id_col_name, source_df, missing_cols_left 287 | ) 288 | 289 | comparison_results.append( 290 | ( 291 | k, 292 | ",".join(missing_cols_right), 293 | ",".join(missing_cols_left), 294 | ",".join(missing_vals_right), 295 | ",".join(missing_vals_left), 296 | ) 297 | ) 298 | 299 | return comparison_results 300 | 301 | @staticmethod 302 | def _missing_values_between_schemas(schema1, schema2): 303 | return list( 304 | set(list(map(lambda c: c.name + ":" + c.dataType.simpleString(), schema1))) 305 | - set( 306 | list(map(lambda c: c.name + ":" + c.dataType.simpleString(), schema2)) 307 | ) 308 | ) 309 | 310 | @staticmethod 311 | def _list_different_rows_ids_between_dfs( 312 | source_df, id_col_name, related_df, schema_correct 313 | ): 314 | return ( 315 | list( 316 | map( 317 | lambda col: col.__getitem__(id_col_name), 318 | source_df.subtract(related_df).select(id_col_name).collect(), 319 | ) 320 | ) 321 | if not schema_correct 322 | else [] 323 | ) 324 | 325 | 326 | DataFrame.transform = GeneralDFValidator.transform 327 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | init-hook='import sys; sys.path.append("./lib/") 3 | 4 | # A comma-separated list of package or module names from where C extensions may 5 | # be loaded. Extensions are loading into the active Python interpreter and may 6 | # run arbitrary code. 7 | extension-pkg-whitelist= 8 | 9 | # Add files or directories to the blacklist. They should be base names, not 10 | # paths. 11 | ignore=CVS 12 | 13 | # Add files or directories matching the regex patterns to the blacklist. The 14 | # regex matches against base names, not paths. 15 | ignore-patterns= 16 | 17 | # Python code to execute, usually for sys.path manipulation such as 18 | # pygtk.require(). 19 | #init-hook= 20 | 21 | # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the 22 | # number of processors available to use. 23 | jobs=1 24 | 25 | # Control the amount of potential inferred values when inferring a single 26 | # object. This can help the performance when dealing with large functions or 27 | # complex, nested conditions. 28 | limit-inference-results=100 29 | 30 | # List of plugins (as comma separated values of python module names) to load, 31 | # usually to register additional checkers. 32 | load-plugins= 33 | 34 | # Pickle collected data for later comparisons. 35 | persistent=yes 36 | 37 | # Specify a configuration file. 38 | #rcfile= 39 | 40 | # When enabled, pylint would attempt to guess common misconfiguration and emit 41 | # user-friendly hints instead of false-positive error messages. 42 | suggestion-mode=yes 43 | 44 | # Allow loading of arbitrary C extensions. Extensions are imported into the 45 | # active Python interpreter and may run arbitrary code. 46 | unsafe-load-any-extension=no 47 | 48 | 49 | [MESSAGES CONTROL] 50 | 51 | # Only show warnings with the listed confidence levels. Leave empty to show 52 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED. 53 | confidence= 54 | 55 | # Disable the message, report, category or checker with the given id(s). You 56 | # can either give multiple identifiers separated by comma (,) or put this 57 | # option multiple times (only on the command line, not in the configuration 58 | # file where it should appear only once). You can also use "--disable=all" to 59 | # disable everything first and then reenable specific checks. For example, if 60 | # you want to run only the similarities checker, you can use "--disable=all 61 | # --enable=similarities". If you want to run only the classes checker, but have 62 | # no Warning level messages displayed, use "--disable=all --enable=classes 63 | # --disable=W". 64 | disable=print-statement, 65 | parameter-unpacking, 66 | unpacking-in-except, 67 | old-raise-syntax, 68 | backtick, 69 | long-suffix, 70 | old-ne-operator, 71 | old-octal-literal, 72 | import-star-module-level, 73 | non-ascii-bytes-literal, 74 | raw-checker-failed, 75 | bad-inline-option, 76 | locally-disabled, 77 | file-ignored, 78 | suppressed-message, 79 | useless-suppression, 80 | deprecated-pragma, 81 | use-symbolic-message-instead, 82 | apply-builtin, 83 | basestring-builtin, 84 | buffer-builtin, 85 | cmp-builtin, 86 | coerce-builtin, 87 | execfile-builtin, 88 | file-builtin, 89 | long-builtin, 90 | raw_input-builtin, 91 | reduce-builtin, 92 | standarderror-builtin, 93 | unicode-builtin, 94 | xrange-builtin, 95 | coerce-method, 96 | delslice-method, 97 | getslice-method, 98 | setslice-method, 99 | no-absolute-import, 100 | old-division, 101 | dict-iter-method, 102 | dict-view-method, 103 | next-method-called, 104 | metaclass-assignment, 105 | indexing-exception, 106 | raising-string, 107 | reload-builtin, 108 | oct-method, 109 | hex-method, 110 | nonzero-method, 111 | cmp-method, 112 | input-builtin, 113 | round-builtin, 114 | intern-builtin, 115 | unichr-builtin, 116 | map-builtin-not-iterating, 117 | zip-builtin-not-iterating, 118 | range-builtin-not-iterating, 119 | filter-builtin-not-iterating, 120 | using-cmp-argument, 121 | eq-without-hash, 122 | div-method, 123 | idiv-method, 124 | rdiv-method, 125 | exception-message-attribute, 126 | invalid-str-codec, 127 | sys-max-int, 128 | bad-python3-import, 129 | deprecated-string-function, 130 | deprecated-str-translate-call, 131 | deprecated-itertools-function, 132 | deprecated-types-field, 133 | next-method-defined, 134 | dict-items-not-iterating, 135 | dict-keys-not-iterating, 136 | dict-values-not-iterating, 137 | deprecated-operator-function, 138 | deprecated-urllib-function, 139 | xreadlines-attribute, 140 | deprecated-sys-function, 141 | exception-escape, 142 | comprehension-escape 143 | 144 | # Enable the message, report, category or checker with the given id(s). You can 145 | # either give multiple identifier separated by comma (,) or put this option 146 | # multiple time (only on the command line, not in the configuration file where 147 | # it should appear only once). See also the "--disable" option for examples. 148 | enable=c-extension-no-member 149 | 150 | 151 | [REPORTS] 152 | 153 | # Python expression which should return a score less than or equal to 10. You 154 | # have access to the variables 'error', 'warning', 'refactor', and 'convention' 155 | # which contain the number of messages in each category, as well as 'statement' 156 | # which is the total number of statements analyzed. This score is used by the 157 | # global evaluation report (RP0004). 158 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) 159 | 160 | # Template used to display messages. This is a python new-style format string 161 | # used to format the message information. See doc for all details. 162 | #msg-template= 163 | 164 | # Set the output format. Available formats are text, parseable, colorized, json 165 | # and msvs (visual studio). You can also give a reporter class, e.g. 166 | # mypackage.mymodule.MyReporterClass. 167 | output-format=text 168 | 169 | # Tells whether to display a full report or only the messages. 170 | reports=no 171 | 172 | # Activate the evaluation score. 173 | score=yes 174 | 175 | 176 | [REFACTORING] 177 | 178 | # Maximum number of nested blocks for function / method body 179 | max-nested-blocks=5 180 | 181 | # Complete name of functions that never returns. When checking for 182 | # inconsistent-return-statements if a never returning function is called then 183 | # it will be considered as an explicit return statement and no message will be 184 | # printed. 185 | never-returning-functions=sys.exit 186 | 187 | 188 | [LOGGING] 189 | 190 | # Format style used to check logging format string. `old` means using % 191 | # formatting, `new` is for `{}` formatting,and `fstr` is for f-strings. 192 | logging-format-style=old 193 | 194 | # Logging modules to check that the string format arguments are in logging 195 | # function parameter format. 196 | logging-modules=logging 197 | 198 | 199 | [SPELLING] 200 | 201 | # Limits count of emitted suggestions for spelling mistakes. 202 | max-spelling-suggestions=4 203 | 204 | # Spelling dictionary name. Available dictionaries: none. To make it work, 205 | # install the python-enchant package. 206 | spelling-dict= 207 | 208 | # List of comma separated words that should not be checked. 209 | spelling-ignore-words= 210 | 211 | # A path to a file that contains the private dictionary; one word per line. 212 | spelling-private-dict-file= 213 | 214 | # Tells whether to store unknown words to the private dictionary (see the 215 | # --spelling-private-dict-file option) instead of raising a message. 216 | spelling-store-unknown-words=no 217 | 218 | 219 | [MISCELLANEOUS] 220 | 221 | # List of note tags to take in consideration, separated by a comma. 222 | notes=FIXME, 223 | XXX, 224 | TODO 225 | 226 | 227 | [TYPECHECK] 228 | 229 | # List of decorators that produce context managers, such as 230 | # contextlib.contextmanager. Add to this list to register other decorators that 231 | # produce valid context managers. 232 | contextmanager-decorators=contextlib.contextmanager 233 | 234 | # List of members which are set dynamically and missed by pylint inference 235 | # system, and so shouldn't trigger E1101 when accessed. Python regular 236 | # expressions are accepted. 237 | generated-members= 238 | 239 | # Tells whether missing members accessed in mixin class should be ignored. A 240 | # mixin class is detected if its name ends with "mixin" (case insensitive). 241 | ignore-mixin-members=yes 242 | 243 | # Tells whether to warn about missing members when the owner of the attribute 244 | # is inferred to be None. 245 | ignore-none=yes 246 | 247 | # This flag controls whether pylint should warn about no-member and similar 248 | # checks whenever an opaque object is returned when inferring. The inference 249 | # can return multiple potential results while evaluating a Python object, but 250 | # some branches might not be evaluated, which results in partial inference. In 251 | # that case, it might be useful to still emit no-member and other checks for 252 | # the rest of the inferred objects. 253 | ignore-on-opaque-inference=yes 254 | 255 | # List of class names for which member attributes should not be checked (useful 256 | # for classes with dynamically set attributes). This supports the use of 257 | # qualified names. 258 | ignored-classes=optparse.Values,thread._local,_thread._local 259 | 260 | # List of module names for which member attributes should not be checked 261 | # (useful for modules/projects where namespaces are manipulated during runtime 262 | # and thus existing member attributes cannot be deduced by static analysis). It 263 | # supports qualified module names, as well as Unix pattern matching. 264 | ignored-modules= 265 | 266 | # Show a hint with possible names when a member name was not found. The aspect 267 | # of finding the hint is based on edit distance. 268 | missing-member-hint=yes 269 | 270 | # The minimum edit distance a name should have in order to be considered a 271 | # similar match for a missing member name. 272 | missing-member-hint-distance=1 273 | 274 | # The total number of similar names that should be taken in consideration when 275 | # showing a hint for a missing member. 276 | missing-member-max-choices=1 277 | 278 | # List of decorators that change the signature of a decorated function. 279 | signature-mutators= 280 | 281 | 282 | [VARIABLES] 283 | 284 | # List of additional names supposed to be defined in builtins. Remember that 285 | # you should avoid defining new builtins when possible. 286 | additional-builtins= 287 | 288 | # Tells whether unused global variables should be treated as a violation. 289 | allow-global-unused-variables=yes 290 | 291 | # List of strings which can identify a callback function by name. A callback 292 | # name must start or end with one of those strings. 293 | callbacks=cb_, 294 | _cb 295 | 296 | # A regular expression matching the name of dummy variables (i.e. expected to 297 | # not be used). 298 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ 299 | 300 | # Argument names that match this expression will be ignored. Default to name 301 | # with leading underscore. 302 | ignored-argument-names=_.*|^ignored_|^unused_ 303 | 304 | # Tells whether we should check for unused import in __init__ files. 305 | init-import=no 306 | 307 | # List of qualified module names which can have objects that can redefine 308 | # builtins. 309 | redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io 310 | 311 | 312 | [FORMAT] 313 | 314 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF. 315 | expected-line-ending-format= 316 | 317 | # Regexp for a line that is allowed to be longer than the limit. 318 | ignore-long-lines=^\s*(# )??$ 319 | 320 | # Number of spaces of indent required inside a hanging or continued line. 321 | indent-after-paren=4 322 | 323 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 324 | # tab). 325 | indent-string=' ' 326 | 327 | # Maximum number of characters on a single line. 328 | max-line-length=100 329 | 330 | # Maximum number of lines in a module. 331 | max-module-lines=1000 332 | 333 | # List of optional constructs for which whitespace checking is disabled. `dict- 334 | # separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. 335 | # `trailing-comma` allows a space between comma and closing bracket: (a, ). 336 | # `empty-line` allows space-only lines. 337 | no-space-check=trailing-comma, 338 | dict-separator 339 | 340 | # Allow the body of a class to be on the same line as the declaration if body 341 | # contains single statement. 342 | single-line-class-stmt=no 343 | 344 | # Allow the body of an if to be on the same line as the test if there is no 345 | # else. 346 | single-line-if-stmt=no 347 | 348 | 349 | [SIMILARITIES] 350 | 351 | # Ignore comments when computing similarities. 352 | ignore-comments=yes 353 | 354 | # Ignore docstrings when computing similarities. 355 | ignore-docstrings=yes 356 | 357 | # Ignore imports when computing similarities. 358 | ignore-imports=no 359 | 360 | # Minimum lines number of a similarity. 361 | min-similarity-lines=4 362 | 363 | 364 | [BASIC] 365 | 366 | # Naming style matching correct argument names. 367 | argument-naming-style=snake_case 368 | 369 | # Regular expression matching correct argument names. Overrides argument- 370 | # naming-style. 371 | #argument-rgx= 372 | 373 | # Naming style matching correct attribute names. 374 | attr-naming-style=snake_case 375 | 376 | # Regular expression matching correct attribute names. Overrides attr-naming- 377 | # style. 378 | #attr-rgx= 379 | 380 | # Bad variable names which should always be refused, separated by a comma. 381 | bad-names=foo, 382 | bar, 383 | baz, 384 | toto, 385 | tutu, 386 | tata 387 | 388 | # Naming style matching correct class attribute names. 389 | class-attribute-naming-style=any 390 | 391 | # Regular expression matching correct class attribute names. Overrides class- 392 | # attribute-naming-style. 393 | #class-attribute-rgx= 394 | 395 | # Naming style matching correct class names. 396 | class-naming-style=PascalCase 397 | 398 | # Regular expression matching correct class names. Overrides class-naming- 399 | # style. 400 | #class-rgx= 401 | 402 | # Naming style matching correct constant names. 403 | const-naming-style=UPPER_CASE 404 | 405 | # Regular expression matching correct constant names. Overrides const-naming- 406 | # style. 407 | #const-rgx= 408 | 409 | # Minimum line length for functions/classes that require docstrings, shorter 410 | # ones are exempt. 411 | docstring-min-length=-1 412 | 413 | # Naming style matching correct function names. 414 | function-naming-style=snake_case 415 | 416 | # Regular expression matching correct function names. Overrides function- 417 | # naming-style. 418 | #function-rgx= 419 | 420 | # Good variable names which should always be accepted, separated by a comma. 421 | good-names=i, 422 | j, 423 | k, 424 | ex, 425 | Run, 426 | _ 427 | 428 | # Include a hint for the correct naming format with invalid-name. 429 | include-naming-hint=no 430 | 431 | # Naming style matching correct inline iteration names. 432 | inlinevar-naming-style=any 433 | 434 | # Regular expression matching correct inline iteration names. Overrides 435 | # inlinevar-naming-style. 436 | #inlinevar-rgx= 437 | 438 | # Naming style matching correct method names. 439 | method-naming-style=snake_case 440 | 441 | # Regular expression matching correct method names. Overrides method-naming- 442 | # style. 443 | #method-rgx= 444 | 445 | # Naming style matching correct module names. 446 | module-naming-style=snake_case 447 | 448 | # Regular expression matching correct module names. Overrides module-naming- 449 | # style. 450 | #module-rgx= 451 | 452 | # Colon-delimited sets of names that determine each other's naming style when 453 | # the name regexes allow several styles. 454 | name-group= 455 | 456 | # Regular expression which should only match function or class names that do 457 | # not require a docstring. 458 | no-docstring-rgx=^_ 459 | 460 | # List of decorators that produce properties, such as abc.abstractproperty. Add 461 | # to this list to register other decorators that produce valid properties. 462 | # These decorators are taken in consideration only for invalid-name. 463 | property-classes=abc.abstractproperty 464 | 465 | # Naming style matching correct variable names. 466 | variable-naming-style=snake_case 467 | 468 | # Regular expression matching correct variable names. Overrides variable- 469 | # naming-style. 470 | #variable-rgx= 471 | 472 | 473 | [STRING] 474 | 475 | # This flag controls whether the implicit-str-concat-in-sequence should 476 | # generate a warning on implicit string concatenation in sequences defined over 477 | # several lines. 478 | check-str-concat-over-line-jumps=no 479 | 480 | 481 | [IMPORTS] 482 | 483 | # List of modules that can be imported at any level, not just the top level 484 | # one. 485 | allow-any-import-level= 486 | 487 | # Allow wildcard imports from modules that define __all__. 488 | allow-wildcard-with-all=no 489 | 490 | # Analyse import fallback blocks. This can be used to support both Python 2 and 491 | # 3 compatible code, which means that the block might have code that exists 492 | # only in one or another interpreter, leading to false positives when analysed. 493 | analyse-fallback-blocks=no 494 | 495 | # Deprecated modules which should not be used, separated by a comma. 496 | deprecated-modules=optparse,tkinter.tix 497 | 498 | # Create a graph of external dependencies in the given file (report RP0402 must 499 | # not be disabled). 500 | ext-import-graph= 501 | 502 | # Create a graph of every (i.e. internal and external) dependencies in the 503 | # given file (report RP0402 must not be disabled). 504 | import-graph= 505 | 506 | # Create a graph of internal dependencies in the given file (report RP0402 must 507 | # not be disabled). 508 | int-import-graph= 509 | 510 | # Force import order to recognize a module as part of the standard 511 | # compatibility libraries. 512 | known-standard-library= 513 | 514 | # Force import order to recognize a module as part of a third party library. 515 | known-third-party=enchant 516 | 517 | # Couples of modules and preferred modules, separated by a comma. 518 | preferred-modules= 519 | 520 | 521 | [CLASSES] 522 | 523 | # List of method names used to declare (i.e. assign) instance attributes. 524 | defining-attr-methods=__init__, 525 | __new__, 526 | setUp, 527 | __post_init__ 528 | 529 | # List of member names, which should be excluded from the protected access 530 | # warning. 531 | exclude-protected=_asdict, 532 | _fields, 533 | _replace, 534 | _source, 535 | _make 536 | 537 | # List of valid names for the first argument in a class method. 538 | valid-classmethod-first-arg=cls 539 | 540 | # List of valid names for the first argument in a metaclass class method. 541 | valid-metaclass-classmethod-first-arg=cls 542 | 543 | 544 | [DESIGN] 545 | 546 | # Maximum number of arguments for function / method. 547 | max-args=5 548 | 549 | # Maximum number of attributes for a class (see R0902). 550 | max-attributes=7 551 | 552 | # Maximum number of boolean expressions in an if statement (see R0916). 553 | max-bool-expr=5 554 | 555 | # Maximum number of branch for function / method body. 556 | max-branches=12 557 | 558 | # Maximum number of locals for function / method body. 559 | max-locals=15 560 | 561 | # Maximum number of parents for a class (see R0901). 562 | max-parents=7 563 | 564 | # Maximum number of public methods for a class (see R0904). 565 | max-public-methods=20 566 | 567 | # Maximum number of return / yield for function / method body. 568 | max-returns=6 569 | 570 | # Maximum number of statements in function / method body. 571 | max-statements=50 572 | 573 | # Minimum number of public methods for a class (see R0903). 574 | min-public-methods=2 575 | 576 | 577 | [EXCEPTIONS] 578 | 579 | # Exceptions that will emit a warning when being caught. Defaults to 580 | # "BaseException, Exception". 581 | overgeneral-exceptions=BaseException, 582 | Exception 583 | -------------------------------------------------------------------------------- /lib/src/spark_validation/static/static/js/main.8e11e6a5.chunk.js.map: -------------------------------------------------------------------------------- 1 | {"version":3,"sources":["components/RoleEditor/index.jsx","components/ToolBar/index.jsx","App.jsx","serviceWorker.js","index.js","assets/logo.png"],"names":["Button","styled","button","Wrapper","div","Editor","Output","getColumnBuilderProps","name","type","operators","label","fields","getControlElements","valueEditor","specs","field","operator","handleOnChange","value","operatorName","toLowerCase","startsWith","includes","onChange","handleQueryChange","targetColumn","input","currentQueries","setQuery","rule","toQuery","isParent","console","log","query","mapOperator","groupRules","rules","groupCombinator","combinator","forEach","prefixOperator","length","replace","trim","map","data","column","RoleEditor","table","columns","useState","queries","parentOf","setParentOf","addToast","useToasts","completnessOperator","setCompletnessOperator","completnessValue","setCompletnessValue","useEffect","reduce","obj","state","body","correctness_validations","completeness_validations","parent_children_constraints","key","onQueryChange","class","selectedOption","options","e","target","isMulti","selectedOptions","newSelection","oldSelection","filter","item","concat","language","style","github","JSON","stringify","onClick","fetch","config","method","appearance","autoDismiss","autoDismissTimeout","props","tableData","Image","img","Logo","initialState","ToolBar","onTableSelected","setTable","src","logo","App","className","Boolean","window","location","hostname","match","ReactDOM","render","StrictMode","document","getElementById","navigator","serviceWorker","ready","then","registration","unregister","catch","error","message","module","exports"],"mappings":"0rDASA,IAgBMA,EAASC,IAAOC,OAAV,KAaNC,EAAUF,IAAOG,IAAV,KAQPC,EAASJ,IAAOG,IAAV,KAyCNE,EAASL,IAAOG,IAAV,KAUNG,EAAwB,SAAC,GAAD,IAAGC,EAAH,EAAGA,KAAMC,EAAT,EAASA,KAAT,MAAqB,CACjDC,UAAW,CACT,CAAEF,KAAM,UAAWG,MAAO,WAC1B,CAAEH,KAAM,cAAeG,MAAO,eAC9B,CAAEH,KAAM,OAAQG,MAAO,YACvB,CAAEH,KAAM,QAASG,MAAO,gBACxB,CAAEH,KAAM,KAAMG,MAAO,MACrB,CAAEH,KAAM,IAAKG,MAAO,KACpB,CAAEH,KAAM,KAAMG,MAAO,MACrB,CAAEH,KAAM,IAAKG,MAAO,KACpB,CAAEH,KAAM,IAAKG,MAAO,KACpB,CAAEH,KAAM,KAAMG,MAAO,MACrB,CAAEH,KAAM,KAAMG,MAAO,OAEvBC,OAAQ,CACN,CAAEJ,KAAMA,EAAMG,MAAOH,GACrB,CAAEA,KAAK,eAAD,OAAiBA,EAAjB,KAA0BG,MAAM,eAAD,OAAiBH,EAAjB,OAEvCK,mBAAoB,iBAAO,CAIzBC,YAAa,SAAAC,GAAU,IACbC,EAA2CD,EAA3CC,MAAOC,EAAoCF,EAApCE,SAAUC,EAA0BH,EAA1BG,eAAgBC,EAAUJ,EAAVI,MACnCC,EAAeH,EAASI,cAC9B,OACED,EAAaE,WAAW,OACxB,CAAC,OAAQ,SAASC,SAASH,GAEpB,GAEI,WAATX,GAAqBO,IAAK,sBAAoBR,EAApB,KACrB,2BAAOW,MAAOA,EAAOK,SAAUN,IAEjC,2BAAOM,SAAUN,SAgCxBO,EAAoB,SAACC,EAAcC,EAAOC,EAAgBC,GAC9D,IAAMC,EA5BQ,SAAVC,EAAWJ,EAAOK,GACtBC,QAAQC,IAAIP,GACZ,IAAIQ,EAAQ,GACNC,EAAc,CAClB,KAAQ,UACR,QAAW,cACX,SAAY,QAMCC,EAA4CV,EAAnDW,MAA+BC,EAAoBZ,EAAhCa,WAY3B,OAVAH,EAAWI,SAAQ,SAAA1B,GAAU,IAAD,EAC+BA,EAAjDC,aADkB,MACV,GADU,IAC+BD,EAArCE,gBADM,MACK,GADL,EACSE,EAAsBJ,EAAtBI,MADT,EAC+BJ,EAAfuB,aADhB,MACwB,GADxB,EAE1BL,QAAQC,IAAIlB,GAEZ,IAAM0B,EAAiBV,GAAsB,KAAVG,EAAe,GAAKI,EACvDJ,GAASG,EAAMK,OACXZ,EAAQhB,GADH,WAED2B,EAFC,YAEiB1B,EAFjB,YAE0BoB,EAAYnB,IAAaA,EAFnD,YAE+DE,EAF/D,QAKJgB,EAIMJ,CAAQJ,GAAO,GAAMiB,QAAQ,SAAU,KAAKC,OAOzDhB,EANmBD,EAAekB,KAAI,SAAAC,GACpC,OAAIA,EAAKC,SAAWtB,EACX,CAAEsB,OAAQD,EAAKC,OAAQlB,QAEzBiB,OAyHIE,EApHI,SAAC,GAA6B,IAAD,IAA3BC,aAA2B,MAAnB,CAACC,QAAS,IAAS,IAClBC,mBAAS,IADS,mBACvCC,EADuC,KAC9BxB,EAD8B,OAEduB,mBAAS,IAFK,mBAEvCE,EAFuC,KAE7BC,EAF6B,KAGtCC,EAAaC,sBAAbD,SAHsC,EAKQJ,mBAAS,IALjB,mBAKvCM,EALuC,KAKlBC,EALkB,OAMEP,mBAAS,IANX,mBAMvCQ,EANuC,KAMrBC,EANqB,KAQ9CC,qBAAU,WACRjC,EAASqB,EAAMC,QAAQL,KAAI,kBAAe,CAAEE,OAAjB,EAAGxC,KAA4BsB,KAAM,UAC/D,CAACoB,IAEcA,EAAMC,QAAQY,QAAO,SAACjB,EAAKkB,GAE3C,OADAlB,EAAIkB,EAAIxD,MAAQwD,EAAIvD,KACbqC,IACN,IAHH,IAjLemB,EAwLTC,GAxLSD,EAyLb,CACEE,wBAAyBd,EACzBe,yBAA0B,CAAE,CACxB,OAAU,iBACV,KAAQ,kBAAoBV,EAAsB,IAAME,IAG5DS,4BAA6Bf,GAhMR,CACzB,aAAgB,CACd,KAAQ,8BACR,UAAa,KACb,yBAA4B,uDAC5B,0BAA6B,wDAC7B,wBAA2B,sDAC3B,qCAAwC,CAAC,KAAM,OAAQ,cAAe,UACtE,6BAAgC,GAElC,wBAA0B,YAAKW,EAAME,yBACrC,yBAA2B,YAAKF,EAAMG,0BACtC,4BAA8B,YAAKH,EAAMI,6BACzC,4BAA+B,CAAC,eAAgB,oBAiMhD,OACE,kBAAC,WAAD,KAC2B,GAAxBnB,EAAMC,QAAQR,QAAe,iEACL,GAAxBO,EAAMC,QAAQR,QACb,kBAACxC,EAAD,KACE,kBAACE,EAAD,KACE,sCACA,uDAEE6C,EAAMC,QAAQL,KAAI,SAAA/B,GAAK,OACrB,yBAAKuD,IAAKvD,EAAMP,MACd,4BAAKO,EAAMP,KAAX,MAAoBO,EAAMN,KAA1B,KACA,kBAAC,IAAD,CACEG,OAAQL,EAAsBQ,GAAOH,OAErC2D,cAAe,SAAApC,GAAK,OAAIV,EAAkBV,EAAMP,KAAM2B,EAAOkB,EAASxB,MAEtE,iCAIR,wDACA,8CACA,yBAAK2C,MAAM,eACT,kBAAC,IAAD,CACEhD,SAAW,SAAAiD,GAAc,OAAId,EAAuBc,EAAetD,QACnEuD,QAAS,CACP,CAAEvD,MAAO,IAAKR,MAAO,KACrB,CAAEQ,MAAO,KAAMR,MAAO,MACtB,CAAEQ,MAAO,IAAKR,MAAO,KACrB,CAAEQ,MAAO,IAAKR,MAAO,KACrB,CAAEQ,MAAO,KAAMR,MAAO,SAG1B,2BAAOF,KAAK,SAASe,SAAU,SAAAmD,GAAC,OAAId,EAAoBc,EAAEC,OAAOzD,WAEnE,qDAEE+B,EAAMC,QAAQL,KAAI,SAAA/B,GAAK,OACrB,yBAAKuD,IAAKvD,EAAMP,MACd,4BAAKO,EAAMP,KAAX,MAAoBO,EAAMN,KAA1B,KACA,kBAAC,IAAD,CACEoE,SAAO,EACPrD,SAAU,SAAAsD,GACR,IAAMC,GAAgBD,GAAmB,IAAIhC,KAAI,SAAA2B,GAAc,MAC7D,CACE,OAAU1D,EAAMP,KAChB,OAAUiE,EAAetD,UAGvB6D,EAAe1B,EAAS2B,QAAO,SAAAC,GAAI,OAAIA,EAAKlC,QAAUjC,EAAMP,QAClE+C,EAAYwB,EAAaI,OAAOH,KAElCN,QAASxB,EAAMC,QAAQ8B,QAAO,SAAAC,GAAI,OAAIA,EAAK1E,MAAQO,EAAMP,QAAMsC,KAAI,SAAAoC,GAAI,MAAK,CAC1E/D,MAAO+D,EAAK1E,KACZG,MAAOuE,EAAK1E,WAEd,kCAKV,kBAACF,EAAD,KACE,sCACA,kBAAC,IAAD,CAAmB8E,SAAS,OAAOC,MAAOC,KACvCC,KAAKC,UAAUtB,EAAM,KAAM,MAKX,GAAxBhB,EAAMC,QAAQR,QAAe,kBAAC3C,EAAD,CAAQyF,QAhFzB,WACfC,MAAM,eAAgB,CAACxB,KAAMqB,KAAKC,UAAUG,EAAQ,KAAM,GAAIC,OAAQ,SAEtEpC,EAAS,uBAAwB,CAC/BqC,WAAY,UACZC,aAAa,EACbC,mBAAoB,QA0EU,Y,61BCvRpC,IAAM5F,EAAUF,IAAOG,IAAV,KAWPJ,EAASC,IAAOC,OAAV,KAGI,SAAA8F,GAAK,MAAmB,cAAfA,EAAMvF,KAAuB,QAAS,aACpD,SAAAuF,GAAK,MAAmB,cAAfA,EAAMvF,KAAsB,UAAY,WAQtDwF,EAAY,CAChBzF,KAAM,kBACN2C,QACA,CACE,CACE,KAAQ,KACR,KAAQ,UAEV,CACE,KAAQ,OACR,KAAQ,UAEV,CACE,KAAQ,cACR,KAAQ,UAEV,CACE,KAAQ,SACR,KAAQ,UAEV,CACE,KAAQ,UACR,KAAQ,YAKR+C,EAAQjG,IAAOkG,IAAV,KAILC,EAAOnG,IAAOG,IAAV,KASJiG,EAAe,CAAClD,QAAS,IA4BhBmD,EA3BC,SAAC,GAAuB,IAAtBC,EAAqB,EAArBA,gBACR/C,EAAaC,sBAAbD,SAD6B,EAEXJ,mBAAS,eAAIiD,IAFF,mBAE9BnD,EAF8B,KAEvBsD,EAFuB,KAcrC,OACE,kBAAC,EAAD,KACE,kBAAC,EAAD,CAAQf,QAZK,WACfc,EAAgBN,GAChBO,EAASP,GACTzC,EAAS,uBAAwB,CAC/BqC,WAAY,UACZC,aAAa,EACbC,mBAAoB,QAMpB,UACA,8BACE,2BACG7C,EAAM1C,MAAQ,sBAGnB,kBAAC4F,EAAD,uBAAqB,kBAACF,EAAD,CAAOO,IAAKC,SC1ExBC,MAZf,WAAgB,IAAD,EACavD,mBAAS,CAACD,QAAS,KADhC,mBACND,EADM,KACCsD,EADD,KAEb,OACE,kBAAC,gBAAD,KACE,yBAAKI,UAAU,OACb,kBAAC,EAAD,CAASL,gBAAiB,SAAArD,GAAK,OAAIsD,EAAStD,MAC5C,kBAAC,EAAD,CAAYA,MAAOA,OCAP2D,QACW,cAA7BC,OAAOC,SAASC,UAEe,UAA7BF,OAAOC,SAASC,UAEhBF,OAAOC,SAASC,SAASC,MACvB,2DCZNC,IAASC,OACP,kBAAC,IAAMC,WAAP,KACE,kBAAC,EAAD,OAEFC,SAASC,eAAe,SDyHpB,kBAAmBC,WACrBA,UAAUC,cAAcC,MACrBC,MAAK,SAAAC,GACJA,EAAaC,gBAEdC,OAAM,SAAAC,GACL7F,QAAQ6F,MAAMA,EAAMC,a,yyCEzI5BC,EAAOC,QAAU,IAA0B,kC","file":"static/js/main.8e11e6a5.chunk.js","sourcesContent":["import React, {useState, Fragment, useEffect} from 'react';\nimport styled from 'styled-components';\nimport QueryBuilder from 'react-querybuilder';\nimport SyntaxHighlighter from 'react-syntax-highlighter';\nimport {github} from 'react-syntax-highlighter/dist/esm/styles/hljs';\nimport Select from 'react-select';\nimport {useToasts} from 'react-toast-notifications'\nimport config from './config.json';\n \nconst template = state => ({\n \"source_table\": {\n \"name\": \"mock_data/family_sample.csv\",\n \"id_column\": \"ID\",\n \"output_correctness_table\": \"/tmp/mock_data/output/family_sample_test_correctness\",\n \"output_completeness_table\": \"/tmp/mock_data/output/family_sample_test_completeness\",\n \"output_comparison_table\": \"/tmp/mock_data/output/family_sample_test_comparison\",\n \"unique_column_group_values_per_table\": [\"ID\", \"NAME\", \"FAMILY_NAME\", \"PARENT\"],\n \"fuzzy_deduplication_distance\": 0\n },\n \"correctness_validations\": [...state.correctness_validations],\n \"completeness_validations\": [...state.completeness_validations],\n \"parent_children_constraints\": [...state.parent_children_constraints],\n \"compare_related_tables_list\": [\"test.diff_df\", \"test.diff_df_2\"]\n});\n\nconst Button = styled.button`\n height: 30px;\n background: #1890ff;\n border-radius: 4px;\n border: none;\n color: white;\n line-height: 1.5;\n font-weight: bold;\n cursor: pointer;\n width: 200px;\n margin: 30px;\n`;\n \nconst Wrapper = styled.div`\n height: 70vh;\n border: 1px solid lightgray;\n border-radius: 8px;\n display: grid;\n grid-template-columns: repeat(2, 1fr);\n`;\n\nconst Editor = styled.div`\n padding: 0px 30px 50px 30px;\n color: darkblue;\n text-align: left;\n overflow: auto;\n h4 {\n padding: 0px;\n margin: 0px;\n color: #505050;\n font-weight: normal;\n margin-bottom: 5px;\n }\n\n .completness {\n display: grid;\n grid-template-columns: 1fr 2fr;\n grid-gap: 8px;\n }\n\n .ruleGroup {\n padding: 0.5rem;\n border: 1px solid #1890ff;\n border-radius: 4px;\n background: rgba(180, 220, 255, 0.2);\n\n .rule,\n .ruleGroup {\n margin-top: 0.5rem;\n margin-left: 0.5rem;\n }\n\n .ruleGroup-combinators.betweenRules {\n margin-top: 0.5rem;\n }\n\n .ruleGroup-notToggle {\n margin-right: 0.5rem;\n }\n }\n`;\n\nconst Output = styled.div`\n color: darkblue;\n background: rgb(248, 248, 248);\n border-radius: 0px 8px 8px 0px;\n text-align: left;\n padding: 0px 30px;\n font-size: 0.8em;\n overflow: auto;\n`;\n\nconst getColumnBuilderProps = ({ name, type }) => ({\n operators: [\n { name: \"Is NULL\", label: \"Is NULL\" },\n { name: \"Is NOT NULL\", label: \"Is NOT NULL\" },\n { name: \"= ''\", label: \"Is EMPTY\" },\n { name: \"!= ''\", label: \"Is NOT EMPTY\" },\n { name: \"In\", label: \"In\" },\n { name: \"=\", label: \"=\" },\n { name: \"!=\", label: \"!=\" },\n { name: \"<\", label: \"<\" },\n { name: \">\", label: \">\" },\n { name: \"<=\", label: \"<=\" },\n { name: \">=\", label: \">=\" }\n ],\n fields: [\n { name: name, label: name },\n { name: `CHAR_LENGTH(${name})`, label: `CHAR_LENGTH(${name})` }\n ],\n getControlElements: () => ({\n // This part handles the rendering of input components based on the operator/column DOCTYPE\n // atm the plain Input text element doesnt work somehow the value is a react synthetic event\n // which I am not familiar with yet :p\n valueEditor: specs => {\n const { field, operator, handleOnChange, value } = specs;\n const operatorName = operator.toLowerCase();\n if (\n operatorName.startsWith(\"is\") ||\n [\"= ''\", \"!= ''\"].includes(operatorName)\n ) {\n return \"\";\n }\n if (type === \"number\" || field === `CHAR_LENGTH(${name})`) {\n return ;\n }\n return ;\n }\n })\n});\n\nconst toQuery = (input, isParent) => {\n console.log(input);\n let query = \"\";\n const mapOperator = {\n \"null\": \"IS NULL\",\n \"notNull\": \"IS NOT NULL\",\n \"contains\": \"LIKE\"\n }\n\n // const isNum = val => /^\\d+$/.test(val);\n // const getValue = val => isNum(val) || val == '' ? val : `'${val}'`\n\n const { rules: groupRules, combinator: groupCombinator } = input;\n\n groupRules.forEach(specs => {\n const { field = \"\", operator = \"\", value, rules = [] } = specs;\n console.log(field);\n\n const prefixOperator = isParent && query === \"\" ? \"\" : groupCombinator;\n query += rules.length\n ? toQuery(specs)\n : ` ${prefixOperator} ${field} ${mapOperator[operator] || operator} ${value} `;\n });\n\n return query;\n};\n\nconst handleQueryChange = (targetColumn, input, currentQueries, setQuery) => {\n const rule = toQuery(input, true).replace(/\\s\\s+/g, ' ').trim();\n const newQueries = currentQueries.map(data => {\n if (data.column === targetColumn) {\n return { column: data.column, rule };\n }\n return data;\n });\n setQuery(newQueries);\n};\n\nconst RoleEditor = ({table = {columns: []}}) => {\n const [queries, setQuery] = useState([]);\n const [parentOf, setParentOf] = useState([]);\n const { addToast } = useToasts()\n\n const [completnessOperator, setCompletnessOperator] = useState('');\n const [completnessValue, setCompletnessValue] = useState('');\n\n useEffect(() => {\n setQuery(table.columns.map(({ name }) => ({ column: name, rule: \"\" })));\n }, [table]);\n\n const columnMap = table.columns.reduce((map, obj) => {\n map[obj.name] = obj.type;\n return map;\n }, {});\n \n const getInputType = (field) => columnMap[field] || 'text';\n\n const body = template(\n {\n correctness_validations: queries,\n completeness_validations: [ {\n \"column\": \"OVER_ALL_COUNT\",\n \"rule\": \"OVER_ALL_COUNT \" + completnessOperator + \" \" + completnessValue\n }\n ],\n parent_children_constraints: parentOf,\n }\n );\n\n const onSubmit = () => {\n fetch('api/validate', {body: JSON.stringify(config, null, 2), method: 'POST'});\n\n addToast('Succesfully submited', {\n appearance: 'success',\n autoDismiss: true,\n autoDismissTimeout: 3000,\n });\n };\n\n return (\n \n {table.columns.length == 0 &&

Please select table to add rules

}\n {table.columns.length != 0 && \n \n \n

Editor

\n

Correctness validations

\n {\n table.columns.map(specs => (\n
\n

{specs.name}: [{specs.type}]

\n handleQueryChange(specs.name, query, queries, setQuery)}\n />\n
\n
\n ))\n }\n

Completeness validations

\n

Number of rows

\n
\n setCompletnessOperator(selectedOption.value)}\n options={[\n { value: '>', label: '>' },\n { value: '>=', label: '>=' },\n { value: '=', label: '=' },\n { value: '<', label: '<' },\n { value: '<=', label: '<=' },\n ]}\n />\n setCompletnessValue(e.target.value)}>\n
\n

Parent of constraints

\n {\n table.columns.map(specs => (\n
\n

{specs.name}: [{specs.type}]

\n {\n const newSelection = (selectedOptions || []).map(selectedOption => (\n {\n \"column\": specs.name,\n \"parent\": selectedOption.value\n }\n ));\n const oldSelection = parentOf.filter(item => item.column != specs.name);\n setParentOf(newSelection.concat(oldSelection));\n }}\n options={table.columns.filter(item => item.name != specs.name).map(item => ({\n value: item.name,\n label: item.name,\n }))}\n />
\n
\n ))\n }\n
\n \n

Output

\n \n {JSON.stringify(body, null, 2)}\n \n
\n
\n }\n {table.columns.length != 0 && }\n
\n );\n};\n\nexport default RoleEditor;","import React, {useState} from 'react';\nimport styled from 'styled-components';\nimport logo from '../../assets/logo.png';\n\nimport { useToasts} from 'react-toast-notifications'\n\nconst Wrapper = styled.div`\n height: 60px;\n border-radius: 8px;\n border: 1px solid lightgrey;\n display: grid;\n justify-items: center;\n align-items: center;\n grid-template-columns: repeat(3, 1fr);\n margin-bottom: 40px;\n`;\n\nconst Button = styled.button`\n height: 30px;\n width: 100px;\n background: ${props => props.type === 'secondary' ? 'white': '#1890ff'};\n color: ${props => props.type === 'secondary' ?'#1890ff' : 'white'};\n border-radius: 4px;\n border: 1px solid #1890ff;\n line-height: 1.5;\n font-weight: bold;\n cursor: pointer;\n`;\n\nconst tableData = {\n name: \"family_tree.csv\",\n columns: \n [\n {\n \"name\": \"ID\",\n \"type\": \"number\",\n },\n {\n \"name\": \"NAME\",\n \"type\": \"string\",\n },\n {\n \"name\": \"FAMILY_NAME\",\n \"type\": \"string\",\n },\n {\n \"name\": \"PARENT\",\n \"type\": \"string\",\n },\n {\n \"name\": \"ADDRESS\",\n \"type\": \"string\",\n },\n ]\n}\n\nconst Image = styled.img`\n width: 60px;\n`;\n\nconst Logo = styled.div`\n display: grid;\n grid-template-columns: 1fr 1fr;\n justify-items: center;\n align-items: center;\n font-weight: bold;\n color: darkblue;\n`;\n\nconst initialState = {columns: []};\nconst ToolBar = ({onTableSelected}) => {\n const { addToast } = useToasts()\n const [table, setTable] = useState({...initialState});\n\n const onUpload = () => {\n onTableSelected(tableData);\n setTable(tableData);\n addToast('Succesfully uploaded', {\n appearance: 'success',\n autoDismiss: true,\n autoDismissTimeout: 3000,\n })\n };\n\n return (\n \n \n \n \n {table.name || 'No table selected'}\n \n \n 2 | ValiData \n \n );\n};\n\nexport default ToolBar;","import React, {useState} from 'react';\nimport './App.css';\nimport RoleEditor from './components/RoleEditor';\nimport ToolBar from './components/ToolBar';\nimport { ToastProvider } from 'react-toast-notifications'\n\nfunction App() {\n const [table, setTable] = useState({columns: []});\n return (\n \n
\n setTable(table)}/>\n \n
\n
\n );\n}\n\nexport default App;\n","// This optional code is used to register a service worker.\n// register() is not called by default.\n\n// This lets the app load faster on subsequent visits in production, and gives\n// it offline capabilities. However, it also means that developers (and users)\n// will only see deployed updates on subsequent visits to a page, after all the\n// existing tabs open on the page have been closed, since previously cached\n// resources are updated in the background.\n\n// To learn more about the benefits of this model and instructions on how to\n// opt-in, read https://bit.ly/CRA-PWA\n\nconst isLocalhost = Boolean(\n window.location.hostname === 'localhost' ||\n // [::1] is the IPv6 localhost address.\n window.location.hostname === '[::1]' ||\n // 127.0.0.0/8 are considered localhost for IPv4.\n window.location.hostname.match(\n /^127(?:\\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}$/\n )\n);\n\nexport function register(config) {\n if (process.env.NODE_ENV === 'production' && 'serviceWorker' in navigator) {\n // The URL constructor is available in all browsers that support SW.\n const publicUrl = new URL(process.env.PUBLIC_URL, window.location.href);\n if (publicUrl.origin !== window.location.origin) {\n // Our service worker won't work if PUBLIC_URL is on a different origin\n // from what our page is served on. This might happen if a CDN is used to\n // serve assets; see https://github.com/facebook/create-react-app/issues/2374\n return;\n }\n\n window.addEventListener('load', () => {\n const swUrl = `${process.env.PUBLIC_URL}/service-worker.js`;\n\n if (isLocalhost) {\n // This is running on localhost. Let's check if a service worker still exists or not.\n checkValidServiceWorker(swUrl, config);\n\n // Add some additional logging to localhost, pointing developers to the\n // service worker/PWA documentation.\n navigator.serviceWorker.ready.then(() => {\n console.log(\n 'This web app is being served cache-first by a service ' +\n 'worker. To learn more, visit https://bit.ly/CRA-PWA'\n );\n });\n } else {\n // Is not localhost. Just register service worker\n registerValidSW(swUrl, config);\n }\n });\n }\n}\n\nfunction registerValidSW(swUrl, config) {\n navigator.serviceWorker\n .register(swUrl)\n .then(registration => {\n registration.onupdatefound = () => {\n const installingWorker = registration.installing;\n if (installingWorker == null) {\n return;\n }\n installingWorker.onstatechange = () => {\n if (installingWorker.state === 'installed') {\n if (navigator.serviceWorker.controller) {\n // At this point, the updated precached content has been fetched,\n // but the previous service worker will still serve the older\n // content until all client tabs are closed.\n console.log(\n 'New content is available and will be used when all ' +\n 'tabs for this page are closed. See https://bit.ly/CRA-PWA.'\n );\n\n // Execute callback\n if (config && config.onUpdate) {\n config.onUpdate(registration);\n }\n } else {\n // At this point, everything has been precached.\n // It's the perfect time to display a\n // \"Content is cached for offline use.\" message.\n console.log('Content is cached for offline use.');\n\n // Execute callback\n if (config && config.onSuccess) {\n config.onSuccess(registration);\n }\n }\n }\n };\n };\n })\n .catch(error => {\n console.error('Error during service worker registration:', error);\n });\n}\n\nfunction checkValidServiceWorker(swUrl, config) {\n // Check if the service worker can be found. If it can't reload the page.\n fetch(swUrl, {\n headers: { 'Service-Worker': 'script' },\n })\n .then(response => {\n // Ensure service worker exists, and that we really are getting a JS file.\n const contentType = response.headers.get('content-type');\n if (\n response.status === 404 ||\n (contentType != null && contentType.indexOf('javascript') === -1)\n ) {\n // No service worker found. Probably a different app. Reload the page.\n navigator.serviceWorker.ready.then(registration => {\n registration.unregister().then(() => {\n window.location.reload();\n });\n });\n } else {\n // Service worker found. Proceed as normal.\n registerValidSW(swUrl, config);\n }\n })\n .catch(() => {\n console.log(\n 'No internet connection found. App is running in offline mode.'\n );\n });\n}\n\nexport function unregister() {\n if ('serviceWorker' in navigator) {\n navigator.serviceWorker.ready\n .then(registration => {\n registration.unregister();\n })\n .catch(error => {\n console.error(error.message);\n });\n }\n}\n","import React from 'react';\nimport ReactDOM from 'react-dom';\nimport './index.css';\nimport App from './App';\nimport * as serviceWorker from './serviceWorker';\n\nReactDOM.render(\n \n \n ,\n document.getElementById('root')\n);\n\n// If you want your app to work offline and load faster, you can change\n// unregister() to register() below. Note this comes with some pitfalls.\n// Learn more about service workers: https://bit.ly/CRA-PWA\nserviceWorker.unregister();\n","module.exports = __webpack_public_path__ + \"static/media/logo.50e8e5ec.png\";"],"sourceRoot":""} 3 | --------------------------------------------------------------------------------