├── .coveragerc ├── .gitignore ├── LICENSE.txt ├── README.md ├── coverage.svg ├── docs ├── Makefile ├── _static │ └── .gitignore ├── authors.rst ├── changelog.rst ├── conf.py ├── index.rst └── license.rst ├── requirements.txt ├── setup.cfg ├── setup.py ├── src └── pydeequ │ ├── __init__.py │ ├── analyzers.py │ ├── base.py │ ├── checks.py │ ├── examples │ ├── __init__.py │ ├── analyzer_example.py │ ├── basic_usage.py │ ├── basic_usage2.py │ ├── metrics_repo.py │ ├── profiler_example.py │ └── suggestions_example.py │ ├── exceptions.py │ ├── jvm_conversions.py │ ├── metricsrepo.py │ ├── profiler.py │ └── suggestions.py ├── tests └── integration │ ├── test_analyzers.py │ ├── test_constraints.py │ └── test_runners.py └── tox.ini /.coveragerc: -------------------------------------------------------------------------------- 1 | # .coveragerc to control coverage.py 2 | [run] 3 | branch = True 4 | source = pydeequ 5 | omit = src/pydeequ/examples/* 6 | 7 | [paths] 8 | source = 9 | src/ 10 | */site-packages/ 11 | 12 | [report] 13 | # Regexes for lines to exclude from consideration 14 | exclude_lines = 15 | # Have to re-enable the standard pragma 16 | pragma: no cover 17 | 18 | # Don't complain about missing debug-only code: 19 | def __repr__ 20 | if self\.debug 21 | 22 | # Don't complain if tests don't hit defensive assertion code: 23 | raise AssertionError 24 | raise NotImplementedError 25 | 26 | # Don't complain if non-runnable code isn't run: 27 | if 0: 28 | if __name__ == .__main__.: 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Temporary and binary files 2 | *~ 3 | *.py[cod] 4 | *.so 5 | *.cfg 6 | !.isort.cfg 7 | !setup.cfg 8 | *.orig 9 | *.log 10 | *.pot 11 | __pycache__/* 12 | .cache/* 13 | .*.swp 14 | */.ipynb_checkpoints/* 15 | .DS_Store 16 | metastore_db/ 17 | spark-warehouse/ 18 | .vscode 19 | 20 | # Project files 21 | .ropeproject 22 | .project 23 | .pydevproject 24 | .settings 25 | .idea 26 | tags 27 | 28 | # Package files 29 | *.egg 30 | *.eggs/ 31 | .installed.cfg 32 | *.egg-info 33 | 34 | # Unittest and coverage 35 | htmlcov/* 36 | .coverage 37 | .tox 38 | junit.xml 39 | coverage.xml 40 | .pytest_cache/ 41 | 42 | # Build and docs folder/files 43 | build/* 44 | dist/* 45 | sdist/* 46 | docs/api/* 47 | docs/_rst/* 48 | docs/_build/* 49 | cover/* 50 | MANIFEST 51 | 52 | # Per-project virtualenvs 53 | .venv*/ 54 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | __This repository will be merged with the official [AWS Lab/python-deequ](https://github.com/awslabs/python-deequ) project. Please fork and contribute to that project because many issues of this pydeequ version are solved there.__ 2 | -------------------------------------------------------------------------------- /coverage.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | coverage 17 | coverage 18 | 70% 19 | 70% 20 | 21 | 22 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = ../build/sphinx/ 9 | AUTODOCDIR = api 10 | AUTODOCBUILD = sphinx-apidoc 11 | PROJECT = pydeequ 12 | MODULEDIR = ../src/pydeequ 13 | 14 | # User-friendly check for sphinx-build 15 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $?), 1) 16 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 17 | endif 18 | 19 | # Internal variables. 20 | PAPEROPT_a4 = -D latex_paper_size=a4 21 | PAPEROPT_letter = -D latex_paper_size=letter 22 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 23 | # the i18n builder cannot share the environment and doctrees with the others 24 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 25 | 26 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext doc-requirements 27 | 28 | help: 29 | @echo "Please use \`make ' where is one of" 30 | @echo " html to make standalone HTML files" 31 | @echo " dirhtml to make HTML files named index.html in directories" 32 | @echo " singlehtml to make a single large HTML file" 33 | @echo " pickle to make pickle files" 34 | @echo " json to make JSON files" 35 | @echo " htmlhelp to make HTML files and a HTML help project" 36 | @echo " qthelp to make HTML files and a qthelp project" 37 | @echo " devhelp to make HTML files and a Devhelp project" 38 | @echo " epub to make an epub" 39 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 40 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 41 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 42 | @echo " text to make text files" 43 | @echo " man to make manual pages" 44 | @echo " texinfo to make Texinfo files" 45 | @echo " info to make Texinfo files and run them through makeinfo" 46 | @echo " gettext to make PO message catalogs" 47 | @echo " changes to make an overview of all changed/added/deprecated items" 48 | @echo " xml to make Docutils-native XML files" 49 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 50 | @echo " linkcheck to check all external links for integrity" 51 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 52 | 53 | clean: 54 | rm -rf $(BUILDDIR)/* $(AUTODOCDIR) 55 | 56 | $(AUTODOCDIR): $(MODULEDIR) 57 | mkdir -p $@ 58 | $(AUTODOCBUILD) -f -o $@ $^ 59 | 60 | doc-requirements: $(AUTODOCDIR) 61 | 62 | html: doc-requirements 63 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 64 | @echo 65 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 66 | 67 | dirhtml: doc-requirements 68 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 69 | @echo 70 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 71 | 72 | singlehtml: doc-requirements 73 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 74 | @echo 75 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 76 | 77 | pickle: doc-requirements 78 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 79 | @echo 80 | @echo "Build finished; now you can process the pickle files." 81 | 82 | json: doc-requirements 83 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 84 | @echo 85 | @echo "Build finished; now you can process the JSON files." 86 | 87 | htmlhelp: doc-requirements 88 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 89 | @echo 90 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 91 | ".hhp project file in $(BUILDDIR)/htmlhelp." 92 | 93 | qthelp: doc-requirements 94 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 95 | @echo 96 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 97 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 98 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/$(PROJECT).qhcp" 99 | @echo "To view the help file:" 100 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/$(PROJECT).qhc" 101 | 102 | devhelp: doc-requirements 103 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 104 | @echo 105 | @echo "Build finished." 106 | @echo "To view the help file:" 107 | @echo "# mkdir -p $HOME/.local/share/devhelp/$(PROJECT)" 108 | @echo "# ln -s $(BUILDDIR)/devhelp $HOME/.local/share/devhelp/$(PROJEC)" 109 | @echo "# devhelp" 110 | 111 | epub: doc-requirements 112 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 113 | @echo 114 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 115 | 116 | patch-latex: 117 | find _build/latex -iname "*.tex" | xargs -- \ 118 | sed -i'' 's~includegraphics{~includegraphics\[keepaspectratio,max size={\\textwidth}{\\textheight}\]{~g' 119 | 120 | latex: doc-requirements 121 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 122 | $(MAKE) patch-latex 123 | @echo 124 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 125 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 126 | "(use \`make latexpdf' here to do that automatically)." 127 | 128 | latexpdf: doc-requirements 129 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 130 | $(MAKE) patch-latex 131 | @echo "Running LaTeX files through pdflatex..." 132 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 133 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 134 | 135 | latexpdfja: doc-requirements 136 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 137 | @echo "Running LaTeX files through platex and dvipdfmx..." 138 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 139 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 140 | 141 | text: doc-requirements 142 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 143 | @echo 144 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 145 | 146 | man: doc-requirements 147 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 148 | @echo 149 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 150 | 151 | texinfo: doc-requirements 152 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 153 | @echo 154 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 155 | @echo "Run \`make' in that directory to run these through makeinfo" \ 156 | "(use \`make info' here to do that automatically)." 157 | 158 | info: doc-requirements 159 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 160 | @echo "Running Texinfo files through makeinfo..." 161 | make -C $(BUILDDIR)/texinfo info 162 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 163 | 164 | gettext: doc-requirements 165 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 166 | @echo 167 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 168 | 169 | changes: doc-requirements 170 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 171 | @echo 172 | @echo "The overview file is in $(BUILDDIR)/changes." 173 | 174 | linkcheck: doc-requirements 175 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 176 | @echo 177 | @echo "Link check complete; look for any errors in the above output " \ 178 | "or in $(BUILDDIR)/linkcheck/output.txt." 179 | 180 | doctest: doc-requirements 181 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 182 | @echo "Testing of doctests in the sources finished, look at the " \ 183 | "results in $(BUILDDIR)/doctest/output.txt." 184 | 185 | xml: doc-requirements 186 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 187 | @echo 188 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 189 | 190 | pseudoxml: doc-requirements 191 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 192 | @echo 193 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 194 | -------------------------------------------------------------------------------- /docs/_static/.gitignore: -------------------------------------------------------------------------------- 1 | # Empty directory 2 | -------------------------------------------------------------------------------- /docs/authors.rst: -------------------------------------------------------------------------------- 1 | .. _authors: 2 | .. include:: ../AUTHORS.rst 3 | -------------------------------------------------------------------------------- /docs/changelog.rst: -------------------------------------------------------------------------------- 1 | .. _changes: 2 | .. include:: ../CHANGELOG.rst 3 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is execfile()d with the current directory set to its containing dir. 4 | # 5 | # Note that not all possible configuration values are present in this 6 | # autogenerated file. 7 | # 8 | # All configuration values have a default; values that are commented out 9 | # serve to show the default. 10 | 11 | import os 12 | import sys 13 | import inspect 14 | import shutil 15 | 16 | __location__ = os.path.join(os.getcwd(), os.path.dirname( 17 | inspect.getfile(inspect.currentframe()))) 18 | 19 | # If extensions (or modules to document with autodoc) are in another directory, 20 | # add these directories to sys.path here. If the directory is relative to the 21 | # documentation root, use os.path.abspath to make it absolute, like shown here. 22 | sys.path.insert(0, os.path.join(__location__, '../src')) 23 | 24 | # -- Run sphinx-apidoc ------------------------------------------------------ 25 | # This hack is necessary since RTD does not issue `sphinx-apidoc` before running 26 | # `sphinx-build -b html . _build/html`. See Issue: 27 | # https://github.com/rtfd/readthedocs.org/issues/1139 28 | # DON'T FORGET: Check the box "Install your project inside a virtualenv using 29 | # setup.py install" in the RTD Advanced Settings. 30 | # Additionally it helps us to avoid running apidoc manually 31 | 32 | try: # for Sphinx >= 1.7 33 | from sphinx.ext import apidoc 34 | except ImportError: 35 | from sphinx import apidoc 36 | 37 | output_dir = os.path.join(__location__, "api") 38 | module_dir = os.path.join(__location__, "../src/pydeequ") 39 | try: 40 | shutil.rmtree(output_dir) 41 | except FileNotFoundError: 42 | pass 43 | 44 | try: 45 | import sphinx 46 | from pkg_resources import parse_version 47 | 48 | cmd_line_template = "sphinx-apidoc -f -o {outputdir} {moduledir}" 49 | cmd_line = cmd_line_template.format(outputdir=output_dir, moduledir=module_dir) 50 | 51 | args = cmd_line.split(" ") 52 | if parse_version(sphinx.__version__) >= parse_version('1.7'): 53 | args = args[1:] 54 | 55 | apidoc.main(args) 56 | except Exception as e: 57 | print("Running `sphinx-apidoc` failed!\n{}".format(e)) 58 | 59 | # -- General configuration ----------------------------------------------------- 60 | 61 | # If your documentation needs a minimal Sphinx version, state it here. 62 | # needs_sphinx = '1.0' 63 | 64 | # Add any Sphinx extension module names here, as strings. They can be extensions 65 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 66 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.todo', 67 | 'sphinx.ext.autosummary', 'sphinx.ext.viewcode', 'sphinx.ext.coverage', 68 | 'sphinx.ext.doctest', 'sphinx.ext.ifconfig', 'sphinx.ext.mathjax', 69 | 'sphinx.ext.napoleon'] 70 | 71 | # Add any paths that contain templates here, relative to this directory. 72 | templates_path = ['_templates'] 73 | 74 | # The suffix of source filenames. 75 | source_suffix = '.rst' 76 | 77 | # The encoding of source files. 78 | # source_encoding = 'utf-8-sig' 79 | 80 | # The master toctree document. 81 | master_doc = 'index' 82 | 83 | # General information about the project. 84 | project = u'pydeequ' 85 | copyright = u'2020, margitai.i' 86 | 87 | # The version info for the project you're documenting, acts as replacement for 88 | # |version| and |release|, also used in various other places throughout the 89 | # built documents. 90 | # 91 | # The short X.Y version. 92 | version = '' # Is set by calling `setup.py docs` 93 | # The full version, including alpha/beta/rc tags. 94 | release = '' # Is set by calling `setup.py docs` 95 | 96 | # The language for content autogenerated by Sphinx. Refer to documentation 97 | # for a list of supported languages. 98 | # language = None 99 | 100 | # There are two options for replacing |today|: either, you set today to some 101 | # non-false value, then it is used: 102 | # today = '' 103 | # Else, today_fmt is used as the format for a strftime call. 104 | # today_fmt = '%B %d, %Y' 105 | 106 | # List of patterns, relative to source directory, that match files and 107 | # directories to ignore when looking for source files. 108 | exclude_patterns = ['_build'] 109 | 110 | # The reST default role (used for this markup: `text`) to use for all documents. 111 | # default_role = None 112 | 113 | # If true, '()' will be appended to :func: etc. cross-reference text. 114 | # add_function_parentheses = True 115 | 116 | # If true, the current module name will be prepended to all description 117 | # unit titles (such as .. function::). 118 | # add_module_names = True 119 | 120 | # If true, sectionauthor and moduleauthor directives will be shown in the 121 | # output. They are ignored by default. 122 | # show_authors = False 123 | 124 | # The name of the Pygments (syntax highlighting) style to use. 125 | pygments_style = 'sphinx' 126 | 127 | # A list of ignored prefixes for module index sorting. 128 | # modindex_common_prefix = [] 129 | 130 | # If true, keep warnings as "system message" paragraphs in the built documents. 131 | # keep_warnings = False 132 | 133 | 134 | # -- Options for HTML output --------------------------------------------------- 135 | 136 | # The theme to use for HTML and HTML Help pages. See the documentation for 137 | # a list of builtin themes. 138 | html_theme = 'alabaster' 139 | 140 | # Theme options are theme-specific and customize the look and feel of a theme 141 | # further. For a list of options available for each theme, see the 142 | # documentation. 143 | html_theme_options = { 144 | 'sidebar_width': '300px', 145 | 'page_width': '1200px' 146 | } 147 | 148 | # Add any paths that contain custom themes here, relative to this directory. 149 | # html_theme_path = [] 150 | 151 | # The name for this set of Sphinx documents. If None, it defaults to 152 | # " v documentation". 153 | try: 154 | from pydeequ import __version__ as version 155 | except ImportError: 156 | pass 157 | else: 158 | release = version 159 | 160 | # A shorter title for the navigation bar. Default is the same as html_title. 161 | # html_short_title = None 162 | 163 | # The name of an image file (relative to this directory) to place at the top 164 | # of the sidebar. 165 | # html_logo = "" 166 | 167 | # The name of an image file (within the static path) to use as favicon of the 168 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 169 | # pixels large. 170 | # html_favicon = None 171 | 172 | # Add any paths that contain custom static files (such as style sheets) here, 173 | # relative to this directory. They are copied after the builtin static files, 174 | # so a file named "default.css" will overwrite the builtin "default.css". 175 | html_static_path = ['_static'] 176 | 177 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 178 | # using the given strftime format. 179 | # html_last_updated_fmt = '%b %d, %Y' 180 | 181 | # If true, SmartyPants will be used to convert quotes and dashes to 182 | # typographically correct entities. 183 | # html_use_smartypants = True 184 | 185 | # Custom sidebar templates, maps document names to template names. 186 | # html_sidebars = {} 187 | 188 | # Additional templates that should be rendered to pages, maps page names to 189 | # template names. 190 | # html_additional_pages = {} 191 | 192 | # If false, no module index is generated. 193 | # html_domain_indices = True 194 | 195 | # If false, no index is generated. 196 | # html_use_index = True 197 | 198 | # If true, the index is split into individual pages for each letter. 199 | # html_split_index = False 200 | 201 | # If true, links to the reST sources are added to the pages. 202 | # html_show_sourcelink = True 203 | 204 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 205 | # html_show_sphinx = True 206 | 207 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 208 | # html_show_copyright = True 209 | 210 | # If true, an OpenSearch description file will be output, and all pages will 211 | # contain a tag referring to it. The value of this option must be the 212 | # base URL from which the finished HTML is served. 213 | # html_use_opensearch = '' 214 | 215 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 216 | # html_file_suffix = None 217 | 218 | # Output file base name for HTML help builder. 219 | htmlhelp_basename = 'pydeequ-doc' 220 | 221 | 222 | # -- Options for LaTeX output -------------------------------------------------- 223 | 224 | latex_elements = { 225 | # The paper size ('letterpaper' or 'a4paper'). 226 | # 'papersize': 'letterpaper', 227 | 228 | # The font size ('10pt', '11pt' or '12pt'). 229 | # 'pointsize': '10pt', 230 | 231 | # Additional stuff for the LaTeX preamble. 232 | # 'preamble': '', 233 | } 234 | 235 | # Grouping the document tree into LaTeX files. List of tuples 236 | # (source start file, target name, title, author, documentclass [howto/manual]). 237 | latex_documents = [ 238 | ('index', 'user_guide.tex', u'pydeequ Documentation', 239 | u'margitai.i', 'manual'), 240 | ] 241 | 242 | # The name of an image file (relative to this directory) to place at the top of 243 | # the title page. 244 | # latex_logo = "" 245 | 246 | # For "manual" documents, if this is true, then toplevel headings are parts, 247 | # not chapters. 248 | # latex_use_parts = False 249 | 250 | # If true, show page references after internal links. 251 | # latex_show_pagerefs = False 252 | 253 | # If true, show URL addresses after external links. 254 | # latex_show_urls = False 255 | 256 | # Documents to append as an appendix to all manuals. 257 | # latex_appendices = [] 258 | 259 | # If false, no module index is generated. 260 | # latex_domain_indices = True 261 | 262 | # -- External mapping ------------------------------------------------------------ 263 | python_version = '.'.join(map(str, sys.version_info[0:2])) 264 | intersphinx_mapping = { 265 | 'sphinx': ('http://www.sphinx-doc.org/en/stable', None), 266 | 'python': ('https://docs.python.org/' + python_version, None), 267 | 'matplotlib': ('https://matplotlib.org', None), 268 | 'numpy': ('https://docs.scipy.org/doc/numpy', None), 269 | 'sklearn': ('http://scikit-learn.org/stable', None), 270 | 'pandas': ('http://pandas.pydata.org/pandas-docs/stable', None), 271 | 'scipy': ('https://docs.scipy.org/doc/scipy/reference', None), 272 | } 273 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | pydeequ 3 | ======= 4 | 5 | This is the documentation of **pydeequ**. 6 | 7 | .. note:: 8 | 9 | This is the main page of your project's `Sphinx`_ documentation. 10 | It is formatted in `reStructuredText`_. Add additional pages 11 | by creating rst-files in ``docs`` and adding them to the `toctree`_ below. 12 | Use then `references`_ in order to link them from this page, e.g. 13 | :ref:`authors` and :ref:`changes`. 14 | 15 | It is also possible to refer to the documentation of other Python packages 16 | with the `Python domain syntax`_. By default you can reference the 17 | documentation of `Sphinx`_, `Python`_, `NumPy`_, `SciPy`_, `matplotlib`_, 18 | `Pandas`_, `Scikit-Learn`_. You can add more by extending the 19 | ``intersphinx_mapping`` in your Sphinx's ``conf.py``. 20 | 21 | The pretty useful extension `autodoc`_ is activated by default and lets 22 | you include documentation from docstrings. Docstrings can be written in 23 | `Google style`_ (recommended!), `NumPy style`_ and `classical style`_. 24 | 25 | 26 | Contents 27 | ======== 28 | 29 | .. toctree:: 30 | :maxdepth: 2 31 | 32 | License 33 | Authors 34 | Changelog 35 | Module Reference 36 | 37 | 38 | Indices and tables 39 | ================== 40 | 41 | * :ref:`genindex` 42 | * :ref:`modindex` 43 | * :ref:`search` 44 | 45 | .. _toctree: http://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html 46 | .. _reStructuredText: http://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html 47 | .. _references: http://www.sphinx-doc.org/en/stable/markup/inline.html 48 | .. _Python domain syntax: http://sphinx-doc.org/domains.html#the-python-domain 49 | .. _Sphinx: http://www.sphinx-doc.org/ 50 | .. _Python: http://docs.python.org/ 51 | .. _Numpy: http://docs.scipy.org/doc/numpy 52 | .. _SciPy: http://docs.scipy.org/doc/scipy/reference/ 53 | .. _matplotlib: https://matplotlib.org/contents.html# 54 | .. _Pandas: http://pandas.pydata.org/pandas-docs/stable 55 | .. _Scikit-Learn: http://scikit-learn.org/stable 56 | .. _autodoc: http://www.sphinx-doc.org/en/stable/ext/autodoc.html 57 | .. _Google style: https://github.com/google/styleguide/blob/gh-pages/pyguide.md#38-comments-and-docstrings 58 | .. _NumPy style: https://numpydoc.readthedocs.io/en/latest/format.html 59 | .. _classical style: http://www.sphinx-doc.org/en/stable/domains.html#info-field-lists 60 | -------------------------------------------------------------------------------- /docs/license.rst: -------------------------------------------------------------------------------- 1 | .. _license: 2 | 3 | ======= 4 | License 5 | ======= 6 | 7 | .. include:: ../LICENSE.txt 8 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # ============================================================================= 2 | # DEPRECATION WARNING: 3 | # 4 | # The file `requirements.txt` does not influence the package dependencies and 5 | # will not be automatically created in the next version of PyScaffold (v4.x). 6 | # 7 | # Please have look at the docs for better alternatives 8 | # (`Dependency Management` section). 9 | # ============================================================================= 10 | # 11 | # Add your pinned requirements so that they can be easily installed with: 12 | # pip install -r requirements.txt 13 | # Remember to also add them in setup.cfg but unpinned. 14 | # Example: 15 | # numpy==1.13.3 16 | # scipy==1.0 17 | # 18 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | # This file is used to configure your project. 2 | # Read more about the various options under: 3 | # http://setuptools.readthedocs.io/en/latest/setuptools.html#configuring-setup-using-setup-cfg-files 4 | 5 | [metadata] 6 | name = pydeequ 7 | description = Python API for Deequ 8 | author = Istvan Margitai 9 | author-email = margitai.i@gmail.com 10 | license = apache 11 | long-description = file: README.md 12 | long-description-content-type = text/markdown; charset=UTF-8 13 | url = https://github.com/margitaii/pydeequ 14 | #project-urls = 15 | # Documentation = https://pyscaffold.org/ 16 | # Change if running only on Windows, Mac or Linux (comma-separated) 17 | platforms = any 18 | # Add here all kinds of additional classifiers as defined under 19 | # https://pypi.python.org/pypi?%3Aaction=list_classifiers 20 | classifiers = 21 | Development Status :: 4 - Beta 22 | Programming Language :: Python 23 | 24 | [options] 25 | zip_safe = False 26 | packages = find: 27 | include_package_data = True 28 | package_dir = 29 | =src 30 | # DON'T CHANGE THE FOLLOWING LINE! IT WILL BE UPDATED BY PYSCAFFOLD! 31 | setup_requires = pyscaffold>=3.2a0,<3.3a0 32 | # Add here dependencies of your project (semicolon/line-separated), e.g. 33 | # install_requires = numpy; scipy 34 | # The usage of test_requires is discouraged, see `Dependency Management` docs 35 | # tests_require = pytest; pytest-cov 36 | # Require a specific Python version, e.g. Python 2.7 or >= 3.4 37 | # python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.* 38 | 39 | [options.packages.find] 40 | where = src 41 | exclude = 42 | tests 43 | 44 | [options.extras_require] 45 | # Add here additional requirements for extra features, to install with: 46 | # `pip install pydeequ[PDF]` like: 47 | # PDF = ReportLab; RXP 48 | # Add here test requirements (semicolon/line-separated) 49 | testing = 50 | pytest 51 | pytest-cov 52 | 53 | [options.entry_points] 54 | # Add here console scripts like: 55 | # console_scripts = 56 | # script_name = pydeequ.module:function 57 | # For example: 58 | # console_scripts = 59 | # fibonacci = pydeequ.skeleton:run 60 | # And any other entry points, for example: 61 | # pyscaffold.cli = 62 | # awesome = pyscaffoldext.awesome.extension:AwesomeExtension 63 | 64 | [test] 65 | # py.test options when running `python setup.py test` 66 | # addopts = --verbose 67 | extras = True 68 | 69 | [tool:pytest] 70 | # Options for py.test: 71 | # Specify command line options as you would do when invoking py.test directly. 72 | # e.g. --cov-report html (or xml) for html/xml output or --junitxml junit.xml 73 | # in order to write a coverage file that can be read by Jenkins. 74 | addopts = 75 | --cov pydeequ --cov-report term-missing 76 | --verbose 77 | norecursedirs = 78 | dist 79 | build 80 | .tox 81 | testpaths = tests 82 | 83 | [aliases] 84 | dists = bdist_wheel 85 | 86 | [bdist_wheel] 87 | # Use this option if your package is pure-python 88 | universal = 1 89 | 90 | [build_sphinx] 91 | source_dir = docs 92 | build_dir = build/sphinx 93 | 94 | [devpi:upload] 95 | # Options for the devpi: PyPI server and packaging tool 96 | # VCS export must be deactivated since we are using setuptools-scm 97 | no-vcs = 1 98 | formats = bdist_wheel 99 | 100 | [flake8] 101 | # Some sane defaults for the code style checker flake8 102 | exclude = 103 | .tox 104 | build 105 | dist 106 | .eggs 107 | docs/conf.py 108 | 109 | [pyscaffold] 110 | # PyScaffold's parameters when the project was created. 111 | # This will be used when updating. Do not change! 112 | version = 3.2.3 113 | package = pydeequ 114 | extensions = 115 | tox 116 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Setup file for pydeequ. 4 | Use setup.cfg to configure your project. 5 | 6 | This file was generated with PyScaffold 3.2.3. 7 | PyScaffold helps you to put up the scaffold of your new Python project. 8 | Learn more under: https://pyscaffold.org/ 9 | """ 10 | import sys 11 | import os 12 | 13 | from pkg_resources import VersionConflict, require 14 | from setuptools import setup 15 | 16 | try: 17 | require('setuptools>=38.3') 18 | except VersionConflict: 19 | print("Error: version of setuptools is too old (<38.3)!") 20 | sys.exit(1) 21 | 22 | def setup_package(): 23 | needs_sphinx = {'build_sphinx', 'upload_docs'}.intersection(sys.argv) 24 | sphinx = ['sphinx'] if needs_sphinx else [] 25 | 26 | setup(setup_requires=['six', 'pyscaffold>=2.5a0,<2.6a0'] + sphinx, 27 | use_pyscaffold=True) 28 | 29 | if __name__ == "__main__": 30 | setup_package() 31 | -------------------------------------------------------------------------------- /src/pydeequ/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from pkg_resources import get_distribution, DistributionNotFound 3 | 4 | try: 5 | # Change here if project is renamed and does not equal the package name 6 | dist_name = __name__ 7 | __version__ = get_distribution(dist_name).version 8 | except DistributionNotFound: 9 | __version__ = 'unknown' 10 | finally: 11 | del get_distribution, DistributionNotFound 12 | -------------------------------------------------------------------------------- /src/pydeequ/analyzers.py: -------------------------------------------------------------------------------- 1 | import py4j.java_gateway as jg 2 | 3 | from pydeequ.exceptions import JavaClassNotFoundException 4 | import pydeequ.jvm_conversions as jc 5 | 6 | class BaseAnalyzer(object): 7 | """ 8 | Analyzer baseclass 9 | """ 10 | def set_jvm(self, jvm): 11 | self._jvm = jvm 12 | return self 13 | 14 | @property 15 | def jvmdeequAnalyzers(self): 16 | if (self._jvm): 17 | return self._jvm.com.amazon.deequ.analyzers 18 | else: 19 | raise ValueError("Run set_jvm() method first.") 20 | 21 | class ApproxCountDistinct(BaseAnalyzer): 22 | """ 23 | Compute approximated count distinct with HyperLogLogPlusPlus. 24 | 25 | @param column Which column to compute this aggregation on. 26 | """ 27 | 28 | def __init__(self, column): 29 | self.column = column 30 | 31 | @property 32 | def jvmAnalyzer(self): 33 | return self.jvmdeequAnalyzers.ApproxCountDistinct( 34 | self.column, 35 | getattr(self.jvmdeequAnalyzers.ApproxCountDistinct, "apply$default$2")() 36 | ) 37 | 38 | 39 | class ApproxQuantile(BaseAnalyzer): 40 | """ 41 | Approximate quantile analyzer. The allowed relative error compared to the exact quantile can be 42 | configured with `relativeError` parameter. A `relativeError` = 0.0 would yield the exact 43 | quantile while increasing the computational load. 44 | 45 | @param column Column in DataFrame for which the approximate quantile is analyzed. 46 | @param quantile Computed Quantile. Must be in the interval [0, 1], where 0.5 would be the 47 | median. 48 | @param relativeError Relative target precision to achieve in the quantile computation. 49 | Must be in the interval [0, 1]. 50 | @param where Additional filter to apply before the analyzer is run. 51 | """ 52 | 53 | def __init__(self, column, quantile, relativeError = 0.01): 54 | self.column = column 55 | self.quantile = quantile 56 | self.relativeError = relativeError 57 | 58 | @property 59 | def jvmAnalyzer(self): 60 | return self.jvmdeequAnalyzers.ApproxQuantile( 61 | self.column, 62 | self.quantile, 63 | self.relativeError, 64 | getattr(self.jvmdeequAnalyzers.ApproxQuantile, "apply$default$4")() 65 | ) 66 | 67 | class Completeness(BaseAnalyzer): 68 | """ 69 | Fraction of non-null values in a column. 70 | 71 | Args: 72 | column Column in DataFrame 73 | """ 74 | 75 | def __init__(self, column): 76 | self.column = column 77 | 78 | @property 79 | def jvmAnalyzer(self): 80 | return self.jvmdeequAnalyzers.Completeness( 81 | self.column, 82 | getattr(self.jvmdeequAnalyzers.Completeness, "apply$default$2")() 83 | ) 84 | 85 | class Compliance(BaseAnalyzer): 86 | """ 87 | Compliance is a measure of the fraction of rows that complies with the given column constraint. 88 | E.g if the constraint is "att1>3" and data frame has 5 rows with att1 column value greater than 89 | 3 and 10 rows under 3; a DoubleMetric would be returned with 0.33 value 90 | @param instance Unlike other column analyzers (e.g completeness) this analyzer can not 91 | infer to the metric instance name from column name. 92 | Also the constraint given here can be referring to multiple columns, 93 | so metric instance name should be provided, 94 | describing what the analysis being done for. 95 | @param predicate SQL-predicate to apply per row 96 | @param where Additional filter to apply before the analyzer is run. 97 | """ 98 | def __init__(self, instance, predicate): 99 | self.instance = instance 100 | self.predicate = predicate 101 | 102 | @property 103 | def jvmAnalyzer(self): 104 | return self.jvmdeequAnalyzers.Compliance( 105 | self.instance, 106 | self.predicate, 107 | getattr(self.jvmdeequAnalyzers.Compliance, "apply$default$3")() 108 | ) 109 | 110 | class Correlation(BaseAnalyzer): 111 | """ 112 | Computes the pearson correlation coefficient between the two given columns 113 | @param firstColumn First input column for computation 114 | @param secondColumn Second input column for computation 115 | """ 116 | def __init__(self, firstColumn, secondColumn): 117 | self.firstColumn = firstColumn 118 | self.secondColumn = secondColumn 119 | 120 | @property 121 | def jvmAnalyzer(self): 122 | return self.jvmdeequAnalyzers.Correlation( 123 | self.firstColumn, 124 | self.secondColumn, 125 | getattr(self.jvmdeequAnalyzers.Correlation, "apply$default$3")() 126 | ) 127 | 128 | class CountDistinct(BaseAnalyzer): 129 | """ 130 | Number of distinct values 131 | """ 132 | def __init__(self, column): 133 | if isinstance(column, str): 134 | self.column = [column] 135 | elif isinstance(column, list): 136 | self.column = column 137 | else: 138 | raise ValueError("'column' must be string or list of strings.") 139 | 140 | @property 141 | def jvmAnalyzer(self): 142 | return self.jvmdeequAnalyzers.CountDistinct( 143 | jc.iterable_to_scala_seq(self._jvm, self.column) 144 | ) 145 | 146 | class DataType(BaseAnalyzer): 147 | """ 148 | Distribution of data types such as Boolean, Fractional, Integral, and String. 149 | """ 150 | def __init__(self, column): 151 | self.column = column 152 | 153 | @property 154 | def jvmAnalyzer(self): 155 | return self.jvmdeequAnalyzers.DataType( 156 | self.column, 157 | getattr(self.jvmdeequAnalyzers.DataType, "apply$default$2")() 158 | ) 159 | 160 | class Distinctness(BaseAnalyzer): 161 | """ 162 | Distinctness is the fraction of distinct values of a column(s). 163 | @param columns the column(s) for which to compute distinctness 164 | """ 165 | def __init__(self, columns): 166 | if isinstance(columns, str): 167 | self.columns = [columns] 168 | elif isinstance(columns, list): 169 | self.columns = columns 170 | else: 171 | raise ValueError("'columns' must be string or list of strings.") 172 | 173 | @property 174 | def jvmAnalyzer(self): 175 | return self.jvmdeequAnalyzers.Distinctness( 176 | jc.iterable_to_scala_seq(self._jvm, self.columns), 177 | getattr(self.jvmdeequAnalyzers.DataType, "apply$default$2")() 178 | ) 179 | 180 | class Entropy(BaseAnalyzer): 181 | """ 182 | Entropy is a measure of the level of information contained in a message. Given the probability 183 | distribution over values in a column, it describes how many bits are required to identify a 184 | value. 185 | """ 186 | def __init__(self, column): 187 | self.column = column 188 | 189 | @property 190 | def jvmAnalyzer(self): 191 | return self.jvmdeequAnalyzers.Entropy( 192 | self.column, 193 | getattr(self.jvmdeequAnalyzers.Entropy, "apply$default$2")() 194 | ) 195 | 196 | class Histogram(BaseAnalyzer): 197 | """ 198 | Histogram is the summary of values in a column of a DataFrame. Groups the given column's values, 199 | and calculates the number of rows with that specific value and the fraction of this value. 200 | 201 | @param column Column to do histogram analysis on 202 | """ 203 | def __init__(self, column): 204 | self.column = column 205 | 206 | @property 207 | def jvmAnalyzer(self): 208 | return self.jvmdeequAnalyzers.Histogram( 209 | self.column, 210 | getattr(self.jvmdeequAnalyzers.Histogram, "apply$default$2")(), 211 | getattr(self.jvmdeequAnalyzers.Histogram, "apply$default$3")(), 212 | getattr(self.jvmdeequAnalyzers.Histogram, "apply$default$4")() 213 | ) 214 | 215 | class Maximum(BaseAnalyzer): 216 | """ 217 | Maximum value. 218 | """ 219 | def __init__(self, column): 220 | self.column = column 221 | 222 | @property 223 | def jvmAnalyzer(self): 224 | return self.jvmdeequAnalyzers.Maximum( 225 | self.column, 226 | getattr(self.jvmdeequAnalyzers.Maximum, "apply$default$2")() 227 | ) 228 | 229 | class MaxLength(BaseAnalyzer): 230 | """ 231 | """ 232 | def __init__(self, column): 233 | self.column = column 234 | 235 | @property 236 | def jvmAnalyzer(self): 237 | return self.jvmdeequAnalyzers.MaxLength( 238 | self.column, 239 | getattr(self.jvmdeequAnalyzers.MaxLength, "apply$default$2")() 240 | ) 241 | 242 | class Mean(BaseAnalyzer): 243 | """ 244 | Mean value, null values are excluded. 245 | """ 246 | def __init__(self, column): 247 | self.column = column 248 | 249 | @property 250 | def jvmAnalyzer(self): 251 | return self.jvmdeequAnalyzers.Mean( 252 | self.column, 253 | getattr(self.jvmdeequAnalyzers.Mean, "apply$default$2")() 254 | ) 255 | 256 | class Minimum(BaseAnalyzer): 257 | """ 258 | Minimum value. 259 | """ 260 | def __init__(self, column): 261 | self.column = column 262 | 263 | @property 264 | def jvmAnalyzer(self): 265 | return self.jvmdeequAnalyzers.Minimum( 266 | self.column, 267 | getattr(self.jvmdeequAnalyzers.Minimum, "apply$default$2")() 268 | ) 269 | 270 | class MinLength(BaseAnalyzer): 271 | """ 272 | """ 273 | def __init__(self, column): 274 | self.column = column 275 | 276 | @property 277 | def jvmAnalyzer(self): 278 | return self.jvmdeequAnalyzers.MinLength( 279 | self.column, 280 | getattr(self.jvmdeequAnalyzers.MinLength, "apply$default$2")() 281 | ) 282 | 283 | class MutualInformation(BaseAnalyzer): 284 | """ 285 | Mutual Information describes how much information about one column can be inferred from another 286 | column. 287 | 288 | If two columns are independent of each other, then nothing can be inferred from one column about 289 | the other, and mutual information is zero. If there is a functional dependency of one column to 290 | another and vice versa, then all information of the two columns are shared, and mutual 291 | information is the entropy of each column. 292 | """ 293 | def __init__(self, columns): 294 | if not isinstance(columns, list): 295 | raise ValueError("'columns' mus be a list of strings.") 296 | self.columns = columns 297 | 298 | @property 299 | def jvmAnalyzer(self): 300 | return self.jvmdeequAnalyzers.MutualInformation( 301 | jc.iterable_to_scala_seq(self._jvm, self.columns), 302 | getattr(self.jvmdeequAnalyzers.MutualInformation, "apply$default$2")() 303 | ) 304 | 305 | #class PattenMatch 306 | 307 | class Size(BaseAnalyzer): 308 | """ 309 | Size is the number of rows in a DataFrame. 310 | """ 311 | @property 312 | def jvmAnalyzer(self): 313 | return self.jvmdeequAnalyzers.Size( 314 | getattr(self.jvmdeequAnalyzers.Size, "apply$default$1")() 315 | ) 316 | 317 | class StandardDeviation(BaseAnalyzer): 318 | """ 319 | Standard deviation implementation. 320 | """ 321 | def __init__(self, column): 322 | self.column = column 323 | 324 | @property 325 | def jvmAnalyzer(self): 326 | return self.jvmdeequAnalyzers.StandardDeviation( 327 | self.column, 328 | getattr(self.jvmdeequAnalyzers.StandardDeviation, "apply$default$2")() 329 | ) 330 | 331 | class Sum(BaseAnalyzer): 332 | """ 333 | """ 334 | def __init__(self, column): 335 | self.column = column 336 | 337 | @property 338 | def jvmAnalyzer(self): 339 | return self.jvmdeequAnalyzers.Sum( 340 | self.column, 341 | getattr(self.jvmdeequAnalyzers.Sum, "apply$default$2")() 342 | ) 343 | 344 | class Uniqueness(BaseAnalyzer): 345 | """ 346 | Fraction of unique values over the number of all values of 347 | a column. Unique values occur exactly once. 348 | Example: [a, a, b] contains one unique value b, 349 | so uniqueness is 1/3. 350 | """ 351 | def __init__(self, columns): 352 | if not isinstance(columns, list): 353 | raise ValueError("'columns' mus be a list of strings.") 354 | self.columns = columns 355 | 356 | @property 357 | def jvmAnalyzer(self): 358 | return self.jvmdeequAnalyzers.Uniqueness( 359 | jc.iterable_to_scala_seq(self._jvm, self.columns), 360 | getattr(self.jvmdeequAnalyzers.Uniqueness, "apply$default$2")() 361 | ) 362 | 363 | class UniqueValueRatio(BaseAnalyzer): 364 | """ 365 | Fraction of unique values over the number of all distinct 366 | values of a column. Unique values occur exactly once. 367 | Distinct values occur at least once. 368 | Example: [a, a, b] contains one unique value b, 369 | and two distinct values a and b, so the unique value 370 | ratio is 1/2. 371 | """ 372 | def __init__(self, columns): 373 | if not isinstance(columns, list): 374 | raise ValueError("'columns' mus be a list of strings.") 375 | self.columns = columns 376 | 377 | @property 378 | def jvmAnalyzer(self): 379 | return self.jvmdeequAnalyzers.UniqueValueRatio( 380 | jc.iterable_to_scala_seq(self._jvm, self.columns), 381 | getattr(self.jvmdeequAnalyzers.UniqueValueRatio, "apply$default$2")() 382 | ) 383 | 384 | -------------------------------------------------------------------------------- /src/pydeequ/base.py: -------------------------------------------------------------------------------- 1 | import py4j.java_gateway as jg 2 | 3 | from pyspark.sql import DataFrame 4 | 5 | from pydeequ.exceptions import JavaClassNotFoundException 6 | import pydeequ.jvm_conversions as jc 7 | 8 | class BaseWrapper(object): 9 | def __init__(self, SparkSession): 10 | self.spark = SparkSession 11 | 12 | @property 13 | def _jsparkSession(self): 14 | return self.spark._jsparkSession 15 | 16 | @property 17 | def _jvm(self): 18 | return self.spark.sparkContext._jvm 19 | 20 | @property 21 | def _gateway(self): 22 | return self.spark.sparkContext._gateway 23 | 24 | class BaseBuilder(BaseWrapper): 25 | def __init__(self, SparkSession, dataFrame): 26 | super().__init__(SparkSession) 27 | self._dataFrame = dataFrame 28 | 29 | @property 30 | def dataFrame(self): 31 | return self._dataFrame 32 | 33 | class VerificationRunBuilder(BaseBuilder): 34 | """ 35 | A class to build a VerificationRun using a fluent API. 36 | """ 37 | def __init__(self, SparkSession, dataFrame): 38 | """ 39 | Args: 40 | SparkSession (pyspark.sql.SparkSession) 41 | dataFrame (pyspark.sql.dataframe.DataFrame) 42 | """ 43 | super().__init__(SparkSession, dataFrame) 44 | run_builder = self._jvm.com.amazon.deequ.VerificationRunBuilder 45 | self.jvmVerificationRunBuilder = run_builder( 46 | self.dataFrame._jdf 47 | ) 48 | 49 | 50 | def addCheck(self, check): 51 | """ 52 | Add a single check to the run. 53 | 54 | Args: 55 | check (pydeequ.check.Check): 56 | A check object to be executed during the run 57 | """ 58 | jvmCheck = check.jvmCheck 59 | self.jvmVerificationRunBuilder.addCheck(jvmCheck) 60 | return self 61 | 62 | def run(self): 63 | result = self.jvmVerificationRunBuilder.run() 64 | 65 | jvmVerificationResult = self._jvm.com.amazon.deequ \ 66 | .VerificationResult 67 | try: 68 | df = jvmVerificationResult.checkResultsAsDataFrame( 69 | self._jsparkSession, 70 | result, 71 | getattr(jvmVerificationResult, 72 | "checkResultsAsDataFrame$default$3")() 73 | ) 74 | return df 75 | except Exception: 76 | self.spark.sparkContext._gateway.close() 77 | self.spark.stop() 78 | raise AttributeError 79 | 80 | def useRepository(self, metricsRepo): 81 | self.jvmVerificationRunBuilder = self.jvmVerificationRunBuilder \ 82 | .useRepository( 83 | metricsRepo.jvmMetricsRepo 84 | ) 85 | return self 86 | 87 | def saveOrAppendResult(self, resultKey): 88 | self.jvmVerificationRunBuilder = self.jvmVerificationRunBuilder \ 89 | .saveOrAppendResult( 90 | resultKey.jvmResultKey 91 | ) 92 | return self 93 | 94 | class VerificationSuite(BaseWrapper): 95 | """ 96 | Responsible for running checks and required analysis and return the 97 | results. 98 | """ 99 | def __init__(self, SparkSession): 100 | """ 101 | Args: 102 | SparkSession (): 103 | """ 104 | super().__init__(SparkSession) 105 | self._start_callback_server() 106 | 107 | def _start_callback_server(self): 108 | callback = self._gateway.get_callback_server() 109 | if callback is None: 110 | self._gateway.start_callback_server() 111 | elif callback.is_shutdown: 112 | callback.close() 113 | self._gateway.restart_callback_server() 114 | 115 | def onData(self, dataFrame): 116 | """ 117 | Starting point to construct a VerificationRun. 118 | 119 | Args: 120 | dataFrame (pyspark.sql.dataframe.DataFrame): 121 | spark dataFrame on which the checks will be verified. 122 | """ 123 | return VerificationRunBuilder(self.spark, dataFrame) 124 | 125 | class _AnalyzerContext(BaseWrapper): 126 | """ 127 | """ 128 | def __init__(self, SparkSession, jvmAnalyzerContext): 129 | """ Initializes the AnalyzerContext python object with a JVM object. 130 | 131 | Args: 132 | SparkSession (): 133 | jvmAnalyzerContext (JavaObject): 134 | """ 135 | super().__init__(SparkSession) 136 | self.jvmAnalyzerContext = jvmAnalyzerContext 137 | 138 | def successMetricsAsDataFrame(self): 139 | try: 140 | df = self.jvmAnalyzerContext.successMetricsAsDataFrame( 141 | self._jsparkSession, 142 | self.jvmAnalyzerContext, 143 | getattr(self.jvmAnalyzerContext, 144 | "successMetricsAsDataFrame$default$3")() 145 | ) 146 | out = DataFrame(df, self.spark) 147 | return out 148 | except Exception: 149 | self.spark.sparkContext._gateway.close() 150 | self.spark.stop() 151 | raise AttributeError 152 | 153 | def successMetricsAsJson(self): 154 | try: 155 | jf = self.jvmAnalyzerContext.successMetricsAsJson( 156 | self.jvmAnalyzerContext, 157 | getattr(self.jvmAnalyzerContext, 158 | "successMetricsAsJson$default$2")() 159 | ) 160 | 161 | return jf 162 | except Exception: 163 | self.spark.sparkContext._gateway.close() 164 | self.spark.stop() 165 | raise AttributeError 166 | 167 | class AnalysisRunBuilder(BaseBuilder): 168 | """ 169 | A class to build an AnalysisRun using a fluent API. 170 | """ 171 | def __init__(self, SparkSession, dataFrame): 172 | """ 173 | Args: 174 | SparkSession (pyspark.sql.SparkSession) 175 | dataFrame (pyspark.sql.dataframe.DataFrame) 176 | """ 177 | super().__init__(SparkSession, dataFrame) 178 | run_builder = self._jvm.com.amazon.deequ.analyzers.runners.AnalysisRunBuilder 179 | self.jvmAnalysisRunBuilder = run_builder( 180 | self.dataFrame._jdf 181 | ) 182 | 183 | def addAnalyzer(self, analyzer): 184 | """ 185 | Add a single analyzer to the run. 186 | 187 | Args: 188 | analyzer (pydeequ.analyzer.Analyzer): 189 | An analyzer object to be executed during the run 190 | """ 191 | analyzer.set_jvm(self._jvm) 192 | jvmAnalyzer = analyzer.jvmAnalyzer 193 | self.jvmAnalysisRunBuilder.addAnalyzer(jvmAnalyzer) 194 | return self 195 | 196 | def run(self): 197 | """ Returns an AnalyzerContext python object 198 | """ 199 | jvmContext = self.jvmAnalysisRunBuilder.run() 200 | return_context = _AnalyzerContext( 201 | self.spark, 202 | jvmContext) 203 | return return_context 204 | 205 | class AnalysisRunner(BaseWrapper): 206 | """ 207 | Responsible for running metrics calculations. 208 | """ 209 | def onData(self, dataFrame): 210 | """ 211 | Starting point to construct an Analysisrun. 212 | 213 | Args: 214 | dataFrame (pyspark.sql.dataframe.DataFrame): 215 | spark dataFrame on which the checks will be verified. 216 | """ 217 | return AnalysisRunBuilder(self.spark, dataFrame) 218 | 219 | 220 | class ConstraintSuggestionRunBuilder(BaseBuilder): 221 | """ 222 | A class to build a ConstraintSuggestionRun using a fluent API. 223 | """ 224 | def __init__(self, SparkSession, dataFrame): 225 | """ 226 | Args: 227 | SparkSession (pyspark.sql.SparkSession) 228 | dataFrame (pyspark.sql.dataframe.DataFrame) 229 | """ 230 | super().__init__(SparkSession, dataFrame) 231 | run_builder = self._jvm.com.amazon.deequ.suggestions.ConstraintSuggestionRunBuilder 232 | self.jvmConstraintSuggestionRunBuilder = run_builder( 233 | self.dataFrame._jdf 234 | ) 235 | 236 | def addConstraintRule(self, constraint): 237 | """ 238 | Add a single rule for suggesting constraints based on ColumnProfiles to the run. 239 | 240 | Args: 241 | constraintRule 242 | """ 243 | jvmRule = constraint._jvmRule 244 | self.jvmConstraintSuggestionRunBuilder.addConstraintRule(jvmRule()) 245 | return self 246 | 247 | def run(self): 248 | result = self.jvmConstraintSuggestionRunBuilder.run() 249 | 250 | jvmSuggestionResult = self._jvm.com.amazon.deequ \ 251 | .suggestions.ConstraintSuggestionResult 252 | try: 253 | df = jvmSuggestionResult.getConstraintSuggestionsAsJson( 254 | result 255 | ) 256 | return df 257 | except: 258 | self.spark.sparkContext._gateway.close() 259 | self.spark.stop() 260 | raise AttributeError 261 | 262 | class ConstraintSuggestionRunner(BaseWrapper): 263 | """ 264 | """ 265 | def onData(self, dataFrame): 266 | """ 267 | Starting point to construct a run on constraint suggestions. 268 | 269 | Args: 270 | dataFrame (pyspark.sql.dataframe.DataFrame): 271 | spark dataFrame on which the checks will be verified. 272 | """ 273 | return ConstraintSuggestionRunBuilder(self.spark, dataFrame) 274 | -------------------------------------------------------------------------------- /src/pydeequ/checks.py: -------------------------------------------------------------------------------- 1 | 2 | import py4j.java_gateway as jg 3 | 4 | from pydeequ.exceptions import JavaClassNotFoundException 5 | import pydeequ.jvm_conversions as jc 6 | import pdb 7 | 8 | def is_one(x): 9 | """ Helper function for default asseritons. 10 | """ 11 | return x == 1 12 | 13 | class Check(object): 14 | """ 15 | A class representing a list of constraints that can be applied to a given 16 | [[org.apache.spark.sql.DataFrame]]. In order to run the checks, use the 17 | VerificationSuite.run to run your checks along with other Checks and 18 | Analysis objects. When run with VerificationSuite, Analyzers required by 19 | multiple checks/analysis blocks is optimized to run once. 20 | """ 21 | def __init__(self, SparkSession, level='error', description=None, 22 | jvmCheck=None): 23 | """ 24 | Args: 25 | sparkContext (pyspark.context.SparkContext): active SparkContext 26 | level (str): 'error' (default), 'warning' 27 | Assertion level of the check group. If any of the constraints 28 | fail this level is used for the status of the check 29 | description (str): The name describes the check block. Generally 30 | will be used to show in the logs 31 | """ 32 | self.spark = SparkSession 33 | self._level = level 34 | self._description = description 35 | if jvmCheck: 36 | self.jvmCheck = jvmCheck 37 | else: 38 | deequ_check = self._jvm.com.amazon.deequ.checks.Check 39 | if not isinstance(deequ_check, jg.JavaClass): 40 | raise JavaClassNotFoundException("com.amazon.deequ.checks.Check") 41 | self.jvmCheck = deequ_check( 42 | self._jvm_level, 43 | self._description, 44 | getattr(deequ_check, "apply$default$3")() 45 | ) 46 | 47 | @property 48 | def _jvm(self): 49 | return self.spark.sparkContext._jvm 50 | 51 | @property 52 | def level(self): 53 | return self._level 54 | 55 | @property 56 | def description(self): 57 | return self._description 58 | 59 | @property 60 | def _jvm_level(self): 61 | if self._level == 'error': 62 | return self._jvm.com.amazon.deequ.checks.CheckLevel.Error() 63 | elif self._level == 'warning': 64 | return self._jvm.com.amazon.deequ.checks.CheckLevel.Warning() 65 | else: 66 | raise ValueError("Invalid 'level'") 67 | 68 | def hasSize(self, assertion): 69 | """ 70 | Creates a constraint that calculates the data frame size and runs the 71 | assertion on it. 72 | Args: 73 | assertion (function): 74 | Returns: 75 | checks.Check object including this constraint 76 | """ 77 | function = jc.scala_function1(self.spark.sparkContext._gateway, 78 | assertion) 79 | jvmConstraint = self.jvmCheck.hasSize( 80 | function, 81 | getattr(self.jvmCheck, "hasSize$default$2")() 82 | ) 83 | return Check( 84 | self.spark, 85 | self.level, 86 | self.description, 87 | jvmConstraint 88 | ) 89 | 90 | def isUnique(self, column): 91 | """ 92 | Creates a constraint that asserts on a column uniqueness. 93 | Args: 94 | column (str): Column to run the assertion on 95 | Returns: 96 | checks.Check object including this constraint 97 | """ 98 | jvmConstraint = self.jvmCheck.isUnique( 99 | column, 100 | getattr(self.jvmCheck, "isUnique$default$2")() 101 | ) 102 | return Check( 103 | self.spark, 104 | self.level, 105 | self.description, 106 | jvmConstraint 107 | ) 108 | 109 | def hasCompleteness(self, column, assertion): 110 | """ 111 | Creates a constraint that asserts on a column completion. 112 | Uses the given history selection strategy to retrieve historical completeness values on this 113 | column from the history provider. 114 | 115 | @param column Column to run the assertion on 116 | @param assertion Function that receives a double input parameter and returns a boolean 117 | @param hint A hint to provide additional context why a constraint could have failed 118 | """ 119 | function = jc.scala_function1(self.spark.sparkContext._gateway, 120 | assertion) 121 | jvmConstraint = self.jvmCheck.hasCompleteness( 122 | column, 123 | function, 124 | getattr(self.jvmCheck, "hasCompleteness$default$3")() 125 | ) 126 | return Check( 127 | self.spark, 128 | self.level, 129 | self.description, 130 | jvmConstraint 131 | ) 132 | 133 | def hasUniqueness(self, columns, assertion): 134 | """ 135 | Creates a constraint that asserts on uniqueness in a single or combined set of key columns. 136 | 137 | @param columns Key columns 138 | @param assertion Function that receives a double input parameter and returns a boolean. 139 | Refers to the fraction of unique values 140 | @param hint A hint to provide additional context why a constraint could have failed 141 | """ 142 | if (not isinstance(columns, list)): 143 | # Single column is provided 144 | columns = [columns] 145 | function = jc.scala_function1(self.spark.sparkContext._gateway, 146 | assertion) 147 | jvmConstraint = self.jvmCheck.hasUniqueness( 148 | jc.iterable_to_scala_seq(self._jvm, columns), 149 | function 150 | ) 151 | return Check( 152 | self.spark, 153 | self.level, 154 | self.description, 155 | jvmConstraint 156 | ) 157 | 158 | def hasDistinctness(self, columns, assertion): 159 | """ 160 | Creates a constraint on the distinctness in a single or combined set of key columns. 161 | 162 | @param columns columns 163 | @param assertion Function that receives a double input parameter and returns a boolean. 164 | Refers to the fraction of distinct values. 165 | @param hint A hint to provide additional context why a constraint could have failed 166 | """ 167 | if (not isinstance(columns, list)): 168 | # Single column is provided 169 | columns = [columns] 170 | function = jc.scala_function1(self.spark.sparkContext._gateway, 171 | assertion) 172 | jvmConstraint = self.jvmCheck.hasDistinctness( 173 | jc.iterable_to_scala_seq(self._jvm, columns), 174 | function, 175 | getattr(self.jvmCheck, "hasDistinctness$default$3")() 176 | ) 177 | return Check( 178 | self.spark, 179 | self.level, 180 | self.description, 181 | jvmConstraint 182 | ) 183 | 184 | def hasUniqueValueRatio(self, columns, assertion): 185 | """ 186 | Creates a constraint on the unique value ratio in a single or combined set of key columns. 187 | 188 | @param columns columns 189 | @param assertion Function that receives a double input parameter and returns a boolean. 190 | Refers to the fraction of distinct values. 191 | @param hint A hint to provide additional context why a constraint could have failed 192 | """ 193 | if (not isinstance(columns, list)): 194 | # Single column is provided 195 | columns = [columns] 196 | function = jc.scala_function1(self.spark.sparkContext._gateway, 197 | assertion) 198 | jvmConstraint = self.jvmCheck.hasUniqueValueRatio( 199 | jc.iterable_to_scala_seq(self._jvm, columns), 200 | function, 201 | getattr(self.jvmCheck, "hasUniqueValueRatio$default$3")() 202 | ) 203 | return Check( 204 | self.spark, 205 | self.level, 206 | self.description, 207 | jvmConstraint 208 | ) 209 | 210 | def hasNumberOfDistinctValues(self, column, assertion, 211 | binningUdf = None, maxBins = None): 212 | """ 213 | Creates a constraint that asserts on the number of distinct values a column has. 214 | 215 | @param column Column to run the assertion on 216 | @param assertion Function that receives a long input parameter and returns a boolean 217 | @param binningUdf An optional binning function 218 | @param maxBins Histogram details is only provided for N column values with top counts. 219 | maxBins sets the N 220 | @param hint A hint to provide additional context why a constraint could have failed 221 | """ 222 | function = jc.scala_function1(self.spark.sparkContext._gateway, 223 | assertion) 224 | jvmConstraint = self.jvmCheck.hasNumberOfDistinctValues( 225 | column, 226 | function, 227 | getattr(self.jvmCheck, "hasNumberOfDistinctValues$default$3")(), 228 | getattr(self.jvmCheck, "hasNumberOfDistinctValues$default$4")(), 229 | getattr(self.jvmCheck, "hasNumberOfDistinctValues$default$5")() 230 | ) 231 | return Check( 232 | self.spark, 233 | self.level, 234 | self.description, 235 | jvmConstraint 236 | ) 237 | 238 | def hasHistogramValues(self, column, assertion, 239 | binningUdf = None, maxBins = None): 240 | """ 241 | Creates a constraint that asserts on column's value distribution. 242 | 243 | @param column Column to run the assertion on 244 | @param assertion Function that receives a Distribution input parameter and returns a boolean. 245 | E.g 246 | .hasHistogramValues("att2", _.absolutes("f") == 3) 247 | .hasHistogramValues("att2", 248 | _.ratios(Histogram.NullFieldReplacement) == 2/6.0) 249 | @param binningUdf An optional binning function 250 | @param maxBins Histogram details is only provided for N column values with top counts. 251 | maxBins sets the N 252 | @param hint A hint to provide additional context why a constraint could have failed 253 | """ 254 | function = jc.scala_function1(self.spark.sparkContext._gateway, 255 | assertion) 256 | jvmConstraint = self.jvmCheck.hasHistogramValues( 257 | column, 258 | function, 259 | getattr(self.jvmCheck, "hasHistogramValues$default$3")(), 260 | getattr(self.jvmCheck, "hasHistogramValues$default$4")(), 261 | getattr(self.jvmCheck, "hasHistogramValues$default$5")() 262 | ) 263 | return Check( 264 | self.spark, 265 | self.level, 266 | self.description, 267 | jvmConstraint 268 | ) 269 | 270 | def hasEntropy(self, column, assertion): 271 | """ 272 | Creates a constraint that asserts on a column entropy. 273 | 274 | @param column Column to run the assertion on 275 | @param assertion Function that receives a double input parameter and returns a boolean 276 | @param hint A hint to provide additional context why a constraint could have failed 277 | """ 278 | function = jc.scala_function1(self.spark.sparkContext._gateway, 279 | assertion) 280 | jvmConstraint = self.jvmCheck.hasEntropy( 281 | column, 282 | function, 283 | getattr(self.jvmCheck, "hasEntropy$default$3")() 284 | ) 285 | return Check( 286 | self.spark, 287 | self.level, 288 | self.description, 289 | jvmConstraint 290 | ) 291 | 292 | def hasMutualInformation(self, columnA, columnB, assertion): 293 | """ 294 | Creates a constraint that asserts on a mutual information between two columns. 295 | 296 | @param columnA First column for mutual information calculation 297 | @param columnB Second column for mutual information calculation 298 | @param assertion Function that receives a double input parameter and returns a boolean 299 | @param hint A hint to provide additional context why a constraint could have failed 300 | """ 301 | function = jc.scala_function1(self.spark.sparkContext._gateway, 302 | assertion) 303 | jvmConstraint = self.jvmCheck.hasMutualInformation( 304 | columnA, 305 | columnB, 306 | function, 307 | getattr(self.jvmCheck, "hasMutualInformation$default$4")() 308 | ) 309 | return Check( 310 | self.spark, 311 | self.level, 312 | self.description, 313 | jvmConstraint 314 | ) 315 | 316 | def hasApproxQuantile(self, column, quantile, assertion): 317 | """ 318 | Creates a constraint that asserts on an approximated quantile 319 | 320 | @param column Column to run the assertion on 321 | @param quantile Which quantile to assert on 322 | @param assertion Function that receives a double input parameter (the computed quantile) 323 | and returns a boolean 324 | @param hint A hint to provide additional context why a constraint could have failed 325 | """ 326 | function = jc.scala_function1(self.spark.sparkContext._gateway, 327 | assertion) 328 | jvmConstraint = self.jvmCheck.hasApproxQuantile( 329 | column, 330 | quantile, 331 | function, 332 | getattr(self.jvmCheck, "hasApproxQuantile$default$4")() 333 | ) 334 | return Check( 335 | self.spark, 336 | self.level, 337 | self.description, 338 | jvmConstraint 339 | ) 340 | 341 | def hasMinLength(self, column, assertion): 342 | """ 343 | Creates a constraint that asserts on the minimum length of the column 344 | 345 | @param column Column to run the assertion on 346 | @param assertion Function that receives a double input parameter and returns a boolean 347 | @param hint A hint to provide additional context why a constraint could have failed 348 | """ 349 | function = jc.scala_function1(self.spark.sparkContext._gateway, 350 | assertion) 351 | jvmConstraint = self.jvmCheck.hasMinLength( 352 | column, 353 | function, 354 | getattr(self.jvmCheck, "hasMinLength$default$3")() 355 | ) 356 | return Check( 357 | self.spark, 358 | self.level, 359 | self.description, 360 | jvmConstraint 361 | ) 362 | 363 | 364 | def hasMaxLength(self, column, assertion): 365 | """ 366 | Creates a constraint that asserts on the maximum length of the column 367 | 368 | @param column Column to run the assertion on 369 | @param assertion Function that receives a double input parameter and returns a boolean 370 | @param hint A hint to provide additional context why a constraint could have failed 371 | """ 372 | function = jc.scala_function1(self.spark.sparkContext._gateway, 373 | assertion) 374 | jvmConstraint = self.jvmCheck.hasMaxLength( 375 | column, 376 | function, 377 | getattr(self.jvmCheck, "hasMaxLength$default$3")() 378 | ) 379 | return Check( 380 | self.spark, 381 | self.level, 382 | self.description, 383 | jvmConstraint 384 | ) 385 | 386 | def hasMin(self, column, assertion): 387 | """ 388 | Creates a constraint that asserts on the minimum of the column 389 | 390 | @param column Column to run the assertion on 391 | @param assertion Function that receives a double input parameter and returns a boolean 392 | @param hint A hint to provide additional context why a constraint could have failed 393 | """ 394 | function = jc.scala_function1(self.spark.sparkContext._gateway, 395 | assertion) 396 | jvmConstraint = self.jvmCheck.hasMin( 397 | column, 398 | function, 399 | getattr(self.jvmCheck, "hasMin$default$3")() 400 | ) 401 | return Check( 402 | self.spark, 403 | self.level, 404 | self.description, 405 | jvmConstraint 406 | ) 407 | 408 | def hasMax(self, column, assertion): 409 | """ 410 | Creates a constraint that asserts on the maximum of the column 411 | 412 | @param column Column to run the assertion on 413 | @param assertion Function that receives a double input parameter and returns a boolean 414 | @param hint A hint to provide additional context why a constraint could have failed 415 | """ 416 | function = jc.scala_function1(self.spark.sparkContext._gateway, 417 | assertion) 418 | jvmConstraint = self.jvmCheck.hasMax( 419 | column, 420 | function, 421 | getattr(self.jvmCheck, "hasMax$default$3")() 422 | ) 423 | return Check( 424 | self.spark, 425 | self.level, 426 | self.description, 427 | jvmConstraint 428 | ) 429 | 430 | def hasMean(self, column, assertion): 431 | """ 432 | Creates a constraint that asserts on the mean of the column 433 | 434 | @param column Column to run the assertion on 435 | @param assertion Function that receives a double input parameter and returns a boolean 436 | @param hint A hint to provide additional context why a constraint could have failed 437 | """ 438 | function = jc.scala_function1(self.spark.sparkContext._gateway, 439 | assertion) 440 | jvmConstraint = self.jvmCheck.hasMean( 441 | column, 442 | function, 443 | getattr(self.jvmCheck, "hasMean$default$3")() 444 | ) 445 | return Check( 446 | self.spark, 447 | self.level, 448 | self.description, 449 | jvmConstraint 450 | ) 451 | 452 | def hasSum(self, column, assertion): 453 | """ 454 | Creates a constraint that asserts on the sum of the column 455 | 456 | @param column Column to run the assertion on 457 | @param assertion Function that receives a double input parameter and returns a boolean 458 | @param hint A hint to provide additional context why a constraint could have failed 459 | """ 460 | function = jc.scala_function1(self.spark.sparkContext._gateway, 461 | assertion) 462 | jvmConstraint = self.jvmCheck.hasSum( 463 | column, 464 | function, 465 | getattr(self.jvmCheck, "hasSum$default$3")() 466 | ) 467 | return Check( 468 | self.spark, 469 | self.level, 470 | self.description, 471 | jvmConstraint 472 | ) 473 | def hasStandardDeviation(self, column, assertion): 474 | """ 475 | Creates a constraint that asserts on the standard deviation of the column 476 | 477 | @param column Column to run the assertion on 478 | @param assertion Function that receives a double input parameter and returns a boolean 479 | @param hint A hint to provide additional context why a constraint could have failed 480 | """ 481 | function = jc.scala_function1(self.spark.sparkContext._gateway, 482 | assertion) 483 | jvmConstraint = self.jvmCheck.hasStandardDeviation( 484 | column, 485 | function, 486 | getattr(self.jvmCheck, "hasStandardDeviation$default$3")() 487 | ) 488 | return Check( 489 | self.spark, 490 | self.level, 491 | self.description, 492 | jvmConstraint 493 | ) 494 | def hasApproxCountDistinct(self, column, assertion): 495 | """ 496 | Creates a constraint that asserts on the approximate count distinct of the given column 497 | 498 | @param column Column to run the assertion on 499 | @param assertion Function that receives a double input parameter and returns a boolean 500 | @param hint A hint to provide additional context why a constraint could have failed 501 | """ 502 | function = jc.scala_function1(self.spark.sparkContext._gateway, 503 | assertion) 504 | jvmConstraint = self.jvmCheck.hasApproxCountDistinct( 505 | column, 506 | function, 507 | getattr(self.jvmCheck, "hasApproxCountDistinct$default$3")() 508 | ) 509 | return Check( 510 | self.spark, 511 | self.level, 512 | self.description, 513 | jvmConstraint 514 | ) 515 | 516 | def hasCorrelation(self, columnA, columnB, assertion): 517 | """ 518 | Creates a constraint that asserts on the pearson correlation between two columns. 519 | 520 | @param columnA First column for correlation calculation 521 | @param columnB Second column for correlation calculation 522 | @param assertion Function that receives a double input parameter and returns a boolean 523 | @param hint A hint to provide additional context why a constraint could have failed 524 | """ 525 | function = jc.scala_function1(self.spark.sparkContext._gateway, 526 | assertion) 527 | jvmConstraint = self.jvmCheck.hasCorrelation( 528 | columnA, 529 | columnB, 530 | function, 531 | getattr(self.jvmCheck, "hasCorrelation$default$4")() 532 | ) 533 | return Check( 534 | self.spark, 535 | self.level, 536 | self.description, 537 | jvmConstraint 538 | ) 539 | 540 | def satisfies(self, columnCondition, constraintName, assertion): 541 | """ 542 | Creates a constraint that runs the given condition on the data frame. 543 | 544 | @param columnCondition Data frame column which is a combination of expression and the column 545 | name. It has to comply with Spark SQL syntax. 546 | Can be written in an exact same way with conditions inside the 547 | `WHERE` clause. 548 | @param constraintName A name that summarizes the check being made. This name is being used to 549 | name the metrics for the analysis being done. 550 | @param assertion Function that receives a double input parameter and returns a boolean 551 | @param hint A hint to provide additional context why a constraint could have failed 552 | """ 553 | function = jc.scala_function1(self.spark.sparkContext._gateway, 554 | assertion) 555 | jvmConstraint = self.jvmCheck.satisfies( 556 | columnCondition, 557 | constraintName, 558 | function, 559 | getattr(self.jvmCheck, "satisfies$default$4")() 560 | ) 561 | return Check( 562 | self.spark, 563 | self.level, 564 | self.description, 565 | jvmConstraint 566 | ) 567 | 568 | def hasPattern(self, column, pattern, assertion = is_one): 569 | """ 570 | Checks for pattern compliance. Given a column name and a regular expression, defines a 571 | Check on the average compliance of the column's values to the regular expression. 572 | 573 | @param column Name of the column that should be checked. 574 | @param pattern The columns values will be checked for a match against this pattern. 575 | @param assertion Function that receives a double input parameter and returns a boolean 576 | @param hint A hint to provide additional context why a constraint could have failed 577 | """ 578 | # function = jc.scala_function1(self.spark.sparkContext._gateway, 579 | # assertion) 580 | # pattern = jc.scala_regex(self.spark.sparkContext._gateway, pattern) 581 | # jvmConstraint = self.jvmCheck.hasPattern( 582 | # column, 583 | # pattern, 584 | # function, 585 | # getattr(self.jvmCheck, "hasPattern$default$4")(), 586 | # getattr(self.jvmCheck, "hasPattern$default$5")() 587 | # ) 588 | # return Check( 589 | # self.spark, 590 | # self.level, 591 | # self.description, 592 | # jvmConstraint 593 | # ) 594 | pass 595 | 596 | def hasDataType(self, column, dataType, assertion): 597 | """ 598 | Check to run against the fraction of rows that conform to the given data type. 599 | 600 | @param column Name of the columns that should be checked. 601 | @param dataType Data type that the columns should be compared against. 602 | @param assertion Function that receives a double input parameter and returns a boolean 603 | @param hint A hint to provide additional context why a constraint could have failed 604 | """ 605 | _jconstDataTypes = self._jvm.com.amazon.deequ.constraints.ConstrainableDataTypes 606 | dataTypes = { 607 | 'null': _jconstDataTypes.Null(), 608 | 'boolean': _jconstDataTypes.Boolean(), 609 | 'string': _jconstDataTypes.String(), 610 | 'numeric': _jconstDataTypes.Numeric(), 611 | 'fractional': _jconstDataTypes.Fractional(), 612 | 'integer': _jconstDataTypes.Integral() 613 | } 614 | function = jc.scala_function1(self.spark.sparkContext._gateway, 615 | assertion) 616 | jvmConstraint = self.jvmCheck.hasDataType( 617 | column, 618 | dataTypes[dataType], 619 | function, 620 | getattr(self.jvmCheck, "hasDataType$default$4")() 621 | ) 622 | return Check( 623 | self.spark, 624 | self.level, 625 | self.description, 626 | jvmConstraint 627 | ) 628 | 629 | def isPositive(self, column, assertion = is_one): 630 | """ 631 | Creates a constraint that asserts that a column contains positive values 632 | 633 | @param column Column to run the assertion on 634 | @param assertion Function that receives a double input parameter and returns a boolean 635 | @param hint A hint to provide additional context why a constraint could have failed 636 | """ 637 | function = jc.scala_function1(self.spark.sparkContext._gateway, 638 | assertion) 639 | jvmConstraint = self.jvmCheck.isPositive( 640 | column, 641 | function, 642 | getattr(self.jvmCheck, "isPositive$default$3")() 643 | ) 644 | return Check( 645 | self.spark, 646 | self.level, 647 | self.description, 648 | jvmConstraint 649 | ) 650 | 651 | 652 | def isNonNegative(self, column, assertion = is_one): 653 | """ 654 | Creates a constraint that asserts that a column contains no negative values 655 | 656 | @param column Column to run the assertion on 657 | @param assertion Function that receives a double input parameter and returns a boolean 658 | @param hint A hint to provide additional context why a constraint could have failed 659 | """ 660 | function = jc.scala_function1(self.spark.sparkContext._gateway, 661 | assertion) 662 | jvmConstraint = self.jvmCheck.isNonNegative( 663 | column, 664 | function, 665 | getattr(self.jvmCheck, "isNonNegative$default$3")() 666 | ) 667 | return Check( 668 | self.spark, 669 | self.level, 670 | self.description, 671 | jvmConstraint 672 | ) 673 | 674 | def isLessThan(self, columnA, columnB, assertion = is_one): 675 | """ 676 | Asserts that, in each row, the value of columnA is less than the value of columnB 677 | 678 | @param columnA Column to run the assertion on 679 | @param columnB Column to run the assertion on 680 | @param assertion Function that receives a double input parameter and returns a boolean 681 | @param hint A hint to provide additional context why a constraint could have failed 682 | """ 683 | function = jc.scala_function1(self.spark.sparkContext._gateway, 684 | assertion) 685 | jvmConstraint = self.jvmCheck.isLessThan( 686 | columnA, 687 | columnB, 688 | function, 689 | getattr(self.jvmCheck, "isLessThan$default$4")() 690 | ) 691 | return Check( 692 | self.spark, 693 | self.level, 694 | self.description, 695 | jvmConstraint 696 | ) 697 | 698 | def isLessThanOrEqualTo(self, columnA, columnB, assertion = is_one): 699 | """ 700 | Asserts that, in each row, the value of columnA is less than or equal to the value of columnB 701 | 702 | @param columnA Column to run the assertion on 703 | @param columnB Column to run the assertion on 704 | @param assertion Function that receives a double input parameter and returns a boolean 705 | @param hint A hint to provide additional context why a constraint could have failed 706 | """ 707 | function = jc.scala_function1(self.spark.sparkContext._gateway, 708 | assertion) 709 | jvmConstraint = self.jvmCheck.isLessThanOrEqualTo( 710 | columnA, 711 | columnB, 712 | function, 713 | getattr(self.jvmCheck, "isLessThanOrEqualTo$default$4")() 714 | ) 715 | return Check( 716 | self.spark, 717 | self.level, 718 | self.description, 719 | jvmConstraint 720 | ) 721 | 722 | def isGreaterThan(self, columnA, columnB, assertion = is_one): 723 | """ 724 | Asserts that, in each row, the value of columnA is greater than the value of columnB 725 | 726 | @param columnA Column to run the assertion on 727 | @param columnB Column to run the assertion on 728 | @param assertion Function that receives a double input parameter and returns a boolean 729 | @param hint A hint to provide additional context why a constraint could have failed 730 | """ 731 | function = jc.scala_function1(self.spark.sparkContext._gateway, 732 | assertion) 733 | jvmConstraint = self.jvmCheck.isGreaterThan( 734 | columnA, 735 | columnB, 736 | function, 737 | getattr(self.jvmCheck, "isGreaterThan$default$4")() 738 | ) 739 | return Check( 740 | self.spark, 741 | self.level, 742 | self.description, 743 | jvmConstraint 744 | ) 745 | 746 | def isGreaterThanOrEqualTo(self, columnA, columnB, assertion = is_one): 747 | """ 748 | Asserts that, in each row, the value of columnA is greather than or equal to the value of 749 | columnB 750 | 751 | @param columnA Column to run the assertion on 752 | @param columnB Column to run the assertion on 753 | @param assertion Function that receives a double input parameter and returns a boolean 754 | @param hint A hint to provide additional context why a constraint could have failed 755 | """ 756 | function = jc.scala_function1(self.spark.sparkContext._gateway, 757 | assertion) 758 | jvmConstraint = self.jvmCheck.isGreaterThanOrEqualTo( 759 | columnA, 760 | columnB, 761 | function, 762 | getattr(self.jvmCheck, "isGreaterThanOrEqualTo$default$4")() 763 | ) 764 | return Check( 765 | self.spark, 766 | self.level, 767 | self.description, 768 | jvmConstraint 769 | ) 770 | 771 | def isContainedIn(self, column, allowedValues, assertion = is_one): 772 | """ 773 | Asserts that every non-null value in a column is contained in a set of predefined values 774 | 775 | @param column Column to run the assertion on 776 | @param allowedValues Allowed values for the column 777 | @param assertion Function that receives a double input parameter and returns a boolean 778 | @param hint A hint to provide additional context why a constraint could have failed 779 | """ 780 | if (isinstance(allowedValues, list) == False): 781 | raise ValueError("'allowedValues' must be a list of strings.") 782 | function = jc.scala_function1(self.spark.sparkContext._gateway, 783 | assertion) 784 | scalaArray = jc.iterable_to_scala_array(self._jvm, allowedValues) 785 | jvmConstraint = self.jvmCheck.isContainedIn( 786 | column, 787 | scalaArray, 788 | function, 789 | getattr(self.jvmCheck, "isContainedIn$default$6")() 790 | ) 791 | return Check( 792 | self.spark, 793 | self.level, 794 | self.description, 795 | jvmConstraint 796 | ) 797 | 798 | def isInInterval(self, 799 | column, 800 | lowerBound, 801 | upperBound, 802 | includeLowerBound = True, 803 | includeUpperBound = True): 804 | """ 805 | Asserts that the non-null values in a numeric column fall into the predefined interval 806 | 807 | @param column column to run the assertion 808 | @param lowerBound lower bound of the interval 809 | @param upperBound upper bound of the interval 810 | @param includeLowerBound is a value equal to the lower bound allows? 811 | @param includeUpperBound is a value equal to the upper bound allowed? 812 | @param hint A hint to provide additional context why a constraint could have failed 813 | """ 814 | jvmConstraint = self.jvmCheck.isContainedIn( 815 | column, 816 | lowerBound, 817 | upperBound, 818 | includeLowerBound, 819 | includeUpperBound, 820 | getattr(self.jvmCheck, "isContainedIn$default$6")() 821 | ) 822 | return Check( 823 | self.spark, 824 | self.level, 825 | self.description, 826 | jvmConstraint 827 | ) 828 | -------------------------------------------------------------------------------- /src/pydeequ/examples/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | test_data = [("thingA", 13.0, "IN_TRANSIT", "true", 5.0), 3 | ("thingA", 5.0, "DELAYED", "false", 20.0), 4 | ("thingB", None, "DELAYED", None, 12.0), 5 | ("thingC", None, "IN_TRANSIT", "false", 2.0), 6 | ("thingD", 1.0, "DELAYED", "true", None), 7 | ("thingC", 7.0, "UNKNOWN", None, None), 8 | ("thingC", 20.0, "UNKNOWN", None, 3.5), 9 | ("thingE", 20.0, "DELAYED", "false", 8.2)] 10 | 11 | -------------------------------------------------------------------------------- /src/pydeequ/examples/analyzer_example.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession, DataFrame 2 | 3 | from pydeequ.base import AnalysisRunner 4 | import pydeequ.analyzers as analyzers 5 | from pydeequ.examples import test_data 6 | 7 | def main(): 8 | # SparkSession startup 9 | spark = (SparkSession 10 | .builder 11 | .master('local[*]') 12 | .config('spark.jars.packages', 13 | 'com.amazon.deequ:deequ:1.0.5') 14 | .appName('profiler-example') 15 | .getOrCreate()) 16 | df = spark.createDataFrame(test_data) 17 | 18 | r = AnalysisRunner(spark) \ 19 | .onData(df) \ 20 | .addAnalyzer(analyzers.Size()) \ 21 | .addAnalyzer(analyzers.Completeness('_3')) \ 22 | .addAnalyzer(analyzers.ApproxCountDistinct('_1')) \ 23 | .addAnalyzer(analyzers.Mean('_2')) \ 24 | .addAnalyzer(analyzers.Compliance('top values', '_2 > 15')) \ 25 | .addAnalyzer(analyzers.Correlation('_2', '_5')) \ 26 | .run() 27 | 28 | df = DataFrame(r, spark) 29 | df.show(df.count(), False) 30 | 31 | # SparkSession and Java Gateway teardown 32 | spark.sparkContext._gateway.close() 33 | spark.stop() 34 | 35 | if __name__ == "__main__": 36 | main() -------------------------------------------------------------------------------- /src/pydeequ/examples/basic_usage.py: -------------------------------------------------------------------------------- 1 | #!/bin/bash python3 2 | 3 | from pyspark.sql import SparkSession, DataFrame 4 | 5 | from pydeequ.base import VerificationSuite 6 | from pydeequ.checks import Check 7 | from pydeequ.examples import test_data 8 | 9 | def main(): 10 | # SparkSession startup 11 | spark = (SparkSession 12 | .builder 13 | .master('local[*]') 14 | .config('spark.jars.packages', 15 | 'com.amazon.deequ:deequ:1.0.5') 16 | .appName('constrain-example') 17 | .getOrCreate()) 18 | df = spark.createDataFrame(test_data) 19 | 20 | # Constrain verification 21 | r = (VerificationSuite(spark) 22 | .onData(df) 23 | .addCheck(Check(spark, 'error', 'examples') 24 | .hasSize(lambda x: x == 8) 25 | .isUnique('_2') 26 | .hasCompleteness('_2', lambda x: x >= 0.75) 27 | .hasUniqueness('_1', lambda x: x == 3/8) 28 | .hasDistinctness('_1', lambda x: x == 5/8) 29 | .hasUniqueValueRatio('_2', lambda x: x == 0.8) 30 | .hasNumberOfDistinctValues('_2', lambda x: x == 6) 31 | #.hasHistogram 32 | .hasEntropy('_3', lambda x: x > 1) 33 | #.hasMutualInformation('_2', '_3', lambda x: x > 0.5) 34 | .hasApproxQuantile('_2', 0.5, lambda x: x == 7) 35 | .hasMinLength('_1', lambda x: x == 6) 36 | .hasMaxLength('_3', lambda x: x == 10) 37 | .hasMin('_2', lambda x: x == 1) 38 | .hasMax('_2', lambda x: x == 20) 39 | .hasMean('_2', lambda x: x > 10) 40 | .hasSum('_2', lambda x: x > 50) 41 | .hasStandardDeviation('_2', lambda x: x > 5) 42 | .hasApproxCountDistinct('_2', lambda x: x == 5) 43 | .hasCorrelation('_2', '_5', lambda x: x == 1) 44 | .satisfies("_2 > 15", "MyCondition", lambda x: x == 0.25) 45 | #.hasPattern("_1", "thing([A-Z])", lambda x: x == 1) 46 | #.hasDataType("_1", "string", lambda x: x == 1) 47 | .isPositive('_2') 48 | .isNonNegative('_2') 49 | .isLessThan('_5', '_2', lambda x: x == 0.375) 50 | .isLessThanOrEqualTo('_5', '_2', lambda x: x == 0.375) 51 | .isGreaterThan('_5', '_2', lambda x: x == 0.125) 52 | .isGreaterThanOrEqualTo('_5', '_2', lambda x: x == 0.125) 53 | #.isContainedIn('_3', ['DELAYED', 'INTRANSIT']) 54 | .isInInterval('_5', 1.0, 50.0) 55 | ) 56 | .run() 57 | ) 58 | df = DataFrame(r, spark) 59 | df.show(df.count(), False) 60 | 61 | # SparkSession and Java Gateway teardown 62 | spark.sparkContext._gateway.close() 63 | spark.stop() 64 | 65 | if __name__ == '__main__': 66 | main() 67 | -------------------------------------------------------------------------------- /src/pydeequ/examples/basic_usage2.py: -------------------------------------------------------------------------------- 1 | #!/bin/bash python3 2 | 3 | from pyspark.sql import SparkSession, DataFrame 4 | 5 | from pydeequ.base import VerificationSuite 6 | from pydeequ.checks import Check 7 | from pydeequ.examples import test_data 8 | 9 | def main(): 10 | # SparkSession startup 11 | spark = (SparkSession 12 | .builder 13 | .master('local[*]') 14 | .config('spark.jars.packages', 15 | 'com.amazon.deequ:deequ:1.0.5') 16 | .appName('constrain-example') 17 | .getOrCreate()) 18 | df = spark.createDataFrame(test_data) 19 | df.show() 20 | print(df._jdf.__doc__) 21 | 22 | #spark.stop() 23 | 24 | if __name__ == '__main__': 25 | main() 26 | -------------------------------------------------------------------------------- /src/pydeequ/examples/metrics_repo.py: -------------------------------------------------------------------------------- 1 | #!/bin/bash python3 2 | 3 | from pydeequ.examples import test_data 4 | from pydeequ import AnalysisRunner, VerificationSuite 5 | import pydeequ.analyzers as analyzers 6 | from pydeequ.metricsrepo import ResultKey, FileSystemMetricsRepository 7 | from pydeequ.checks import Check 8 | 9 | def main(): 10 | # SparkSession startup 11 | spark = (SparkSession 12 | .builder 13 | .master('local[*]') 14 | .config('spark.jars.packages', 15 | 'com.amazon.deequ:deequ:1.0.5') 16 | .appName('suggestions-example') 17 | .getOrCreate()) 18 | df = spark.createDataFrame(test_data) 19 | # Analysis run 20 | a = (AnalysisRunner(spark) 21 | .onData(df) 22 | .addAnalyzer(analyzers.Size())) \ 23 | .run() 24 | key = ResultKey(spark, 100000, {'key1': 'value1'}) 25 | myrepo = FileSystemMetricsRepository(spark, '../test.json') 26 | myrepo.save(key, a) 27 | 28 | # Verification run 29 | key2 = repo.ResultKey(spark, 100000, {'key1': 'value2', 'key2':'value3'}) 30 | 31 | 32 | v = (base.VerificationSuite(spark) 33 | .onData(df) 34 | .addCheck(Check(spark, 'error', 'examples') 35 | .hasSize(lambda x: x == 8) 36 | .isUnique('_2')) 37 | .useRepository(myrepo) 38 | .saveOrAppendResult(key2) 39 | .run() 40 | ) 41 | 42 | myrepo.load().withTagValues({'key1': 'value1'}).after(99000) \ 43 | .getMetricsAsDF().show() 44 | 45 | # SparkSession and Java Gateway teardown 46 | spark.sparkContext._gateway.close() 47 | spark.stop() 48 | 49 | if __name__ == "__main__": 50 | main() -------------------------------------------------------------------------------- /src/pydeequ/examples/profiler_example.py: -------------------------------------------------------------------------------- 1 | #!/bin/bash python3 2 | 3 | import json 4 | from pyspark.sql import SparkSession, DataFrame 5 | 6 | from pydeequ.profiler import ColumnProfilerRunner 7 | from pydeequ.examples import test_data 8 | 9 | def main(): 10 | # SparkSession startup 11 | spark = (SparkSession 12 | .builder 13 | .master('local[*]') 14 | .config('spark.jars.packages', 15 | 'com.amazon.deequ:deequ:1.0.5') 16 | .appName('profiler-example') 17 | .getOrCreate()) 18 | df = spark.createDataFrame(test_data) 19 | 20 | # Constrain verification 21 | r = (ColumnProfilerRunner() 22 | .onData(df) 23 | .run()) 24 | 25 | parsed = json.loads(r) 26 | print(json.dumps(parsed, indent = 4)) 27 | 28 | # SparkSession and Java Gateway teardown 29 | spark.sparkContext._gateway.close() 30 | spark.stop() 31 | 32 | if __name__ == "__main__": 33 | main() 34 | -------------------------------------------------------------------------------- /src/pydeequ/examples/suggestions_example.py: -------------------------------------------------------------------------------- 1 | #!/bin/bash python3 2 | 3 | import json 4 | from pyspark.sql import SparkSession, DataFrame 5 | 6 | from pydeequ.base import ConstraintSuggestionRunner 7 | from pydeequ.suggestions import Rules 8 | from pydeequ.examples import test_data 9 | 10 | def main(): 11 | # SparkSession startup 12 | spark = (SparkSession 13 | .builder 14 | .master('local[*]') 15 | .config('spark.jars.packages', 16 | 'com.amazon.deequ:deequ:1.0.5') 17 | .appName('suggestions-example') 18 | .getOrCreate()) 19 | df = spark.createDataFrame(test_data) 20 | 21 | # Constrain verification 22 | r = (ConstraintSuggestionRunner(spark) 23 | .onData(df) 24 | .addConstraintRule(Rules.CategoricalRangeRule(spark)) 25 | .run()) 26 | 27 | parsed = json.loads(r) 28 | print(json.dumps(parsed, indent = 4)) 29 | 30 | # SparkSession and Java Gateway teardown 31 | spark.sparkContext._gateway.close() 32 | spark.stop() 33 | 34 | 35 | if __name__ == "__main__": 36 | main() -------------------------------------------------------------------------------- /src/pydeequ/exceptions.py: -------------------------------------------------------------------------------- 1 | class JavaClassNotFoundException(Exception): 2 | """ 3 | Raise if required Java class is not found by py4j 4 | """ 5 | 6 | def __init__(self, java_class): 7 | Exception.__init__(self) 8 | self.java_class = java_class 9 | 10 | def __str__(self): 11 | return "%s. Did you forget to add the jar to the class path?" % ( 12 | self.java_class 13 | ) 14 | 15 | def __repr__(self): 16 | return "%s: %s" % (self.__class__.__name__, self.java_class) 17 | -------------------------------------------------------------------------------- /src/pydeequ/jvm_conversions.py: -------------------------------------------------------------------------------- 1 | def iterable_to_scala_list(jvm, iterable): 2 | return jvm.scala.collection.JavaConversions.\ 3 | iterableAsScalaIterable(iterable).\ 4 | toList() 5 | 6 | def iterable_to_scala_set(jvm, iterable): 7 | return jvm.scala.collection.JavaConversions.\ 8 | iterableAsScalaIterable(iterable).\ 9 | toSet() 10 | 11 | def iterable_to_scala_seq(jvm, iterable): 12 | return jvm.scala.collection.JavaConversions.\ 13 | iterableAsScalaIterable(iterable).\ 14 | toSeq() 15 | 16 | def simple_date_format(jvm, s): 17 | return jvm.java.text.SimpleDateFormat(s) 18 | 19 | def tuple2(jvm, t): 20 | return jvm.scala.Tuple2(*t) 21 | 22 | def option(jvm, java_obj): 23 | return jvm.scala.Option.apply(java_obj) 24 | 25 | def scala_none(jvm): 26 | return getattr(getattr(jvm.scala, "None$"), "MODULE$") 27 | 28 | def dict_to_scala_map(jvm, keyvaluepairs): 29 | return jvm.scala.collection.JavaConverters.\ 30 | mapAsScalaMapConverter(keyvaluepairs).\ 31 | asScala().toMap(jvm.scala.Predef.conforms()) 32 | 33 | class scala_function1: 34 | def __init__(self, gateway, lambda_function): 35 | self.gateway = gateway 36 | self.lambda_function = lambda_function 37 | 38 | def apply(self, arg): 39 | return self.lambda_function(arg) 40 | 41 | class Java: 42 | implements = ["scala.Function1"] 43 | 44 | -------------------------------------------------------------------------------- /src/pydeequ/metricsrepo.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import DataFrame 2 | 3 | from pydeequ.base import BaseWrapper 4 | import pydeequ.jvm_conversions as jc 5 | 6 | class ResultKey(BaseWrapper): 7 | """ Unique identifier of Analysis result. 8 | """ 9 | def __init__(self, SparkSession, dataSetDate, tags): 10 | """ 11 | :param double dataSetDate: A date related to the Analysis result 12 | :param dict tags: Key-value store of tags 13 | """ 14 | super().__init__(SparkSession) 15 | self.dataSetDate = dataSetDate 16 | self.tags = tags 17 | result_key = self._jvm.com.amazon.deequ.repository.ResultKey 18 | self.jvmResultKey = result_key( 19 | self.dataSetDate, 20 | jc.dict_to_scala_map(self._jvm, self.tags) 21 | ) 22 | 23 | class FileSystemMetricsRepository(BaseWrapper): 24 | """ FS based repository class 25 | """ 26 | def __init__(self, SparkSession, path): 27 | super().__init__(SparkSession) 28 | self.path = path 29 | fs_repo = self._jvm.com.amazon.deequ.repository.fs.\ 30 | FileSystemMetricsRepository 31 | self.jvmMetricsRepo = fs_repo( 32 | self._jsparkSession, 33 | self.path 34 | ) 35 | 36 | def save(self, resultKey, analyserContext): 37 | """ Save Analysis results (metrics). 38 | 39 | :param ResultKey resultKey: unique identifier of Analysis results 40 | :param AnalyzerContext analyserContext: 41 | """ 42 | return self.jvmMetricsRepo.save( 43 | resultKey.jvmResultKey, 44 | analyserContext.jvmAnalyzerContext 45 | ) 46 | 47 | def load(self): 48 | """ Get a builder class to construct a loading query to get 49 | analysis results 50 | """ 51 | return FSRepoResultsLoader(self.spark, self.path) 52 | 53 | class FSRepoResultsLoader(BaseWrapper): 54 | def __init__(self, SparkSession, path): 55 | super().__init__(SparkSession) 56 | self.path = path 57 | fs_repo_loader = self._jvm.com.amazon.deequ.repository.fs.\ 58 | FileSystemMetricsRepositoryMultipleResultsLoader 59 | self.jvmFSMetricsRepoLoader = fs_repo_loader( 60 | self._jsparkSession, 61 | self.path 62 | ) 63 | 64 | def withTagValues(self, tagValues): 65 | self.tagValues = tagValues 66 | self.jvmFSMetricsRepoLoader = self.jvmFSMetricsRepoLoader \ 67 | .withTagValues( 68 | jc.dict_to_scala_map(self._jvm, tagValues) 69 | ) 70 | return self 71 | 72 | def before(self, dateTime): 73 | self.before = dateTime 74 | self.jvmFSMetricsRepoLoader = self.jvmFSMetricsRepoLoader \ 75 | .before( 76 | dateTime 77 | ) 78 | return self 79 | 80 | def after(self, dateTime): 81 | self.after = dateTime 82 | self.jvmFSMetricsRepoLoader = self.jvmFSMetricsRepoLoader \ 83 | .after( 84 | dateTime 85 | ) 86 | return self 87 | 88 | def getMetricsAsDF(self): 89 | jvmGetter = self.jvmFSMetricsRepoLoader.getSuccessMetricsAsDataFrame 90 | df = jvmGetter( 91 | self._jsparkSession, 92 | getattr(self.jvmFSMetricsRepoLoader, 93 | "getSuccessMetricsAsDataFrame$default$2")() 94 | ) 95 | return DataFrame(df, self.spark) 96 | 97 | def getMetricsAsJson(self): 98 | jvmGetter = self.jvmFSMetricsRepoLoader.getSuccessMetricsAsJson 99 | jf = jvmGetter( 100 | getattr(self.jvmFSMetricsRepoLoader, 101 | "getSuccessMetricsAsJson$default$1")() 102 | ) 103 | return jf 104 | 105 | 106 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /src/pydeequ/profiler.py: -------------------------------------------------------------------------------- 1 | from pydeequ.exceptions import JavaClassNotFoundException 2 | import pydeequ.jvm_conversions as jc 3 | import pdb 4 | 5 | class ColumnProfilerRunBuilder: 6 | """ 7 | Builds profiling runner. 8 | """ 9 | def __init__(self, dataFrame): 10 | """ 11 | Args: 12 | dataFrame (pyspark.sql.dataframe.DataFrame): 13 | """ 14 | self._sc = dataFrame._sc 15 | self._dataFrame = dataFrame 16 | run_builder = self._jvm.com.amazon.deequ \ 17 | .profiles.ColumnProfilerRunBuilder 18 | self.jvmColumnProfilerRunBuilder = run_builder( 19 | self._dataFrame._jdf 20 | ) 21 | 22 | @property 23 | def _jvm(self): 24 | return self._sc._jvm 25 | 26 | @property 27 | def dataFrame(self): 28 | return self._dataFrame 29 | 30 | def run(self): 31 | result = self.jvmColumnProfilerRunBuilder.run() 32 | 33 | seqColumnProfiles = result.profiles().values().toSeq() 34 | jf = result.toJson( 35 | seqColumnProfiles 36 | ) 37 | 38 | return jf 39 | 40 | class ColumnProfilerRunner(): 41 | """ 42 | Responsible for running data profiling. 43 | """ 44 | def onData(self, dataFrame): 45 | """ 46 | Starting point to construct a profiling runner. 47 | 48 | Args: 49 | dataFrame (pyspark.sql.dataframe.DataFrame): 50 | """ 51 | return ColumnProfilerRunBuilder(dataFrame) 52 | 53 | -------------------------------------------------------------------------------- /src/pydeequ/suggestions.py: -------------------------------------------------------------------------------- 1 | import py4j.java_gateway as jg 2 | import pdb 3 | 4 | from pydeequ.exceptions import JavaClassNotFoundException 5 | import pydeequ.jvm_conversions as jc 6 | 7 | 8 | class Rules: 9 | """ 10 | Constraint rules 11 | """ 12 | 13 | def __init__(self, spark, _jvmRule): 14 | self.spark = spark 15 | self._jvmRule = _jvmRule 16 | 17 | @property 18 | def _jvm(self): 19 | return self.spark.sparkContext._jvm 20 | 21 | @classmethod 22 | def CompleteIfCompleteRule(cls, spark): 23 | _jvmRule = spark.sparkContext._jvm.com.amazon.deequ.suggestions.rules.CompleteIfCompleteRule 24 | return cls(spark, _jvmRule) 25 | 26 | @classmethod 27 | def RetainCompletenessRule(cls, spark): 28 | _jvmRule = spark.sparkContext._jvm.com.amazon.deequ.suggestions.rules.RetainCompletenessRule 29 | return cls(spark, _jvmRule) 30 | 31 | @classmethod 32 | def RetainTypeRule(cls, spark): 33 | _jvmRule = spark.sparkContext._jvm.com.amazon.deequ.suggestions.rules.RetainTypeRule 34 | return cls(spark, _jvmRule) 35 | 36 | @classmethod 37 | def CategoricalRangeRule(cls, spark): 38 | _jvmRule = spark.sparkContext._jvm.com.amazon.deequ.suggestions.rules.CategoricalRangeRule 39 | return cls(spark, _jvmRule) 40 | 41 | @classmethod 42 | def FractionalCategoricalRangeRule(cls, spark): 43 | _jvmRule = spark.sparkContext._jvm.com.amazon.deequ.suggestions.rules.FractionalCategoricalRangeRule 44 | return cls(spark, _jvmRule) 45 | 46 | @classmethod 47 | def NonNegativeNumbersRule(cls, spark): 48 | _jvmRule = spark.sparkContext._jvm.com.amazon.deequ.suggestions.rules.NonNegativeNumbersRule 49 | return cls(spark, _jvmRule) -------------------------------------------------------------------------------- /tests/integration/test_analyzers.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from pyspark.sql import SparkSession 4 | from pyspark.sql import DataFrame, Row 5 | 6 | from pydeequ.base import AnalysisRunner 7 | from pydeequ.examples import test_data 8 | from pydeequ import analyzers 9 | 10 | class AnalysisRunnerTest(unittest.TestCase): 11 | 12 | @classmethod 13 | def setUpClass(cls): 14 | cls.spark = (SparkSession 15 | .builder 16 | .master('local[*]') 17 | .config('spark.jars.packages', 18 | 'com.amazon.deequ:deequ:1.0.5') 19 | .appName('pytest-pyspark-local-testing') 20 | .getOrCreate()) 21 | cls.df = cls.spark.createDataFrame(test_data) 22 | cls.runner = AnalysisRunner(cls.spark) 23 | 24 | @classmethod 25 | def tearDownClass(cls): 26 | cls.spark.sparkContext._gateway.close() 27 | cls.spark.stop() 28 | 29 | def test_ApproxCountDistinct(self): 30 | out = self.runner.onData(self.df) \ 31 | .addAnalyzer(analyzers.ApproxCountDistinct('_1')) \ 32 | .run().successMetricsAsDataFrame() 33 | out = out.select('value').collect() 34 | self.assertEqual(out, [Row(value=5.0)]) 35 | 36 | def test_ApproxQuantile(self): 37 | out = self.runner.onData(self.df) \ 38 | .addAnalyzer(analyzers.ApproxQuantile('_2', 0.75)) \ 39 | .run().successMetricsAsDataFrame() 40 | out = out.select('value').collect() 41 | self.assertEqual(out, [Row(value=20)]) 42 | 43 | def test_Completeness(self): 44 | out = self.runner.onData(self.df) \ 45 | .addAnalyzer(analyzers.Completeness('_2')) \ 46 | .run().successMetricsAsDataFrame() 47 | out = out.select('value').collect() 48 | self.assertEqual(out, [Row(value=0.75)]) 49 | 50 | def test_Compliance(self): 51 | out = self.runner.onData(self.df) \ 52 | .addAnalyzer(analyzers.Compliance('top _2', '_2 > 15')) \ 53 | .run().successMetricsAsDataFrame() 54 | out = out.select('value').collect() 55 | self.assertEqual(out, [Row(value=0.25)]) 56 | 57 | def test_Correlation(self): 58 | out = self.runner.onData(self.df) \ 59 | .addAnalyzer(analyzers.Correlation('_2', '_5')) \ 60 | .run().successMetricsAsDataFrame() 61 | out = out.select('value').collect() 62 | self.assertLess(out, [Row(value=-0.8)]) 63 | 64 | def test_CountDistinct(self): 65 | out = self.runner.onData(self.df) \ 66 | .addAnalyzer(analyzers.CountDistinct('_3')) \ 67 | .run().successMetricsAsDataFrame() 68 | out = out.select('value').collect() 69 | self.assertEqual(out, [Row(value=3)]) 70 | 71 | def test_DataType(self): 72 | out = self.runner.onData(self.df) \ 73 | .addAnalyzer(analyzers.DataType('_3')) \ 74 | .run().successMetricsAsDataFrame() 75 | out = out.select('value').collect() 76 | self.assertEqual(out, [Row(value=5.0), Row(value=0.0), Row(value=0.0), Row(value=0.0), Row(value=0.0), Row(value=0.0), Row(value=0.0), Row(value=0.0), Row(value=0.0), Row(value=8.0), Row(value=1.0)]) 77 | 78 | def test_Distinctness(self): 79 | out = self.runner.onData(self.df) \ 80 | .addAnalyzer(analyzers.Distinctness('_3')) \ 81 | .run().successMetricsAsDataFrame() 82 | out = out.select('value').collect() 83 | self.assertEqual(out, [Row(value=0.375)]) 84 | 85 | def test_Entropy(self): 86 | out = self.runner.onData(self.df) \ 87 | .addAnalyzer(analyzers.Entropy('_3')) \ 88 | .run().successMetricsAsDataFrame() 89 | out = out.select('value').collect() 90 | self.assertGreater(out, [Row(value=1)]) 91 | 92 | def test_Histogram(self): 93 | out = self.runner.onData(self.df) \ 94 | .addAnalyzer(analyzers.Histogram('_3')) \ 95 | .run().successMetricsAsDataFrame() 96 | out = out.select('value').collect() 97 | self.assertEqual(out, [Row(value=3.0), Row(value=4.0), Row(value=0.5), Row(value=2.0), Row(value=0.25), Row(value=2.0), Row(value=0.25)]) 98 | 99 | def test_Maximum(self): 100 | out = self.runner.onData(self.df) \ 101 | .addAnalyzer(analyzers.Maximum('_2')) \ 102 | .run().successMetricsAsDataFrame() 103 | out = out.select('value').collect() 104 | self.assertEqual(out, [Row(value=20)]) 105 | 106 | def test_MaxLength(self): 107 | out = self.runner.onData(self.df) \ 108 | .addAnalyzer(analyzers.MaxLength('_1')) \ 109 | .run().successMetricsAsDataFrame() 110 | out = out.select('value').collect() 111 | self.assertEqual(out, [Row(value=6)]) 112 | 113 | def test_Mean(self): 114 | out = self.runner.onData(self.df) \ 115 | .addAnalyzer(analyzers.Mean('_2')) \ 116 | .run().successMetricsAsDataFrame() 117 | out = out.select('value').collect() 118 | self.assertEqual(out, [Row(value=11)]) 119 | 120 | def test_Minimum(self): 121 | out = self.runner.onData(self.df) \ 122 | .addAnalyzer(analyzers.Minimum('_2')) \ 123 | .run().successMetricsAsDataFrame() 124 | out = out.select('value').collect() 125 | self.assertEqual(out, [Row(value=1)]) 126 | 127 | def test_MinLength(self): 128 | out = self.runner.onData(self.df) \ 129 | .addAnalyzer(analyzers.MaxLength('_1')) \ 130 | .run().successMetricsAsDataFrame() 131 | out = out.select('value').collect() 132 | self.assertEqual(out, [Row(value=6)]) 133 | 134 | def test_MutualInformation(self): 135 | out = self.runner.onData(self.df) \ 136 | .addAnalyzer(analyzers.MutualInformation(['_1', '_3'])) \ 137 | .run().successMetricsAsDataFrame() 138 | out = out.select('value').collect() 139 | self.assertGreater(out, [Row(value=0.5)]) 140 | 141 | def test_Size(self): 142 | out = self.runner.onData(self.df) \ 143 | .addAnalyzer(analyzers.Size()) \ 144 | .run().successMetricsAsDataFrame() 145 | out = out.select('value').collect() 146 | self.assertEqual(out, [Row(value=8)]) 147 | 148 | def test_StandardDeviation(self): 149 | out = self.runner.onData(self.df) \ 150 | .addAnalyzer(analyzers.StandardDeviation('_2')) \ 151 | .run().successMetricsAsDataFrame() 152 | out = out.select('value').collect() 153 | self.assertGreater(out, [Row(value=7)]) 154 | 155 | def test_Sum(self): 156 | out = self.runner.onData(self.df) \ 157 | .addAnalyzer(analyzers.Sum('_2')) \ 158 | .run().successMetricsAsDataFrame() 159 | out = out.select('value').collect() 160 | self.assertGreater(out, [Row(value=10)]) 161 | 162 | def test_Uniqueness(self): 163 | out = self.runner.onData(self.df) \ 164 | .addAnalyzer(analyzers.Uniqueness(['_1'])) \ 165 | .run().successMetricsAsDataFrame() 166 | out = out.select('value').collect() 167 | self.assertEqual(out, [Row(value=0.375)]) 168 | 169 | def test_UniqueValueRatio(self): 170 | out = self.runner.onData(self.df) \ 171 | .addAnalyzer(analyzers.UniqueValueRatio(['_1'])) \ 172 | .run().successMetricsAsDataFrame() 173 | out = out.select('value').collect() 174 | self.assertEqual(out, [Row(value=0.6)]) 175 | 176 | if __name__ == '__main__': 177 | unittest.main() 178 | -------------------------------------------------------------------------------- /tests/integration/test_constraints.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from pyspark.sql import SparkSession, DataFrame, Row 4 | 5 | from pydeequ.base import VerificationSuite 6 | from pydeequ.checks import Check 7 | from pydeequ.examples import test_data 8 | 9 | class ConstraintTest(unittest.TestCase): 10 | 11 | @classmethod 12 | def setUpClass(cls): 13 | cls.spark = (SparkSession 14 | .builder 15 | .master('local[*]') 16 | .config('spark.jars.packages', 17 | 'com.amazon.deequ:deequ:1.0.5') 18 | .appName('pytest-pyspark-local-testing') 19 | .getOrCreate()) 20 | cls.df = cls.spark.createDataFrame(test_data) 21 | cls.suite = VerificationSuite(cls.spark) 22 | cls.success = Row(constraint_status = 'Success') 23 | cls.failure = Row(constraint_status = 'Failure') 24 | 25 | @classmethod 26 | def tearDownClass(cls): 27 | cls.spark.sparkContext._gateway.close() 28 | cls.spark.stop() 29 | 30 | def test_hasSize(self): 31 | chk = Check(self.spark) \ 32 | .hasSize(lambda x: x == 8) 33 | out = self.suite.onData(self.df).addCheck(chk).run() 34 | out = DataFrame(out, self.spark).select('constraint_status').collect() 35 | self.assertEqual(out, [self.success]) 36 | 37 | def test_isUnique(self): 38 | chk = Check(self.spark) \ 39 | .isUnique('_1') 40 | out = self.suite.onData(self.df).addCheck(chk).run() 41 | out = DataFrame(out, self.spark).select('constraint_status').collect() 42 | self.assertEqual(out, [self.failure]) 43 | 44 | def test_hasCompleteness(self): 45 | chk = Check(self.spark) \ 46 | .hasCompleteness('_2', lambda x: x >= 0.75) 47 | out = self.suite.onData(self.df).addCheck(chk).run() 48 | out = DataFrame(out, self.spark).select('constraint_status').collect() 49 | self.assertEqual(out, [self.success]) 50 | 51 | def test_hasUniqueness(self): 52 | chk = Check(self.spark) \ 53 | .hasUniqueness('_1', lambda x: x == 3/8) 54 | out = self.suite.onData(self.df).addCheck(chk).run() 55 | out = DataFrame(out, self.spark).select('constraint_status').collect() 56 | self.assertEqual(out, [self.success]) 57 | 58 | def test_hasDistinctness(self): 59 | chk = Check(self.spark) \ 60 | .hasDistinctness('_1', lambda x: x == 5/8) 61 | out = self.suite.onData(self.df).addCheck(chk).run() 62 | out = DataFrame(out, self.spark).select('constraint_status').collect() 63 | self.assertEqual(out, [self.success]) 64 | 65 | def test_hasUniqueValueRatio(self): 66 | chk = Check(self.spark) \ 67 | .hasUniqueValueRatio('_2', lambda x: x == 0.8) 68 | out = self.suite.onData(self.df).addCheck(chk).run() 69 | out = DataFrame(out, self.spark).select('constraint_status').collect() 70 | self.assertEqual(out, [self.success]) 71 | 72 | def test_hasNumberOfDistinctValues(self): 73 | chk = Check(self.spark) \ 74 | .hasNumberOfDistinctValues('_2', lambda x: x == 6) 75 | out = self.suite.onData(self.df).addCheck(chk).run() 76 | out = DataFrame(out, self.spark).select('constraint_status').collect() 77 | self.assertEqual(out, [self.success]) 78 | 79 | # .hasHistogram 80 | 81 | def test_hasEntropy(self): 82 | chk = Check(self.spark) \ 83 | .hasEntropy('_3', lambda x: x > 1) 84 | out = self.suite.onData(self.df).addCheck(chk).run() 85 | out = DataFrame(out, self.spark).select('constraint_status').collect() 86 | self.assertEqual(out, [self.success]) 87 | 88 | # .hasMutualInformation 89 | 90 | def test_hasApproxQuantile(self): 91 | chk = Check(self.spark) \ 92 | .hasApproxQuantile('_2', 0.5, lambda x: x == 7) 93 | out = self.suite.onData(self.df).addCheck(chk).run() 94 | out = DataFrame(out, self.spark).select('constraint_status').collect() 95 | self.assertEqual(out, [self.success]) 96 | 97 | def test_hasMinLength(self): 98 | chk = Check(self.spark) \ 99 | .hasMinLength('_1', lambda x: x == 6) 100 | out = self.suite.onData(self.df).addCheck(chk).run() 101 | out = DataFrame(out, self.spark).select('constraint_status').collect() 102 | self.assertEqual(out, [self.success]) 103 | 104 | def test_hasMaxLength(self): 105 | chk = Check(self.spark) \ 106 | .hasMaxLength('_3', lambda x: x == 10) 107 | out = self.suite.onData(self.df).addCheck(chk).run() 108 | out = DataFrame(out, self.spark).select('constraint_status').collect() 109 | self.assertEqual(out, [self.success]) 110 | 111 | def test_hasMin(self): 112 | chk = Check(self.spark) \ 113 | .hasMin('_2', lambda x: x == 1) 114 | out = self.suite.onData(self.df).addCheck(chk).run() 115 | out = DataFrame(out, self.spark).select('constraint_status').collect() 116 | self.assertEqual(out, [self.success]) 117 | 118 | def test_hasMax(self): 119 | chk = Check(self.spark) \ 120 | .hasMax('_2', lambda x: x == 20) 121 | out = self.suite.onData(self.df).addCheck(chk).run() 122 | out = DataFrame(out, self.spark).select('constraint_status').collect() 123 | self.assertEqual(out, [self.success]) 124 | 125 | def test_hasMean(self): 126 | chk = Check(self.spark) \ 127 | .hasMean('_2', lambda x: x > 10) 128 | out = self.suite.onData(self.df).addCheck(chk).run() 129 | out = DataFrame(out, self.spark).select('constraint_status').collect() 130 | self.assertEqual(out, [self.success]) 131 | 132 | def test_hasSum(self): 133 | chk = Check(self.spark) \ 134 | .hasSum('_2', lambda x: x > 50) 135 | out = self.suite.onData(self.df).addCheck(chk).run() 136 | out = DataFrame(out, self.spark).select('constraint_status').collect() 137 | self.assertEqual(out, [self.success]) 138 | 139 | def test_hasStandardDeviation(self): 140 | chk = Check(self.spark) \ 141 | .hasStandardDeviation('_2', lambda x: x > 5) 142 | out = self.suite.onData(self.df).addCheck(chk).run() 143 | out = DataFrame(out, self.spark).select('constraint_status').collect() 144 | self.assertEqual(out, [self.success]) 145 | 146 | def test_hasApproxContDistintc(self): 147 | chk = Check(self.spark) \ 148 | .hasApproxCountDistinct('_2', lambda x: x == 5) 149 | out = self.suite.onData(self.df).addCheck(chk).run() 150 | out = DataFrame(out, self.spark).select('constraint_status').collect() 151 | self.assertEqual(out, [self.success]) 152 | 153 | def test_hasCorrelation(self): 154 | chk = Check(self.spark) \ 155 | .hasCorrelation('_2', '_2', lambda x: x == 1) 156 | out = self.suite.onData(self.df).addCheck(chk).run() 157 | out = DataFrame(out, self.spark).select('constraint_status').collect() 158 | self.assertEqual(out, [self.success]) 159 | 160 | def test_satisfies(self): 161 | chk = Check(self.spark) \ 162 | .satisfies("_2 > 15", "MyCondition", lambda x: x == 0.25) 163 | out = self.suite.onData(self.df).addCheck(chk).run() 164 | out = DataFrame(out, self.spark).select('constraint_status').collect() 165 | self.assertEqual(out, [self.success]) 166 | 167 | 168 | #.hasPattern("_1", "thing([A-Z])", lambda x: x == 1) 169 | #.hasDataType("_1", "string", lambda x: x == 1) 170 | 171 | def test_isPositive(self): 172 | chk = Check(self.spark) \ 173 | .isPositive('_2') 174 | out = self.suite.onData(self.df).addCheck(chk).run() 175 | out = DataFrame(out, self.spark).select('constraint_status').collect() 176 | self.assertEqual(out, [self.success]) 177 | 178 | def test_isNonNegative(self): 179 | chk = Check(self.spark) \ 180 | .isNonNegative('_2') 181 | out = self.suite.onData(self.df).addCheck(chk).run() 182 | out = DataFrame(out, self.spark).select('constraint_status').collect() 183 | self.assertEqual(out, [self.success]) 184 | 185 | def test_isLessThan(self): 186 | chk = Check(self.spark) \ 187 | .isLessThan('_5', '_2', lambda x: x == 0.375) 188 | out = self.suite.onData(self.df).addCheck(chk).run() 189 | out = DataFrame(out, self.spark).select('constraint_status').collect() 190 | self.assertEqual(out, [self.success]) 191 | 192 | def test_isLessThanOrEqualTo(self): 193 | chk = Check(self.spark) \ 194 | .isLessThanOrEqualTo('_5', '_2', lambda x: x == 0.375) 195 | out = self.suite.onData(self.df).addCheck(chk).run() 196 | out = DataFrame(out, self.spark).select('constraint_status').collect() 197 | self.assertEqual(out, [self.success]) 198 | 199 | def test_isGreaterThan(self): 200 | chk = Check(self.spark) \ 201 | .isGreaterThan('_5', '_2', lambda x: x == 0.125) 202 | out = self.suite.onData(self.df).addCheck(chk).run() 203 | out = DataFrame(out, self.spark).select('constraint_status').collect() 204 | self.assertEqual(out, [self.success]) 205 | 206 | def test_isGreaterThanOrEqualTo(self): 207 | chk = Check(self.spark) \ 208 | .isGreaterThanOrEqualTo('_5', '_2', lambda x: x == 0.125) 209 | out = self.suite.onData(self.df).addCheck(chk).run() 210 | out = DataFrame(out, self.spark).select('constraint_status').collect() 211 | self.assertEqual(out, [self.success]) 212 | 213 | #.isContainedIn('_3', ['DELAYED', 'INTRANSIT']) 214 | 215 | def test_isInInterval(self): 216 | chk = Check(self.spark) \ 217 | .isInInterval('_5', 1.0, 50.0) 218 | out = self.suite.onData(self.df).addCheck(chk).run() 219 | out = DataFrame(out, self.spark).select('constraint_status').collect() 220 | self.assertEqual(out, [self.success]) 221 | 222 | if __name__ == '__main__': 223 | unittest.main() 224 | -------------------------------------------------------------------------------- /tests/integration/test_runners.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from pyspark.sql import SparkSession 4 | 5 | from pydeequ.base import VerificationSuite, AnalysisRunner, ConstraintSuggestionRunner 6 | from pydeequ.profiler import ColumnProfilerRunner 7 | from pydeequ.examples import test_data 8 | 9 | class VerificationTest(unittest.TestCase): 10 | 11 | @classmethod 12 | def setUpClass(cls): 13 | cls.spark = (SparkSession 14 | .builder 15 | .master('local[*]') 16 | .config('spark.jars.packages', 17 | 'com.amazon.deequ:deequ:1.0.5') 18 | .appName('pytest-pyspark-local-testing') 19 | .getOrCreate()) 20 | cls.df = cls.spark.createDataFrame(test_data) 21 | 22 | @classmethod 23 | def tearDownClass(cls): 24 | cls.spark.sparkContext._gateway.close() 25 | cls.spark.stop() 26 | 27 | def test_VerificationSuiteArgs(self): 28 | suiterunner = VerificationSuite(self.spark).onData(self.df) 29 | # check dataframe prop 30 | self.assertEqual(suiterunner.dataFrame.columns, 31 | ['_1', '_2', '_3', '_4', '_5'] 32 | ) 33 | # check _jsparkSession prop 34 | self.assertEqual(suiterunner._jsparkSession.getClass().toString(), 35 | 'class org.apache.spark.sql.SparkSession' 36 | ) 37 | # check _jvm prop 38 | self.assertEqual(suiterunner._jvm, 39 | self.spark.sparkContext._jvm 40 | ) 41 | # check jvmVerificationRunBuilder 42 | self.assertEqual(suiterunner.jvmVerificationRunBuilder.getClass().toString(), 43 | "class com.amazon.deequ.VerificationRunBuilder" 44 | ) 45 | 46 | def test_AnalyzerRunnerArgs(self): 47 | runner = AnalysisRunner(self.spark).onData(self.df) 48 | # check dataframe prop 49 | self.assertEqual(runner.dataFrame.columns, 50 | ['_1', '_2', '_3', '_4', '_5'] 51 | ) 52 | # check _jsparkSession prop 53 | self.assertEqual(runner._jsparkSession.getClass().toString(), 54 | 'class org.apache.spark.sql.SparkSession' 55 | ) 56 | # check _jvm prop 57 | self.assertEqual(runner._jvm, 58 | self.spark.sparkContext._jvm 59 | ) 60 | # check jvmAnalysisRunBuilder 61 | self.assertEqual(runner.jvmAnalysisRunBuilder.getClass().toString(), 62 | "class com.amazon.deequ.analyzers.runners.AnalysisRunBuilder" 63 | ) 64 | 65 | def test_ProfilerRunnerArgs(self): 66 | profilerrunner = ColumnProfilerRunner().onData(self.df) 67 | # check dataframe prop 68 | self.assertEqual(profilerrunner.dataFrame.columns, 69 | ['_1', '_2', '_3', '_4', '_5'] 70 | ) 71 | # check _jvm prop 72 | self.assertEqual(profilerrunner._jvm, 73 | self.spark.sparkContext._jvm 74 | ) 75 | # check jvmColumnProfilerRunBuilder 76 | self.assertEqual(profilerrunner.jvmColumnProfilerRunBuilder.getClass().toString(), 77 | "class com.amazon.deequ.profiles.ColumnProfilerRunBuilder" 78 | ) 79 | 80 | def test_SuggestionRunnerArgs(self): 81 | suggestionrunner = ConstraintSuggestionRunner(self.spark).onData(self.df) 82 | # check dataframe prop 83 | self.assertEqual(suggestionrunner.dataFrame.columns, 84 | ['_1', '_2', '_3', '_4', '_5'] 85 | ) 86 | # check _jvm prop 87 | self.assertEqual(suggestionrunner._jvm, 88 | self.spark.sparkContext._jvm 89 | ) 90 | # check jvmColumnProfilerRunBuilder 91 | self.assertEqual(suggestionrunner.jvmConstraintSuggestionRunBuilder.getClass().toString(), 92 | "class com.amazon.deequ.suggestions.ConstraintSuggestionRunBuilder" 93 | ) 94 | 95 | if __name__ == '__main__': 96 | unittest.main() 97 | 98 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # Tox configuration file 2 | # Read more under https://tox.readthedocs.org/ 3 | # THIS SCRIPT IS SUPPOSED TO BE AN EXAMPLE. MODIFY IT ACCORDING TO YOUR NEEDS! 4 | 5 | [tox] 6 | minversion = 2.4 7 | envlist = default 8 | 9 | [testenv] 10 | setenv = TOXINIDIR = {toxinidir} 11 | passenv = 12 | HOME 13 | commands = 14 | py.test {posargs} 15 | extras = 16 | all 17 | testing 18 | --------------------------------------------------------------------------------