├── .coveragerc
├── .gitignore
├── LICENSE.txt
├── README.md
├── coverage.svg
├── docs
├── Makefile
├── _static
│ └── .gitignore
├── authors.rst
├── changelog.rst
├── conf.py
├── index.rst
└── license.rst
├── requirements.txt
├── setup.cfg
├── setup.py
├── src
└── pydeequ
│ ├── __init__.py
│ ├── analyzers.py
│ ├── base.py
│ ├── checks.py
│ ├── examples
│ ├── __init__.py
│ ├── analyzer_example.py
│ ├── basic_usage.py
│ ├── basic_usage2.py
│ ├── metrics_repo.py
│ ├── profiler_example.py
│ └── suggestions_example.py
│ ├── exceptions.py
│ ├── jvm_conversions.py
│ ├── metricsrepo.py
│ ├── profiler.py
│ └── suggestions.py
├── tests
└── integration
│ ├── test_analyzers.py
│ ├── test_constraints.py
│ └── test_runners.py
└── tox.ini
/.coveragerc:
--------------------------------------------------------------------------------
1 | # .coveragerc to control coverage.py
2 | [run]
3 | branch = True
4 | source = pydeequ
5 | omit = src/pydeequ/examples/*
6 |
7 | [paths]
8 | source =
9 | src/
10 | */site-packages/
11 |
12 | [report]
13 | # Regexes for lines to exclude from consideration
14 | exclude_lines =
15 | # Have to re-enable the standard pragma
16 | pragma: no cover
17 |
18 | # Don't complain about missing debug-only code:
19 | def __repr__
20 | if self\.debug
21 |
22 | # Don't complain if tests don't hit defensive assertion code:
23 | raise AssertionError
24 | raise NotImplementedError
25 |
26 | # Don't complain if non-runnable code isn't run:
27 | if 0:
28 | if __name__ == .__main__.:
29 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Temporary and binary files
2 | *~
3 | *.py[cod]
4 | *.so
5 | *.cfg
6 | !.isort.cfg
7 | !setup.cfg
8 | *.orig
9 | *.log
10 | *.pot
11 | __pycache__/*
12 | .cache/*
13 | .*.swp
14 | */.ipynb_checkpoints/*
15 | .DS_Store
16 | metastore_db/
17 | spark-warehouse/
18 | .vscode
19 |
20 | # Project files
21 | .ropeproject
22 | .project
23 | .pydevproject
24 | .settings
25 | .idea
26 | tags
27 |
28 | # Package files
29 | *.egg
30 | *.eggs/
31 | .installed.cfg
32 | *.egg-info
33 |
34 | # Unittest and coverage
35 | htmlcov/*
36 | .coverage
37 | .tox
38 | junit.xml
39 | coverage.xml
40 | .pytest_cache/
41 |
42 | # Build and docs folder/files
43 | build/*
44 | dist/*
45 | sdist/*
46 | docs/api/*
47 | docs/_rst/*
48 | docs/_build/*
49 | cover/*
50 | MANIFEST
51 |
52 | # Per-project virtualenvs
53 | .venv*/
54 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | __This repository will be merged with the official [AWS Lab/python-deequ](https://github.com/awslabs/python-deequ) project. Please fork and contribute to that project because many issues of this pydeequ version are solved there.__
2 |
--------------------------------------------------------------------------------
/coverage.svg:
--------------------------------------------------------------------------------
1 |
2 |
22 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | PAPER =
8 | BUILDDIR = ../build/sphinx/
9 | AUTODOCDIR = api
10 | AUTODOCBUILD = sphinx-apidoc
11 | PROJECT = pydeequ
12 | MODULEDIR = ../src/pydeequ
13 |
14 | # User-friendly check for sphinx-build
15 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $?), 1)
16 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
17 | endif
18 |
19 | # Internal variables.
20 | PAPEROPT_a4 = -D latex_paper_size=a4
21 | PAPEROPT_letter = -D latex_paper_size=letter
22 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
23 | # the i18n builder cannot share the environment and doctrees with the others
24 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
25 |
26 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext doc-requirements
27 |
28 | help:
29 | @echo "Please use \`make ' where is one of"
30 | @echo " html to make standalone HTML files"
31 | @echo " dirhtml to make HTML files named index.html in directories"
32 | @echo " singlehtml to make a single large HTML file"
33 | @echo " pickle to make pickle files"
34 | @echo " json to make JSON files"
35 | @echo " htmlhelp to make HTML files and a HTML help project"
36 | @echo " qthelp to make HTML files and a qthelp project"
37 | @echo " devhelp to make HTML files and a Devhelp project"
38 | @echo " epub to make an epub"
39 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
40 | @echo " latexpdf to make LaTeX files and run them through pdflatex"
41 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
42 | @echo " text to make text files"
43 | @echo " man to make manual pages"
44 | @echo " texinfo to make Texinfo files"
45 | @echo " info to make Texinfo files and run them through makeinfo"
46 | @echo " gettext to make PO message catalogs"
47 | @echo " changes to make an overview of all changed/added/deprecated items"
48 | @echo " xml to make Docutils-native XML files"
49 | @echo " pseudoxml to make pseudoxml-XML files for display purposes"
50 | @echo " linkcheck to check all external links for integrity"
51 | @echo " doctest to run all doctests embedded in the documentation (if enabled)"
52 |
53 | clean:
54 | rm -rf $(BUILDDIR)/* $(AUTODOCDIR)
55 |
56 | $(AUTODOCDIR): $(MODULEDIR)
57 | mkdir -p $@
58 | $(AUTODOCBUILD) -f -o $@ $^
59 |
60 | doc-requirements: $(AUTODOCDIR)
61 |
62 | html: doc-requirements
63 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
64 | @echo
65 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
66 |
67 | dirhtml: doc-requirements
68 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
69 | @echo
70 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
71 |
72 | singlehtml: doc-requirements
73 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
74 | @echo
75 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
76 |
77 | pickle: doc-requirements
78 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
79 | @echo
80 | @echo "Build finished; now you can process the pickle files."
81 |
82 | json: doc-requirements
83 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
84 | @echo
85 | @echo "Build finished; now you can process the JSON files."
86 |
87 | htmlhelp: doc-requirements
88 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
89 | @echo
90 | @echo "Build finished; now you can run HTML Help Workshop with the" \
91 | ".hhp project file in $(BUILDDIR)/htmlhelp."
92 |
93 | qthelp: doc-requirements
94 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
95 | @echo
96 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \
97 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
98 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/$(PROJECT).qhcp"
99 | @echo "To view the help file:"
100 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/$(PROJECT).qhc"
101 |
102 | devhelp: doc-requirements
103 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
104 | @echo
105 | @echo "Build finished."
106 | @echo "To view the help file:"
107 | @echo "# mkdir -p $HOME/.local/share/devhelp/$(PROJECT)"
108 | @echo "# ln -s $(BUILDDIR)/devhelp $HOME/.local/share/devhelp/$(PROJEC)"
109 | @echo "# devhelp"
110 |
111 | epub: doc-requirements
112 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
113 | @echo
114 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
115 |
116 | patch-latex:
117 | find _build/latex -iname "*.tex" | xargs -- \
118 | sed -i'' 's~includegraphics{~includegraphics\[keepaspectratio,max size={\\textwidth}{\\textheight}\]{~g'
119 |
120 | latex: doc-requirements
121 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
122 | $(MAKE) patch-latex
123 | @echo
124 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
125 | @echo "Run \`make' in that directory to run these through (pdf)latex" \
126 | "(use \`make latexpdf' here to do that automatically)."
127 |
128 | latexpdf: doc-requirements
129 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
130 | $(MAKE) patch-latex
131 | @echo "Running LaTeX files through pdflatex..."
132 | $(MAKE) -C $(BUILDDIR)/latex all-pdf
133 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
134 |
135 | latexpdfja: doc-requirements
136 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
137 | @echo "Running LaTeX files through platex and dvipdfmx..."
138 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
139 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
140 |
141 | text: doc-requirements
142 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
143 | @echo
144 | @echo "Build finished. The text files are in $(BUILDDIR)/text."
145 |
146 | man: doc-requirements
147 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
148 | @echo
149 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
150 |
151 | texinfo: doc-requirements
152 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
153 | @echo
154 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
155 | @echo "Run \`make' in that directory to run these through makeinfo" \
156 | "(use \`make info' here to do that automatically)."
157 |
158 | info: doc-requirements
159 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
160 | @echo "Running Texinfo files through makeinfo..."
161 | make -C $(BUILDDIR)/texinfo info
162 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
163 |
164 | gettext: doc-requirements
165 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
166 | @echo
167 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
168 |
169 | changes: doc-requirements
170 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
171 | @echo
172 | @echo "The overview file is in $(BUILDDIR)/changes."
173 |
174 | linkcheck: doc-requirements
175 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
176 | @echo
177 | @echo "Link check complete; look for any errors in the above output " \
178 | "or in $(BUILDDIR)/linkcheck/output.txt."
179 |
180 | doctest: doc-requirements
181 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
182 | @echo "Testing of doctests in the sources finished, look at the " \
183 | "results in $(BUILDDIR)/doctest/output.txt."
184 |
185 | xml: doc-requirements
186 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
187 | @echo
188 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml."
189 |
190 | pseudoxml: doc-requirements
191 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
192 | @echo
193 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
194 |
--------------------------------------------------------------------------------
/docs/_static/.gitignore:
--------------------------------------------------------------------------------
1 | # Empty directory
2 |
--------------------------------------------------------------------------------
/docs/authors.rst:
--------------------------------------------------------------------------------
1 | .. _authors:
2 | .. include:: ../AUTHORS.rst
3 |
--------------------------------------------------------------------------------
/docs/changelog.rst:
--------------------------------------------------------------------------------
1 | .. _changes:
2 | .. include:: ../CHANGELOG.rst
3 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # This file is execfile()d with the current directory set to its containing dir.
4 | #
5 | # Note that not all possible configuration values are present in this
6 | # autogenerated file.
7 | #
8 | # All configuration values have a default; values that are commented out
9 | # serve to show the default.
10 |
11 | import os
12 | import sys
13 | import inspect
14 | import shutil
15 |
16 | __location__ = os.path.join(os.getcwd(), os.path.dirname(
17 | inspect.getfile(inspect.currentframe())))
18 |
19 | # If extensions (or modules to document with autodoc) are in another directory,
20 | # add these directories to sys.path here. If the directory is relative to the
21 | # documentation root, use os.path.abspath to make it absolute, like shown here.
22 | sys.path.insert(0, os.path.join(__location__, '../src'))
23 |
24 | # -- Run sphinx-apidoc ------------------------------------------------------
25 | # This hack is necessary since RTD does not issue `sphinx-apidoc` before running
26 | # `sphinx-build -b html . _build/html`. See Issue:
27 | # https://github.com/rtfd/readthedocs.org/issues/1139
28 | # DON'T FORGET: Check the box "Install your project inside a virtualenv using
29 | # setup.py install" in the RTD Advanced Settings.
30 | # Additionally it helps us to avoid running apidoc manually
31 |
32 | try: # for Sphinx >= 1.7
33 | from sphinx.ext import apidoc
34 | except ImportError:
35 | from sphinx import apidoc
36 |
37 | output_dir = os.path.join(__location__, "api")
38 | module_dir = os.path.join(__location__, "../src/pydeequ")
39 | try:
40 | shutil.rmtree(output_dir)
41 | except FileNotFoundError:
42 | pass
43 |
44 | try:
45 | import sphinx
46 | from pkg_resources import parse_version
47 |
48 | cmd_line_template = "sphinx-apidoc -f -o {outputdir} {moduledir}"
49 | cmd_line = cmd_line_template.format(outputdir=output_dir, moduledir=module_dir)
50 |
51 | args = cmd_line.split(" ")
52 | if parse_version(sphinx.__version__) >= parse_version('1.7'):
53 | args = args[1:]
54 |
55 | apidoc.main(args)
56 | except Exception as e:
57 | print("Running `sphinx-apidoc` failed!\n{}".format(e))
58 |
59 | # -- General configuration -----------------------------------------------------
60 |
61 | # If your documentation needs a minimal Sphinx version, state it here.
62 | # needs_sphinx = '1.0'
63 |
64 | # Add any Sphinx extension module names here, as strings. They can be extensions
65 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
66 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.todo',
67 | 'sphinx.ext.autosummary', 'sphinx.ext.viewcode', 'sphinx.ext.coverage',
68 | 'sphinx.ext.doctest', 'sphinx.ext.ifconfig', 'sphinx.ext.mathjax',
69 | 'sphinx.ext.napoleon']
70 |
71 | # Add any paths that contain templates here, relative to this directory.
72 | templates_path = ['_templates']
73 |
74 | # The suffix of source filenames.
75 | source_suffix = '.rst'
76 |
77 | # The encoding of source files.
78 | # source_encoding = 'utf-8-sig'
79 |
80 | # The master toctree document.
81 | master_doc = 'index'
82 |
83 | # General information about the project.
84 | project = u'pydeequ'
85 | copyright = u'2020, margitai.i'
86 |
87 | # The version info for the project you're documenting, acts as replacement for
88 | # |version| and |release|, also used in various other places throughout the
89 | # built documents.
90 | #
91 | # The short X.Y version.
92 | version = '' # Is set by calling `setup.py docs`
93 | # The full version, including alpha/beta/rc tags.
94 | release = '' # Is set by calling `setup.py docs`
95 |
96 | # The language for content autogenerated by Sphinx. Refer to documentation
97 | # for a list of supported languages.
98 | # language = None
99 |
100 | # There are two options for replacing |today|: either, you set today to some
101 | # non-false value, then it is used:
102 | # today = ''
103 | # Else, today_fmt is used as the format for a strftime call.
104 | # today_fmt = '%B %d, %Y'
105 |
106 | # List of patterns, relative to source directory, that match files and
107 | # directories to ignore when looking for source files.
108 | exclude_patterns = ['_build']
109 |
110 | # The reST default role (used for this markup: `text`) to use for all documents.
111 | # default_role = None
112 |
113 | # If true, '()' will be appended to :func: etc. cross-reference text.
114 | # add_function_parentheses = True
115 |
116 | # If true, the current module name will be prepended to all description
117 | # unit titles (such as .. function::).
118 | # add_module_names = True
119 |
120 | # If true, sectionauthor and moduleauthor directives will be shown in the
121 | # output. They are ignored by default.
122 | # show_authors = False
123 |
124 | # The name of the Pygments (syntax highlighting) style to use.
125 | pygments_style = 'sphinx'
126 |
127 | # A list of ignored prefixes for module index sorting.
128 | # modindex_common_prefix = []
129 |
130 | # If true, keep warnings as "system message" paragraphs in the built documents.
131 | # keep_warnings = False
132 |
133 |
134 | # -- Options for HTML output ---------------------------------------------------
135 |
136 | # The theme to use for HTML and HTML Help pages. See the documentation for
137 | # a list of builtin themes.
138 | html_theme = 'alabaster'
139 |
140 | # Theme options are theme-specific and customize the look and feel of a theme
141 | # further. For a list of options available for each theme, see the
142 | # documentation.
143 | html_theme_options = {
144 | 'sidebar_width': '300px',
145 | 'page_width': '1200px'
146 | }
147 |
148 | # Add any paths that contain custom themes here, relative to this directory.
149 | # html_theme_path = []
150 |
151 | # The name for this set of Sphinx documents. If None, it defaults to
152 | # " v documentation".
153 | try:
154 | from pydeequ import __version__ as version
155 | except ImportError:
156 | pass
157 | else:
158 | release = version
159 |
160 | # A shorter title for the navigation bar. Default is the same as html_title.
161 | # html_short_title = None
162 |
163 | # The name of an image file (relative to this directory) to place at the top
164 | # of the sidebar.
165 | # html_logo = ""
166 |
167 | # The name of an image file (within the static path) to use as favicon of the
168 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
169 | # pixels large.
170 | # html_favicon = None
171 |
172 | # Add any paths that contain custom static files (such as style sheets) here,
173 | # relative to this directory. They are copied after the builtin static files,
174 | # so a file named "default.css" will overwrite the builtin "default.css".
175 | html_static_path = ['_static']
176 |
177 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
178 | # using the given strftime format.
179 | # html_last_updated_fmt = '%b %d, %Y'
180 |
181 | # If true, SmartyPants will be used to convert quotes and dashes to
182 | # typographically correct entities.
183 | # html_use_smartypants = True
184 |
185 | # Custom sidebar templates, maps document names to template names.
186 | # html_sidebars = {}
187 |
188 | # Additional templates that should be rendered to pages, maps page names to
189 | # template names.
190 | # html_additional_pages = {}
191 |
192 | # If false, no module index is generated.
193 | # html_domain_indices = True
194 |
195 | # If false, no index is generated.
196 | # html_use_index = True
197 |
198 | # If true, the index is split into individual pages for each letter.
199 | # html_split_index = False
200 |
201 | # If true, links to the reST sources are added to the pages.
202 | # html_show_sourcelink = True
203 |
204 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
205 | # html_show_sphinx = True
206 |
207 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
208 | # html_show_copyright = True
209 |
210 | # If true, an OpenSearch description file will be output, and all pages will
211 | # contain a tag referring to it. The value of this option must be the
212 | # base URL from which the finished HTML is served.
213 | # html_use_opensearch = ''
214 |
215 | # This is the file name suffix for HTML files (e.g. ".xhtml").
216 | # html_file_suffix = None
217 |
218 | # Output file base name for HTML help builder.
219 | htmlhelp_basename = 'pydeequ-doc'
220 |
221 |
222 | # -- Options for LaTeX output --------------------------------------------------
223 |
224 | latex_elements = {
225 | # The paper size ('letterpaper' or 'a4paper').
226 | # 'papersize': 'letterpaper',
227 |
228 | # The font size ('10pt', '11pt' or '12pt').
229 | # 'pointsize': '10pt',
230 |
231 | # Additional stuff for the LaTeX preamble.
232 | # 'preamble': '',
233 | }
234 |
235 | # Grouping the document tree into LaTeX files. List of tuples
236 | # (source start file, target name, title, author, documentclass [howto/manual]).
237 | latex_documents = [
238 | ('index', 'user_guide.tex', u'pydeequ Documentation',
239 | u'margitai.i', 'manual'),
240 | ]
241 |
242 | # The name of an image file (relative to this directory) to place at the top of
243 | # the title page.
244 | # latex_logo = ""
245 |
246 | # For "manual" documents, if this is true, then toplevel headings are parts,
247 | # not chapters.
248 | # latex_use_parts = False
249 |
250 | # If true, show page references after internal links.
251 | # latex_show_pagerefs = False
252 |
253 | # If true, show URL addresses after external links.
254 | # latex_show_urls = False
255 |
256 | # Documents to append as an appendix to all manuals.
257 | # latex_appendices = []
258 |
259 | # If false, no module index is generated.
260 | # latex_domain_indices = True
261 |
262 | # -- External mapping ------------------------------------------------------------
263 | python_version = '.'.join(map(str, sys.version_info[0:2]))
264 | intersphinx_mapping = {
265 | 'sphinx': ('http://www.sphinx-doc.org/en/stable', None),
266 | 'python': ('https://docs.python.org/' + python_version, None),
267 | 'matplotlib': ('https://matplotlib.org', None),
268 | 'numpy': ('https://docs.scipy.org/doc/numpy', None),
269 | 'sklearn': ('http://scikit-learn.org/stable', None),
270 | 'pandas': ('http://pandas.pydata.org/pandas-docs/stable', None),
271 | 'scipy': ('https://docs.scipy.org/doc/scipy/reference', None),
272 | }
273 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | =======
2 | pydeequ
3 | =======
4 |
5 | This is the documentation of **pydeequ**.
6 |
7 | .. note::
8 |
9 | This is the main page of your project's `Sphinx`_ documentation.
10 | It is formatted in `reStructuredText`_. Add additional pages
11 | by creating rst-files in ``docs`` and adding them to the `toctree`_ below.
12 | Use then `references`_ in order to link them from this page, e.g.
13 | :ref:`authors` and :ref:`changes`.
14 |
15 | It is also possible to refer to the documentation of other Python packages
16 | with the `Python domain syntax`_. By default you can reference the
17 | documentation of `Sphinx`_, `Python`_, `NumPy`_, `SciPy`_, `matplotlib`_,
18 | `Pandas`_, `Scikit-Learn`_. You can add more by extending the
19 | ``intersphinx_mapping`` in your Sphinx's ``conf.py``.
20 |
21 | The pretty useful extension `autodoc`_ is activated by default and lets
22 | you include documentation from docstrings. Docstrings can be written in
23 | `Google style`_ (recommended!), `NumPy style`_ and `classical style`_.
24 |
25 |
26 | Contents
27 | ========
28 |
29 | .. toctree::
30 | :maxdepth: 2
31 |
32 | License
33 | Authors
34 | Changelog
35 | Module Reference
36 |
37 |
38 | Indices and tables
39 | ==================
40 |
41 | * :ref:`genindex`
42 | * :ref:`modindex`
43 | * :ref:`search`
44 |
45 | .. _toctree: http://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html
46 | .. _reStructuredText: http://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html
47 | .. _references: http://www.sphinx-doc.org/en/stable/markup/inline.html
48 | .. _Python domain syntax: http://sphinx-doc.org/domains.html#the-python-domain
49 | .. _Sphinx: http://www.sphinx-doc.org/
50 | .. _Python: http://docs.python.org/
51 | .. _Numpy: http://docs.scipy.org/doc/numpy
52 | .. _SciPy: http://docs.scipy.org/doc/scipy/reference/
53 | .. _matplotlib: https://matplotlib.org/contents.html#
54 | .. _Pandas: http://pandas.pydata.org/pandas-docs/stable
55 | .. _Scikit-Learn: http://scikit-learn.org/stable
56 | .. _autodoc: http://www.sphinx-doc.org/en/stable/ext/autodoc.html
57 | .. _Google style: https://github.com/google/styleguide/blob/gh-pages/pyguide.md#38-comments-and-docstrings
58 | .. _NumPy style: https://numpydoc.readthedocs.io/en/latest/format.html
59 | .. _classical style: http://www.sphinx-doc.org/en/stable/domains.html#info-field-lists
60 |
--------------------------------------------------------------------------------
/docs/license.rst:
--------------------------------------------------------------------------------
1 | .. _license:
2 |
3 | =======
4 | License
5 | =======
6 |
7 | .. include:: ../LICENSE.txt
8 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # =============================================================================
2 | # DEPRECATION WARNING:
3 | #
4 | # The file `requirements.txt` does not influence the package dependencies and
5 | # will not be automatically created in the next version of PyScaffold (v4.x).
6 | #
7 | # Please have look at the docs for better alternatives
8 | # (`Dependency Management` section).
9 | # =============================================================================
10 | #
11 | # Add your pinned requirements so that they can be easily installed with:
12 | # pip install -r requirements.txt
13 | # Remember to also add them in setup.cfg but unpinned.
14 | # Example:
15 | # numpy==1.13.3
16 | # scipy==1.0
17 | #
18 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | # This file is used to configure your project.
2 | # Read more about the various options under:
3 | # http://setuptools.readthedocs.io/en/latest/setuptools.html#configuring-setup-using-setup-cfg-files
4 |
5 | [metadata]
6 | name = pydeequ
7 | description = Python API for Deequ
8 | author = Istvan Margitai
9 | author-email = margitai.i@gmail.com
10 | license = apache
11 | long-description = file: README.md
12 | long-description-content-type = text/markdown; charset=UTF-8
13 | url = https://github.com/margitaii/pydeequ
14 | #project-urls =
15 | # Documentation = https://pyscaffold.org/
16 | # Change if running only on Windows, Mac or Linux (comma-separated)
17 | platforms = any
18 | # Add here all kinds of additional classifiers as defined under
19 | # https://pypi.python.org/pypi?%3Aaction=list_classifiers
20 | classifiers =
21 | Development Status :: 4 - Beta
22 | Programming Language :: Python
23 |
24 | [options]
25 | zip_safe = False
26 | packages = find:
27 | include_package_data = True
28 | package_dir =
29 | =src
30 | # DON'T CHANGE THE FOLLOWING LINE! IT WILL BE UPDATED BY PYSCAFFOLD!
31 | setup_requires = pyscaffold>=3.2a0,<3.3a0
32 | # Add here dependencies of your project (semicolon/line-separated), e.g.
33 | # install_requires = numpy; scipy
34 | # The usage of test_requires is discouraged, see `Dependency Management` docs
35 | # tests_require = pytest; pytest-cov
36 | # Require a specific Python version, e.g. Python 2.7 or >= 3.4
37 | # python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*
38 |
39 | [options.packages.find]
40 | where = src
41 | exclude =
42 | tests
43 |
44 | [options.extras_require]
45 | # Add here additional requirements for extra features, to install with:
46 | # `pip install pydeequ[PDF]` like:
47 | # PDF = ReportLab; RXP
48 | # Add here test requirements (semicolon/line-separated)
49 | testing =
50 | pytest
51 | pytest-cov
52 |
53 | [options.entry_points]
54 | # Add here console scripts like:
55 | # console_scripts =
56 | # script_name = pydeequ.module:function
57 | # For example:
58 | # console_scripts =
59 | # fibonacci = pydeequ.skeleton:run
60 | # And any other entry points, for example:
61 | # pyscaffold.cli =
62 | # awesome = pyscaffoldext.awesome.extension:AwesomeExtension
63 |
64 | [test]
65 | # py.test options when running `python setup.py test`
66 | # addopts = --verbose
67 | extras = True
68 |
69 | [tool:pytest]
70 | # Options for py.test:
71 | # Specify command line options as you would do when invoking py.test directly.
72 | # e.g. --cov-report html (or xml) for html/xml output or --junitxml junit.xml
73 | # in order to write a coverage file that can be read by Jenkins.
74 | addopts =
75 | --cov pydeequ --cov-report term-missing
76 | --verbose
77 | norecursedirs =
78 | dist
79 | build
80 | .tox
81 | testpaths = tests
82 |
83 | [aliases]
84 | dists = bdist_wheel
85 |
86 | [bdist_wheel]
87 | # Use this option if your package is pure-python
88 | universal = 1
89 |
90 | [build_sphinx]
91 | source_dir = docs
92 | build_dir = build/sphinx
93 |
94 | [devpi:upload]
95 | # Options for the devpi: PyPI server and packaging tool
96 | # VCS export must be deactivated since we are using setuptools-scm
97 | no-vcs = 1
98 | formats = bdist_wheel
99 |
100 | [flake8]
101 | # Some sane defaults for the code style checker flake8
102 | exclude =
103 | .tox
104 | build
105 | dist
106 | .eggs
107 | docs/conf.py
108 |
109 | [pyscaffold]
110 | # PyScaffold's parameters when the project was created.
111 | # This will be used when updating. Do not change!
112 | version = 3.2.3
113 | package = pydeequ
114 | extensions =
115 | tox
116 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Setup file for pydeequ.
4 | Use setup.cfg to configure your project.
5 |
6 | This file was generated with PyScaffold 3.2.3.
7 | PyScaffold helps you to put up the scaffold of your new Python project.
8 | Learn more under: https://pyscaffold.org/
9 | """
10 | import sys
11 | import os
12 |
13 | from pkg_resources import VersionConflict, require
14 | from setuptools import setup
15 |
16 | try:
17 | require('setuptools>=38.3')
18 | except VersionConflict:
19 | print("Error: version of setuptools is too old (<38.3)!")
20 | sys.exit(1)
21 |
22 | def setup_package():
23 | needs_sphinx = {'build_sphinx', 'upload_docs'}.intersection(sys.argv)
24 | sphinx = ['sphinx'] if needs_sphinx else []
25 |
26 | setup(setup_requires=['six', 'pyscaffold>=2.5a0,<2.6a0'] + sphinx,
27 | use_pyscaffold=True)
28 |
29 | if __name__ == "__main__":
30 | setup_package()
31 |
--------------------------------------------------------------------------------
/src/pydeequ/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from pkg_resources import get_distribution, DistributionNotFound
3 |
4 | try:
5 | # Change here if project is renamed and does not equal the package name
6 | dist_name = __name__
7 | __version__ = get_distribution(dist_name).version
8 | except DistributionNotFound:
9 | __version__ = 'unknown'
10 | finally:
11 | del get_distribution, DistributionNotFound
12 |
--------------------------------------------------------------------------------
/src/pydeequ/analyzers.py:
--------------------------------------------------------------------------------
1 | import py4j.java_gateway as jg
2 |
3 | from pydeequ.exceptions import JavaClassNotFoundException
4 | import pydeequ.jvm_conversions as jc
5 |
6 | class BaseAnalyzer(object):
7 | """
8 | Analyzer baseclass
9 | """
10 | def set_jvm(self, jvm):
11 | self._jvm = jvm
12 | return self
13 |
14 | @property
15 | def jvmdeequAnalyzers(self):
16 | if (self._jvm):
17 | return self._jvm.com.amazon.deequ.analyzers
18 | else:
19 | raise ValueError("Run set_jvm() method first.")
20 |
21 | class ApproxCountDistinct(BaseAnalyzer):
22 | """
23 | Compute approximated count distinct with HyperLogLogPlusPlus.
24 |
25 | @param column Which column to compute this aggregation on.
26 | """
27 |
28 | def __init__(self, column):
29 | self.column = column
30 |
31 | @property
32 | def jvmAnalyzer(self):
33 | return self.jvmdeequAnalyzers.ApproxCountDistinct(
34 | self.column,
35 | getattr(self.jvmdeequAnalyzers.ApproxCountDistinct, "apply$default$2")()
36 | )
37 |
38 |
39 | class ApproxQuantile(BaseAnalyzer):
40 | """
41 | Approximate quantile analyzer. The allowed relative error compared to the exact quantile can be
42 | configured with `relativeError` parameter. A `relativeError` = 0.0 would yield the exact
43 | quantile while increasing the computational load.
44 |
45 | @param column Column in DataFrame for which the approximate quantile is analyzed.
46 | @param quantile Computed Quantile. Must be in the interval [0, 1], where 0.5 would be the
47 | median.
48 | @param relativeError Relative target precision to achieve in the quantile computation.
49 | Must be in the interval [0, 1].
50 | @param where Additional filter to apply before the analyzer is run.
51 | """
52 |
53 | def __init__(self, column, quantile, relativeError = 0.01):
54 | self.column = column
55 | self.quantile = quantile
56 | self.relativeError = relativeError
57 |
58 | @property
59 | def jvmAnalyzer(self):
60 | return self.jvmdeequAnalyzers.ApproxQuantile(
61 | self.column,
62 | self.quantile,
63 | self.relativeError,
64 | getattr(self.jvmdeequAnalyzers.ApproxQuantile, "apply$default$4")()
65 | )
66 |
67 | class Completeness(BaseAnalyzer):
68 | """
69 | Fraction of non-null values in a column.
70 |
71 | Args:
72 | column Column in DataFrame
73 | """
74 |
75 | def __init__(self, column):
76 | self.column = column
77 |
78 | @property
79 | def jvmAnalyzer(self):
80 | return self.jvmdeequAnalyzers.Completeness(
81 | self.column,
82 | getattr(self.jvmdeequAnalyzers.Completeness, "apply$default$2")()
83 | )
84 |
85 | class Compliance(BaseAnalyzer):
86 | """
87 | Compliance is a measure of the fraction of rows that complies with the given column constraint.
88 | E.g if the constraint is "att1>3" and data frame has 5 rows with att1 column value greater than
89 | 3 and 10 rows under 3; a DoubleMetric would be returned with 0.33 value
90 | @param instance Unlike other column analyzers (e.g completeness) this analyzer can not
91 | infer to the metric instance name from column name.
92 | Also the constraint given here can be referring to multiple columns,
93 | so metric instance name should be provided,
94 | describing what the analysis being done for.
95 | @param predicate SQL-predicate to apply per row
96 | @param where Additional filter to apply before the analyzer is run.
97 | """
98 | def __init__(self, instance, predicate):
99 | self.instance = instance
100 | self.predicate = predicate
101 |
102 | @property
103 | def jvmAnalyzer(self):
104 | return self.jvmdeequAnalyzers.Compliance(
105 | self.instance,
106 | self.predicate,
107 | getattr(self.jvmdeequAnalyzers.Compliance, "apply$default$3")()
108 | )
109 |
110 | class Correlation(BaseAnalyzer):
111 | """
112 | Computes the pearson correlation coefficient between the two given columns
113 | @param firstColumn First input column for computation
114 | @param secondColumn Second input column for computation
115 | """
116 | def __init__(self, firstColumn, secondColumn):
117 | self.firstColumn = firstColumn
118 | self.secondColumn = secondColumn
119 |
120 | @property
121 | def jvmAnalyzer(self):
122 | return self.jvmdeequAnalyzers.Correlation(
123 | self.firstColumn,
124 | self.secondColumn,
125 | getattr(self.jvmdeequAnalyzers.Correlation, "apply$default$3")()
126 | )
127 |
128 | class CountDistinct(BaseAnalyzer):
129 | """
130 | Number of distinct values
131 | """
132 | def __init__(self, column):
133 | if isinstance(column, str):
134 | self.column = [column]
135 | elif isinstance(column, list):
136 | self.column = column
137 | else:
138 | raise ValueError("'column' must be string or list of strings.")
139 |
140 | @property
141 | def jvmAnalyzer(self):
142 | return self.jvmdeequAnalyzers.CountDistinct(
143 | jc.iterable_to_scala_seq(self._jvm, self.column)
144 | )
145 |
146 | class DataType(BaseAnalyzer):
147 | """
148 | Distribution of data types such as Boolean, Fractional, Integral, and String.
149 | """
150 | def __init__(self, column):
151 | self.column = column
152 |
153 | @property
154 | def jvmAnalyzer(self):
155 | return self.jvmdeequAnalyzers.DataType(
156 | self.column,
157 | getattr(self.jvmdeequAnalyzers.DataType, "apply$default$2")()
158 | )
159 |
160 | class Distinctness(BaseAnalyzer):
161 | """
162 | Distinctness is the fraction of distinct values of a column(s).
163 | @param columns the column(s) for which to compute distinctness
164 | """
165 | def __init__(self, columns):
166 | if isinstance(columns, str):
167 | self.columns = [columns]
168 | elif isinstance(columns, list):
169 | self.columns = columns
170 | else:
171 | raise ValueError("'columns' must be string or list of strings.")
172 |
173 | @property
174 | def jvmAnalyzer(self):
175 | return self.jvmdeequAnalyzers.Distinctness(
176 | jc.iterable_to_scala_seq(self._jvm, self.columns),
177 | getattr(self.jvmdeequAnalyzers.DataType, "apply$default$2")()
178 | )
179 |
180 | class Entropy(BaseAnalyzer):
181 | """
182 | Entropy is a measure of the level of information contained in a message. Given the probability
183 | distribution over values in a column, it describes how many bits are required to identify a
184 | value.
185 | """
186 | def __init__(self, column):
187 | self.column = column
188 |
189 | @property
190 | def jvmAnalyzer(self):
191 | return self.jvmdeequAnalyzers.Entropy(
192 | self.column,
193 | getattr(self.jvmdeequAnalyzers.Entropy, "apply$default$2")()
194 | )
195 |
196 | class Histogram(BaseAnalyzer):
197 | """
198 | Histogram is the summary of values in a column of a DataFrame. Groups the given column's values,
199 | and calculates the number of rows with that specific value and the fraction of this value.
200 |
201 | @param column Column to do histogram analysis on
202 | """
203 | def __init__(self, column):
204 | self.column = column
205 |
206 | @property
207 | def jvmAnalyzer(self):
208 | return self.jvmdeequAnalyzers.Histogram(
209 | self.column,
210 | getattr(self.jvmdeequAnalyzers.Histogram, "apply$default$2")(),
211 | getattr(self.jvmdeequAnalyzers.Histogram, "apply$default$3")(),
212 | getattr(self.jvmdeequAnalyzers.Histogram, "apply$default$4")()
213 | )
214 |
215 | class Maximum(BaseAnalyzer):
216 | """
217 | Maximum value.
218 | """
219 | def __init__(self, column):
220 | self.column = column
221 |
222 | @property
223 | def jvmAnalyzer(self):
224 | return self.jvmdeequAnalyzers.Maximum(
225 | self.column,
226 | getattr(self.jvmdeequAnalyzers.Maximum, "apply$default$2")()
227 | )
228 |
229 | class MaxLength(BaseAnalyzer):
230 | """
231 | """
232 | def __init__(self, column):
233 | self.column = column
234 |
235 | @property
236 | def jvmAnalyzer(self):
237 | return self.jvmdeequAnalyzers.MaxLength(
238 | self.column,
239 | getattr(self.jvmdeequAnalyzers.MaxLength, "apply$default$2")()
240 | )
241 |
242 | class Mean(BaseAnalyzer):
243 | """
244 | Mean value, null values are excluded.
245 | """
246 | def __init__(self, column):
247 | self.column = column
248 |
249 | @property
250 | def jvmAnalyzer(self):
251 | return self.jvmdeequAnalyzers.Mean(
252 | self.column,
253 | getattr(self.jvmdeequAnalyzers.Mean, "apply$default$2")()
254 | )
255 |
256 | class Minimum(BaseAnalyzer):
257 | """
258 | Minimum value.
259 | """
260 | def __init__(self, column):
261 | self.column = column
262 |
263 | @property
264 | def jvmAnalyzer(self):
265 | return self.jvmdeequAnalyzers.Minimum(
266 | self.column,
267 | getattr(self.jvmdeequAnalyzers.Minimum, "apply$default$2")()
268 | )
269 |
270 | class MinLength(BaseAnalyzer):
271 | """
272 | """
273 | def __init__(self, column):
274 | self.column = column
275 |
276 | @property
277 | def jvmAnalyzer(self):
278 | return self.jvmdeequAnalyzers.MinLength(
279 | self.column,
280 | getattr(self.jvmdeequAnalyzers.MinLength, "apply$default$2")()
281 | )
282 |
283 | class MutualInformation(BaseAnalyzer):
284 | """
285 | Mutual Information describes how much information about one column can be inferred from another
286 | column.
287 |
288 | If two columns are independent of each other, then nothing can be inferred from one column about
289 | the other, and mutual information is zero. If there is a functional dependency of one column to
290 | another and vice versa, then all information of the two columns are shared, and mutual
291 | information is the entropy of each column.
292 | """
293 | def __init__(self, columns):
294 | if not isinstance(columns, list):
295 | raise ValueError("'columns' mus be a list of strings.")
296 | self.columns = columns
297 |
298 | @property
299 | def jvmAnalyzer(self):
300 | return self.jvmdeequAnalyzers.MutualInformation(
301 | jc.iterable_to_scala_seq(self._jvm, self.columns),
302 | getattr(self.jvmdeequAnalyzers.MutualInformation, "apply$default$2")()
303 | )
304 |
305 | #class PattenMatch
306 |
307 | class Size(BaseAnalyzer):
308 | """
309 | Size is the number of rows in a DataFrame.
310 | """
311 | @property
312 | def jvmAnalyzer(self):
313 | return self.jvmdeequAnalyzers.Size(
314 | getattr(self.jvmdeequAnalyzers.Size, "apply$default$1")()
315 | )
316 |
317 | class StandardDeviation(BaseAnalyzer):
318 | """
319 | Standard deviation implementation.
320 | """
321 | def __init__(self, column):
322 | self.column = column
323 |
324 | @property
325 | def jvmAnalyzer(self):
326 | return self.jvmdeequAnalyzers.StandardDeviation(
327 | self.column,
328 | getattr(self.jvmdeequAnalyzers.StandardDeviation, "apply$default$2")()
329 | )
330 |
331 | class Sum(BaseAnalyzer):
332 | """
333 | """
334 | def __init__(self, column):
335 | self.column = column
336 |
337 | @property
338 | def jvmAnalyzer(self):
339 | return self.jvmdeequAnalyzers.Sum(
340 | self.column,
341 | getattr(self.jvmdeequAnalyzers.Sum, "apply$default$2")()
342 | )
343 |
344 | class Uniqueness(BaseAnalyzer):
345 | """
346 | Fraction of unique values over the number of all values of
347 | a column. Unique values occur exactly once.
348 | Example: [a, a, b] contains one unique value b,
349 | so uniqueness is 1/3.
350 | """
351 | def __init__(self, columns):
352 | if not isinstance(columns, list):
353 | raise ValueError("'columns' mus be a list of strings.")
354 | self.columns = columns
355 |
356 | @property
357 | def jvmAnalyzer(self):
358 | return self.jvmdeequAnalyzers.Uniqueness(
359 | jc.iterable_to_scala_seq(self._jvm, self.columns),
360 | getattr(self.jvmdeequAnalyzers.Uniqueness, "apply$default$2")()
361 | )
362 |
363 | class UniqueValueRatio(BaseAnalyzer):
364 | """
365 | Fraction of unique values over the number of all distinct
366 | values of a column. Unique values occur exactly once.
367 | Distinct values occur at least once.
368 | Example: [a, a, b] contains one unique value b,
369 | and two distinct values a and b, so the unique value
370 | ratio is 1/2.
371 | """
372 | def __init__(self, columns):
373 | if not isinstance(columns, list):
374 | raise ValueError("'columns' mus be a list of strings.")
375 | self.columns = columns
376 |
377 | @property
378 | def jvmAnalyzer(self):
379 | return self.jvmdeequAnalyzers.UniqueValueRatio(
380 | jc.iterable_to_scala_seq(self._jvm, self.columns),
381 | getattr(self.jvmdeequAnalyzers.UniqueValueRatio, "apply$default$2")()
382 | )
383 |
384 |
--------------------------------------------------------------------------------
/src/pydeequ/base.py:
--------------------------------------------------------------------------------
1 | import py4j.java_gateway as jg
2 |
3 | from pyspark.sql import DataFrame
4 |
5 | from pydeequ.exceptions import JavaClassNotFoundException
6 | import pydeequ.jvm_conversions as jc
7 |
8 | class BaseWrapper(object):
9 | def __init__(self, SparkSession):
10 | self.spark = SparkSession
11 |
12 | @property
13 | def _jsparkSession(self):
14 | return self.spark._jsparkSession
15 |
16 | @property
17 | def _jvm(self):
18 | return self.spark.sparkContext._jvm
19 |
20 | @property
21 | def _gateway(self):
22 | return self.spark.sparkContext._gateway
23 |
24 | class BaseBuilder(BaseWrapper):
25 | def __init__(self, SparkSession, dataFrame):
26 | super().__init__(SparkSession)
27 | self._dataFrame = dataFrame
28 |
29 | @property
30 | def dataFrame(self):
31 | return self._dataFrame
32 |
33 | class VerificationRunBuilder(BaseBuilder):
34 | """
35 | A class to build a VerificationRun using a fluent API.
36 | """
37 | def __init__(self, SparkSession, dataFrame):
38 | """
39 | Args:
40 | SparkSession (pyspark.sql.SparkSession)
41 | dataFrame (pyspark.sql.dataframe.DataFrame)
42 | """
43 | super().__init__(SparkSession, dataFrame)
44 | run_builder = self._jvm.com.amazon.deequ.VerificationRunBuilder
45 | self.jvmVerificationRunBuilder = run_builder(
46 | self.dataFrame._jdf
47 | )
48 |
49 |
50 | def addCheck(self, check):
51 | """
52 | Add a single check to the run.
53 |
54 | Args:
55 | check (pydeequ.check.Check):
56 | A check object to be executed during the run
57 | """
58 | jvmCheck = check.jvmCheck
59 | self.jvmVerificationRunBuilder.addCheck(jvmCheck)
60 | return self
61 |
62 | def run(self):
63 | result = self.jvmVerificationRunBuilder.run()
64 |
65 | jvmVerificationResult = self._jvm.com.amazon.deequ \
66 | .VerificationResult
67 | try:
68 | df = jvmVerificationResult.checkResultsAsDataFrame(
69 | self._jsparkSession,
70 | result,
71 | getattr(jvmVerificationResult,
72 | "checkResultsAsDataFrame$default$3")()
73 | )
74 | return df
75 | except Exception:
76 | self.spark.sparkContext._gateway.close()
77 | self.spark.stop()
78 | raise AttributeError
79 |
80 | def useRepository(self, metricsRepo):
81 | self.jvmVerificationRunBuilder = self.jvmVerificationRunBuilder \
82 | .useRepository(
83 | metricsRepo.jvmMetricsRepo
84 | )
85 | return self
86 |
87 | def saveOrAppendResult(self, resultKey):
88 | self.jvmVerificationRunBuilder = self.jvmVerificationRunBuilder \
89 | .saveOrAppendResult(
90 | resultKey.jvmResultKey
91 | )
92 | return self
93 |
94 | class VerificationSuite(BaseWrapper):
95 | """
96 | Responsible for running checks and required analysis and return the
97 | results.
98 | """
99 | def __init__(self, SparkSession):
100 | """
101 | Args:
102 | SparkSession ():
103 | """
104 | super().__init__(SparkSession)
105 | self._start_callback_server()
106 |
107 | def _start_callback_server(self):
108 | callback = self._gateway.get_callback_server()
109 | if callback is None:
110 | self._gateway.start_callback_server()
111 | elif callback.is_shutdown:
112 | callback.close()
113 | self._gateway.restart_callback_server()
114 |
115 | def onData(self, dataFrame):
116 | """
117 | Starting point to construct a VerificationRun.
118 |
119 | Args:
120 | dataFrame (pyspark.sql.dataframe.DataFrame):
121 | spark dataFrame on which the checks will be verified.
122 | """
123 | return VerificationRunBuilder(self.spark, dataFrame)
124 |
125 | class _AnalyzerContext(BaseWrapper):
126 | """
127 | """
128 | def __init__(self, SparkSession, jvmAnalyzerContext):
129 | """ Initializes the AnalyzerContext python object with a JVM object.
130 |
131 | Args:
132 | SparkSession ():
133 | jvmAnalyzerContext (JavaObject):
134 | """
135 | super().__init__(SparkSession)
136 | self.jvmAnalyzerContext = jvmAnalyzerContext
137 |
138 | def successMetricsAsDataFrame(self):
139 | try:
140 | df = self.jvmAnalyzerContext.successMetricsAsDataFrame(
141 | self._jsparkSession,
142 | self.jvmAnalyzerContext,
143 | getattr(self.jvmAnalyzerContext,
144 | "successMetricsAsDataFrame$default$3")()
145 | )
146 | out = DataFrame(df, self.spark)
147 | return out
148 | except Exception:
149 | self.spark.sparkContext._gateway.close()
150 | self.spark.stop()
151 | raise AttributeError
152 |
153 | def successMetricsAsJson(self):
154 | try:
155 | jf = self.jvmAnalyzerContext.successMetricsAsJson(
156 | self.jvmAnalyzerContext,
157 | getattr(self.jvmAnalyzerContext,
158 | "successMetricsAsJson$default$2")()
159 | )
160 |
161 | return jf
162 | except Exception:
163 | self.spark.sparkContext._gateway.close()
164 | self.spark.stop()
165 | raise AttributeError
166 |
167 | class AnalysisRunBuilder(BaseBuilder):
168 | """
169 | A class to build an AnalysisRun using a fluent API.
170 | """
171 | def __init__(self, SparkSession, dataFrame):
172 | """
173 | Args:
174 | SparkSession (pyspark.sql.SparkSession)
175 | dataFrame (pyspark.sql.dataframe.DataFrame)
176 | """
177 | super().__init__(SparkSession, dataFrame)
178 | run_builder = self._jvm.com.amazon.deequ.analyzers.runners.AnalysisRunBuilder
179 | self.jvmAnalysisRunBuilder = run_builder(
180 | self.dataFrame._jdf
181 | )
182 |
183 | def addAnalyzer(self, analyzer):
184 | """
185 | Add a single analyzer to the run.
186 |
187 | Args:
188 | analyzer (pydeequ.analyzer.Analyzer):
189 | An analyzer object to be executed during the run
190 | """
191 | analyzer.set_jvm(self._jvm)
192 | jvmAnalyzer = analyzer.jvmAnalyzer
193 | self.jvmAnalysisRunBuilder.addAnalyzer(jvmAnalyzer)
194 | return self
195 |
196 | def run(self):
197 | """ Returns an AnalyzerContext python object
198 | """
199 | jvmContext = self.jvmAnalysisRunBuilder.run()
200 | return_context = _AnalyzerContext(
201 | self.spark,
202 | jvmContext)
203 | return return_context
204 |
205 | class AnalysisRunner(BaseWrapper):
206 | """
207 | Responsible for running metrics calculations.
208 | """
209 | def onData(self, dataFrame):
210 | """
211 | Starting point to construct an Analysisrun.
212 |
213 | Args:
214 | dataFrame (pyspark.sql.dataframe.DataFrame):
215 | spark dataFrame on which the checks will be verified.
216 | """
217 | return AnalysisRunBuilder(self.spark, dataFrame)
218 |
219 |
220 | class ConstraintSuggestionRunBuilder(BaseBuilder):
221 | """
222 | A class to build a ConstraintSuggestionRun using a fluent API.
223 | """
224 | def __init__(self, SparkSession, dataFrame):
225 | """
226 | Args:
227 | SparkSession (pyspark.sql.SparkSession)
228 | dataFrame (pyspark.sql.dataframe.DataFrame)
229 | """
230 | super().__init__(SparkSession, dataFrame)
231 | run_builder = self._jvm.com.amazon.deequ.suggestions.ConstraintSuggestionRunBuilder
232 | self.jvmConstraintSuggestionRunBuilder = run_builder(
233 | self.dataFrame._jdf
234 | )
235 |
236 | def addConstraintRule(self, constraint):
237 | """
238 | Add a single rule for suggesting constraints based on ColumnProfiles to the run.
239 |
240 | Args:
241 | constraintRule
242 | """
243 | jvmRule = constraint._jvmRule
244 | self.jvmConstraintSuggestionRunBuilder.addConstraintRule(jvmRule())
245 | return self
246 |
247 | def run(self):
248 | result = self.jvmConstraintSuggestionRunBuilder.run()
249 |
250 | jvmSuggestionResult = self._jvm.com.amazon.deequ \
251 | .suggestions.ConstraintSuggestionResult
252 | try:
253 | df = jvmSuggestionResult.getConstraintSuggestionsAsJson(
254 | result
255 | )
256 | return df
257 | except:
258 | self.spark.sparkContext._gateway.close()
259 | self.spark.stop()
260 | raise AttributeError
261 |
262 | class ConstraintSuggestionRunner(BaseWrapper):
263 | """
264 | """
265 | def onData(self, dataFrame):
266 | """
267 | Starting point to construct a run on constraint suggestions.
268 |
269 | Args:
270 | dataFrame (pyspark.sql.dataframe.DataFrame):
271 | spark dataFrame on which the checks will be verified.
272 | """
273 | return ConstraintSuggestionRunBuilder(self.spark, dataFrame)
274 |
--------------------------------------------------------------------------------
/src/pydeequ/checks.py:
--------------------------------------------------------------------------------
1 |
2 | import py4j.java_gateway as jg
3 |
4 | from pydeequ.exceptions import JavaClassNotFoundException
5 | import pydeequ.jvm_conversions as jc
6 | import pdb
7 |
8 | def is_one(x):
9 | """ Helper function for default asseritons.
10 | """
11 | return x == 1
12 |
13 | class Check(object):
14 | """
15 | A class representing a list of constraints that can be applied to a given
16 | [[org.apache.spark.sql.DataFrame]]. In order to run the checks, use the
17 | VerificationSuite.run to run your checks along with other Checks and
18 | Analysis objects. When run with VerificationSuite, Analyzers required by
19 | multiple checks/analysis blocks is optimized to run once.
20 | """
21 | def __init__(self, SparkSession, level='error', description=None,
22 | jvmCheck=None):
23 | """
24 | Args:
25 | sparkContext (pyspark.context.SparkContext): active SparkContext
26 | level (str): 'error' (default), 'warning'
27 | Assertion level of the check group. If any of the constraints
28 | fail this level is used for the status of the check
29 | description (str): The name describes the check block. Generally
30 | will be used to show in the logs
31 | """
32 | self.spark = SparkSession
33 | self._level = level
34 | self._description = description
35 | if jvmCheck:
36 | self.jvmCheck = jvmCheck
37 | else:
38 | deequ_check = self._jvm.com.amazon.deequ.checks.Check
39 | if not isinstance(deequ_check, jg.JavaClass):
40 | raise JavaClassNotFoundException("com.amazon.deequ.checks.Check")
41 | self.jvmCheck = deequ_check(
42 | self._jvm_level,
43 | self._description,
44 | getattr(deequ_check, "apply$default$3")()
45 | )
46 |
47 | @property
48 | def _jvm(self):
49 | return self.spark.sparkContext._jvm
50 |
51 | @property
52 | def level(self):
53 | return self._level
54 |
55 | @property
56 | def description(self):
57 | return self._description
58 |
59 | @property
60 | def _jvm_level(self):
61 | if self._level == 'error':
62 | return self._jvm.com.amazon.deequ.checks.CheckLevel.Error()
63 | elif self._level == 'warning':
64 | return self._jvm.com.amazon.deequ.checks.CheckLevel.Warning()
65 | else:
66 | raise ValueError("Invalid 'level'")
67 |
68 | def hasSize(self, assertion):
69 | """
70 | Creates a constraint that calculates the data frame size and runs the
71 | assertion on it.
72 | Args:
73 | assertion (function):
74 | Returns:
75 | checks.Check object including this constraint
76 | """
77 | function = jc.scala_function1(self.spark.sparkContext._gateway,
78 | assertion)
79 | jvmConstraint = self.jvmCheck.hasSize(
80 | function,
81 | getattr(self.jvmCheck, "hasSize$default$2")()
82 | )
83 | return Check(
84 | self.spark,
85 | self.level,
86 | self.description,
87 | jvmConstraint
88 | )
89 |
90 | def isUnique(self, column):
91 | """
92 | Creates a constraint that asserts on a column uniqueness.
93 | Args:
94 | column (str): Column to run the assertion on
95 | Returns:
96 | checks.Check object including this constraint
97 | """
98 | jvmConstraint = self.jvmCheck.isUnique(
99 | column,
100 | getattr(self.jvmCheck, "isUnique$default$2")()
101 | )
102 | return Check(
103 | self.spark,
104 | self.level,
105 | self.description,
106 | jvmConstraint
107 | )
108 |
109 | def hasCompleteness(self, column, assertion):
110 | """
111 | Creates a constraint that asserts on a column completion.
112 | Uses the given history selection strategy to retrieve historical completeness values on this
113 | column from the history provider.
114 |
115 | @param column Column to run the assertion on
116 | @param assertion Function that receives a double input parameter and returns a boolean
117 | @param hint A hint to provide additional context why a constraint could have failed
118 | """
119 | function = jc.scala_function1(self.spark.sparkContext._gateway,
120 | assertion)
121 | jvmConstraint = self.jvmCheck.hasCompleteness(
122 | column,
123 | function,
124 | getattr(self.jvmCheck, "hasCompleteness$default$3")()
125 | )
126 | return Check(
127 | self.spark,
128 | self.level,
129 | self.description,
130 | jvmConstraint
131 | )
132 |
133 | def hasUniqueness(self, columns, assertion):
134 | """
135 | Creates a constraint that asserts on uniqueness in a single or combined set of key columns.
136 |
137 | @param columns Key columns
138 | @param assertion Function that receives a double input parameter and returns a boolean.
139 | Refers to the fraction of unique values
140 | @param hint A hint to provide additional context why a constraint could have failed
141 | """
142 | if (not isinstance(columns, list)):
143 | # Single column is provided
144 | columns = [columns]
145 | function = jc.scala_function1(self.spark.sparkContext._gateway,
146 | assertion)
147 | jvmConstraint = self.jvmCheck.hasUniqueness(
148 | jc.iterable_to_scala_seq(self._jvm, columns),
149 | function
150 | )
151 | return Check(
152 | self.spark,
153 | self.level,
154 | self.description,
155 | jvmConstraint
156 | )
157 |
158 | def hasDistinctness(self, columns, assertion):
159 | """
160 | Creates a constraint on the distinctness in a single or combined set of key columns.
161 |
162 | @param columns columns
163 | @param assertion Function that receives a double input parameter and returns a boolean.
164 | Refers to the fraction of distinct values.
165 | @param hint A hint to provide additional context why a constraint could have failed
166 | """
167 | if (not isinstance(columns, list)):
168 | # Single column is provided
169 | columns = [columns]
170 | function = jc.scala_function1(self.spark.sparkContext._gateway,
171 | assertion)
172 | jvmConstraint = self.jvmCheck.hasDistinctness(
173 | jc.iterable_to_scala_seq(self._jvm, columns),
174 | function,
175 | getattr(self.jvmCheck, "hasDistinctness$default$3")()
176 | )
177 | return Check(
178 | self.spark,
179 | self.level,
180 | self.description,
181 | jvmConstraint
182 | )
183 |
184 | def hasUniqueValueRatio(self, columns, assertion):
185 | """
186 | Creates a constraint on the unique value ratio in a single or combined set of key columns.
187 |
188 | @param columns columns
189 | @param assertion Function that receives a double input parameter and returns a boolean.
190 | Refers to the fraction of distinct values.
191 | @param hint A hint to provide additional context why a constraint could have failed
192 | """
193 | if (not isinstance(columns, list)):
194 | # Single column is provided
195 | columns = [columns]
196 | function = jc.scala_function1(self.spark.sparkContext._gateway,
197 | assertion)
198 | jvmConstraint = self.jvmCheck.hasUniqueValueRatio(
199 | jc.iterable_to_scala_seq(self._jvm, columns),
200 | function,
201 | getattr(self.jvmCheck, "hasUniqueValueRatio$default$3")()
202 | )
203 | return Check(
204 | self.spark,
205 | self.level,
206 | self.description,
207 | jvmConstraint
208 | )
209 |
210 | def hasNumberOfDistinctValues(self, column, assertion,
211 | binningUdf = None, maxBins = None):
212 | """
213 | Creates a constraint that asserts on the number of distinct values a column has.
214 |
215 | @param column Column to run the assertion on
216 | @param assertion Function that receives a long input parameter and returns a boolean
217 | @param binningUdf An optional binning function
218 | @param maxBins Histogram details is only provided for N column values with top counts.
219 | maxBins sets the N
220 | @param hint A hint to provide additional context why a constraint could have failed
221 | """
222 | function = jc.scala_function1(self.spark.sparkContext._gateway,
223 | assertion)
224 | jvmConstraint = self.jvmCheck.hasNumberOfDistinctValues(
225 | column,
226 | function,
227 | getattr(self.jvmCheck, "hasNumberOfDistinctValues$default$3")(),
228 | getattr(self.jvmCheck, "hasNumberOfDistinctValues$default$4")(),
229 | getattr(self.jvmCheck, "hasNumberOfDistinctValues$default$5")()
230 | )
231 | return Check(
232 | self.spark,
233 | self.level,
234 | self.description,
235 | jvmConstraint
236 | )
237 |
238 | def hasHistogramValues(self, column, assertion,
239 | binningUdf = None, maxBins = None):
240 | """
241 | Creates a constraint that asserts on column's value distribution.
242 |
243 | @param column Column to run the assertion on
244 | @param assertion Function that receives a Distribution input parameter and returns a boolean.
245 | E.g
246 | .hasHistogramValues("att2", _.absolutes("f") == 3)
247 | .hasHistogramValues("att2",
248 | _.ratios(Histogram.NullFieldReplacement) == 2/6.0)
249 | @param binningUdf An optional binning function
250 | @param maxBins Histogram details is only provided for N column values with top counts.
251 | maxBins sets the N
252 | @param hint A hint to provide additional context why a constraint could have failed
253 | """
254 | function = jc.scala_function1(self.spark.sparkContext._gateway,
255 | assertion)
256 | jvmConstraint = self.jvmCheck.hasHistogramValues(
257 | column,
258 | function,
259 | getattr(self.jvmCheck, "hasHistogramValues$default$3")(),
260 | getattr(self.jvmCheck, "hasHistogramValues$default$4")(),
261 | getattr(self.jvmCheck, "hasHistogramValues$default$5")()
262 | )
263 | return Check(
264 | self.spark,
265 | self.level,
266 | self.description,
267 | jvmConstraint
268 | )
269 |
270 | def hasEntropy(self, column, assertion):
271 | """
272 | Creates a constraint that asserts on a column entropy.
273 |
274 | @param column Column to run the assertion on
275 | @param assertion Function that receives a double input parameter and returns a boolean
276 | @param hint A hint to provide additional context why a constraint could have failed
277 | """
278 | function = jc.scala_function1(self.spark.sparkContext._gateway,
279 | assertion)
280 | jvmConstraint = self.jvmCheck.hasEntropy(
281 | column,
282 | function,
283 | getattr(self.jvmCheck, "hasEntropy$default$3")()
284 | )
285 | return Check(
286 | self.spark,
287 | self.level,
288 | self.description,
289 | jvmConstraint
290 | )
291 |
292 | def hasMutualInformation(self, columnA, columnB, assertion):
293 | """
294 | Creates a constraint that asserts on a mutual information between two columns.
295 |
296 | @param columnA First column for mutual information calculation
297 | @param columnB Second column for mutual information calculation
298 | @param assertion Function that receives a double input parameter and returns a boolean
299 | @param hint A hint to provide additional context why a constraint could have failed
300 | """
301 | function = jc.scala_function1(self.spark.sparkContext._gateway,
302 | assertion)
303 | jvmConstraint = self.jvmCheck.hasMutualInformation(
304 | columnA,
305 | columnB,
306 | function,
307 | getattr(self.jvmCheck, "hasMutualInformation$default$4")()
308 | )
309 | return Check(
310 | self.spark,
311 | self.level,
312 | self.description,
313 | jvmConstraint
314 | )
315 |
316 | def hasApproxQuantile(self, column, quantile, assertion):
317 | """
318 | Creates a constraint that asserts on an approximated quantile
319 |
320 | @param column Column to run the assertion on
321 | @param quantile Which quantile to assert on
322 | @param assertion Function that receives a double input parameter (the computed quantile)
323 | and returns a boolean
324 | @param hint A hint to provide additional context why a constraint could have failed
325 | """
326 | function = jc.scala_function1(self.spark.sparkContext._gateway,
327 | assertion)
328 | jvmConstraint = self.jvmCheck.hasApproxQuantile(
329 | column,
330 | quantile,
331 | function,
332 | getattr(self.jvmCheck, "hasApproxQuantile$default$4")()
333 | )
334 | return Check(
335 | self.spark,
336 | self.level,
337 | self.description,
338 | jvmConstraint
339 | )
340 |
341 | def hasMinLength(self, column, assertion):
342 | """
343 | Creates a constraint that asserts on the minimum length of the column
344 |
345 | @param column Column to run the assertion on
346 | @param assertion Function that receives a double input parameter and returns a boolean
347 | @param hint A hint to provide additional context why a constraint could have failed
348 | """
349 | function = jc.scala_function1(self.spark.sparkContext._gateway,
350 | assertion)
351 | jvmConstraint = self.jvmCheck.hasMinLength(
352 | column,
353 | function,
354 | getattr(self.jvmCheck, "hasMinLength$default$3")()
355 | )
356 | return Check(
357 | self.spark,
358 | self.level,
359 | self.description,
360 | jvmConstraint
361 | )
362 |
363 |
364 | def hasMaxLength(self, column, assertion):
365 | """
366 | Creates a constraint that asserts on the maximum length of the column
367 |
368 | @param column Column to run the assertion on
369 | @param assertion Function that receives a double input parameter and returns a boolean
370 | @param hint A hint to provide additional context why a constraint could have failed
371 | """
372 | function = jc.scala_function1(self.spark.sparkContext._gateway,
373 | assertion)
374 | jvmConstraint = self.jvmCheck.hasMaxLength(
375 | column,
376 | function,
377 | getattr(self.jvmCheck, "hasMaxLength$default$3")()
378 | )
379 | return Check(
380 | self.spark,
381 | self.level,
382 | self.description,
383 | jvmConstraint
384 | )
385 |
386 | def hasMin(self, column, assertion):
387 | """
388 | Creates a constraint that asserts on the minimum of the column
389 |
390 | @param column Column to run the assertion on
391 | @param assertion Function that receives a double input parameter and returns a boolean
392 | @param hint A hint to provide additional context why a constraint could have failed
393 | """
394 | function = jc.scala_function1(self.spark.sparkContext._gateway,
395 | assertion)
396 | jvmConstraint = self.jvmCheck.hasMin(
397 | column,
398 | function,
399 | getattr(self.jvmCheck, "hasMin$default$3")()
400 | )
401 | return Check(
402 | self.spark,
403 | self.level,
404 | self.description,
405 | jvmConstraint
406 | )
407 |
408 | def hasMax(self, column, assertion):
409 | """
410 | Creates a constraint that asserts on the maximum of the column
411 |
412 | @param column Column to run the assertion on
413 | @param assertion Function that receives a double input parameter and returns a boolean
414 | @param hint A hint to provide additional context why a constraint could have failed
415 | """
416 | function = jc.scala_function1(self.spark.sparkContext._gateway,
417 | assertion)
418 | jvmConstraint = self.jvmCheck.hasMax(
419 | column,
420 | function,
421 | getattr(self.jvmCheck, "hasMax$default$3")()
422 | )
423 | return Check(
424 | self.spark,
425 | self.level,
426 | self.description,
427 | jvmConstraint
428 | )
429 |
430 | def hasMean(self, column, assertion):
431 | """
432 | Creates a constraint that asserts on the mean of the column
433 |
434 | @param column Column to run the assertion on
435 | @param assertion Function that receives a double input parameter and returns a boolean
436 | @param hint A hint to provide additional context why a constraint could have failed
437 | """
438 | function = jc.scala_function1(self.spark.sparkContext._gateway,
439 | assertion)
440 | jvmConstraint = self.jvmCheck.hasMean(
441 | column,
442 | function,
443 | getattr(self.jvmCheck, "hasMean$default$3")()
444 | )
445 | return Check(
446 | self.spark,
447 | self.level,
448 | self.description,
449 | jvmConstraint
450 | )
451 |
452 | def hasSum(self, column, assertion):
453 | """
454 | Creates a constraint that asserts on the sum of the column
455 |
456 | @param column Column to run the assertion on
457 | @param assertion Function that receives a double input parameter and returns a boolean
458 | @param hint A hint to provide additional context why a constraint could have failed
459 | """
460 | function = jc.scala_function1(self.spark.sparkContext._gateway,
461 | assertion)
462 | jvmConstraint = self.jvmCheck.hasSum(
463 | column,
464 | function,
465 | getattr(self.jvmCheck, "hasSum$default$3")()
466 | )
467 | return Check(
468 | self.spark,
469 | self.level,
470 | self.description,
471 | jvmConstraint
472 | )
473 | def hasStandardDeviation(self, column, assertion):
474 | """
475 | Creates a constraint that asserts on the standard deviation of the column
476 |
477 | @param column Column to run the assertion on
478 | @param assertion Function that receives a double input parameter and returns a boolean
479 | @param hint A hint to provide additional context why a constraint could have failed
480 | """
481 | function = jc.scala_function1(self.spark.sparkContext._gateway,
482 | assertion)
483 | jvmConstraint = self.jvmCheck.hasStandardDeviation(
484 | column,
485 | function,
486 | getattr(self.jvmCheck, "hasStandardDeviation$default$3")()
487 | )
488 | return Check(
489 | self.spark,
490 | self.level,
491 | self.description,
492 | jvmConstraint
493 | )
494 | def hasApproxCountDistinct(self, column, assertion):
495 | """
496 | Creates a constraint that asserts on the approximate count distinct of the given column
497 |
498 | @param column Column to run the assertion on
499 | @param assertion Function that receives a double input parameter and returns a boolean
500 | @param hint A hint to provide additional context why a constraint could have failed
501 | """
502 | function = jc.scala_function1(self.spark.sparkContext._gateway,
503 | assertion)
504 | jvmConstraint = self.jvmCheck.hasApproxCountDistinct(
505 | column,
506 | function,
507 | getattr(self.jvmCheck, "hasApproxCountDistinct$default$3")()
508 | )
509 | return Check(
510 | self.spark,
511 | self.level,
512 | self.description,
513 | jvmConstraint
514 | )
515 |
516 | def hasCorrelation(self, columnA, columnB, assertion):
517 | """
518 | Creates a constraint that asserts on the pearson correlation between two columns.
519 |
520 | @param columnA First column for correlation calculation
521 | @param columnB Second column for correlation calculation
522 | @param assertion Function that receives a double input parameter and returns a boolean
523 | @param hint A hint to provide additional context why a constraint could have failed
524 | """
525 | function = jc.scala_function1(self.spark.sparkContext._gateway,
526 | assertion)
527 | jvmConstraint = self.jvmCheck.hasCorrelation(
528 | columnA,
529 | columnB,
530 | function,
531 | getattr(self.jvmCheck, "hasCorrelation$default$4")()
532 | )
533 | return Check(
534 | self.spark,
535 | self.level,
536 | self.description,
537 | jvmConstraint
538 | )
539 |
540 | def satisfies(self, columnCondition, constraintName, assertion):
541 | """
542 | Creates a constraint that runs the given condition on the data frame.
543 |
544 | @param columnCondition Data frame column which is a combination of expression and the column
545 | name. It has to comply with Spark SQL syntax.
546 | Can be written in an exact same way with conditions inside the
547 | `WHERE` clause.
548 | @param constraintName A name that summarizes the check being made. This name is being used to
549 | name the metrics for the analysis being done.
550 | @param assertion Function that receives a double input parameter and returns a boolean
551 | @param hint A hint to provide additional context why a constraint could have failed
552 | """
553 | function = jc.scala_function1(self.spark.sparkContext._gateway,
554 | assertion)
555 | jvmConstraint = self.jvmCheck.satisfies(
556 | columnCondition,
557 | constraintName,
558 | function,
559 | getattr(self.jvmCheck, "satisfies$default$4")()
560 | )
561 | return Check(
562 | self.spark,
563 | self.level,
564 | self.description,
565 | jvmConstraint
566 | )
567 |
568 | def hasPattern(self, column, pattern, assertion = is_one):
569 | """
570 | Checks for pattern compliance. Given a column name and a regular expression, defines a
571 | Check on the average compliance of the column's values to the regular expression.
572 |
573 | @param column Name of the column that should be checked.
574 | @param pattern The columns values will be checked for a match against this pattern.
575 | @param assertion Function that receives a double input parameter and returns a boolean
576 | @param hint A hint to provide additional context why a constraint could have failed
577 | """
578 | # function = jc.scala_function1(self.spark.sparkContext._gateway,
579 | # assertion)
580 | # pattern = jc.scala_regex(self.spark.sparkContext._gateway, pattern)
581 | # jvmConstraint = self.jvmCheck.hasPattern(
582 | # column,
583 | # pattern,
584 | # function,
585 | # getattr(self.jvmCheck, "hasPattern$default$4")(),
586 | # getattr(self.jvmCheck, "hasPattern$default$5")()
587 | # )
588 | # return Check(
589 | # self.spark,
590 | # self.level,
591 | # self.description,
592 | # jvmConstraint
593 | # )
594 | pass
595 |
596 | def hasDataType(self, column, dataType, assertion):
597 | """
598 | Check to run against the fraction of rows that conform to the given data type.
599 |
600 | @param column Name of the columns that should be checked.
601 | @param dataType Data type that the columns should be compared against.
602 | @param assertion Function that receives a double input parameter and returns a boolean
603 | @param hint A hint to provide additional context why a constraint could have failed
604 | """
605 | _jconstDataTypes = self._jvm.com.amazon.deequ.constraints.ConstrainableDataTypes
606 | dataTypes = {
607 | 'null': _jconstDataTypes.Null(),
608 | 'boolean': _jconstDataTypes.Boolean(),
609 | 'string': _jconstDataTypes.String(),
610 | 'numeric': _jconstDataTypes.Numeric(),
611 | 'fractional': _jconstDataTypes.Fractional(),
612 | 'integer': _jconstDataTypes.Integral()
613 | }
614 | function = jc.scala_function1(self.spark.sparkContext._gateway,
615 | assertion)
616 | jvmConstraint = self.jvmCheck.hasDataType(
617 | column,
618 | dataTypes[dataType],
619 | function,
620 | getattr(self.jvmCheck, "hasDataType$default$4")()
621 | )
622 | return Check(
623 | self.spark,
624 | self.level,
625 | self.description,
626 | jvmConstraint
627 | )
628 |
629 | def isPositive(self, column, assertion = is_one):
630 | """
631 | Creates a constraint that asserts that a column contains positive values
632 |
633 | @param column Column to run the assertion on
634 | @param assertion Function that receives a double input parameter and returns a boolean
635 | @param hint A hint to provide additional context why a constraint could have failed
636 | """
637 | function = jc.scala_function1(self.spark.sparkContext._gateway,
638 | assertion)
639 | jvmConstraint = self.jvmCheck.isPositive(
640 | column,
641 | function,
642 | getattr(self.jvmCheck, "isPositive$default$3")()
643 | )
644 | return Check(
645 | self.spark,
646 | self.level,
647 | self.description,
648 | jvmConstraint
649 | )
650 |
651 |
652 | def isNonNegative(self, column, assertion = is_one):
653 | """
654 | Creates a constraint that asserts that a column contains no negative values
655 |
656 | @param column Column to run the assertion on
657 | @param assertion Function that receives a double input parameter and returns a boolean
658 | @param hint A hint to provide additional context why a constraint could have failed
659 | """
660 | function = jc.scala_function1(self.spark.sparkContext._gateway,
661 | assertion)
662 | jvmConstraint = self.jvmCheck.isNonNegative(
663 | column,
664 | function,
665 | getattr(self.jvmCheck, "isNonNegative$default$3")()
666 | )
667 | return Check(
668 | self.spark,
669 | self.level,
670 | self.description,
671 | jvmConstraint
672 | )
673 |
674 | def isLessThan(self, columnA, columnB, assertion = is_one):
675 | """
676 | Asserts that, in each row, the value of columnA is less than the value of columnB
677 |
678 | @param columnA Column to run the assertion on
679 | @param columnB Column to run the assertion on
680 | @param assertion Function that receives a double input parameter and returns a boolean
681 | @param hint A hint to provide additional context why a constraint could have failed
682 | """
683 | function = jc.scala_function1(self.spark.sparkContext._gateway,
684 | assertion)
685 | jvmConstraint = self.jvmCheck.isLessThan(
686 | columnA,
687 | columnB,
688 | function,
689 | getattr(self.jvmCheck, "isLessThan$default$4")()
690 | )
691 | return Check(
692 | self.spark,
693 | self.level,
694 | self.description,
695 | jvmConstraint
696 | )
697 |
698 | def isLessThanOrEqualTo(self, columnA, columnB, assertion = is_one):
699 | """
700 | Asserts that, in each row, the value of columnA is less than or equal to the value of columnB
701 |
702 | @param columnA Column to run the assertion on
703 | @param columnB Column to run the assertion on
704 | @param assertion Function that receives a double input parameter and returns a boolean
705 | @param hint A hint to provide additional context why a constraint could have failed
706 | """
707 | function = jc.scala_function1(self.spark.sparkContext._gateway,
708 | assertion)
709 | jvmConstraint = self.jvmCheck.isLessThanOrEqualTo(
710 | columnA,
711 | columnB,
712 | function,
713 | getattr(self.jvmCheck, "isLessThanOrEqualTo$default$4")()
714 | )
715 | return Check(
716 | self.spark,
717 | self.level,
718 | self.description,
719 | jvmConstraint
720 | )
721 |
722 | def isGreaterThan(self, columnA, columnB, assertion = is_one):
723 | """
724 | Asserts that, in each row, the value of columnA is greater than the value of columnB
725 |
726 | @param columnA Column to run the assertion on
727 | @param columnB Column to run the assertion on
728 | @param assertion Function that receives a double input parameter and returns a boolean
729 | @param hint A hint to provide additional context why a constraint could have failed
730 | """
731 | function = jc.scala_function1(self.spark.sparkContext._gateway,
732 | assertion)
733 | jvmConstraint = self.jvmCheck.isGreaterThan(
734 | columnA,
735 | columnB,
736 | function,
737 | getattr(self.jvmCheck, "isGreaterThan$default$4")()
738 | )
739 | return Check(
740 | self.spark,
741 | self.level,
742 | self.description,
743 | jvmConstraint
744 | )
745 |
746 | def isGreaterThanOrEqualTo(self, columnA, columnB, assertion = is_one):
747 | """
748 | Asserts that, in each row, the value of columnA is greather than or equal to the value of
749 | columnB
750 |
751 | @param columnA Column to run the assertion on
752 | @param columnB Column to run the assertion on
753 | @param assertion Function that receives a double input parameter and returns a boolean
754 | @param hint A hint to provide additional context why a constraint could have failed
755 | """
756 | function = jc.scala_function1(self.spark.sparkContext._gateway,
757 | assertion)
758 | jvmConstraint = self.jvmCheck.isGreaterThanOrEqualTo(
759 | columnA,
760 | columnB,
761 | function,
762 | getattr(self.jvmCheck, "isGreaterThanOrEqualTo$default$4")()
763 | )
764 | return Check(
765 | self.spark,
766 | self.level,
767 | self.description,
768 | jvmConstraint
769 | )
770 |
771 | def isContainedIn(self, column, allowedValues, assertion = is_one):
772 | """
773 | Asserts that every non-null value in a column is contained in a set of predefined values
774 |
775 | @param column Column to run the assertion on
776 | @param allowedValues Allowed values for the column
777 | @param assertion Function that receives a double input parameter and returns a boolean
778 | @param hint A hint to provide additional context why a constraint could have failed
779 | """
780 | if (isinstance(allowedValues, list) == False):
781 | raise ValueError("'allowedValues' must be a list of strings.")
782 | function = jc.scala_function1(self.spark.sparkContext._gateway,
783 | assertion)
784 | scalaArray = jc.iterable_to_scala_array(self._jvm, allowedValues)
785 | jvmConstraint = self.jvmCheck.isContainedIn(
786 | column,
787 | scalaArray,
788 | function,
789 | getattr(self.jvmCheck, "isContainedIn$default$6")()
790 | )
791 | return Check(
792 | self.spark,
793 | self.level,
794 | self.description,
795 | jvmConstraint
796 | )
797 |
798 | def isInInterval(self,
799 | column,
800 | lowerBound,
801 | upperBound,
802 | includeLowerBound = True,
803 | includeUpperBound = True):
804 | """
805 | Asserts that the non-null values in a numeric column fall into the predefined interval
806 |
807 | @param column column to run the assertion
808 | @param lowerBound lower bound of the interval
809 | @param upperBound upper bound of the interval
810 | @param includeLowerBound is a value equal to the lower bound allows?
811 | @param includeUpperBound is a value equal to the upper bound allowed?
812 | @param hint A hint to provide additional context why a constraint could have failed
813 | """
814 | jvmConstraint = self.jvmCheck.isContainedIn(
815 | column,
816 | lowerBound,
817 | upperBound,
818 | includeLowerBound,
819 | includeUpperBound,
820 | getattr(self.jvmCheck, "isContainedIn$default$6")()
821 | )
822 | return Check(
823 | self.spark,
824 | self.level,
825 | self.description,
826 | jvmConstraint
827 | )
828 |
--------------------------------------------------------------------------------
/src/pydeequ/examples/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | test_data = [("thingA", 13.0, "IN_TRANSIT", "true", 5.0),
3 | ("thingA", 5.0, "DELAYED", "false", 20.0),
4 | ("thingB", None, "DELAYED", None, 12.0),
5 | ("thingC", None, "IN_TRANSIT", "false", 2.0),
6 | ("thingD", 1.0, "DELAYED", "true", None),
7 | ("thingC", 7.0, "UNKNOWN", None, None),
8 | ("thingC", 20.0, "UNKNOWN", None, 3.5),
9 | ("thingE", 20.0, "DELAYED", "false", 8.2)]
10 |
11 |
--------------------------------------------------------------------------------
/src/pydeequ/examples/analyzer_example.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession, DataFrame
2 |
3 | from pydeequ.base import AnalysisRunner
4 | import pydeequ.analyzers as analyzers
5 | from pydeequ.examples import test_data
6 |
7 | def main():
8 | # SparkSession startup
9 | spark = (SparkSession
10 | .builder
11 | .master('local[*]')
12 | .config('spark.jars.packages',
13 | 'com.amazon.deequ:deequ:1.0.5')
14 | .appName('profiler-example')
15 | .getOrCreate())
16 | df = spark.createDataFrame(test_data)
17 |
18 | r = AnalysisRunner(spark) \
19 | .onData(df) \
20 | .addAnalyzer(analyzers.Size()) \
21 | .addAnalyzer(analyzers.Completeness('_3')) \
22 | .addAnalyzer(analyzers.ApproxCountDistinct('_1')) \
23 | .addAnalyzer(analyzers.Mean('_2')) \
24 | .addAnalyzer(analyzers.Compliance('top values', '_2 > 15')) \
25 | .addAnalyzer(analyzers.Correlation('_2', '_5')) \
26 | .run()
27 |
28 | df = DataFrame(r, spark)
29 | df.show(df.count(), False)
30 |
31 | # SparkSession and Java Gateway teardown
32 | spark.sparkContext._gateway.close()
33 | spark.stop()
34 |
35 | if __name__ == "__main__":
36 | main()
--------------------------------------------------------------------------------
/src/pydeequ/examples/basic_usage.py:
--------------------------------------------------------------------------------
1 | #!/bin/bash python3
2 |
3 | from pyspark.sql import SparkSession, DataFrame
4 |
5 | from pydeequ.base import VerificationSuite
6 | from pydeequ.checks import Check
7 | from pydeequ.examples import test_data
8 |
9 | def main():
10 | # SparkSession startup
11 | spark = (SparkSession
12 | .builder
13 | .master('local[*]')
14 | .config('spark.jars.packages',
15 | 'com.amazon.deequ:deequ:1.0.5')
16 | .appName('constrain-example')
17 | .getOrCreate())
18 | df = spark.createDataFrame(test_data)
19 |
20 | # Constrain verification
21 | r = (VerificationSuite(spark)
22 | .onData(df)
23 | .addCheck(Check(spark, 'error', 'examples')
24 | .hasSize(lambda x: x == 8)
25 | .isUnique('_2')
26 | .hasCompleteness('_2', lambda x: x >= 0.75)
27 | .hasUniqueness('_1', lambda x: x == 3/8)
28 | .hasDistinctness('_1', lambda x: x == 5/8)
29 | .hasUniqueValueRatio('_2', lambda x: x == 0.8)
30 | .hasNumberOfDistinctValues('_2', lambda x: x == 6)
31 | #.hasHistogram
32 | .hasEntropy('_3', lambda x: x > 1)
33 | #.hasMutualInformation('_2', '_3', lambda x: x > 0.5)
34 | .hasApproxQuantile('_2', 0.5, lambda x: x == 7)
35 | .hasMinLength('_1', lambda x: x == 6)
36 | .hasMaxLength('_3', lambda x: x == 10)
37 | .hasMin('_2', lambda x: x == 1)
38 | .hasMax('_2', lambda x: x == 20)
39 | .hasMean('_2', lambda x: x > 10)
40 | .hasSum('_2', lambda x: x > 50)
41 | .hasStandardDeviation('_2', lambda x: x > 5)
42 | .hasApproxCountDistinct('_2', lambda x: x == 5)
43 | .hasCorrelation('_2', '_5', lambda x: x == 1)
44 | .satisfies("_2 > 15", "MyCondition", lambda x: x == 0.25)
45 | #.hasPattern("_1", "thing([A-Z])", lambda x: x == 1)
46 | #.hasDataType("_1", "string", lambda x: x == 1)
47 | .isPositive('_2')
48 | .isNonNegative('_2')
49 | .isLessThan('_5', '_2', lambda x: x == 0.375)
50 | .isLessThanOrEqualTo('_5', '_2', lambda x: x == 0.375)
51 | .isGreaterThan('_5', '_2', lambda x: x == 0.125)
52 | .isGreaterThanOrEqualTo('_5', '_2', lambda x: x == 0.125)
53 | #.isContainedIn('_3', ['DELAYED', 'INTRANSIT'])
54 | .isInInterval('_5', 1.0, 50.0)
55 | )
56 | .run()
57 | )
58 | df = DataFrame(r, spark)
59 | df.show(df.count(), False)
60 |
61 | # SparkSession and Java Gateway teardown
62 | spark.sparkContext._gateway.close()
63 | spark.stop()
64 |
65 | if __name__ == '__main__':
66 | main()
67 |
--------------------------------------------------------------------------------
/src/pydeequ/examples/basic_usage2.py:
--------------------------------------------------------------------------------
1 | #!/bin/bash python3
2 |
3 | from pyspark.sql import SparkSession, DataFrame
4 |
5 | from pydeequ.base import VerificationSuite
6 | from pydeequ.checks import Check
7 | from pydeequ.examples import test_data
8 |
9 | def main():
10 | # SparkSession startup
11 | spark = (SparkSession
12 | .builder
13 | .master('local[*]')
14 | .config('spark.jars.packages',
15 | 'com.amazon.deequ:deequ:1.0.5')
16 | .appName('constrain-example')
17 | .getOrCreate())
18 | df = spark.createDataFrame(test_data)
19 | df.show()
20 | print(df._jdf.__doc__)
21 |
22 | #spark.stop()
23 |
24 | if __name__ == '__main__':
25 | main()
26 |
--------------------------------------------------------------------------------
/src/pydeequ/examples/metrics_repo.py:
--------------------------------------------------------------------------------
1 | #!/bin/bash python3
2 |
3 | from pydeequ.examples import test_data
4 | from pydeequ import AnalysisRunner, VerificationSuite
5 | import pydeequ.analyzers as analyzers
6 | from pydeequ.metricsrepo import ResultKey, FileSystemMetricsRepository
7 | from pydeequ.checks import Check
8 |
9 | def main():
10 | # SparkSession startup
11 | spark = (SparkSession
12 | .builder
13 | .master('local[*]')
14 | .config('spark.jars.packages',
15 | 'com.amazon.deequ:deequ:1.0.5')
16 | .appName('suggestions-example')
17 | .getOrCreate())
18 | df = spark.createDataFrame(test_data)
19 | # Analysis run
20 | a = (AnalysisRunner(spark)
21 | .onData(df)
22 | .addAnalyzer(analyzers.Size())) \
23 | .run()
24 | key = ResultKey(spark, 100000, {'key1': 'value1'})
25 | myrepo = FileSystemMetricsRepository(spark, '../test.json')
26 | myrepo.save(key, a)
27 |
28 | # Verification run
29 | key2 = repo.ResultKey(spark, 100000, {'key1': 'value2', 'key2':'value3'})
30 |
31 |
32 | v = (base.VerificationSuite(spark)
33 | .onData(df)
34 | .addCheck(Check(spark, 'error', 'examples')
35 | .hasSize(lambda x: x == 8)
36 | .isUnique('_2'))
37 | .useRepository(myrepo)
38 | .saveOrAppendResult(key2)
39 | .run()
40 | )
41 |
42 | myrepo.load().withTagValues({'key1': 'value1'}).after(99000) \
43 | .getMetricsAsDF().show()
44 |
45 | # SparkSession and Java Gateway teardown
46 | spark.sparkContext._gateway.close()
47 | spark.stop()
48 |
49 | if __name__ == "__main__":
50 | main()
--------------------------------------------------------------------------------
/src/pydeequ/examples/profiler_example.py:
--------------------------------------------------------------------------------
1 | #!/bin/bash python3
2 |
3 | import json
4 | from pyspark.sql import SparkSession, DataFrame
5 |
6 | from pydeequ.profiler import ColumnProfilerRunner
7 | from pydeequ.examples import test_data
8 |
9 | def main():
10 | # SparkSession startup
11 | spark = (SparkSession
12 | .builder
13 | .master('local[*]')
14 | .config('spark.jars.packages',
15 | 'com.amazon.deequ:deequ:1.0.5')
16 | .appName('profiler-example')
17 | .getOrCreate())
18 | df = spark.createDataFrame(test_data)
19 |
20 | # Constrain verification
21 | r = (ColumnProfilerRunner()
22 | .onData(df)
23 | .run())
24 |
25 | parsed = json.loads(r)
26 | print(json.dumps(parsed, indent = 4))
27 |
28 | # SparkSession and Java Gateway teardown
29 | spark.sparkContext._gateway.close()
30 | spark.stop()
31 |
32 | if __name__ == "__main__":
33 | main()
34 |
--------------------------------------------------------------------------------
/src/pydeequ/examples/suggestions_example.py:
--------------------------------------------------------------------------------
1 | #!/bin/bash python3
2 |
3 | import json
4 | from pyspark.sql import SparkSession, DataFrame
5 |
6 | from pydeequ.base import ConstraintSuggestionRunner
7 | from pydeequ.suggestions import Rules
8 | from pydeequ.examples import test_data
9 |
10 | def main():
11 | # SparkSession startup
12 | spark = (SparkSession
13 | .builder
14 | .master('local[*]')
15 | .config('spark.jars.packages',
16 | 'com.amazon.deequ:deequ:1.0.5')
17 | .appName('suggestions-example')
18 | .getOrCreate())
19 | df = spark.createDataFrame(test_data)
20 |
21 | # Constrain verification
22 | r = (ConstraintSuggestionRunner(spark)
23 | .onData(df)
24 | .addConstraintRule(Rules.CategoricalRangeRule(spark))
25 | .run())
26 |
27 | parsed = json.loads(r)
28 | print(json.dumps(parsed, indent = 4))
29 |
30 | # SparkSession and Java Gateway teardown
31 | spark.sparkContext._gateway.close()
32 | spark.stop()
33 |
34 |
35 | if __name__ == "__main__":
36 | main()
--------------------------------------------------------------------------------
/src/pydeequ/exceptions.py:
--------------------------------------------------------------------------------
1 | class JavaClassNotFoundException(Exception):
2 | """
3 | Raise if required Java class is not found by py4j
4 | """
5 |
6 | def __init__(self, java_class):
7 | Exception.__init__(self)
8 | self.java_class = java_class
9 |
10 | def __str__(self):
11 | return "%s. Did you forget to add the jar to the class path?" % (
12 | self.java_class
13 | )
14 |
15 | def __repr__(self):
16 | return "%s: %s" % (self.__class__.__name__, self.java_class)
17 |
--------------------------------------------------------------------------------
/src/pydeequ/jvm_conversions.py:
--------------------------------------------------------------------------------
1 | def iterable_to_scala_list(jvm, iterable):
2 | return jvm.scala.collection.JavaConversions.\
3 | iterableAsScalaIterable(iterable).\
4 | toList()
5 |
6 | def iterable_to_scala_set(jvm, iterable):
7 | return jvm.scala.collection.JavaConversions.\
8 | iterableAsScalaIterable(iterable).\
9 | toSet()
10 |
11 | def iterable_to_scala_seq(jvm, iterable):
12 | return jvm.scala.collection.JavaConversions.\
13 | iterableAsScalaIterable(iterable).\
14 | toSeq()
15 |
16 | def simple_date_format(jvm, s):
17 | return jvm.java.text.SimpleDateFormat(s)
18 |
19 | def tuple2(jvm, t):
20 | return jvm.scala.Tuple2(*t)
21 |
22 | def option(jvm, java_obj):
23 | return jvm.scala.Option.apply(java_obj)
24 |
25 | def scala_none(jvm):
26 | return getattr(getattr(jvm.scala, "None$"), "MODULE$")
27 |
28 | def dict_to_scala_map(jvm, keyvaluepairs):
29 | return jvm.scala.collection.JavaConverters.\
30 | mapAsScalaMapConverter(keyvaluepairs).\
31 | asScala().toMap(jvm.scala.Predef.conforms())
32 |
33 | class scala_function1:
34 | def __init__(self, gateway, lambda_function):
35 | self.gateway = gateway
36 | self.lambda_function = lambda_function
37 |
38 | def apply(self, arg):
39 | return self.lambda_function(arg)
40 |
41 | class Java:
42 | implements = ["scala.Function1"]
43 |
44 |
--------------------------------------------------------------------------------
/src/pydeequ/metricsrepo.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import DataFrame
2 |
3 | from pydeequ.base import BaseWrapper
4 | import pydeequ.jvm_conversions as jc
5 |
6 | class ResultKey(BaseWrapper):
7 | """ Unique identifier of Analysis result.
8 | """
9 | def __init__(self, SparkSession, dataSetDate, tags):
10 | """
11 | :param double dataSetDate: A date related to the Analysis result
12 | :param dict tags: Key-value store of tags
13 | """
14 | super().__init__(SparkSession)
15 | self.dataSetDate = dataSetDate
16 | self.tags = tags
17 | result_key = self._jvm.com.amazon.deequ.repository.ResultKey
18 | self.jvmResultKey = result_key(
19 | self.dataSetDate,
20 | jc.dict_to_scala_map(self._jvm, self.tags)
21 | )
22 |
23 | class FileSystemMetricsRepository(BaseWrapper):
24 | """ FS based repository class
25 | """
26 | def __init__(self, SparkSession, path):
27 | super().__init__(SparkSession)
28 | self.path = path
29 | fs_repo = self._jvm.com.amazon.deequ.repository.fs.\
30 | FileSystemMetricsRepository
31 | self.jvmMetricsRepo = fs_repo(
32 | self._jsparkSession,
33 | self.path
34 | )
35 |
36 | def save(self, resultKey, analyserContext):
37 | """ Save Analysis results (metrics).
38 |
39 | :param ResultKey resultKey: unique identifier of Analysis results
40 | :param AnalyzerContext analyserContext:
41 | """
42 | return self.jvmMetricsRepo.save(
43 | resultKey.jvmResultKey,
44 | analyserContext.jvmAnalyzerContext
45 | )
46 |
47 | def load(self):
48 | """ Get a builder class to construct a loading query to get
49 | analysis results
50 | """
51 | return FSRepoResultsLoader(self.spark, self.path)
52 |
53 | class FSRepoResultsLoader(BaseWrapper):
54 | def __init__(self, SparkSession, path):
55 | super().__init__(SparkSession)
56 | self.path = path
57 | fs_repo_loader = self._jvm.com.amazon.deequ.repository.fs.\
58 | FileSystemMetricsRepositoryMultipleResultsLoader
59 | self.jvmFSMetricsRepoLoader = fs_repo_loader(
60 | self._jsparkSession,
61 | self.path
62 | )
63 |
64 | def withTagValues(self, tagValues):
65 | self.tagValues = tagValues
66 | self.jvmFSMetricsRepoLoader = self.jvmFSMetricsRepoLoader \
67 | .withTagValues(
68 | jc.dict_to_scala_map(self._jvm, tagValues)
69 | )
70 | return self
71 |
72 | def before(self, dateTime):
73 | self.before = dateTime
74 | self.jvmFSMetricsRepoLoader = self.jvmFSMetricsRepoLoader \
75 | .before(
76 | dateTime
77 | )
78 | return self
79 |
80 | def after(self, dateTime):
81 | self.after = dateTime
82 | self.jvmFSMetricsRepoLoader = self.jvmFSMetricsRepoLoader \
83 | .after(
84 | dateTime
85 | )
86 | return self
87 |
88 | def getMetricsAsDF(self):
89 | jvmGetter = self.jvmFSMetricsRepoLoader.getSuccessMetricsAsDataFrame
90 | df = jvmGetter(
91 | self._jsparkSession,
92 | getattr(self.jvmFSMetricsRepoLoader,
93 | "getSuccessMetricsAsDataFrame$default$2")()
94 | )
95 | return DataFrame(df, self.spark)
96 |
97 | def getMetricsAsJson(self):
98 | jvmGetter = self.jvmFSMetricsRepoLoader.getSuccessMetricsAsJson
99 | jf = jvmGetter(
100 | getattr(self.jvmFSMetricsRepoLoader,
101 | "getSuccessMetricsAsJson$default$1")()
102 | )
103 | return jf
104 |
105 |
106 |
107 |
108 |
109 |
--------------------------------------------------------------------------------
/src/pydeequ/profiler.py:
--------------------------------------------------------------------------------
1 | from pydeequ.exceptions import JavaClassNotFoundException
2 | import pydeequ.jvm_conversions as jc
3 | import pdb
4 |
5 | class ColumnProfilerRunBuilder:
6 | """
7 | Builds profiling runner.
8 | """
9 | def __init__(self, dataFrame):
10 | """
11 | Args:
12 | dataFrame (pyspark.sql.dataframe.DataFrame):
13 | """
14 | self._sc = dataFrame._sc
15 | self._dataFrame = dataFrame
16 | run_builder = self._jvm.com.amazon.deequ \
17 | .profiles.ColumnProfilerRunBuilder
18 | self.jvmColumnProfilerRunBuilder = run_builder(
19 | self._dataFrame._jdf
20 | )
21 |
22 | @property
23 | def _jvm(self):
24 | return self._sc._jvm
25 |
26 | @property
27 | def dataFrame(self):
28 | return self._dataFrame
29 |
30 | def run(self):
31 | result = self.jvmColumnProfilerRunBuilder.run()
32 |
33 | seqColumnProfiles = result.profiles().values().toSeq()
34 | jf = result.toJson(
35 | seqColumnProfiles
36 | )
37 |
38 | return jf
39 |
40 | class ColumnProfilerRunner():
41 | """
42 | Responsible for running data profiling.
43 | """
44 | def onData(self, dataFrame):
45 | """
46 | Starting point to construct a profiling runner.
47 |
48 | Args:
49 | dataFrame (pyspark.sql.dataframe.DataFrame):
50 | """
51 | return ColumnProfilerRunBuilder(dataFrame)
52 |
53 |
--------------------------------------------------------------------------------
/src/pydeequ/suggestions.py:
--------------------------------------------------------------------------------
1 | import py4j.java_gateway as jg
2 | import pdb
3 |
4 | from pydeequ.exceptions import JavaClassNotFoundException
5 | import pydeequ.jvm_conversions as jc
6 |
7 |
8 | class Rules:
9 | """
10 | Constraint rules
11 | """
12 |
13 | def __init__(self, spark, _jvmRule):
14 | self.spark = spark
15 | self._jvmRule = _jvmRule
16 |
17 | @property
18 | def _jvm(self):
19 | return self.spark.sparkContext._jvm
20 |
21 | @classmethod
22 | def CompleteIfCompleteRule(cls, spark):
23 | _jvmRule = spark.sparkContext._jvm.com.amazon.deequ.suggestions.rules.CompleteIfCompleteRule
24 | return cls(spark, _jvmRule)
25 |
26 | @classmethod
27 | def RetainCompletenessRule(cls, spark):
28 | _jvmRule = spark.sparkContext._jvm.com.amazon.deequ.suggestions.rules.RetainCompletenessRule
29 | return cls(spark, _jvmRule)
30 |
31 | @classmethod
32 | def RetainTypeRule(cls, spark):
33 | _jvmRule = spark.sparkContext._jvm.com.amazon.deequ.suggestions.rules.RetainTypeRule
34 | return cls(spark, _jvmRule)
35 |
36 | @classmethod
37 | def CategoricalRangeRule(cls, spark):
38 | _jvmRule = spark.sparkContext._jvm.com.amazon.deequ.suggestions.rules.CategoricalRangeRule
39 | return cls(spark, _jvmRule)
40 |
41 | @classmethod
42 | def FractionalCategoricalRangeRule(cls, spark):
43 | _jvmRule = spark.sparkContext._jvm.com.amazon.deequ.suggestions.rules.FractionalCategoricalRangeRule
44 | return cls(spark, _jvmRule)
45 |
46 | @classmethod
47 | def NonNegativeNumbersRule(cls, spark):
48 | _jvmRule = spark.sparkContext._jvm.com.amazon.deequ.suggestions.rules.NonNegativeNumbersRule
49 | return cls(spark, _jvmRule)
--------------------------------------------------------------------------------
/tests/integration/test_analyzers.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from pyspark.sql import SparkSession
4 | from pyspark.sql import DataFrame, Row
5 |
6 | from pydeequ.base import AnalysisRunner
7 | from pydeequ.examples import test_data
8 | from pydeequ import analyzers
9 |
10 | class AnalysisRunnerTest(unittest.TestCase):
11 |
12 | @classmethod
13 | def setUpClass(cls):
14 | cls.spark = (SparkSession
15 | .builder
16 | .master('local[*]')
17 | .config('spark.jars.packages',
18 | 'com.amazon.deequ:deequ:1.0.5')
19 | .appName('pytest-pyspark-local-testing')
20 | .getOrCreate())
21 | cls.df = cls.spark.createDataFrame(test_data)
22 | cls.runner = AnalysisRunner(cls.spark)
23 |
24 | @classmethod
25 | def tearDownClass(cls):
26 | cls.spark.sparkContext._gateway.close()
27 | cls.spark.stop()
28 |
29 | def test_ApproxCountDistinct(self):
30 | out = self.runner.onData(self.df) \
31 | .addAnalyzer(analyzers.ApproxCountDistinct('_1')) \
32 | .run().successMetricsAsDataFrame()
33 | out = out.select('value').collect()
34 | self.assertEqual(out, [Row(value=5.0)])
35 |
36 | def test_ApproxQuantile(self):
37 | out = self.runner.onData(self.df) \
38 | .addAnalyzer(analyzers.ApproxQuantile('_2', 0.75)) \
39 | .run().successMetricsAsDataFrame()
40 | out = out.select('value').collect()
41 | self.assertEqual(out, [Row(value=20)])
42 |
43 | def test_Completeness(self):
44 | out = self.runner.onData(self.df) \
45 | .addAnalyzer(analyzers.Completeness('_2')) \
46 | .run().successMetricsAsDataFrame()
47 | out = out.select('value').collect()
48 | self.assertEqual(out, [Row(value=0.75)])
49 |
50 | def test_Compliance(self):
51 | out = self.runner.onData(self.df) \
52 | .addAnalyzer(analyzers.Compliance('top _2', '_2 > 15')) \
53 | .run().successMetricsAsDataFrame()
54 | out = out.select('value').collect()
55 | self.assertEqual(out, [Row(value=0.25)])
56 |
57 | def test_Correlation(self):
58 | out = self.runner.onData(self.df) \
59 | .addAnalyzer(analyzers.Correlation('_2', '_5')) \
60 | .run().successMetricsAsDataFrame()
61 | out = out.select('value').collect()
62 | self.assertLess(out, [Row(value=-0.8)])
63 |
64 | def test_CountDistinct(self):
65 | out = self.runner.onData(self.df) \
66 | .addAnalyzer(analyzers.CountDistinct('_3')) \
67 | .run().successMetricsAsDataFrame()
68 | out = out.select('value').collect()
69 | self.assertEqual(out, [Row(value=3)])
70 |
71 | def test_DataType(self):
72 | out = self.runner.onData(self.df) \
73 | .addAnalyzer(analyzers.DataType('_3')) \
74 | .run().successMetricsAsDataFrame()
75 | out = out.select('value').collect()
76 | self.assertEqual(out, [Row(value=5.0), Row(value=0.0), Row(value=0.0), Row(value=0.0), Row(value=0.0), Row(value=0.0), Row(value=0.0), Row(value=0.0), Row(value=0.0), Row(value=8.0), Row(value=1.0)])
77 |
78 | def test_Distinctness(self):
79 | out = self.runner.onData(self.df) \
80 | .addAnalyzer(analyzers.Distinctness('_3')) \
81 | .run().successMetricsAsDataFrame()
82 | out = out.select('value').collect()
83 | self.assertEqual(out, [Row(value=0.375)])
84 |
85 | def test_Entropy(self):
86 | out = self.runner.onData(self.df) \
87 | .addAnalyzer(analyzers.Entropy('_3')) \
88 | .run().successMetricsAsDataFrame()
89 | out = out.select('value').collect()
90 | self.assertGreater(out, [Row(value=1)])
91 |
92 | def test_Histogram(self):
93 | out = self.runner.onData(self.df) \
94 | .addAnalyzer(analyzers.Histogram('_3')) \
95 | .run().successMetricsAsDataFrame()
96 | out = out.select('value').collect()
97 | self.assertEqual(out, [Row(value=3.0), Row(value=4.0), Row(value=0.5), Row(value=2.0), Row(value=0.25), Row(value=2.0), Row(value=0.25)])
98 |
99 | def test_Maximum(self):
100 | out = self.runner.onData(self.df) \
101 | .addAnalyzer(analyzers.Maximum('_2')) \
102 | .run().successMetricsAsDataFrame()
103 | out = out.select('value').collect()
104 | self.assertEqual(out, [Row(value=20)])
105 |
106 | def test_MaxLength(self):
107 | out = self.runner.onData(self.df) \
108 | .addAnalyzer(analyzers.MaxLength('_1')) \
109 | .run().successMetricsAsDataFrame()
110 | out = out.select('value').collect()
111 | self.assertEqual(out, [Row(value=6)])
112 |
113 | def test_Mean(self):
114 | out = self.runner.onData(self.df) \
115 | .addAnalyzer(analyzers.Mean('_2')) \
116 | .run().successMetricsAsDataFrame()
117 | out = out.select('value').collect()
118 | self.assertEqual(out, [Row(value=11)])
119 |
120 | def test_Minimum(self):
121 | out = self.runner.onData(self.df) \
122 | .addAnalyzer(analyzers.Minimum('_2')) \
123 | .run().successMetricsAsDataFrame()
124 | out = out.select('value').collect()
125 | self.assertEqual(out, [Row(value=1)])
126 |
127 | def test_MinLength(self):
128 | out = self.runner.onData(self.df) \
129 | .addAnalyzer(analyzers.MaxLength('_1')) \
130 | .run().successMetricsAsDataFrame()
131 | out = out.select('value').collect()
132 | self.assertEqual(out, [Row(value=6)])
133 |
134 | def test_MutualInformation(self):
135 | out = self.runner.onData(self.df) \
136 | .addAnalyzer(analyzers.MutualInformation(['_1', '_3'])) \
137 | .run().successMetricsAsDataFrame()
138 | out = out.select('value').collect()
139 | self.assertGreater(out, [Row(value=0.5)])
140 |
141 | def test_Size(self):
142 | out = self.runner.onData(self.df) \
143 | .addAnalyzer(analyzers.Size()) \
144 | .run().successMetricsAsDataFrame()
145 | out = out.select('value').collect()
146 | self.assertEqual(out, [Row(value=8)])
147 |
148 | def test_StandardDeviation(self):
149 | out = self.runner.onData(self.df) \
150 | .addAnalyzer(analyzers.StandardDeviation('_2')) \
151 | .run().successMetricsAsDataFrame()
152 | out = out.select('value').collect()
153 | self.assertGreater(out, [Row(value=7)])
154 |
155 | def test_Sum(self):
156 | out = self.runner.onData(self.df) \
157 | .addAnalyzer(analyzers.Sum('_2')) \
158 | .run().successMetricsAsDataFrame()
159 | out = out.select('value').collect()
160 | self.assertGreater(out, [Row(value=10)])
161 |
162 | def test_Uniqueness(self):
163 | out = self.runner.onData(self.df) \
164 | .addAnalyzer(analyzers.Uniqueness(['_1'])) \
165 | .run().successMetricsAsDataFrame()
166 | out = out.select('value').collect()
167 | self.assertEqual(out, [Row(value=0.375)])
168 |
169 | def test_UniqueValueRatio(self):
170 | out = self.runner.onData(self.df) \
171 | .addAnalyzer(analyzers.UniqueValueRatio(['_1'])) \
172 | .run().successMetricsAsDataFrame()
173 | out = out.select('value').collect()
174 | self.assertEqual(out, [Row(value=0.6)])
175 |
176 | if __name__ == '__main__':
177 | unittest.main()
178 |
--------------------------------------------------------------------------------
/tests/integration/test_constraints.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from pyspark.sql import SparkSession, DataFrame, Row
4 |
5 | from pydeequ.base import VerificationSuite
6 | from pydeequ.checks import Check
7 | from pydeequ.examples import test_data
8 |
9 | class ConstraintTest(unittest.TestCase):
10 |
11 | @classmethod
12 | def setUpClass(cls):
13 | cls.spark = (SparkSession
14 | .builder
15 | .master('local[*]')
16 | .config('spark.jars.packages',
17 | 'com.amazon.deequ:deequ:1.0.5')
18 | .appName('pytest-pyspark-local-testing')
19 | .getOrCreate())
20 | cls.df = cls.spark.createDataFrame(test_data)
21 | cls.suite = VerificationSuite(cls.spark)
22 | cls.success = Row(constraint_status = 'Success')
23 | cls.failure = Row(constraint_status = 'Failure')
24 |
25 | @classmethod
26 | def tearDownClass(cls):
27 | cls.spark.sparkContext._gateway.close()
28 | cls.spark.stop()
29 |
30 | def test_hasSize(self):
31 | chk = Check(self.spark) \
32 | .hasSize(lambda x: x == 8)
33 | out = self.suite.onData(self.df).addCheck(chk).run()
34 | out = DataFrame(out, self.spark).select('constraint_status').collect()
35 | self.assertEqual(out, [self.success])
36 |
37 | def test_isUnique(self):
38 | chk = Check(self.spark) \
39 | .isUnique('_1')
40 | out = self.suite.onData(self.df).addCheck(chk).run()
41 | out = DataFrame(out, self.spark).select('constraint_status').collect()
42 | self.assertEqual(out, [self.failure])
43 |
44 | def test_hasCompleteness(self):
45 | chk = Check(self.spark) \
46 | .hasCompleteness('_2', lambda x: x >= 0.75)
47 | out = self.suite.onData(self.df).addCheck(chk).run()
48 | out = DataFrame(out, self.spark).select('constraint_status').collect()
49 | self.assertEqual(out, [self.success])
50 |
51 | def test_hasUniqueness(self):
52 | chk = Check(self.spark) \
53 | .hasUniqueness('_1', lambda x: x == 3/8)
54 | out = self.suite.onData(self.df).addCheck(chk).run()
55 | out = DataFrame(out, self.spark).select('constraint_status').collect()
56 | self.assertEqual(out, [self.success])
57 |
58 | def test_hasDistinctness(self):
59 | chk = Check(self.spark) \
60 | .hasDistinctness('_1', lambda x: x == 5/8)
61 | out = self.suite.onData(self.df).addCheck(chk).run()
62 | out = DataFrame(out, self.spark).select('constraint_status').collect()
63 | self.assertEqual(out, [self.success])
64 |
65 | def test_hasUniqueValueRatio(self):
66 | chk = Check(self.spark) \
67 | .hasUniqueValueRatio('_2', lambda x: x == 0.8)
68 | out = self.suite.onData(self.df).addCheck(chk).run()
69 | out = DataFrame(out, self.spark).select('constraint_status').collect()
70 | self.assertEqual(out, [self.success])
71 |
72 | def test_hasNumberOfDistinctValues(self):
73 | chk = Check(self.spark) \
74 | .hasNumberOfDistinctValues('_2', lambda x: x == 6)
75 | out = self.suite.onData(self.df).addCheck(chk).run()
76 | out = DataFrame(out, self.spark).select('constraint_status').collect()
77 | self.assertEqual(out, [self.success])
78 |
79 | # .hasHistogram
80 |
81 | def test_hasEntropy(self):
82 | chk = Check(self.spark) \
83 | .hasEntropy('_3', lambda x: x > 1)
84 | out = self.suite.onData(self.df).addCheck(chk).run()
85 | out = DataFrame(out, self.spark).select('constraint_status').collect()
86 | self.assertEqual(out, [self.success])
87 |
88 | # .hasMutualInformation
89 |
90 | def test_hasApproxQuantile(self):
91 | chk = Check(self.spark) \
92 | .hasApproxQuantile('_2', 0.5, lambda x: x == 7)
93 | out = self.suite.onData(self.df).addCheck(chk).run()
94 | out = DataFrame(out, self.spark).select('constraint_status').collect()
95 | self.assertEqual(out, [self.success])
96 |
97 | def test_hasMinLength(self):
98 | chk = Check(self.spark) \
99 | .hasMinLength('_1', lambda x: x == 6)
100 | out = self.suite.onData(self.df).addCheck(chk).run()
101 | out = DataFrame(out, self.spark).select('constraint_status').collect()
102 | self.assertEqual(out, [self.success])
103 |
104 | def test_hasMaxLength(self):
105 | chk = Check(self.spark) \
106 | .hasMaxLength('_3', lambda x: x == 10)
107 | out = self.suite.onData(self.df).addCheck(chk).run()
108 | out = DataFrame(out, self.spark).select('constraint_status').collect()
109 | self.assertEqual(out, [self.success])
110 |
111 | def test_hasMin(self):
112 | chk = Check(self.spark) \
113 | .hasMin('_2', lambda x: x == 1)
114 | out = self.suite.onData(self.df).addCheck(chk).run()
115 | out = DataFrame(out, self.spark).select('constraint_status').collect()
116 | self.assertEqual(out, [self.success])
117 |
118 | def test_hasMax(self):
119 | chk = Check(self.spark) \
120 | .hasMax('_2', lambda x: x == 20)
121 | out = self.suite.onData(self.df).addCheck(chk).run()
122 | out = DataFrame(out, self.spark).select('constraint_status').collect()
123 | self.assertEqual(out, [self.success])
124 |
125 | def test_hasMean(self):
126 | chk = Check(self.spark) \
127 | .hasMean('_2', lambda x: x > 10)
128 | out = self.suite.onData(self.df).addCheck(chk).run()
129 | out = DataFrame(out, self.spark).select('constraint_status').collect()
130 | self.assertEqual(out, [self.success])
131 |
132 | def test_hasSum(self):
133 | chk = Check(self.spark) \
134 | .hasSum('_2', lambda x: x > 50)
135 | out = self.suite.onData(self.df).addCheck(chk).run()
136 | out = DataFrame(out, self.spark).select('constraint_status').collect()
137 | self.assertEqual(out, [self.success])
138 |
139 | def test_hasStandardDeviation(self):
140 | chk = Check(self.spark) \
141 | .hasStandardDeviation('_2', lambda x: x > 5)
142 | out = self.suite.onData(self.df).addCheck(chk).run()
143 | out = DataFrame(out, self.spark).select('constraint_status').collect()
144 | self.assertEqual(out, [self.success])
145 |
146 | def test_hasApproxContDistintc(self):
147 | chk = Check(self.spark) \
148 | .hasApproxCountDistinct('_2', lambda x: x == 5)
149 | out = self.suite.onData(self.df).addCheck(chk).run()
150 | out = DataFrame(out, self.spark).select('constraint_status').collect()
151 | self.assertEqual(out, [self.success])
152 |
153 | def test_hasCorrelation(self):
154 | chk = Check(self.spark) \
155 | .hasCorrelation('_2', '_2', lambda x: x == 1)
156 | out = self.suite.onData(self.df).addCheck(chk).run()
157 | out = DataFrame(out, self.spark).select('constraint_status').collect()
158 | self.assertEqual(out, [self.success])
159 |
160 | def test_satisfies(self):
161 | chk = Check(self.spark) \
162 | .satisfies("_2 > 15", "MyCondition", lambda x: x == 0.25)
163 | out = self.suite.onData(self.df).addCheck(chk).run()
164 | out = DataFrame(out, self.spark).select('constraint_status').collect()
165 | self.assertEqual(out, [self.success])
166 |
167 |
168 | #.hasPattern("_1", "thing([A-Z])", lambda x: x == 1)
169 | #.hasDataType("_1", "string", lambda x: x == 1)
170 |
171 | def test_isPositive(self):
172 | chk = Check(self.spark) \
173 | .isPositive('_2')
174 | out = self.suite.onData(self.df).addCheck(chk).run()
175 | out = DataFrame(out, self.spark).select('constraint_status').collect()
176 | self.assertEqual(out, [self.success])
177 |
178 | def test_isNonNegative(self):
179 | chk = Check(self.spark) \
180 | .isNonNegative('_2')
181 | out = self.suite.onData(self.df).addCheck(chk).run()
182 | out = DataFrame(out, self.spark).select('constraint_status').collect()
183 | self.assertEqual(out, [self.success])
184 |
185 | def test_isLessThan(self):
186 | chk = Check(self.spark) \
187 | .isLessThan('_5', '_2', lambda x: x == 0.375)
188 | out = self.suite.onData(self.df).addCheck(chk).run()
189 | out = DataFrame(out, self.spark).select('constraint_status').collect()
190 | self.assertEqual(out, [self.success])
191 |
192 | def test_isLessThanOrEqualTo(self):
193 | chk = Check(self.spark) \
194 | .isLessThanOrEqualTo('_5', '_2', lambda x: x == 0.375)
195 | out = self.suite.onData(self.df).addCheck(chk).run()
196 | out = DataFrame(out, self.spark).select('constraint_status').collect()
197 | self.assertEqual(out, [self.success])
198 |
199 | def test_isGreaterThan(self):
200 | chk = Check(self.spark) \
201 | .isGreaterThan('_5', '_2', lambda x: x == 0.125)
202 | out = self.suite.onData(self.df).addCheck(chk).run()
203 | out = DataFrame(out, self.spark).select('constraint_status').collect()
204 | self.assertEqual(out, [self.success])
205 |
206 | def test_isGreaterThanOrEqualTo(self):
207 | chk = Check(self.spark) \
208 | .isGreaterThanOrEqualTo('_5', '_2', lambda x: x == 0.125)
209 | out = self.suite.onData(self.df).addCheck(chk).run()
210 | out = DataFrame(out, self.spark).select('constraint_status').collect()
211 | self.assertEqual(out, [self.success])
212 |
213 | #.isContainedIn('_3', ['DELAYED', 'INTRANSIT'])
214 |
215 | def test_isInInterval(self):
216 | chk = Check(self.spark) \
217 | .isInInterval('_5', 1.0, 50.0)
218 | out = self.suite.onData(self.df).addCheck(chk).run()
219 | out = DataFrame(out, self.spark).select('constraint_status').collect()
220 | self.assertEqual(out, [self.success])
221 |
222 | if __name__ == '__main__':
223 | unittest.main()
224 |
--------------------------------------------------------------------------------
/tests/integration/test_runners.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from pyspark.sql import SparkSession
4 |
5 | from pydeequ.base import VerificationSuite, AnalysisRunner, ConstraintSuggestionRunner
6 | from pydeequ.profiler import ColumnProfilerRunner
7 | from pydeequ.examples import test_data
8 |
9 | class VerificationTest(unittest.TestCase):
10 |
11 | @classmethod
12 | def setUpClass(cls):
13 | cls.spark = (SparkSession
14 | .builder
15 | .master('local[*]')
16 | .config('spark.jars.packages',
17 | 'com.amazon.deequ:deequ:1.0.5')
18 | .appName('pytest-pyspark-local-testing')
19 | .getOrCreate())
20 | cls.df = cls.spark.createDataFrame(test_data)
21 |
22 | @classmethod
23 | def tearDownClass(cls):
24 | cls.spark.sparkContext._gateway.close()
25 | cls.spark.stop()
26 |
27 | def test_VerificationSuiteArgs(self):
28 | suiterunner = VerificationSuite(self.spark).onData(self.df)
29 | # check dataframe prop
30 | self.assertEqual(suiterunner.dataFrame.columns,
31 | ['_1', '_2', '_3', '_4', '_5']
32 | )
33 | # check _jsparkSession prop
34 | self.assertEqual(suiterunner._jsparkSession.getClass().toString(),
35 | 'class org.apache.spark.sql.SparkSession'
36 | )
37 | # check _jvm prop
38 | self.assertEqual(suiterunner._jvm,
39 | self.spark.sparkContext._jvm
40 | )
41 | # check jvmVerificationRunBuilder
42 | self.assertEqual(suiterunner.jvmVerificationRunBuilder.getClass().toString(),
43 | "class com.amazon.deequ.VerificationRunBuilder"
44 | )
45 |
46 | def test_AnalyzerRunnerArgs(self):
47 | runner = AnalysisRunner(self.spark).onData(self.df)
48 | # check dataframe prop
49 | self.assertEqual(runner.dataFrame.columns,
50 | ['_1', '_2', '_3', '_4', '_5']
51 | )
52 | # check _jsparkSession prop
53 | self.assertEqual(runner._jsparkSession.getClass().toString(),
54 | 'class org.apache.spark.sql.SparkSession'
55 | )
56 | # check _jvm prop
57 | self.assertEqual(runner._jvm,
58 | self.spark.sparkContext._jvm
59 | )
60 | # check jvmAnalysisRunBuilder
61 | self.assertEqual(runner.jvmAnalysisRunBuilder.getClass().toString(),
62 | "class com.amazon.deequ.analyzers.runners.AnalysisRunBuilder"
63 | )
64 |
65 | def test_ProfilerRunnerArgs(self):
66 | profilerrunner = ColumnProfilerRunner().onData(self.df)
67 | # check dataframe prop
68 | self.assertEqual(profilerrunner.dataFrame.columns,
69 | ['_1', '_2', '_3', '_4', '_5']
70 | )
71 | # check _jvm prop
72 | self.assertEqual(profilerrunner._jvm,
73 | self.spark.sparkContext._jvm
74 | )
75 | # check jvmColumnProfilerRunBuilder
76 | self.assertEqual(profilerrunner.jvmColumnProfilerRunBuilder.getClass().toString(),
77 | "class com.amazon.deequ.profiles.ColumnProfilerRunBuilder"
78 | )
79 |
80 | def test_SuggestionRunnerArgs(self):
81 | suggestionrunner = ConstraintSuggestionRunner(self.spark).onData(self.df)
82 | # check dataframe prop
83 | self.assertEqual(suggestionrunner.dataFrame.columns,
84 | ['_1', '_2', '_3', '_4', '_5']
85 | )
86 | # check _jvm prop
87 | self.assertEqual(suggestionrunner._jvm,
88 | self.spark.sparkContext._jvm
89 | )
90 | # check jvmColumnProfilerRunBuilder
91 | self.assertEqual(suggestionrunner.jvmConstraintSuggestionRunBuilder.getClass().toString(),
92 | "class com.amazon.deequ.suggestions.ConstraintSuggestionRunBuilder"
93 | )
94 |
95 | if __name__ == '__main__':
96 | unittest.main()
97 |
98 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | # Tox configuration file
2 | # Read more under https://tox.readthedocs.org/
3 | # THIS SCRIPT IS SUPPOSED TO BE AN EXAMPLE. MODIFY IT ACCORDING TO YOUR NEEDS!
4 |
5 | [tox]
6 | minversion = 2.4
7 | envlist = default
8 |
9 | [testenv]
10 | setenv = TOXINIDIR = {toxinidir}
11 | passenv =
12 | HOME
13 | commands =
14 | py.test {posargs}
15 | extras =
16 | all
17 | testing
18 |
--------------------------------------------------------------------------------