├── .coveragerc
├── .gitignore
├── LICENSE.txt
├── README.md
├── coverage.svg
├── docs
    ├── Makefile
    ├── _static
    │   └── .gitignore
    ├── authors.rst
    ├── changelog.rst
    ├── conf.py
    ├── index.rst
    └── license.rst
├── requirements.txt
├── setup.cfg
├── setup.py
├── src
    └── pydeequ
    │   ├── __init__.py
    │   ├── analyzers.py
    │   ├── base.py
    │   ├── checks.py
    │   ├── examples
    │       ├── __init__.py
    │       ├── analyzer_example.py
    │       ├── basic_usage.py
    │       ├── basic_usage2.py
    │       ├── metrics_repo.py
    │       ├── profiler_example.py
    │       └── suggestions_example.py
    │   ├── exceptions.py
    │   ├── jvm_conversions.py
    │   ├── metricsrepo.py
    │   ├── profiler.py
    │   └── suggestions.py
├── tests
    └── integration
    │   ├── test_analyzers.py
    │   ├── test_constraints.py
    │   └── test_runners.py
└── tox.ini


/.coveragerc:
--------------------------------------------------------------------------------
 1 | # .coveragerc to control coverage.py
 2 | [run]
 3 | branch = True
 4 | source = pydeequ
 5 | omit = src/pydeequ/examples/*
 6 | 
 7 | [paths]
 8 | source =
 9 |     src/
10 |     */site-packages/
11 | 
12 | [report]
13 | # Regexes for lines to exclude from consideration
14 | exclude_lines =
15 |     # Have to re-enable the standard pragma
16 |     pragma: no cover
17 | 
18 |     # Don't complain about missing debug-only code:
19 |     def __repr__
20 |     if self\.debug
21 | 
22 |     # Don't complain if tests don't hit defensive assertion code:
23 |     raise AssertionError
24 |     raise NotImplementedError
25 | 
26 |     # Don't complain if non-runnable code isn't run:
27 |     if 0:
28 |     if __name__ == .__main__.:
29 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Temporary and binary files
 2 | *~
 3 | *.py[cod]
 4 | *.so
 5 | *.cfg
 6 | !.isort.cfg
 7 | !setup.cfg
 8 | *.orig
 9 | *.log
10 | *.pot
11 | __pycache__/*
12 | .cache/*
13 | .*.swp
14 | */.ipynb_checkpoints/*
15 | .DS_Store
16 | metastore_db/
17 | spark-warehouse/
18 | .vscode
19 | 
20 | # Project files
21 | .ropeproject
22 | .project
23 | .pydevproject
24 | .settings
25 | .idea
26 | tags
27 | 
28 | # Package files
29 | *.egg
30 | *.eggs/
31 | .installed.cfg
32 | *.egg-info
33 | 
34 | # Unittest and coverage
35 | htmlcov/*
36 | .coverage
37 | .tox
38 | junit.xml
39 | coverage.xml
40 | .pytest_cache/
41 | 
42 | # Build and docs folder/files
43 | build/*
44 | dist/*
45 | sdist/*
46 | docs/api/*
47 | docs/_rst/*
48 | docs/_build/*
49 | cover/*
50 | MANIFEST
51 | 
52 | # Per-project virtualenvs
53 | .venv*/
54 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | __This repository will be merged with the official [AWS Lab/python-deequ](https://github.com/awslabs/python-deequ) project. Please fork and contribute to that project because many issues of this pydeequ version are solved there.__
2 | 


--------------------------------------------------------------------------------
/coverage.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <svg xmlns="http://www.w3.org/2000/svg" width="99" height="20">
 3 |     <linearGradient id="b" x2="0" y2="100%">
 4 |         <stop offset="0" stop-color="#bbb" stop-opacity=".1"/>
 5 |         <stop offset="1" stop-opacity=".1"/>
 6 |     </linearGradient>
 7 |     <mask id="a">
 8 |         <rect width="99" height="20" rx="3" fill="#fff"/>
 9 |     </mask>
10 |     <g mask="url(#a)">
11 |         <path fill="#555" d="M0 0h63v20H0z"/>
12 |         <path fill="#dfb317" d="M63 0h36v20H63z"/>
13 |         <path fill="url(#b)" d="M0 0h99v20H0z"/>
14 |     </g>
15 |     <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11">
16 |         <text x="31.5" y="15" fill="#010101" fill-opacity=".3">coverage</text>
17 |         <text x="31.5" y="14">coverage</text>
18 |         <text x="80" y="15" fill="#010101" fill-opacity=".3">70%</text>
19 |         <text x="80" y="14">70%</text>
20 |     </g>
21 | </svg>
22 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = ../build/sphinx/
  9 | AUTODOCDIR    = api
 10 | AUTODOCBUILD  = sphinx-apidoc
 11 | PROJECT       = pydeequ
 12 | MODULEDIR     = ../src/pydeequ
 13 | 
 14 | # User-friendly check for sphinx-build
 15 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $?), 1)
 16 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 17 | endif
 18 | 
 19 | # Internal variables.
 20 | PAPEROPT_a4     = -D latex_paper_size=a4
 21 | PAPEROPT_letter = -D latex_paper_size=letter
 22 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 23 | # the i18n builder cannot share the environment and doctrees with the others
 24 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 25 | 
 26 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext doc-requirements
 27 | 
 28 | help:
 29 | 	@echo "Please use \`make <target>' where <target> is one of"
 30 | 	@echo "  html       to make standalone HTML files"
 31 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 32 | 	@echo "  singlehtml to make a single large HTML file"
 33 | 	@echo "  pickle     to make pickle files"
 34 | 	@echo "  json       to make JSON files"
 35 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 36 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 37 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 38 | 	@echo "  epub       to make an epub"
 39 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 40 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 41 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 42 | 	@echo "  text       to make text files"
 43 | 	@echo "  man        to make manual pages"
 44 | 	@echo "  texinfo    to make Texinfo files"
 45 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 46 | 	@echo "  gettext    to make PO message catalogs"
 47 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 48 | 	@echo "  xml        to make Docutils-native XML files"
 49 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 50 | 	@echo "  linkcheck  to check all external links for integrity"
 51 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 52 | 
 53 | clean:
 54 | 	rm -rf $(BUILDDIR)/* $(AUTODOCDIR)
 55 | 
 56 | $(AUTODOCDIR): $(MODULEDIR)
 57 | 	mkdir -p $@
 58 | 	$(AUTODOCBUILD) -f -o $@ $^
 59 | 
 60 | doc-requirements: $(AUTODOCDIR)
 61 | 
 62 | html: doc-requirements
 63 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 64 | 	@echo
 65 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 66 | 
 67 | dirhtml: doc-requirements
 68 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 69 | 	@echo
 70 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 71 | 
 72 | singlehtml: doc-requirements
 73 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 74 | 	@echo
 75 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 76 | 
 77 | pickle: doc-requirements
 78 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 79 | 	@echo
 80 | 	@echo "Build finished; now you can process the pickle files."
 81 | 
 82 | json: doc-requirements
 83 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 84 | 	@echo
 85 | 	@echo "Build finished; now you can process the JSON files."
 86 | 
 87 | htmlhelp: doc-requirements
 88 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 89 | 	@echo
 90 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 91 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 92 | 
 93 | qthelp: doc-requirements
 94 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 95 | 	@echo
 96 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 97 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 98 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/$(PROJECT).qhcp"
 99 | 	@echo "To view the help file:"
100 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/$(PROJECT).qhc"
101 | 
102 | devhelp: doc-requirements
103 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
104 | 	@echo
105 | 	@echo "Build finished."
106 | 	@echo "To view the help file:"
107 | 	@echo "# mkdir -p $HOME/.local/share/devhelp/$(PROJECT)"
108 | 	@echo "# ln -s $(BUILDDIR)/devhelp $HOME/.local/share/devhelp/$(PROJEC)"
109 | 	@echo "# devhelp"
110 | 
111 | epub: doc-requirements
112 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
113 | 	@echo
114 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
115 | 
116 | patch-latex:
117 | 	find _build/latex -iname "*.tex" | xargs -- \
118 | 		sed -i'' 's~includegraphics{~includegraphics\[keepaspectratio,max size={\\textwidth}{\\textheight}\]{~g'
119 | 
120 | latex: doc-requirements
121 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
122 | 	$(MAKE) patch-latex
123 | 	@echo
124 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
125 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
126 | 	      "(use \`make latexpdf' here to do that automatically)."
127 | 
128 | latexpdf: doc-requirements
129 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
130 | 	$(MAKE) patch-latex
131 | 	@echo "Running LaTeX files through pdflatex..."
132 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
133 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
134 | 
135 | latexpdfja: doc-requirements
136 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
137 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
138 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
139 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
140 | 
141 | text: doc-requirements
142 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
143 | 	@echo
144 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
145 | 
146 | man: doc-requirements
147 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
148 | 	@echo
149 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
150 | 
151 | texinfo: doc-requirements
152 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
153 | 	@echo
154 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
155 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
156 | 	      "(use \`make info' here to do that automatically)."
157 | 
158 | info: doc-requirements
159 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
160 | 	@echo "Running Texinfo files through makeinfo..."
161 | 	make -C $(BUILDDIR)/texinfo info
162 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
163 | 
164 | gettext: doc-requirements
165 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
166 | 	@echo
167 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
168 | 
169 | changes: doc-requirements
170 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
171 | 	@echo
172 | 	@echo "The overview file is in $(BUILDDIR)/changes."
173 | 
174 | linkcheck: doc-requirements
175 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
176 | 	@echo
177 | 	@echo "Link check complete; look for any errors in the above output " \
178 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
179 | 
180 | doctest: doc-requirements
181 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
182 | 	@echo "Testing of doctests in the sources finished, look at the " \
183 | 	      "results in $(BUILDDIR)/doctest/output.txt."
184 | 
185 | xml: doc-requirements
186 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
187 | 	@echo
188 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
189 | 
190 | pseudoxml: doc-requirements
191 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
192 | 	@echo
193 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
194 | 


--------------------------------------------------------------------------------
/docs/_static/.gitignore:
--------------------------------------------------------------------------------
1 | # Empty directory
2 | 


--------------------------------------------------------------------------------
/docs/authors.rst:
--------------------------------------------------------------------------------
1 | .. _authors:
2 | .. include:: ../AUTHORS.rst
3 | 


--------------------------------------------------------------------------------
/docs/changelog.rst:
--------------------------------------------------------------------------------
1 | .. _changes:
2 | .. include:: ../CHANGELOG.rst
3 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is execfile()d with the current directory set to its containing dir.
  4 | #
  5 | # Note that not all possible configuration values are present in this
  6 | # autogenerated file.
  7 | #
  8 | # All configuration values have a default; values that are commented out
  9 | # serve to show the default.
 10 | 
 11 | import os
 12 | import sys
 13 | import inspect
 14 | import shutil
 15 | 
 16 | __location__ = os.path.join(os.getcwd(), os.path.dirname(
 17 |     inspect.getfile(inspect.currentframe())))
 18 | 
 19 | # If extensions (or modules to document with autodoc) are in another directory,
 20 | # add these directories to sys.path here. If the directory is relative to the
 21 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 22 | sys.path.insert(0, os.path.join(__location__, '../src'))
 23 | 
 24 | # -- Run sphinx-apidoc ------------------------------------------------------
 25 | # This hack is necessary since RTD does not issue `sphinx-apidoc` before running
 26 | # `sphinx-build -b html . _build/html`. See Issue:
 27 | # https://github.com/rtfd/readthedocs.org/issues/1139
 28 | # DON'T FORGET: Check the box "Install your project inside a virtualenv using
 29 | # setup.py install" in the RTD Advanced Settings.
 30 | # Additionally it helps us to avoid running apidoc manually
 31 | 
 32 | try:  # for Sphinx >= 1.7
 33 |     from sphinx.ext import apidoc
 34 | except ImportError:
 35 |     from sphinx import apidoc
 36 | 
 37 | output_dir = os.path.join(__location__, "api")
 38 | module_dir = os.path.join(__location__, "../src/pydeequ")
 39 | try:
 40 |     shutil.rmtree(output_dir)
 41 | except FileNotFoundError:
 42 |     pass
 43 | 
 44 | try:
 45 |     import sphinx
 46 |     from pkg_resources import parse_version
 47 | 
 48 |     cmd_line_template = "sphinx-apidoc -f -o {outputdir} {moduledir}"
 49 |     cmd_line = cmd_line_template.format(outputdir=output_dir, moduledir=module_dir)
 50 | 
 51 |     args = cmd_line.split(" ")
 52 |     if parse_version(sphinx.__version__) >= parse_version('1.7'):
 53 |         args = args[1:]
 54 | 
 55 |     apidoc.main(args)
 56 | except Exception as e:
 57 |     print("Running `sphinx-apidoc` failed!\n{}".format(e))
 58 | 
 59 | # -- General configuration -----------------------------------------------------
 60 | 
 61 | # If your documentation needs a minimal Sphinx version, state it here.
 62 | # needs_sphinx = '1.0'
 63 | 
 64 | # Add any Sphinx extension module names here, as strings. They can be extensions
 65 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 66 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.todo',
 67 |               'sphinx.ext.autosummary', 'sphinx.ext.viewcode', 'sphinx.ext.coverage',
 68 |               'sphinx.ext.doctest', 'sphinx.ext.ifconfig', 'sphinx.ext.mathjax',
 69 |               'sphinx.ext.napoleon']
 70 | 
 71 | # Add any paths that contain templates here, relative to this directory.
 72 | templates_path = ['_templates']
 73 | 
 74 | # The suffix of source filenames.
 75 | source_suffix = '.rst'
 76 | 
 77 | # The encoding of source files.
 78 | # source_encoding = 'utf-8-sig'
 79 | 
 80 | # The master toctree document.
 81 | master_doc = 'index'
 82 | 
 83 | # General information about the project.
 84 | project = u'pydeequ'
 85 | copyright = u'2020, margitai.i'
 86 | 
 87 | # The version info for the project you're documenting, acts as replacement for
 88 | # |version| and |release|, also used in various other places throughout the
 89 | # built documents.
 90 | #
 91 | # The short X.Y version.
 92 | version = ''  # Is set by calling `setup.py docs`
 93 | # The full version, including alpha/beta/rc tags.
 94 | release = ''  # Is set by calling `setup.py docs`
 95 | 
 96 | # The language for content autogenerated by Sphinx. Refer to documentation
 97 | # for a list of supported languages.
 98 | # language = None
 99 | 
100 | # There are two options for replacing |today|: either, you set today to some
101 | # non-false value, then it is used:
102 | # today = ''
103 | # Else, today_fmt is used as the format for a strftime call.
104 | # today_fmt = '%B %d, %Y'
105 | 
106 | # List of patterns, relative to source directory, that match files and
107 | # directories to ignore when looking for source files.
108 | exclude_patterns = ['_build']
109 | 
110 | # The reST default role (used for this markup: `text`) to use for all documents.
111 | # default_role = None
112 | 
113 | # If true, '()' will be appended to :func: etc. cross-reference text.
114 | # add_function_parentheses = True
115 | 
116 | # If true, the current module name will be prepended to all description
117 | # unit titles (such as .. function::).
118 | # add_module_names = True
119 | 
120 | # If true, sectionauthor and moduleauthor directives will be shown in the
121 | # output. They are ignored by default.
122 | # show_authors = False
123 | 
124 | # The name of the Pygments (syntax highlighting) style to use.
125 | pygments_style = 'sphinx'
126 | 
127 | # A list of ignored prefixes for module index sorting.
128 | # modindex_common_prefix = []
129 | 
130 | # If true, keep warnings as "system message" paragraphs in the built documents.
131 | # keep_warnings = False
132 | 
133 | 
134 | # -- Options for HTML output ---------------------------------------------------
135 | 
136 | # The theme to use for HTML and HTML Help pages.  See the documentation for
137 | # a list of builtin themes.
138 | html_theme = 'alabaster'
139 | 
140 | # Theme options are theme-specific and customize the look and feel of a theme
141 | # further.  For a list of options available for each theme, see the
142 | # documentation.
143 | html_theme_options = {
144 |     'sidebar_width': '300px',
145 |     'page_width': '1200px'
146 | }
147 | 
148 | # Add any paths that contain custom themes here, relative to this directory.
149 | # html_theme_path = []
150 | 
151 | # The name for this set of Sphinx documents.  If None, it defaults to
152 | # "<project> v<release> documentation".
153 | try:
154 |     from pydeequ import __version__ as version
155 | except ImportError:
156 |     pass
157 | else:
158 |     release = version
159 | 
160 | # A shorter title for the navigation bar.  Default is the same as html_title.
161 | # html_short_title = None
162 | 
163 | # The name of an image file (relative to this directory) to place at the top
164 | # of the sidebar.
165 | # html_logo = ""
166 | 
167 | # The name of an image file (within the static path) to use as favicon of the
168 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
169 | # pixels large.
170 | # html_favicon = None
171 | 
172 | # Add any paths that contain custom static files (such as style sheets) here,
173 | # relative to this directory. They are copied after the builtin static files,
174 | # so a file named "default.css" will overwrite the builtin "default.css".
175 | html_static_path = ['_static']
176 | 
177 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
178 | # using the given strftime format.
179 | # html_last_updated_fmt = '%b %d, %Y'
180 | 
181 | # If true, SmartyPants will be used to convert quotes and dashes to
182 | # typographically correct entities.
183 | # html_use_smartypants = True
184 | 
185 | # Custom sidebar templates, maps document names to template names.
186 | # html_sidebars = {}
187 | 
188 | # Additional templates that should be rendered to pages, maps page names to
189 | # template names.
190 | # html_additional_pages = {}
191 | 
192 | # If false, no module index is generated.
193 | # html_domain_indices = True
194 | 
195 | # If false, no index is generated.
196 | # html_use_index = True
197 | 
198 | # If true, the index is split into individual pages for each letter.
199 | # html_split_index = False
200 | 
201 | # If true, links to the reST sources are added to the pages.
202 | # html_show_sourcelink = True
203 | 
204 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
205 | # html_show_sphinx = True
206 | 
207 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
208 | # html_show_copyright = True
209 | 
210 | # If true, an OpenSearch description file will be output, and all pages will
211 | # contain a <link> tag referring to it.  The value of this option must be the
212 | # base URL from which the finished HTML is served.
213 | # html_use_opensearch = ''
214 | 
215 | # This is the file name suffix for HTML files (e.g. ".xhtml").
216 | # html_file_suffix = None
217 | 
218 | # Output file base name for HTML help builder.
219 | htmlhelp_basename = 'pydeequ-doc'
220 | 
221 | 
222 | # -- Options for LaTeX output --------------------------------------------------
223 | 
224 | latex_elements = {
225 | # The paper size ('letterpaper' or 'a4paper').
226 | # 'papersize': 'letterpaper',
227 | 
228 | # The font size ('10pt', '11pt' or '12pt').
229 | # 'pointsize': '10pt',
230 | 
231 | # Additional stuff for the LaTeX preamble.
232 | # 'preamble': '',
233 | }
234 | 
235 | # Grouping the document tree into LaTeX files. List of tuples
236 | # (source start file, target name, title, author, documentclass [howto/manual]).
237 | latex_documents = [
238 |   ('index', 'user_guide.tex', u'pydeequ Documentation',
239 |    u'margitai.i', 'manual'),
240 | ]
241 | 
242 | # The name of an image file (relative to this directory) to place at the top of
243 | # the title page.
244 | # latex_logo = ""
245 | 
246 | # For "manual" documents, if this is true, then toplevel headings are parts,
247 | # not chapters.
248 | # latex_use_parts = False
249 | 
250 | # If true, show page references after internal links.
251 | # latex_show_pagerefs = False
252 | 
253 | # If true, show URL addresses after external links.
254 | # latex_show_urls = False
255 | 
256 | # Documents to append as an appendix to all manuals.
257 | # latex_appendices = []
258 | 
259 | # If false, no module index is generated.
260 | # latex_domain_indices = True
261 | 
262 | # -- External mapping ------------------------------------------------------------
263 | python_version = '.'.join(map(str, sys.version_info[0:2]))
264 | intersphinx_mapping = {
265 |     'sphinx': ('http://www.sphinx-doc.org/en/stable', None),
266 |     'python': ('https://docs.python.org/' + python_version, None),
267 |     'matplotlib': ('https://matplotlib.org', None),
268 |     'numpy': ('https://docs.scipy.org/doc/numpy', None),
269 |     'sklearn': ('http://scikit-learn.org/stable', None),
270 |     'pandas': ('http://pandas.pydata.org/pandas-docs/stable', None),
271 |     'scipy': ('https://docs.scipy.org/doc/scipy/reference', None),
272 | }
273 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | =======
 2 | pydeequ
 3 | =======
 4 | 
 5 | This is the documentation of **pydeequ**.
 6 | 
 7 | .. note::
 8 | 
 9 |     This is the main page of your project's `Sphinx`_ documentation.
10 |     It is formatted in `reStructuredText`_. Add additional pages
11 |     by creating rst-files in ``docs`` and adding them to the `toctree`_ below.
12 |     Use then `references`_ in order to link them from this page, e.g.
13 |     :ref:`authors` and :ref:`changes`.
14 | 
15 |     It is also possible to refer to the documentation of other Python packages
16 |     with the `Python domain syntax`_. By default you can reference the
17 |     documentation of `Sphinx`_, `Python`_, `NumPy`_, `SciPy`_, `matplotlib`_,
18 |     `Pandas`_, `Scikit-Learn`_. You can add more by extending the
19 |     ``intersphinx_mapping`` in your Sphinx's ``conf.py``.
20 | 
21 |     The pretty useful extension `autodoc`_ is activated by default and lets
22 |     you include documentation from docstrings. Docstrings can be written in
23 |     `Google style`_ (recommended!), `NumPy style`_ and `classical style`_.
24 | 
25 | 
26 | Contents
27 | ========
28 | 
29 | .. toctree::
30 |    :maxdepth: 2
31 | 
32 |    License <license>
33 |    Authors <authors>
34 |    Changelog <changelog>
35 |    Module Reference <api/modules>
36 | 
37 | 
38 | Indices and tables
39 | ==================
40 | 
41 | * :ref:`genindex`
42 | * :ref:`modindex`
43 | * :ref:`search`
44 | 
45 | .. _toctree: http://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html
46 | .. _reStructuredText: http://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html
47 | .. _references: http://www.sphinx-doc.org/en/stable/markup/inline.html
48 | .. _Python domain syntax: http://sphinx-doc.org/domains.html#the-python-domain
49 | .. _Sphinx: http://www.sphinx-doc.org/
50 | .. _Python: http://docs.python.org/
51 | .. _Numpy: http://docs.scipy.org/doc/numpy
52 | .. _SciPy: http://docs.scipy.org/doc/scipy/reference/
53 | .. _matplotlib: https://matplotlib.org/contents.html#
54 | .. _Pandas: http://pandas.pydata.org/pandas-docs/stable
55 | .. _Scikit-Learn: http://scikit-learn.org/stable
56 | .. _autodoc: http://www.sphinx-doc.org/en/stable/ext/autodoc.html
57 | .. _Google style: https://github.com/google/styleguide/blob/gh-pages/pyguide.md#38-comments-and-docstrings
58 | .. _NumPy style: https://numpydoc.readthedocs.io/en/latest/format.html
59 | .. _classical style: http://www.sphinx-doc.org/en/stable/domains.html#info-field-lists
60 | 


--------------------------------------------------------------------------------
/docs/license.rst:
--------------------------------------------------------------------------------
1 | .. _license:
2 | 
3 | =======
4 | License
5 | =======
6 | 
7 | .. include:: ../LICENSE.txt
8 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # =============================================================================
 2 | # DEPRECATION WARNING:
 3 | #
 4 | # The file `requirements.txt` does not influence the package dependencies and
 5 | # will not be automatically created in the next version of PyScaffold (v4.x).
 6 | #
 7 | # Please have look at the docs for better alternatives
 8 | # (`Dependency Management` section).
 9 | # =============================================================================
10 | #
11 | # Add your pinned requirements so that they can be easily installed with:
12 | # pip install -r requirements.txt
13 | # Remember to also add them in setup.cfg but unpinned.
14 | # Example:
15 | # numpy==1.13.3
16 | # scipy==1.0
17 | #
18 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
  1 | # This file is used to configure your project.
  2 | # Read more about the various options under:
  3 | # http://setuptools.readthedocs.io/en/latest/setuptools.html#configuring-setup-using-setup-cfg-files
  4 | 
  5 | [metadata]
  6 | name = pydeequ
  7 | description = Python API for Deequ
  8 | author = Istvan Margitai
  9 | author-email = margitai.i@gmail.com
 10 | license = apache
 11 | long-description = file: README.md
 12 | long-description-content-type = text/markdown; charset=UTF-8
 13 | url = https://github.com/margitaii/pydeequ
 14 | #project-urls =
 15 | #    Documentation = https://pyscaffold.org/
 16 | # Change if running only on Windows, Mac or Linux (comma-separated)
 17 | platforms = any
 18 | # Add here all kinds of additional classifiers as defined under
 19 | # https://pypi.python.org/pypi?%3Aaction=list_classifiers
 20 | classifiers =
 21 |     Development Status :: 4 - Beta
 22 |     Programming Language :: Python
 23 | 
 24 | [options]
 25 | zip_safe = False
 26 | packages = find:
 27 | include_package_data = True
 28 | package_dir =
 29 |     =src
 30 | # DON'T CHANGE THE FOLLOWING LINE! IT WILL BE UPDATED BY PYSCAFFOLD!
 31 | setup_requires = pyscaffold>=3.2a0,<3.3a0
 32 | # Add here dependencies of your project (semicolon/line-separated), e.g.
 33 | # install_requires = numpy; scipy
 34 | # The usage of test_requires is discouraged, see `Dependency Management` docs
 35 | # tests_require = pytest; pytest-cov
 36 | # Require a specific Python version, e.g. Python 2.7 or >= 3.4
 37 | # python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*
 38 | 
 39 | [options.packages.find]
 40 | where = src
 41 | exclude =
 42 |     tests
 43 | 
 44 | [options.extras_require]
 45 | # Add here additional requirements for extra features, to install with:
 46 | # `pip install pydeequ[PDF]` like:
 47 | # PDF = ReportLab; RXP
 48 | # Add here test requirements (semicolon/line-separated)
 49 | testing =
 50 |     pytest
 51 |     pytest-cov
 52 | 
 53 | [options.entry_points]
 54 | # Add here console scripts like:
 55 | # console_scripts =
 56 | #     script_name = pydeequ.module:function
 57 | # For example:
 58 | # console_scripts =
 59 | #     fibonacci = pydeequ.skeleton:run
 60 | # And any other entry points, for example:
 61 | # pyscaffold.cli =
 62 | #     awesome = pyscaffoldext.awesome.extension:AwesomeExtension
 63 | 
 64 | [test]
 65 | # py.test options when running `python setup.py test`
 66 | # addopts = --verbose
 67 | extras = True
 68 | 
 69 | [tool:pytest]
 70 | # Options for py.test:
 71 | # Specify command line options as you would do when invoking py.test directly.
 72 | # e.g. --cov-report html (or xml) for html/xml output or --junitxml junit.xml
 73 | # in order to write a coverage file that can be read by Jenkins.
 74 | addopts =
 75 |     --cov pydeequ --cov-report term-missing
 76 |     --verbose
 77 | norecursedirs =
 78 |     dist
 79 |     build
 80 |     .tox
 81 | testpaths = tests
 82 | 
 83 | [aliases]
 84 | dists = bdist_wheel
 85 | 
 86 | [bdist_wheel]
 87 | # Use this option if your package is pure-python
 88 | universal = 1
 89 | 
 90 | [build_sphinx]
 91 | source_dir = docs
 92 | build_dir = build/sphinx
 93 | 
 94 | [devpi:upload]
 95 | # Options for the devpi: PyPI server and packaging tool
 96 | # VCS export must be deactivated since we are using setuptools-scm
 97 | no-vcs = 1
 98 | formats = bdist_wheel
 99 | 
100 | [flake8]
101 | # Some sane defaults for the code style checker flake8
102 | exclude =
103 |     .tox
104 |     build
105 |     dist
106 |     .eggs
107 |     docs/conf.py
108 | 
109 | [pyscaffold]
110 | # PyScaffold's parameters when the project was created.
111 | # This will be used when updating. Do not change!
112 | version = 3.2.3
113 | package = pydeequ
114 | extensions =
115 |     tox
116 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 |     Setup file for pydeequ.
 4 |     Use setup.cfg to configure your project.
 5 | 
 6 |     This file was generated with PyScaffold 3.2.3.
 7 |     PyScaffold helps you to put up the scaffold of your new Python project.
 8 |     Learn more under: https://pyscaffold.org/
 9 | """
10 | import sys
11 | import os
12 | 
13 | from pkg_resources import VersionConflict, require
14 | from setuptools import setup
15 | 
16 | try:
17 |     require('setuptools>=38.3')
18 | except VersionConflict:
19 |     print("Error: version of setuptools is too old (<38.3)!")
20 |     sys.exit(1)
21 | 
22 | def setup_package():
23 |     needs_sphinx = {'build_sphinx', 'upload_docs'}.intersection(sys.argv)
24 |     sphinx = ['sphinx'] if needs_sphinx else []
25 | 
26 |     setup(setup_requires=['six', 'pyscaffold>=2.5a0,<2.6a0'] + sphinx,
27 |           use_pyscaffold=True)
28 | 
29 | if __name__ == "__main__":
30 |     setup_package()
31 | 


--------------------------------------------------------------------------------
/src/pydeequ/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from pkg_resources import get_distribution, DistributionNotFound
 3 | 
 4 | try:
 5 |     # Change here if project is renamed and does not equal the package name
 6 |     dist_name = __name__
 7 |     __version__ = get_distribution(dist_name).version
 8 | except DistributionNotFound:
 9 |     __version__ = 'unknown'
10 | finally:
11 |     del get_distribution, DistributionNotFound
12 | 


--------------------------------------------------------------------------------
/src/pydeequ/analyzers.py:
--------------------------------------------------------------------------------
  1 | import py4j.java_gateway as jg
  2 | 
  3 | from pydeequ.exceptions import JavaClassNotFoundException
  4 | import pydeequ.jvm_conversions as jc
  5 | 
  6 | class BaseAnalyzer(object):
  7 |     """
  8 |     Analyzer baseclass
  9 |     """
 10 |     def set_jvm(self, jvm):
 11 |         self._jvm = jvm
 12 |         return self
 13 | 
 14 |     @property
 15 |     def jvmdeequAnalyzers(self):
 16 |         if (self._jvm):
 17 |             return self._jvm.com.amazon.deequ.analyzers
 18 |         else:
 19 |             raise ValueError("Run set_jvm() method first.")
 20 | 
 21 | class ApproxCountDistinct(BaseAnalyzer):
 22 |     """
 23 |     Compute approximated count distinct with HyperLogLogPlusPlus.
 24 | 
 25 |     @param column Which column to compute this aggregation on.
 26 |     """
 27 | 
 28 |     def __init__(self, column):
 29 |         self.column = column
 30 | 
 31 |     @property
 32 |     def jvmAnalyzer(self):
 33 |         return self.jvmdeequAnalyzers.ApproxCountDistinct(
 34 |             self.column,
 35 |             getattr(self.jvmdeequAnalyzers.ApproxCountDistinct, "apply$default$2")() 
 36 |             )
 37 | 
 38 | 
 39 | class ApproxQuantile(BaseAnalyzer):
 40 |     """
 41 |     Approximate quantile analyzer. The allowed relative error compared to the exact quantile can be
 42 |     configured with `relativeError` parameter. A `relativeError` = 0.0 would yield the exact
 43 |     quantile while increasing the computational load.
 44 |     
 45 |     @param column Column in DataFrame for which the approximate quantile is analyzed.
 46 |     @param quantile Computed Quantile. Must be in the interval [0, 1], where 0.5 would be the
 47 |                     median.
 48 |     @param relativeError Relative target precision to achieve in the quantile computation.
 49 |                          Must be in the interval [0, 1].
 50 |     @param where Additional filter to apply before the analyzer is run.        
 51 |     """
 52 | 
 53 |     def __init__(self, column, quantile, relativeError = 0.01):
 54 |         self.column = column
 55 |         self.quantile = quantile
 56 |         self.relativeError = relativeError
 57 | 
 58 |     @property
 59 |     def jvmAnalyzer(self):
 60 |         return self.jvmdeequAnalyzers.ApproxQuantile(
 61 |             self.column,
 62 |             self.quantile,
 63 |             self.relativeError,
 64 |             getattr(self.jvmdeequAnalyzers.ApproxQuantile, "apply$default$4")() 
 65 |             )
 66 | 
 67 | class Completeness(BaseAnalyzer):
 68 |     """
 69 |     Fraction of non-null values in a column.
 70 | 
 71 |     Args:
 72 |         column Column in DataFrame
 73 |     """
 74 | 
 75 |     def __init__(self, column):
 76 |         self.column = column
 77 | 
 78 |     @property
 79 |     def jvmAnalyzer(self):
 80 |         return self.jvmdeequAnalyzers.Completeness(
 81 |             self.column,
 82 |             getattr(self.jvmdeequAnalyzers.Completeness, "apply$default$2")() 
 83 |             )
 84 | 
 85 | class Compliance(BaseAnalyzer):
 86 |     """
 87 |     Compliance is a measure of the fraction of rows that complies with the given column constraint.
 88 |     E.g if the constraint is "att1>3" and data frame has 5 rows with att1 column value greater than
 89 |     3 and 10 rows under 3; a DoubleMetric would be returned with 0.33 value
 90 |         @param instance         Unlike other column analyzers (e.g completeness) this analyzer can not
 91 |                             infer to the metric instance name from column name.
 92 |                             Also the constraint given here can be referring to multiple columns,
 93 |                             so metric instance name should be provided,
 94 |                             describing what the analysis being done for.
 95 |     @param predicate SQL-predicate to apply per row
 96 |     @param where Additional filter to apply before the analyzer is run.    
 97 |     """
 98 |     def __init__(self, instance, predicate):
 99 |         self.instance = instance
100 |         self.predicate = predicate
101 | 
102 |     @property
103 |     def jvmAnalyzer(self):
104 |         return self.jvmdeequAnalyzers.Compliance(
105 |             self.instance,
106 |             self.predicate,
107 |             getattr(self.jvmdeequAnalyzers.Compliance, "apply$default$3")() 
108 |             )
109 | 
110 | class Correlation(BaseAnalyzer):
111 |     """
112 |     Computes the pearson correlation coefficient between the two given columns
113 |         @param firstColumn First input column for computation
114 |     @param secondColumn Second input column for computation
115 |     """
116 |     def __init__(self, firstColumn, secondColumn):
117 |         self.firstColumn = firstColumn
118 |         self.secondColumn = secondColumn
119 | 
120 |     @property
121 |     def jvmAnalyzer(self):
122 |         return self.jvmdeequAnalyzers.Correlation(
123 |             self.firstColumn,
124 |             self.secondColumn,
125 |             getattr(self.jvmdeequAnalyzers.Correlation, "apply$default$3")() 
126 |             )    
127 | 
128 | class CountDistinct(BaseAnalyzer):
129 |     """
130 |     Number of distinct values
131 |     """
132 |     def __init__(self, column):
133 |         if isinstance(column, str):
134 |             self.column = [column]
135 |         elif isinstance(column, list):
136 |             self.column = column
137 |         else:
138 |             raise ValueError("'column' must be string or list of strings.")
139 | 
140 |     @property
141 |     def jvmAnalyzer(self):
142 |         return self.jvmdeequAnalyzers.CountDistinct(
143 |             jc.iterable_to_scala_seq(self._jvm, self.column)
144 |             )
145 | 
146 | class DataType(BaseAnalyzer):
147 |     """
148 |     Distribution of data types such as Boolean, Fractional, Integral, and String.
149 |     """
150 |     def __init__(self, column):
151 |         self.column = column
152 |     
153 |     @property
154 |     def jvmAnalyzer(self):
155 |         return self.jvmdeequAnalyzers.DataType(
156 |             self.column,
157 |             getattr(self.jvmdeequAnalyzers.DataType, "apply$default$2")()
158 |             )
159 | 
160 | class Distinctness(BaseAnalyzer):
161 |     """
162 |     Distinctness is the fraction of distinct values of a column(s).
163 |         @param columns  the column(s) for which to compute distinctness    
164 |     """
165 |     def __init__(self, columns):        
166 |         if isinstance(columns, str):
167 |             self.columns = [columns]
168 |         elif isinstance(columns, list):
169 |             self.columns = columns
170 |         else:
171 |             raise ValueError("'columns' must be string or list of strings.")
172 | 
173 |     @property
174 |     def jvmAnalyzer(self):
175 |         return self.jvmdeequAnalyzers.Distinctness(
176 |             jc.iterable_to_scala_seq(self._jvm, self.columns),
177 |             getattr(self.jvmdeequAnalyzers.DataType, "apply$default$2")()
178 |             )
179 | 
180 | class Entropy(BaseAnalyzer):
181 |     """
182 |     Entropy is a measure of the level of information contained in a message. Given the probability
183 |     distribution over values in a column, it describes how many bits are required to identify a
184 |     value.
185 |     """
186 |     def __init__(self, column):
187 |         self.column = column
188 | 
189 |     @property
190 |     def jvmAnalyzer(self):
191 |         return self.jvmdeequAnalyzers.Entropy(
192 |             self.column,
193 |             getattr(self.jvmdeequAnalyzers.Entropy, "apply$default$2")()
194 |             )
195 | 
196 | class Histogram(BaseAnalyzer):
197 |     """
198 |     Histogram is the summary of values in a column of a DataFrame. Groups the given column's values,
199 |     and calculates the number of rows with that specific value and the fraction of this value.
200 |    
201 |     @param column        Column to do histogram analysis on    
202 |     """
203 |     def __init__(self, column):
204 |         self.column = column
205 |     
206 |     @property
207 |     def jvmAnalyzer(self):
208 |         return self.jvmdeequAnalyzers.Histogram(
209 |             self.column,
210 |             getattr(self.jvmdeequAnalyzers.Histogram, "apply$default$2")(),
211 |             getattr(self.jvmdeequAnalyzers.Histogram, "apply$default$3")(),
212 |             getattr(self.jvmdeequAnalyzers.Histogram, "apply$default$4")()
213 |             )
214 | 
215 | class Maximum(BaseAnalyzer):
216 |     """
217 |     Maximum value.
218 |     """
219 |     def __init__(self, column):
220 |         self.column = column
221 |     
222 |     @property
223 |     def jvmAnalyzer(self):
224 |         return self.jvmdeequAnalyzers.Maximum(
225 |             self.column,
226 |             getattr(self.jvmdeequAnalyzers.Maximum, "apply$default$2")()
227 |         )
228 |     
229 | class MaxLength(BaseAnalyzer):
230 |     """
231 |     """
232 |     def __init__(self, column):
233 |         self.column = column
234 |     
235 |     @property
236 |     def jvmAnalyzer(self):
237 |         return self.jvmdeequAnalyzers.MaxLength(
238 |             self.column,
239 |             getattr(self.jvmdeequAnalyzers.MaxLength, "apply$default$2")()
240 |         )
241 | 
242 | class Mean(BaseAnalyzer):
243 |     """
244 |     Mean value, null values are excluded.
245 |     """
246 |     def __init__(self, column):
247 |         self.column = column
248 |     
249 |     @property
250 |     def jvmAnalyzer(self):
251 |         return self.jvmdeequAnalyzers.Mean(
252 |             self.column,
253 |             getattr(self.jvmdeequAnalyzers.Mean, "apply$default$2")()
254 |         )
255 | 
256 | class Minimum(BaseAnalyzer):
257 |     """
258 |     Minimum value.
259 |     """
260 |     def __init__(self, column):
261 |         self.column = column
262 |     
263 |     @property
264 |     def jvmAnalyzer(self):
265 |         return self.jvmdeequAnalyzers.Minimum(
266 |             self.column,
267 |             getattr(self.jvmdeequAnalyzers.Minimum, "apply$default$2")()
268 |         )
269 | 
270 | class MinLength(BaseAnalyzer):
271 |     """
272 |     """
273 |     def __init__(self, column):
274 |         self.column = column
275 |     
276 |     @property
277 |     def jvmAnalyzer(self):
278 |         return self.jvmdeequAnalyzers.MinLength(
279 |             self.column,
280 |             getattr(self.jvmdeequAnalyzers.MinLength, "apply$default$2")()
281 |         )
282 | 
283 | class MutualInformation(BaseAnalyzer):
284 |     """
285 |     Mutual Information describes how much information about one column can be inferred from another
286 |     column.
287 |    
288 |     If two columns are independent of each other, then nothing can be inferred from one column about
289 |     the other, and mutual information is zero. If there is a functional dependency of one column to
290 |     another and vice versa, then all information of the two columns are shared, and mutual
291 |     information is the entropy of each column.    
292 |     """
293 |     def __init__(self, columns):
294 |         if not isinstance(columns, list):
295 |             raise ValueError("'columns' mus be a list of strings.")
296 |         self.columns = columns
297 | 
298 |     @property
299 |     def jvmAnalyzer(self):
300 |         return self.jvmdeequAnalyzers.MutualInformation(
301 |             jc.iterable_to_scala_seq(self._jvm, self.columns),
302 |             getattr(self.jvmdeequAnalyzers.MutualInformation, "apply$default$2")()
303 |         )
304 | 
305 | #class PattenMatch
306 | 
307 | class Size(BaseAnalyzer):
308 |     """
309 |     Size is the number of rows in a DataFrame.
310 |     """
311 |     @property
312 |     def jvmAnalyzer(self):
313 |         return self.jvmdeequAnalyzers.Size(
314 |             getattr(self.jvmdeequAnalyzers.Size, "apply$default$1")()
315 |         )
316 | 
317 | class StandardDeviation(BaseAnalyzer):
318 |     """
319 |     Standard deviation implementation.
320 |     """
321 |     def __init__(self, column):
322 |         self.column = column
323 | 
324 |     @property
325 |     def jvmAnalyzer(self):
326 |         return self.jvmdeequAnalyzers.StandardDeviation(
327 |             self.column,
328 |             getattr(self.jvmdeequAnalyzers.StandardDeviation, "apply$default$2")()
329 |         )
330 | 
331 | class Sum(BaseAnalyzer):
332 |     """
333 |     """
334 |     def __init__(self, column):
335 |         self.column = column
336 |     
337 |     @property
338 |     def jvmAnalyzer(self):
339 |         return self.jvmdeequAnalyzers.Sum(
340 |             self.column,
341 |             getattr(self.jvmdeequAnalyzers.Sum, "apply$default$2")()
342 |         )
343 | 
344 | class Uniqueness(BaseAnalyzer):
345 |     """
346 |     Fraction of unique values over the number of all values of 
347 |     a column. Unique values occur exactly once. 
348 |     Example: [a, a, b] contains one unique value b,
349 |     so uniqueness is 1/3.
350 |     """
351 |     def __init__(self, columns):
352 |         if not isinstance(columns, list):
353 |             raise ValueError("'columns' mus be a list of strings.")
354 |         self.columns = columns
355 | 
356 |     @property
357 |     def jvmAnalyzer(self):
358 |         return self.jvmdeequAnalyzers.Uniqueness(
359 |             jc.iterable_to_scala_seq(self._jvm, self.columns),
360 |             getattr(self.jvmdeequAnalyzers.Uniqueness, "apply$default$2")()
361 |         )
362 |     
363 | class UniqueValueRatio(BaseAnalyzer):
364 |     """
365 |     Fraction of unique values over the number of all distinct
366 |     values of a column. Unique values occur exactly once.
367 |     Distinct values occur at least once.
368 |     Example: [a, a, b] contains one unique value b,
369 |     and two distinct values a and b, so the unique value
370 |     ratio is 1/2.
371 |     """
372 |     def __init__(self, columns):
373 |         if not isinstance(columns, list):
374 |             raise ValueError("'columns' mus be a list of strings.")
375 |         self.columns = columns
376 | 
377 |     @property
378 |     def jvmAnalyzer(self):
379 |         return self.jvmdeequAnalyzers.UniqueValueRatio(
380 |             jc.iterable_to_scala_seq(self._jvm, self.columns),
381 |             getattr(self.jvmdeequAnalyzers.UniqueValueRatio, "apply$default$2")()
382 |         )
383 | 
384 | 


--------------------------------------------------------------------------------
/src/pydeequ/base.py:
--------------------------------------------------------------------------------
  1 | import py4j.java_gateway as jg
  2 | 
  3 | from pyspark.sql import DataFrame
  4 | 
  5 | from pydeequ.exceptions import JavaClassNotFoundException
  6 | import pydeequ.jvm_conversions as jc
  7 | 
  8 | class BaseWrapper(object):
  9 |     def __init__(self, SparkSession):
 10 |         self.spark = SparkSession
 11 | 
 12 |     @property
 13 |     def _jsparkSession(self):
 14 |         return self.spark._jsparkSession
 15 | 
 16 |     @property
 17 |     def _jvm(self):
 18 |         return self.spark.sparkContext._jvm
 19 | 
 20 |     @property
 21 |     def _gateway(self):
 22 |         return self.spark.sparkContext._gateway
 23 | 
 24 | class BaseBuilder(BaseWrapper):
 25 |     def __init__(self, SparkSession, dataFrame):
 26 |         super().__init__(SparkSession)
 27 |         self._dataFrame = dataFrame
 28 | 
 29 |     @property
 30 |     def dataFrame(self):
 31 |         return self._dataFrame
 32 | 
 33 | class VerificationRunBuilder(BaseBuilder):
 34 |     """
 35 |     A class to build a VerificationRun using a fluent API.
 36 |     """
 37 |     def __init__(self, SparkSession, dataFrame):
 38 |         """
 39 |         Args:
 40 |             SparkSession (pyspark.sql.SparkSession)
 41 |             dataFrame (pyspark.sql.dataframe.DataFrame)
 42 |         """
 43 |         super().__init__(SparkSession, dataFrame)
 44 |         run_builder = self._jvm.com.amazon.deequ.VerificationRunBuilder
 45 |         self.jvmVerificationRunBuilder = run_builder(
 46 |             self.dataFrame._jdf
 47 |         )
 48 | 
 49 | 
 50 |     def addCheck(self, check):
 51 |         """
 52 |         Add a single check to the run.
 53 | 
 54 |         Args:
 55 |             check (pydeequ.check.Check):
 56 |             A check object to be executed during the run
 57 |         """
 58 |         jvmCheck = check.jvmCheck
 59 |         self.jvmVerificationRunBuilder.addCheck(jvmCheck)
 60 |         return self
 61 | 
 62 |     def run(self):
 63 |         result = self.jvmVerificationRunBuilder.run()
 64 | 
 65 |         jvmVerificationResult = self._jvm.com.amazon.deequ \
 66 |             .VerificationResult
 67 |         try: 
 68 |             df = jvmVerificationResult.checkResultsAsDataFrame(
 69 |                 self._jsparkSession,
 70 |                 result,
 71 |                 getattr(jvmVerificationResult,
 72 |                         "checkResultsAsDataFrame$default$3")()
 73 |             )
 74 |             return df
 75 |         except Exception: 
 76 |             self.spark.sparkContext._gateway.close()
 77 |             self.spark.stop()
 78 |             raise AttributeError
 79 | 
 80 |     def useRepository(self, metricsRepo):
 81 |         self.jvmVerificationRunBuilder = self.jvmVerificationRunBuilder \
 82 |             .useRepository(
 83 |                 metricsRepo.jvmMetricsRepo
 84 |             )
 85 |         return self
 86 | 
 87 |     def saveOrAppendResult(self, resultKey):
 88 |         self.jvmVerificationRunBuilder = self.jvmVerificationRunBuilder \
 89 |             .saveOrAppendResult(
 90 |                 resultKey.jvmResultKey
 91 |             )
 92 |         return self
 93 | 
 94 | class VerificationSuite(BaseWrapper):
 95 |     """
 96 |     Responsible for running checks and required analysis and return the
 97 |     results.
 98 |     """
 99 |     def __init__(self, SparkSession):
100 |         """
101 |         Args:
102 |             SparkSession ():
103 |         """
104 |         super().__init__(SparkSession)
105 |         self._start_callback_server()
106 | 
107 |     def _start_callback_server(self):
108 |         callback = self._gateway.get_callback_server()
109 |         if callback is None:
110 |             self._gateway.start_callback_server()
111 |         elif callback.is_shutdown:
112 |             callback.close()
113 |             self._gateway.restart_callback_server()
114 | 
115 |     def onData(self, dataFrame):
116 |         """
117 |         Starting point to construct a VerificationRun.
118 |         
119 |         Args:
120 |             dataFrame (pyspark.sql.dataframe.DataFrame):
121 |             spark dataFrame on which the checks will be verified.
122 |         """
123 |         return VerificationRunBuilder(self.spark, dataFrame)
124 | 
125 | class _AnalyzerContext(BaseWrapper):
126 |     """
127 |     """
128 |     def __init__(self, SparkSession, jvmAnalyzerContext):
129 |         """ Initializes the AnalyzerContext python object with a JVM object.
130 | 
131 |         Args:
132 |             SparkSession ():
133 |             jvmAnalyzerContext (JavaObject):
134 |         """
135 |         super().__init__(SparkSession)
136 |         self.jvmAnalyzerContext = jvmAnalyzerContext
137 | 
138 |     def successMetricsAsDataFrame(self):
139 |         try: 
140 |             df = self.jvmAnalyzerContext.successMetricsAsDataFrame(
141 |                 self._jsparkSession,
142 |                 self.jvmAnalyzerContext,
143 |                 getattr(self.jvmAnalyzerContext,
144 |                         "successMetricsAsDataFrame$default$3")()
145 |             )
146 |             out = DataFrame(df, self.spark)
147 |             return out
148 |         except Exception: 
149 |             self.spark.sparkContext._gateway.close()
150 |             self.spark.stop()
151 |             raise AttributeError
152 | 
153 |     def successMetricsAsJson(self):
154 |         try: 
155 |             jf = self.jvmAnalyzerContext.successMetricsAsJson(
156 |                 self.jvmAnalyzerContext,
157 |                 getattr(self.jvmAnalyzerContext,
158 |                         "successMetricsAsJson$default$2")()
159 |             )
160 | 
161 |             return jf
162 |         except Exception: 
163 |             self.spark.sparkContext._gateway.close()
164 |             self.spark.stop()
165 |             raise AttributeError
166 | 
167 | class AnalysisRunBuilder(BaseBuilder):
168 |     """
169 |     A class to build an AnalysisRun using a fluent API.
170 |     """
171 |     def __init__(self, SparkSession, dataFrame):
172 |         """
173 |         Args:
174 |             SparkSession (pyspark.sql.SparkSession)
175 |             dataFrame (pyspark.sql.dataframe.DataFrame)
176 |         """
177 |         super().__init__(SparkSession, dataFrame)
178 |         run_builder = self._jvm.com.amazon.deequ.analyzers.runners.AnalysisRunBuilder
179 |         self.jvmAnalysisRunBuilder = run_builder(
180 |             self.dataFrame._jdf
181 |         )
182 | 
183 |     def addAnalyzer(self, analyzer):
184 |         """
185 |         Add a single analyzer to the run.
186 |         
187 |         Args:
188 |             analyzer (pydeequ.analyzer.Analyzer):
189 |             An analyzer object to be executed during the run
190 |         """
191 |         analyzer.set_jvm(self._jvm)
192 |         jvmAnalyzer = analyzer.jvmAnalyzer
193 |         self.jvmAnalysisRunBuilder.addAnalyzer(jvmAnalyzer)
194 |         return self
195 | 
196 |     def run(self):
197 |         """ Returns an AnalyzerContext python object
198 |         """
199 |         jvmContext = self.jvmAnalysisRunBuilder.run()
200 |         return_context = _AnalyzerContext(
201 |             self.spark, 
202 |             jvmContext)
203 |         return return_context
204 | 
205 | class AnalysisRunner(BaseWrapper):
206 |     """
207 |     Responsible for running metrics calculations.
208 |     """
209 |     def onData(self, dataFrame):
210 |         """
211 |         Starting point to construct an Analysisrun.
212 |         
213 |         Args:
214 |             dataFrame (pyspark.sql.dataframe.DataFrame):
215 |             spark dataFrame on which the checks will be verified.
216 |         """
217 |         return AnalysisRunBuilder(self.spark, dataFrame)
218 | 
219 | 
220 | class ConstraintSuggestionRunBuilder(BaseBuilder):
221 |     """
222 |     A class to build a ConstraintSuggestionRun using a fluent API.
223 |     """
224 |     def __init__(self, SparkSession, dataFrame):
225 |         """
226 |         Args:
227 |             SparkSession (pyspark.sql.SparkSession)
228 |             dataFrame (pyspark.sql.dataframe.DataFrame)
229 |         """
230 |         super().__init__(SparkSession, dataFrame)
231 |         run_builder = self._jvm.com.amazon.deequ.suggestions.ConstraintSuggestionRunBuilder
232 |         self.jvmConstraintSuggestionRunBuilder = run_builder(
233 |             self.dataFrame._jdf
234 |         )
235 | 
236 |     def addConstraintRule(self, constraint):
237 |         """
238 |         Add a single rule for suggesting constraints based on ColumnProfiles to the run.
239 |         
240 |         Args:
241 |             constraintRule
242 |         """
243 |         jvmRule = constraint._jvmRule
244 |         self.jvmConstraintSuggestionRunBuilder.addConstraintRule(jvmRule())
245 |         return self
246 | 
247 |     def run(self):
248 |         result = self.jvmConstraintSuggestionRunBuilder.run()
249 | 
250 |         jvmSuggestionResult = self._jvm.com.amazon.deequ \
251 |             .suggestions.ConstraintSuggestionResult
252 |         try: 
253 |             df = jvmSuggestionResult.getConstraintSuggestionsAsJson(
254 |                 result
255 |             )
256 |             return df
257 |         except: 
258 |             self.spark.sparkContext._gateway.close()
259 |             self.spark.stop()
260 |             raise AttributeError
261 | 
262 | class ConstraintSuggestionRunner(BaseWrapper):
263 |     """
264 |     """
265 |     def onData(self, dataFrame):
266 |         """
267 |         Starting point to construct a run on constraint suggestions.
268 |         
269 |         Args:
270 |             dataFrame (pyspark.sql.dataframe.DataFrame):
271 |             spark dataFrame on which the checks will be verified.
272 |         """
273 |         return ConstraintSuggestionRunBuilder(self.spark, dataFrame)
274 | 


--------------------------------------------------------------------------------
/src/pydeequ/checks.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import py4j.java_gateway as jg
  3 | 
  4 | from pydeequ.exceptions import JavaClassNotFoundException
  5 | import pydeequ.jvm_conversions as jc
  6 | import pdb
  7 | 
  8 | def is_one(x):
  9 |     """ Helper function for default asseritons.
 10 |     """
 11 |     return x == 1
 12 | 
 13 | class Check(object):
 14 |     """
 15 |     A class representing a list of constraints that can be applied to a given
 16 |     [[org.apache.spark.sql.DataFrame]]. In order to run the checks, use the
 17 |     VerificationSuite.run to run your checks along with other Checks and
 18 |     Analysis objects. When run with VerificationSuite, Analyzers required by
 19 |     multiple checks/analysis blocks is optimized to run once.
 20 |     """
 21 |     def __init__(self, SparkSession, level='error', description=None,
 22 |                  jvmCheck=None):
 23 |         """
 24 |         Args:
 25 |             sparkContext (pyspark.context.SparkContext): active SparkContext
 26 |             level (str): 'error' (default), 'warning'
 27 |                 Assertion level of the check group. If any of the constraints
 28 |                 fail this level is used for the status of the check
 29 |             description (str): The name describes the check block. Generally
 30 |                 will be used to show in the logs
 31 |         """
 32 |         self.spark = SparkSession
 33 |         self._level = level
 34 |         self._description = description
 35 |         if jvmCheck:
 36 |             self.jvmCheck = jvmCheck
 37 |         else:
 38 |             deequ_check = self._jvm.com.amazon.deequ.checks.Check
 39 |             if not isinstance(deequ_check, jg.JavaClass):
 40 |                 raise JavaClassNotFoundException("com.amazon.deequ.checks.Check")
 41 |             self.jvmCheck = deequ_check(
 42 |                 self._jvm_level,
 43 |                 self._description,
 44 |                 getattr(deequ_check, "apply$default$3")()
 45 |             )
 46 | 
 47 |     @property
 48 |     def _jvm(self):
 49 |         return self.spark.sparkContext._jvm
 50 | 
 51 |     @property
 52 |     def level(self):
 53 |         return  self._level
 54 | 
 55 |     @property
 56 |     def description(self):
 57 |         return self._description
 58 | 
 59 |     @property
 60 |     def _jvm_level(self):
 61 |         if self._level == 'error':
 62 |             return self._jvm.com.amazon.deequ.checks.CheckLevel.Error()
 63 |         elif self._level == 'warning':
 64 |             return self._jvm.com.amazon.deequ.checks.CheckLevel.Warning()
 65 |         else:
 66 |             raise ValueError("Invalid 'level'")
 67 | 
 68 |     def hasSize(self, assertion): 
 69 |         """
 70 |         Creates a constraint that calculates the data frame size and runs the
 71 |         assertion on it.
 72 |         Args:
 73 |             assertion (function):
 74 |         Returns:
 75 |             checks.Check object including this constraint
 76 |         """
 77 |         function = jc.scala_function1(self.spark.sparkContext._gateway,
 78 |                                       assertion)
 79 |         jvmConstraint = self.jvmCheck.hasSize(
 80 |             function,
 81 |             getattr(self.jvmCheck, "hasSize$default$2")()
 82 |         )
 83 |         return Check(
 84 |             self.spark,
 85 |             self.level,
 86 |             self.description,
 87 |             jvmConstraint
 88 |         )
 89 | 
 90 |     def isUnique(self, column):
 91 |         """
 92 |         Creates a constraint that asserts on a column uniqueness.
 93 |         Args:
 94 |             column (str): Column to run the assertion on
 95 |         Returns:
 96 |             checks.Check object including this constraint
 97 |         """
 98 |         jvmConstraint = self.jvmCheck.isUnique(
 99 |             column,
100 |             getattr(self.jvmCheck, "isUnique$default$2")()
101 |         )
102 |         return Check(
103 |             self.spark,
104 |             self.level,
105 |             self.description,
106 |             jvmConstraint
107 |         )
108 | 
109 |     def hasCompleteness(self, column, assertion):
110 |         """
111 |         Creates a constraint that asserts on a column completion.
112 |         Uses the given history selection strategy to retrieve historical completeness values on this
113 |         column from the history provider.
114 |         
115 |         @param column    Column to run the assertion on
116 |         @param assertion Function that receives a double input parameter and returns a boolean
117 |         @param hint A hint to provide additional context why a constraint could have failed
118 |         """
119 |         function = jc.scala_function1(self.spark.sparkContext._gateway,
120 |                                       assertion)
121 |         jvmConstraint = self.jvmCheck.hasCompleteness(
122 |             column,
123 |             function,
124 |             getattr(self.jvmCheck, "hasCompleteness$default$3")()
125 |         )
126 |         return Check(
127 |             self.spark,
128 |             self.level,
129 |             self.description,
130 |             jvmConstraint
131 |         )
132 | 
133 |     def hasUniqueness(self, columns, assertion):
134 |         """
135 |          Creates a constraint that asserts on uniqueness in a single or combined set of key columns.
136 |         
137 |          @param columns Key columns
138 |          @param assertion Function that receives a double input parameter and returns a boolean.
139 |                           Refers to the fraction of unique values
140 |          @param hint A hint to provide additional context why a constraint could have failed
141 |          """
142 |         if (not isinstance(columns, list)):
143 |             # Single column is provided
144 |             columns = [columns]
145 |         function = jc.scala_function1(self.spark.sparkContext._gateway,
146 |                                       assertion)
147 |         jvmConstraint = self.jvmCheck.hasUniqueness(
148 |             jc.iterable_to_scala_seq(self._jvm, columns),
149 |             function
150 |         )
151 |         return Check(
152 |             self.spark,
153 |             self.level,
154 |             self.description,
155 |             jvmConstraint
156 |         )
157 | 
158 |     def hasDistinctness(self, columns, assertion):
159 |         """
160 |         Creates a constraint on the distinctness in a single or combined set of key columns.
161 |        
162 |         @param columns columns
163 |         @param assertion Function that receives a double input parameter and returns a boolean.
164 |                          Refers to the fraction of distinct values.
165 |         @param hint A hint to provide additional context why a constraint could have failed
166 |         """
167 |         if (not isinstance(columns, list)):
168 |             # Single column is provided
169 |             columns = [columns]
170 |         function = jc.scala_function1(self.spark.sparkContext._gateway,
171 |                                       assertion)
172 |         jvmConstraint = self.jvmCheck.hasDistinctness(
173 |             jc.iterable_to_scala_seq(self._jvm, columns),
174 |             function,
175 |             getattr(self.jvmCheck, "hasDistinctness$default$3")()
176 |         )
177 |         return Check(
178 |             self.spark,
179 |             self.level,
180 |             self.description,
181 |             jvmConstraint
182 |         )
183 | 
184 |     def hasUniqueValueRatio(self, columns, assertion):
185 |         """
186 |         Creates a constraint on the unique value ratio in a single or combined set of key columns.
187 |        
188 |         @param columns columns
189 |         @param assertion Function that receives a double input parameter and returns a boolean.
190 |                          Refers to the fraction of distinct values.
191 |         @param hint A hint to provide additional context why a constraint could have failed
192 |         """
193 |         if (not isinstance(columns, list)):
194 |             # Single column is provided
195 |             columns = [columns]
196 |         function = jc.scala_function1(self.spark.sparkContext._gateway,
197 |                                       assertion)
198 |         jvmConstraint = self.jvmCheck.hasUniqueValueRatio(
199 |             jc.iterable_to_scala_seq(self._jvm, columns),
200 |             function,
201 |             getattr(self.jvmCheck, "hasUniqueValueRatio$default$3")()
202 |         )
203 |         return Check(
204 |             self.spark,
205 |             self.level,
206 |             self.description,
207 |             jvmConstraint
208 |         )
209 | 
210 |     def hasNumberOfDistinctValues(self, column, assertion,
211 |                                   binningUdf = None, maxBins = None):
212 |         """
213 |         Creates a constraint that asserts on the number of distinct values a column has.
214 |         
215 |         @param column     Column to run the assertion on
216 |         @param assertion  Function that receives a long input parameter and returns a boolean
217 |         @param binningUdf An optional binning function
218 |         @param maxBins    Histogram details is only provided for N column values with top counts.
219 |                           maxBins sets the N
220 |         @param hint A hint to provide additional context why a constraint could have failed
221 |         """
222 |         function = jc.scala_function1(self.spark.sparkContext._gateway,
223 |                                       assertion)
224 |         jvmConstraint = self.jvmCheck.hasNumberOfDistinctValues(
225 |             column,
226 |             function,
227 |             getattr(self.jvmCheck, "hasNumberOfDistinctValues$default$3")(),
228 |             getattr(self.jvmCheck, "hasNumberOfDistinctValues$default$4")(),
229 |             getattr(self.jvmCheck, "hasNumberOfDistinctValues$default$5")()
230 |         )
231 |         return Check(
232 |             self.spark,
233 |             self.level,
234 |             self.description,
235 |             jvmConstraint
236 |         )
237 | 
238 |     def hasHistogramValues(self, column, assertion,
239 |                            binningUdf = None, maxBins = None):
240 |         """
241 |         Creates a constraint that asserts on column's value distribution.
242 |         
243 |         @param column     Column to run the assertion on
244 |         @param assertion  Function that receives a Distribution input parameter and returns a boolean.
245 |                           E.g
246 |                           .hasHistogramValues("att2", _.absolutes("f") == 3)
247 |                           .hasHistogramValues("att2",
248 |                           _.ratios(Histogram.NullFieldReplacement) == 2/6.0)
249 |         @param binningUdf An optional binning function
250 |         @param maxBins    Histogram details is only provided for N column values with top counts.
251 |                           maxBins sets the N
252 |         @param hint A hint to provide additional context why a constraint could have failed
253 |         """
254 |         function = jc.scala_function1(self.spark.sparkContext._gateway,
255 |                                       assertion)
256 |         jvmConstraint = self.jvmCheck.hasHistogramValues(
257 |             column,
258 |             function,
259 |             getattr(self.jvmCheck, "hasHistogramValues$default$3")(),
260 |             getattr(self.jvmCheck, "hasHistogramValues$default$4")(),
261 |             getattr(self.jvmCheck, "hasHistogramValues$default$5")()
262 |         )
263 |         return Check(
264 |             self.spark,
265 |             self.level,
266 |             self.description,
267 |             jvmConstraint
268 |         )
269 | 
270 |     def hasEntropy(self, column, assertion):
271 |         """
272 |         Creates a constraint that asserts on a column entropy.
273 |         
274 |         @param column    Column to run the assertion on
275 |         @param assertion Function that receives a double input parameter and returns a boolean
276 |         @param hint      A hint to provide additional context why a constraint could have failed
277 |         """
278 |         function = jc.scala_function1(self.spark.sparkContext._gateway,
279 |                                       assertion)
280 |         jvmConstraint = self.jvmCheck.hasEntropy(
281 |             column,
282 |             function,
283 |             getattr(self.jvmCheck, "hasEntropy$default$3")()
284 |         )
285 |         return Check(
286 |             self.spark,
287 |             self.level,
288 |             self.description,
289 |             jvmConstraint
290 |         )
291 | 
292 |     def hasMutualInformation(self, columnA, columnB, assertion):
293 |         """
294 |         Creates a constraint that asserts on a mutual information between two columns.
295 |         
296 |         @param columnA   First column for mutual information calculation
297 |         @param columnB   Second column for mutual information calculation
298 |         @param assertion Function that receives a double input parameter and returns a boolean
299 |         @param hint      A hint to provide additional context why a constraint could have failed
300 |         """
301 |         function = jc.scala_function1(self.spark.sparkContext._gateway,
302 |                                       assertion)
303 |         jvmConstraint = self.jvmCheck.hasMutualInformation(
304 |             columnA,
305 |             columnB,
306 |             function,
307 |             getattr(self.jvmCheck, "hasMutualInformation$default$4")()
308 |         )
309 |         return Check(
310 |             self.spark,
311 |             self.level,
312 |             self.description,
313 |             jvmConstraint
314 |         )
315 | 
316 |     def hasApproxQuantile(self, column, quantile, assertion):
317 |         """
318 |         Creates a constraint that asserts on an approximated quantile
319 |         
320 |         @param column Column to run the assertion on
321 |         @param quantile Which quantile to assert on
322 |         @param assertion Function that receives a double input parameter (the computed quantile)
323 |                          and returns a boolean
324 |         @param hint A hint to provide additional context why a constraint could have failed
325 |         """
326 |         function = jc.scala_function1(self.spark.sparkContext._gateway,
327 |                                       assertion)
328 |         jvmConstraint = self.jvmCheck.hasApproxQuantile(
329 |             column,
330 |             quantile,
331 |             function,
332 |             getattr(self.jvmCheck, "hasApproxQuantile$default$4")()
333 |         )
334 |         return Check(
335 |             self.spark,
336 |             self.level,
337 |             self.description,
338 |             jvmConstraint
339 |         )
340 | 
341 |     def hasMinLength(self, column, assertion):
342 |         """
343 |         Creates a constraint that asserts on the minimum length of the column
344 |         
345 |         @param column Column to run the assertion on
346 |         @param assertion Function that receives a double input parameter and returns a boolean
347 |         @param hint A hint to provide additional context why a constraint could have failed
348 |         """
349 |         function = jc.scala_function1(self.spark.sparkContext._gateway,
350 |                                       assertion)
351 |         jvmConstraint = self.jvmCheck.hasMinLength(
352 |             column,
353 |             function,
354 |             getattr(self.jvmCheck, "hasMinLength$default$3")()
355 |         )
356 |         return Check(
357 |             self.spark,
358 |             self.level,
359 |             self.description,
360 |             jvmConstraint
361 |         )
362 | 
363 | 
364 |     def hasMaxLength(self, column, assertion):
365 |         """
366 |         Creates a constraint that asserts on the maximum length of the column
367 |         
368 |         @param column Column to run the assertion on
369 |         @param assertion Function that receives a double input parameter and returns a boolean
370 |         @param hint A hint to provide additional context why a constraint could have failed
371 |         """
372 |         function = jc.scala_function1(self.spark.sparkContext._gateway,
373 |                                       assertion)
374 |         jvmConstraint = self.jvmCheck.hasMaxLength(
375 |             column,
376 |             function,
377 |             getattr(self.jvmCheck, "hasMaxLength$default$3")()
378 |         )
379 |         return Check(
380 |             self.spark,
381 |             self.level,
382 |             self.description,
383 |             jvmConstraint
384 |         )
385 | 
386 |     def hasMin(self, column, assertion):
387 |         """
388 |         Creates a constraint that asserts on the minimum of the column
389 |         
390 |         @param column Column to run the assertion on
391 |         @param assertion Function that receives a double input parameter and returns a boolean
392 |         @param hint A hint to provide additional context why a constraint could have failed
393 |         """
394 |         function = jc.scala_function1(self.spark.sparkContext._gateway,
395 |                                       assertion)
396 |         jvmConstraint = self.jvmCheck.hasMin(
397 |             column,
398 |             function,
399 |             getattr(self.jvmCheck, "hasMin$default$3")()
400 |         )
401 |         return Check(
402 |             self.spark,
403 |             self.level,
404 |             self.description,
405 |             jvmConstraint
406 |         )
407 | 
408 |     def hasMax(self, column, assertion):
409 |         """
410 |         Creates a constraint that asserts on the maximum of the column
411 |         
412 |         @param column Column to run the assertion on
413 |         @param assertion Function that receives a double input parameter and returns a boolean
414 |         @param hint A hint to provide additional context why a constraint could have failed
415 |         """
416 |         function = jc.scala_function1(self.spark.sparkContext._gateway,
417 |                                       assertion)
418 |         jvmConstraint = self.jvmCheck.hasMax(
419 |             column,
420 |             function,
421 |             getattr(self.jvmCheck, "hasMax$default$3")()
422 |         )
423 |         return Check(
424 |             self.spark,
425 |             self.level,
426 |             self.description,
427 |             jvmConstraint
428 |         )
429 | 
430 |     def hasMean(self, column, assertion):
431 |         """
432 |         Creates a constraint that asserts on the mean of the column
433 |         
434 |         @param column Column to run the assertion on
435 |         @param assertion Function that receives a double input parameter and returns a boolean
436 |         @param hint A hint to provide additional context why a constraint could have failed
437 |         """
438 |         function = jc.scala_function1(self.spark.sparkContext._gateway,
439 |                                       assertion)
440 |         jvmConstraint = self.jvmCheck.hasMean(
441 |             column,
442 |             function,
443 |             getattr(self.jvmCheck, "hasMean$default$3")()
444 |         )
445 |         return Check(
446 |             self.spark,
447 |             self.level,
448 |             self.description,
449 |             jvmConstraint
450 |         )
451 | 
452 |     def hasSum(self, column, assertion):
453 |         """
454 |         Creates a constraint that asserts on the sum of the column
455 |         
456 |         @param column Column to run the assertion on
457 |         @param assertion Function that receives a double input parameter and returns a boolean
458 |         @param hint A hint to provide additional context why a constraint could have failed
459 |         """
460 |         function = jc.scala_function1(self.spark.sparkContext._gateway,
461 |                                       assertion)
462 |         jvmConstraint = self.jvmCheck.hasSum(
463 |             column,
464 |             function,
465 |             getattr(self.jvmCheck, "hasSum$default$3")()
466 |         )
467 |         return Check(
468 |             self.spark,
469 |             self.level,
470 |             self.description,
471 |             jvmConstraint
472 |         )
473 |     def hasStandardDeviation(self, column, assertion):
474 |         """
475 |         Creates a constraint that asserts on the standard deviation of the column
476 |         
477 |         @param column Column to run the assertion on
478 |         @param assertion Function that receives a double input parameter and returns a boolean
479 |         @param hint A hint to provide additional context why a constraint could have failed
480 |         """
481 |         function = jc.scala_function1(self.spark.sparkContext._gateway,
482 |                                       assertion)
483 |         jvmConstraint = self.jvmCheck.hasStandardDeviation(
484 |             column,
485 |             function,
486 |             getattr(self.jvmCheck, "hasStandardDeviation$default$3")()
487 |         )
488 |         return Check(
489 |             self.spark,
490 |             self.level,
491 |             self.description,
492 |             jvmConstraint
493 |         )
494 |     def hasApproxCountDistinct(self, column, assertion):
495 |         """
496 |         Creates a constraint that asserts on the approximate count distinct of the given column
497 |         
498 |         @param column Column to run the assertion on
499 |         @param assertion Function that receives a double input parameter and returns a boolean
500 |         @param hint A hint to provide additional context why a constraint could have failed
501 |         """
502 |         function = jc.scala_function1(self.spark.sparkContext._gateway,
503 |                                       assertion)
504 |         jvmConstraint = self.jvmCheck.hasApproxCountDistinct(
505 |             column,
506 |             function,
507 |             getattr(self.jvmCheck, "hasApproxCountDistinct$default$3")()
508 |         )
509 |         return Check(
510 |             self.spark,
511 |             self.level,
512 |             self.description,
513 |             jvmConstraint
514 |         )
515 | 
516 |     def hasCorrelation(self, columnA, columnB, assertion):
517 |         """
518 |          Creates a constraint that asserts on the pearson correlation between two columns.
519 |          
520 |          @param columnA   First column for correlation calculation
521 |          @param columnB   Second column for correlation calculation
522 |          @param assertion Function that receives a double input parameter and returns a boolean
523 |          @param hint A hint to provide additional context why a constraint could have failed
524 |          """
525 |         function = jc.scala_function1(self.spark.sparkContext._gateway,
526 |                                       assertion)
527 |         jvmConstraint = self.jvmCheck.hasCorrelation(
528 |             columnA,
529 |             columnB,
530 |             function,
531 |             getattr(self.jvmCheck, "hasCorrelation$default$4")()
532 |         )
533 |         return Check(
534 |             self.spark,
535 |             self.level,
536 |             self.description,
537 |             jvmConstraint
538 |         )
539 | 
540 |     def satisfies(self, columnCondition, constraintName, assertion):
541 |         """
542 |         Creates a constraint that runs the given condition on the data frame.
543 |         
544 |         @param columnCondition Data frame column which is a combination of expression and the column
545 |                                name. It has to comply with Spark SQL syntax.
546 |                                Can be written in an exact same way with conditions inside the
547 |                                `WHERE` clause.
548 |         @param constraintName  A name that summarizes the check being made. This name is being used to
549 |                                name the metrics for the analysis being done.
550 |         @param assertion       Function that receives a double input parameter and returns a boolean
551 |         @param hint A hint to provide additional context why a constraint could have failed
552 |         """
553 |         function = jc.scala_function1(self.spark.sparkContext._gateway,
554 |                                       assertion)
555 |         jvmConstraint = self.jvmCheck.satisfies(
556 |             columnCondition,
557 |             constraintName,
558 |             function,
559 |             getattr(self.jvmCheck, "satisfies$default$4")()
560 |         )
561 |         return Check(
562 |             self.spark,
563 |             self.level,
564 |             self.description,
565 |             jvmConstraint
566 |         )
567 | 
568 |     def hasPattern(self, column, pattern, assertion = is_one):
569 |         """
570 |         Checks for pattern compliance. Given a column name and a regular expression, defines a
571 |         Check on the average compliance of the column's values to the regular expression.
572 |         
573 |         @param column Name of the column that should be checked.
574 |         @param pattern The columns values will be checked for a match against this pattern.
575 |         @param assertion Function that receives a double input parameter and returns a boolean
576 |         @param hint A hint to provide additional context why a constraint could have failed
577 |         """
578 |        # function = jc.scala_function1(self.spark.sparkContext._gateway,
579 |        #                              assertion)
580 |        # pattern = jc.scala_regex(self.spark.sparkContext._gateway, pattern)
581 |        # jvmConstraint = self.jvmCheck.hasPattern(
582 |        #     column,
583 |        #     pattern,
584 |        #     function,
585 |        #     getattr(self.jvmCheck, "hasPattern$default$4")(),
586 |        #     getattr(self.jvmCheck, "hasPattern$default$5")()
587 |        # )
588 |        # return Check(
589 |        #     self.spark,
590 |        #     self.level,
591 |        #     self.description,
592 |        #     jvmConstraint
593 |        # )
594 |         pass
595 | 
596 |     def hasDataType(self, column, dataType, assertion):
597 |         """
598 |         Check to run against the fraction of rows that conform to the given data type.
599 |         
600 |         @param column Name of the columns that should be checked.
601 |         @param dataType Data type that the columns should be compared against.
602 |         @param assertion Function that receives a double input parameter and returns a boolean
603 |         @param hint A hint to provide additional context why a constraint could have failed
604 |         """
605 |         _jconstDataTypes =  self._jvm.com.amazon.deequ.constraints.ConstrainableDataTypes
606 |         dataTypes = {
607 |             'null': _jconstDataTypes.Null(),
608 |             'boolean': _jconstDataTypes.Boolean(),
609 |             'string': _jconstDataTypes.String(),
610 |             'numeric': _jconstDataTypes.Numeric(),
611 |             'fractional': _jconstDataTypes.Fractional(),
612 |             'integer': _jconstDataTypes.Integral()
613 |         }
614 |         function = jc.scala_function1(self.spark.sparkContext._gateway,
615 |                                       assertion)
616 |         jvmConstraint = self.jvmCheck.hasDataType(
617 |             column,
618 |             dataTypes[dataType],
619 |             function,
620 |             getattr(self.jvmCheck, "hasDataType$default$4")()
621 |         )
622 |         return Check(
623 |             self.spark,
624 |             self.level,
625 |             self.description,
626 |             jvmConstraint
627 |         )
628 | 
629 |     def isPositive(self, column, assertion = is_one):
630 |         """
631 |         Creates a constraint that asserts that a column contains positive values
632 |         
633 |         @param column Column to run the assertion on
634 |         @param assertion Function that receives a double input parameter and returns a boolean
635 |         @param hint A hint to provide additional context why a constraint could have failed
636 |         """
637 |         function = jc.scala_function1(self.spark.sparkContext._gateway,
638 |                                       assertion)
639 |         jvmConstraint = self.jvmCheck.isPositive(
640 |             column,
641 |             function,
642 |             getattr(self.jvmCheck, "isPositive$default$3")()
643 |         )
644 |         return Check(
645 |             self.spark,
646 |             self.level,
647 |             self.description,
648 |             jvmConstraint
649 |         )
650 | 
651 | 
652 |     def isNonNegative(self, column, assertion = is_one):
653 |         """
654 |         Creates a constraint that asserts that a column contains no negative values
655 |         
656 |         @param column Column to run the assertion on
657 |         @param assertion Function that receives a double input parameter and returns a boolean
658 |         @param hint A hint to provide additional context why a constraint could have failed
659 |         """
660 |         function = jc.scala_function1(self.spark.sparkContext._gateway,
661 |                                       assertion)
662 |         jvmConstraint = self.jvmCheck.isNonNegative(
663 |             column,
664 |             function,
665 |             getattr(self.jvmCheck, "isNonNegative$default$3")()
666 |         )
667 |         return Check(
668 |             self.spark,
669 |             self.level,
670 |             self.description,
671 |             jvmConstraint
672 |         )
673 | 
674 |     def isLessThan(self, columnA, columnB, assertion = is_one):
675 |         """
676 |         Asserts that, in each row, the value of columnA is less than the value of columnB
677 |         
678 |         @param columnA Column to run the assertion on
679 |         @param columnB Column to run the assertion on
680 |         @param assertion Function that receives a double input parameter and returns a boolean
681 |         @param hint A hint to provide additional context why a constraint could have failed
682 |         """
683 |         function = jc.scala_function1(self.spark.sparkContext._gateway,
684 |                                       assertion)
685 |         jvmConstraint = self.jvmCheck.isLessThan(
686 |             columnA,
687 |             columnB,
688 |             function,
689 |             getattr(self.jvmCheck, "isLessThan$default$4")()
690 |         )
691 |         return Check(
692 |             self.spark,
693 |             self.level,
694 |             self.description,
695 |             jvmConstraint
696 |         )
697 | 
698 |     def isLessThanOrEqualTo(self, columnA, columnB, assertion = is_one):
699 |         """
700 |         Asserts that, in each row, the value of columnA is less than or equal to the value of columnB
701 |         
702 |         @param columnA Column to run the assertion on
703 |         @param columnB Column to run the assertion on
704 |         @param assertion Function that receives a double input parameter and returns a boolean
705 |         @param hint A hint to provide additional context why a constraint could have failed
706 |         """
707 |         function = jc.scala_function1(self.spark.sparkContext._gateway,
708 |                                       assertion)
709 |         jvmConstraint = self.jvmCheck.isLessThanOrEqualTo(
710 |             columnA,
711 |             columnB,
712 |             function,
713 |             getattr(self.jvmCheck, "isLessThanOrEqualTo$default$4")()
714 |         )
715 |         return Check(
716 |             self.spark,
717 |             self.level,
718 |             self.description,
719 |             jvmConstraint
720 |         )
721 | 
722 |     def isGreaterThan(self, columnA, columnB, assertion = is_one):
723 |         """
724 |         Asserts that, in each row, the value of columnA is greater than the value of columnB
725 |         
726 |         @param columnA Column to run the assertion on
727 |         @param columnB Column to run the assertion on
728 |         @param assertion Function that receives a double input parameter and returns a boolean
729 |         @param hint A hint to provide additional context why a constraint could have failed
730 |         """
731 |         function = jc.scala_function1(self.spark.sparkContext._gateway,
732 |                                       assertion)
733 |         jvmConstraint = self.jvmCheck.isGreaterThan(
734 |             columnA,
735 |             columnB,
736 |             function,
737 |             getattr(self.jvmCheck, "isGreaterThan$default$4")()
738 |         )
739 |         return Check(
740 |             self.spark,
741 |             self.level,
742 |             self.description,
743 |             jvmConstraint
744 |         )
745 | 
746 |     def isGreaterThanOrEqualTo(self, columnA, columnB, assertion = is_one):
747 |         """
748 |         Asserts that, in each row, the value of columnA is greather than or equal to the value of
749 |         columnB
750 |         
751 |         @param columnA Column to run the assertion on
752 |         @param columnB Column to run the assertion on
753 |         @param assertion Function that receives a double input parameter and returns a boolean
754 |         @param hint A hint to provide additional context why a constraint could have failed
755 |         """
756 |         function = jc.scala_function1(self.spark.sparkContext._gateway,
757 |                                       assertion)
758 |         jvmConstraint = self.jvmCheck.isGreaterThanOrEqualTo(
759 |             columnA,
760 |             columnB,
761 |             function,
762 |             getattr(self.jvmCheck, "isGreaterThanOrEqualTo$default$4")()
763 |         )
764 |         return Check(
765 |             self.spark,
766 |             self.level,
767 |             self.description,
768 |             jvmConstraint
769 |         )
770 | 
771 |     def isContainedIn(self, column, allowedValues, assertion = is_one):
772 |         """
773 |         Asserts that every non-null value in a column is contained in a set of predefined values
774 |         
775 |         @param column Column to run the assertion on
776 |         @param allowedValues Allowed values for the column
777 |         @param assertion Function that receives a double input parameter and returns a boolean
778 |         @param hint A hint to provide additional context why a constraint could have failed
779 |         """
780 |         if (isinstance(allowedValues, list) == False):
781 |             raise ValueError("'allowedValues' must be a list of strings.")
782 |         function = jc.scala_function1(self.spark.sparkContext._gateway,
783 |                                       assertion)
784 |         scalaArray = jc.iterable_to_scala_array(self._jvm, allowedValues)
785 |         jvmConstraint = self.jvmCheck.isContainedIn(
786 |             column,
787 |             scalaArray,
788 |             function,
789 |             getattr(self.jvmCheck, "isContainedIn$default$6")()
790 |         )
791 |         return Check(
792 |             self.spark,
793 |             self.level,
794 |             self.description,
795 |             jvmConstraint
796 |         )
797 | 
798 |     def isInInterval(self,
799 |                      column,
800 |                      lowerBound,
801 |                      upperBound,
802 |                      includeLowerBound = True,
803 |                      includeUpperBound = True):
804 |         """
805 |         Asserts that the non-null values in a numeric column fall into the predefined interval
806 |         
807 |         @param column column to run the assertion
808 |         @param lowerBound lower bound of the interval
809 |         @param upperBound upper bound of the interval
810 |         @param includeLowerBound is a value equal to the lower bound allows?
811 |         @param includeUpperBound is a value equal to the upper bound allowed?
812 |         @param hint A hint to provide additional context why a constraint could have failed
813 |         """
814 |         jvmConstraint = self.jvmCheck.isContainedIn(
815 |             column,
816 |             lowerBound,
817 |             upperBound,
818 |             includeLowerBound,
819 |             includeUpperBound,
820 |             getattr(self.jvmCheck, "isContainedIn$default$6")()
821 |         )
822 |         return Check(
823 |             self.spark,
824 |             self.level,
825 |             self.description,
826 |             jvmConstraint
827 |         )
828 | 


--------------------------------------------------------------------------------
/src/pydeequ/examples/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | test_data = [("thingA", 13.0, "IN_TRANSIT", "true", 5.0),
 3 |              ("thingA", 5.0, "DELAYED", "false", 20.0),
 4 |              ("thingB", None, "DELAYED", None, 12.0),
 5 |              ("thingC", None, "IN_TRANSIT", "false", 2.0),
 6 |              ("thingD", 1.0, "DELAYED", "true", None),
 7 |              ("thingC", 7.0, "UNKNOWN", None, None),
 8 |              ("thingC", 20.0, "UNKNOWN", None, 3.5),
 9 |              ("thingE", 20.0, "DELAYED", "false", 8.2)]
10 | 
11 | 


--------------------------------------------------------------------------------
/src/pydeequ/examples/analyzer_example.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession, DataFrame
 2 | 
 3 | from pydeequ.base import AnalysisRunner
 4 | import pydeequ.analyzers as analyzers
 5 | from pydeequ.examples import test_data
 6 | 
 7 | def main():
 8 |     # SparkSession startup
 9 |     spark = (SparkSession
10 |               .builder
11 |               .master('local[*]')
12 |               .config('spark.jars.packages',
13 |                       'com.amazon.deequ:deequ:1.0.5')
14 |               .appName('profiler-example')
15 |               .getOrCreate())
16 |     df = spark.createDataFrame(test_data)
17 | 
18 |     r = AnalysisRunner(spark) \
19 |     	.onData(df) \
20 |         .addAnalyzer(analyzers.Size()) \
21 |         .addAnalyzer(analyzers.Completeness('_3')) \
22 |     	.addAnalyzer(analyzers.ApproxCountDistinct('_1')) \
23 |         .addAnalyzer(analyzers.Mean('_2')) \
24 |         .addAnalyzer(analyzers.Compliance('top values', '_2 > 15')) \
25 |         .addAnalyzer(analyzers.Correlation('_2', '_5')) \
26 |     	.run()
27 | 
28 |     df = DataFrame(r, spark)
29 |     df.show(df.count(), False)
30 | 
31 |     # SparkSession and Java Gateway teardown
32 |     spark.sparkContext._gateway.close()
33 |     spark.stop()
34 | 
35 | if __name__ == "__main__":
36 | 	main()


--------------------------------------------------------------------------------
/src/pydeequ/examples/basic_usage.py:
--------------------------------------------------------------------------------
 1 | #!/bin/bash python3
 2 | 
 3 | from pyspark.sql import SparkSession, DataFrame
 4 | 
 5 | from pydeequ.base import VerificationSuite
 6 | from pydeequ.checks import Check
 7 | from pydeequ.examples import test_data
 8 | 
 9 | def main():
10 |     # SparkSession startup
11 |     spark = (SparkSession
12 |               .builder
13 |               .master('local[*]')
14 |               .config('spark.jars.packages',
15 |                       'com.amazon.deequ:deequ:1.0.5')
16 |               .appName('constrain-example')
17 |               .getOrCreate())
18 |     df = spark.createDataFrame(test_data)
19 | 
20 |     # Constrain verification
21 |     r = (VerificationSuite(spark)
22 |          .onData(df)
23 |          .addCheck(Check(spark, 'error', 'examples')
24 |                    .hasSize(lambda x: x == 8)
25 |                    .isUnique('_2')
26 |                    .hasCompleteness('_2', lambda x: x >= 0.75)
27 |                    .hasUniqueness('_1', lambda x: x == 3/8)
28 |                    .hasDistinctness('_1', lambda x: x == 5/8)
29 |                    .hasUniqueValueRatio('_2', lambda x: x == 0.8)
30 |                    .hasNumberOfDistinctValues('_2', lambda x: x == 6)
31 |                    #.hasHistogram
32 |                    .hasEntropy('_3', lambda x: x > 1)
33 |                    #.hasMutualInformation('_2', '_3', lambda x: x > 0.5)
34 |                    .hasApproxQuantile('_2', 0.5, lambda x: x == 7)
35 |                    .hasMinLength('_1', lambda x: x == 6)
36 |                    .hasMaxLength('_3', lambda x: x == 10)
37 |                    .hasMin('_2', lambda x: x == 1)
38 |                    .hasMax('_2', lambda x: x == 20)
39 |                    .hasMean('_2', lambda x: x > 10)
40 |                    .hasSum('_2', lambda x: x > 50)
41 |                    .hasStandardDeviation('_2', lambda x: x > 5)
42 |                    .hasApproxCountDistinct('_2', lambda x: x == 5)
43 |                    .hasCorrelation('_2', '_5', lambda x: x == 1)
44 |                    .satisfies("_2 > 15", "MyCondition", lambda x: x == 0.25)
45 |                    #.hasPattern("_1", "thing([A-Z])", lambda x: x == 1)
46 |                    #.hasDataType("_1", "string", lambda x: x == 1)
47 |                    .isPositive('_2')
48 |                    .isNonNegative('_2')
49 |                    .isLessThan('_5', '_2', lambda x: x == 0.375)
50 |                    .isLessThanOrEqualTo('_5', '_2', lambda x: x == 0.375)
51 |                    .isGreaterThan('_5', '_2', lambda x: x == 0.125)
52 |                    .isGreaterThanOrEqualTo('_5', '_2', lambda x: x == 0.125)
53 |                    #.isContainedIn('_3', ['DELAYED', 'INTRANSIT'])
54 |                    .isInInterval('_5', 1.0, 50.0)
55 |                    )
56 |          .run()
57 |          )
58 |     df = DataFrame(r, spark)
59 |     df.show(df.count(), False)
60 | 
61 |     # SparkSession and Java Gateway teardown
62 |     spark.sparkContext._gateway.close()
63 |     spark.stop()
64 | 
65 | if __name__ == '__main__':
66 |     main()
67 | 


--------------------------------------------------------------------------------
/src/pydeequ/examples/basic_usage2.py:
--------------------------------------------------------------------------------
 1 | #!/bin/bash python3
 2 | 
 3 | from pyspark.sql import SparkSession, DataFrame
 4 | 
 5 | from pydeequ.base import VerificationSuite
 6 | from pydeequ.checks import Check
 7 | from pydeequ.examples import test_data
 8 | 
 9 | def main():
10 |     # SparkSession startup
11 |     spark = (SparkSession
12 |               .builder
13 |               .master('local[*]')
14 |               .config('spark.jars.packages',
15 |                       'com.amazon.deequ:deequ:1.0.5')
16 |               .appName('constrain-example')
17 |               .getOrCreate())
18 |     df = spark.createDataFrame(test_data)
19 |     df.show()
20 |     print(df._jdf.__doc__)
21 | 
22 |     #spark.stop()
23 | 
24 | if __name__ == '__main__':
25 |     main()
26 | 


--------------------------------------------------------------------------------
/src/pydeequ/examples/metrics_repo.py:
--------------------------------------------------------------------------------
 1 | #!/bin/bash python3
 2 | 
 3 | from pydeequ.examples import test_data
 4 | from pydeequ import AnalysisRunner, VerificationSuite 
 5 | import pydeequ.analyzers as analyzers
 6 | from pydeequ.metricsrepo import ResultKey, FileSystemMetricsRepository
 7 | from pydeequ.checks import Check
 8 | 
 9 | def main():
10 |      # SparkSession startup
11 |      spark = (SparkSession
12 |                .builder
13 |                .master('local[*]')
14 |                .config('spark.jars.packages',
15 |                        'com.amazon.deequ:deequ:1.0.5')
16 |                .appName('suggestions-example')
17 |                .getOrCreate())
18 |      df = spark.createDataFrame(test_data)
19 |      # Analysis run
20 |      a = (AnalysisRunner(spark)
21 |           .onData(df)
22 |           .addAnalyzer(analyzers.Size())) \
23 |           .run()
24 |      key = ResultKey(spark, 100000, {'key1': 'value1'})
25 |      myrepo = FileSystemMetricsRepository(spark, '../test.json')
26 |      myrepo.save(key, a)
27 | 
28 |      # Verification run
29 |      key2 = repo.ResultKey(spark, 100000, {'key1': 'value2', 'key2':'value3'})
30 |      
31 | 
32 |      v = (base.VerificationSuite(spark)
33 |               .onData(df)
34 |               .addCheck(Check(spark, 'error', 'examples')
35 |                         .hasSize(lambda x: x == 8)
36 |                         .isUnique('_2'))
37 |           .useRepository(myrepo)
38 |           .saveOrAppendResult(key2)
39 |           .run()
40 |      )
41 | 
42 |      myrepo.load().withTagValues({'key1': 'value1'}).after(99000) \
43 |           .getMetricsAsDF().show()
44 | 
45 |      # SparkSession and Java Gateway teardown
46 |      spark.sparkContext._gateway.close()
47 |      spark.stop()
48 | 
49 | if __name__ == "__main__":
50 |     main()


--------------------------------------------------------------------------------
/src/pydeequ/examples/profiler_example.py:
--------------------------------------------------------------------------------
 1 | #!/bin/bash python3
 2 | 
 3 | import json
 4 | from pyspark.sql import SparkSession, DataFrame
 5 | 
 6 | from pydeequ.profiler import ColumnProfilerRunner
 7 | from pydeequ.examples import test_data
 8 | 
 9 | def main():
10 |     # SparkSession startup
11 |     spark = (SparkSession
12 |               .builder
13 |               .master('local[*]')
14 |               .config('spark.jars.packages',
15 |                       'com.amazon.deequ:deequ:1.0.5')
16 |               .appName('profiler-example')
17 |               .getOrCreate())
18 |     df = spark.createDataFrame(test_data)
19 | 
20 |     # Constrain verification
21 |     r = (ColumnProfilerRunner()
22 |          .onData(df)
23 |          .run())
24 | 
25 |     parsed = json.loads(r)
26 |     print(json.dumps(parsed, indent = 4))
27 |     
28 |     # SparkSession and Java Gateway teardown
29 |     spark.sparkContext._gateway.close()
30 |     spark.stop()
31 | 
32 | if __name__ == "__main__":
33 |     main()
34 | 


--------------------------------------------------------------------------------
/src/pydeequ/examples/suggestions_example.py:
--------------------------------------------------------------------------------
 1 | #!/bin/bash python3
 2 | 
 3 | import json
 4 | from pyspark.sql import SparkSession, DataFrame
 5 | 
 6 | from pydeequ.base import ConstraintSuggestionRunner
 7 | from pydeequ.suggestions import Rules
 8 | from pydeequ.examples import test_data
 9 | 
10 | def main():
11 |     # SparkSession startup
12 |     spark = (SparkSession
13 |               .builder
14 |               .master('local[*]')
15 |               .config('spark.jars.packages',
16 |                       'com.amazon.deequ:deequ:1.0.5')
17 |               .appName('suggestions-example')
18 |               .getOrCreate())
19 |     df = spark.createDataFrame(test_data)
20 | 
21 |     # Constrain verification
22 |     r = (ConstraintSuggestionRunner(spark)
23 |          .onData(df)
24 |          .addConstraintRule(Rules.CategoricalRangeRule(spark))
25 |          .run())
26 | 
27 |     parsed = json.loads(r)
28 |     print(json.dumps(parsed, indent = 4))
29 | 
30 |     # SparkSession and Java Gateway teardown
31 |     spark.sparkContext._gateway.close()
32 |     spark.stop()
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     main()


--------------------------------------------------------------------------------
/src/pydeequ/exceptions.py:
--------------------------------------------------------------------------------
 1 | class JavaClassNotFoundException(Exception):
 2 |     """
 3 |     Raise if required Java class is not found by py4j
 4 |     """
 5 | 
 6 |     def __init__(self, java_class):
 7 |         Exception.__init__(self)
 8 |         self.java_class = java_class
 9 | 
10 |     def __str__(self):
11 |         return "%s. Did you forget to add the jar to the class path?" % (
12 |             self.java_class
13 |         )
14 | 
15 |     def __repr__(self):
16 |         return "%s: %s" % (self.__class__.__name__, self.java_class)
17 | 


--------------------------------------------------------------------------------
/src/pydeequ/jvm_conversions.py:
--------------------------------------------------------------------------------
 1 | def iterable_to_scala_list(jvm, iterable):
 2 |     return jvm.scala.collection.JavaConversions.\
 3 |         iterableAsScalaIterable(iterable).\
 4 |         toList()
 5 | 
 6 | def iterable_to_scala_set(jvm, iterable):
 7 |     return jvm.scala.collection.JavaConversions.\
 8 |         iterableAsScalaIterable(iterable).\
 9 |         toSet()
10 | 
11 | def iterable_to_scala_seq(jvm, iterable):
12 |     return jvm.scala.collection.JavaConversions.\
13 |         iterableAsScalaIterable(iterable).\
14 |         toSeq()
15 | 
16 | def simple_date_format(jvm, s):
17 |     return jvm.java.text.SimpleDateFormat(s)
18 | 
19 | def tuple2(jvm, t):
20 |     return jvm.scala.Tuple2(*t)
21 | 
22 | def option(jvm, java_obj):
23 |     return jvm.scala.Option.apply(java_obj)
24 | 
25 | def scala_none(jvm):
26 |     return getattr(getattr(jvm.scala, "None$"), "MODULE$")
27 | 
28 | def dict_to_scala_map(jvm, keyvaluepairs):
29 |     return jvm.scala.collection.JavaConverters.\
30 |         mapAsScalaMapConverter(keyvaluepairs).\
31 |         asScala().toMap(jvm.scala.Predef.conforms())
32 | 
33 | class scala_function1:
34 |     def __init__(self, gateway, lambda_function):
35 |         self.gateway = gateway
36 |         self.lambda_function = lambda_function
37 | 
38 |     def apply(self, arg):
39 |         return self.lambda_function(arg)
40 | 
41 |     class Java:
42 |         implements = ["scala.Function1"]
43 | 
44 | 


--------------------------------------------------------------------------------
/src/pydeequ/metricsrepo.py:
--------------------------------------------------------------------------------
  1 | from pyspark.sql import DataFrame
  2 | 
  3 | from pydeequ.base import BaseWrapper
  4 | import pydeequ.jvm_conversions as jc
  5 | 
  6 | class ResultKey(BaseWrapper):
  7 |     """ Unique identifier of Analysis result.
  8 |     """
  9 |     def __init__(self, SparkSession, dataSetDate, tags):
 10 |         """
 11 |         :param double dataSetDate: A date related to the Analysis result
 12 |         :param dict tags: Key-value store of tags
 13 |         """
 14 |         super().__init__(SparkSession)
 15 |         self.dataSetDate = dataSetDate
 16 |         self.tags = tags
 17 |         result_key = self._jvm.com.amazon.deequ.repository.ResultKey
 18 |         self.jvmResultKey = result_key(
 19 |             self.dataSetDate,
 20 |             jc.dict_to_scala_map(self._jvm, self.tags)
 21 |         )
 22 | 
 23 | class FileSystemMetricsRepository(BaseWrapper):
 24 |     """ FS based repository class
 25 |     """
 26 |     def __init__(self, SparkSession, path):
 27 |         super().__init__(SparkSession)
 28 |         self.path = path
 29 |         fs_repo = self._jvm.com.amazon.deequ.repository.fs.\
 30 |             FileSystemMetricsRepository
 31 |         self.jvmMetricsRepo = fs_repo(
 32 |             self._jsparkSession,
 33 |             self.path
 34 |         )
 35 | 
 36 |     def save(self, resultKey, analyserContext):
 37 |         """ Save Analysis results (metrics).
 38 | 
 39 |         :param ResultKey resultKey: unique identifier of Analysis results
 40 |         :param AnalyzerContext analyserContext: 
 41 |         """
 42 |         return self.jvmMetricsRepo.save(
 43 |             resultKey.jvmResultKey,
 44 |             analyserContext.jvmAnalyzerContext
 45 |         )
 46 | 
 47 |     def load(self):
 48 |         """ Get a builder class to construct a loading query to get
 49 |         analysis results
 50 |         """
 51 |         return FSRepoResultsLoader(self.spark, self.path)
 52 | 
 53 | class FSRepoResultsLoader(BaseWrapper):
 54 |     def __init__(self, SparkSession, path):
 55 |         super().__init__(SparkSession)
 56 |         self.path = path
 57 |         fs_repo_loader = self._jvm.com.amazon.deequ.repository.fs.\
 58 |             FileSystemMetricsRepositoryMultipleResultsLoader
 59 |         self.jvmFSMetricsRepoLoader = fs_repo_loader(
 60 |             self._jsparkSession,
 61 |             self.path
 62 |         )
 63 | 
 64 |     def withTagValues(self, tagValues):
 65 |         self.tagValues = tagValues
 66 |         self.jvmFSMetricsRepoLoader = self.jvmFSMetricsRepoLoader \
 67 |             .withTagValues(
 68 |                 jc.dict_to_scala_map(self._jvm, tagValues)
 69 |         )
 70 |         return self
 71 | 
 72 |     def before(self, dateTime):
 73 |         self.before = dateTime
 74 |         self.jvmFSMetricsRepoLoader = self.jvmFSMetricsRepoLoader \
 75 |             .before(
 76 |                 dateTime
 77 |         )
 78 |         return self
 79 | 
 80 |     def after(self, dateTime):
 81 |         self.after = dateTime
 82 |         self.jvmFSMetricsRepoLoader = self.jvmFSMetricsRepoLoader \
 83 |             .after(
 84 |                 dateTime
 85 |         )
 86 |         return self
 87 | 
 88 |     def getMetricsAsDF(self):
 89 |         jvmGetter = self.jvmFSMetricsRepoLoader.getSuccessMetricsAsDataFrame
 90 |         df = jvmGetter(
 91 |             self._jsparkSession,
 92 |             getattr(self.jvmFSMetricsRepoLoader, 
 93 |                 "getSuccessMetricsAsDataFrame$default$2")()
 94 |         )
 95 |         return DataFrame(df, self.spark)
 96 | 
 97 |     def getMetricsAsJson(self):
 98 |         jvmGetter = self.jvmFSMetricsRepoLoader.getSuccessMetricsAsJson
 99 |         jf = jvmGetter(
100 |              getattr(self.jvmFSMetricsRepoLoader,
101 |                 "getSuccessMetricsAsJson$default$1")()
102 |         )
103 |         return jf
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 


--------------------------------------------------------------------------------
/src/pydeequ/profiler.py:
--------------------------------------------------------------------------------
 1 | from pydeequ.exceptions import JavaClassNotFoundException
 2 | import pydeequ.jvm_conversions as jc
 3 | import pdb
 4 | 
 5 | class ColumnProfilerRunBuilder:
 6 |     """
 7 |     Builds profiling runner.
 8 |     """
 9 |     def __init__(self, dataFrame):
10 |         """
11 |         Args:
12 |             dataFrame (pyspark.sql.dataframe.DataFrame):
13 |         """
14 |         self._sc = dataFrame._sc
15 |         self._dataFrame = dataFrame
16 |         run_builder = self._jvm.com.amazon.deequ \
17 |             .profiles.ColumnProfilerRunBuilder
18 |         self.jvmColumnProfilerRunBuilder = run_builder(
19 |             self._dataFrame._jdf
20 |         )
21 | 
22 |     @property
23 |     def _jvm(self):
24 |         return self._sc._jvm
25 | 
26 |     @property
27 |     def dataFrame(self):
28 |         return self._dataFrame
29 | 
30 |     def run(self):
31 |         result = self.jvmColumnProfilerRunBuilder.run()
32 | 
33 |         seqColumnProfiles = result.profiles().values().toSeq()
34 |         jf = result.toJson(
35 |             seqColumnProfiles
36 |         )
37 | 
38 |         return jf
39 | 
40 | class ColumnProfilerRunner():
41 |     """
42 |     Responsible for running data profiling.
43 |     """
44 |     def onData(self, dataFrame):
45 |         """
46 |         Starting point to construct a profiling runner.
47 |         
48 |         Args:
49 |             dataFrame (pyspark.sql.dataframe.DataFrame):
50 |         """
51 |         return ColumnProfilerRunBuilder(dataFrame)
52 | 
53 | 


--------------------------------------------------------------------------------
/src/pydeequ/suggestions.py:
--------------------------------------------------------------------------------
 1 | import py4j.java_gateway as jg
 2 | import pdb
 3 | 
 4 | from pydeequ.exceptions import JavaClassNotFoundException
 5 | import pydeequ.jvm_conversions as jc
 6 | 
 7 | 
 8 | class Rules:
 9 |     """
10 |     Constraint rules
11 |     """
12 | 
13 |     def __init__(self, spark, _jvmRule):
14 |         self.spark = spark
15 |         self._jvmRule = _jvmRule
16 | 
17 |     @property
18 |     def _jvm(self):
19 |         return self.spark.sparkContext._jvm
20 | 
21 |     @classmethod
22 |     def CompleteIfCompleteRule(cls, spark):
23 |         _jvmRule = spark.sparkContext._jvm.com.amazon.deequ.suggestions.rules.CompleteIfCompleteRule
24 |         return cls(spark, _jvmRule)
25 | 
26 |     @classmethod
27 |     def RetainCompletenessRule(cls, spark):
28 |         _jvmRule = spark.sparkContext._jvm.com.amazon.deequ.suggestions.rules.RetainCompletenessRule
29 |         return cls(spark, _jvmRule)
30 | 
31 |     @classmethod
32 |     def RetainTypeRule(cls, spark):
33 |         _jvmRule = spark.sparkContext._jvm.com.amazon.deequ.suggestions.rules.RetainTypeRule
34 |         return cls(spark, _jvmRule)
35 | 
36 |     @classmethod
37 |     def CategoricalRangeRule(cls, spark):
38 |         _jvmRule = spark.sparkContext._jvm.com.amazon.deequ.suggestions.rules.CategoricalRangeRule
39 |         return cls(spark, _jvmRule)
40 | 
41 |     @classmethod
42 |     def FractionalCategoricalRangeRule(cls, spark):
43 |         _jvmRule = spark.sparkContext._jvm.com.amazon.deequ.suggestions.rules.FractionalCategoricalRangeRule
44 |         return cls(spark, _jvmRule)
45 |     
46 |     @classmethod
47 |     def NonNegativeNumbersRule(cls, spark):
48 |         _jvmRule = spark.sparkContext._jvm.com.amazon.deequ.suggestions.rules.NonNegativeNumbersRule
49 |         return cls(spark, _jvmRule)


--------------------------------------------------------------------------------
/tests/integration/test_analyzers.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | from pyspark.sql import SparkSession
  4 | from pyspark.sql import DataFrame, Row
  5 | 
  6 | from pydeequ.base import AnalysisRunner
  7 | from pydeequ.examples import test_data
  8 | from pydeequ import analyzers
  9 | 
 10 | class AnalysisRunnerTest(unittest.TestCase):
 11 | 
 12 |     @classmethod
 13 |     def setUpClass(cls):
 14 |         cls.spark = (SparkSession
 15 |                       .builder
 16 |                       .master('local[*]')
 17 |                       .config('spark.jars.packages',
 18 |                               'com.amazon.deequ:deequ:1.0.5')
 19 |                       .appName('pytest-pyspark-local-testing')
 20 |                       .getOrCreate())
 21 |         cls.df = cls.spark.createDataFrame(test_data)
 22 |         cls.runner = AnalysisRunner(cls.spark)
 23 | 
 24 |     @classmethod
 25 |     def tearDownClass(cls):
 26 |         cls.spark.sparkContext._gateway.close()
 27 |         cls.spark.stop()
 28 | 
 29 |     def test_ApproxCountDistinct(self):
 30 |         out = self.runner.onData(self.df) \
 31 |             .addAnalyzer(analyzers.ApproxCountDistinct('_1')) \
 32 |             .run().successMetricsAsDataFrame()
 33 |         out = out.select('value').collect()
 34 |         self.assertEqual(out, [Row(value=5.0)])
 35 | 
 36 |     def test_ApproxQuantile(self):
 37 |         out = self.runner.onData(self.df) \
 38 |             .addAnalyzer(analyzers.ApproxQuantile('_2', 0.75)) \
 39 |             .run().successMetricsAsDataFrame()
 40 |         out = out.select('value').collect()
 41 |         self.assertEqual(out, [Row(value=20)])
 42 |    
 43 |     def test_Completeness(self):
 44 |         out = self.runner.onData(self.df) \
 45 |             .addAnalyzer(analyzers.Completeness('_2')) \
 46 |             .run().successMetricsAsDataFrame()
 47 |         out = out.select('value').collect()
 48 |         self.assertEqual(out, [Row(value=0.75)])
 49 | 
 50 |     def test_Compliance(self):
 51 |         out = self.runner.onData(self.df) \
 52 |             .addAnalyzer(analyzers.Compliance('top _2', '_2 > 15')) \
 53 |             .run().successMetricsAsDataFrame()
 54 |         out = out.select('value').collect()
 55 |         self.assertEqual(out, [Row(value=0.25)])
 56 | 
 57 |     def test_Correlation(self):
 58 |         out = self.runner.onData(self.df) \
 59 |             .addAnalyzer(analyzers.Correlation('_2', '_5')) \
 60 |             .run().successMetricsAsDataFrame()
 61 |         out = out.select('value').collect()
 62 |         self.assertLess(out, [Row(value=-0.8)])
 63 | 
 64 |     def test_CountDistinct(self):
 65 |         out = self.runner.onData(self.df) \
 66 |             .addAnalyzer(analyzers.CountDistinct('_3')) \
 67 |             .run().successMetricsAsDataFrame()
 68 |         out = out.select('value').collect()
 69 |         self.assertEqual(out, [Row(value=3)])
 70 | 
 71 |     def test_DataType(self):
 72 |         out = self.runner.onData(self.df) \
 73 |             .addAnalyzer(analyzers.DataType('_3')) \
 74 |             .run().successMetricsAsDataFrame()
 75 |         out = out.select('value').collect()
 76 |         self.assertEqual(out, [Row(value=5.0), Row(value=0.0), Row(value=0.0), Row(value=0.0), Row(value=0.0), Row(value=0.0), Row(value=0.0), Row(value=0.0), Row(value=0.0), Row(value=8.0), Row(value=1.0)])
 77 | 
 78 |     def test_Distinctness(self):
 79 |         out = self.runner.onData(self.df) \
 80 |             .addAnalyzer(analyzers.Distinctness('_3')) \
 81 |             .run().successMetricsAsDataFrame()
 82 |         out = out.select('value').collect()
 83 |         self.assertEqual(out, [Row(value=0.375)])
 84 | 
 85 |     def test_Entropy(self):
 86 |         out = self.runner.onData(self.df) \
 87 |             .addAnalyzer(analyzers.Entropy('_3')) \
 88 |             .run().successMetricsAsDataFrame()
 89 |         out = out.select('value').collect()
 90 |         self.assertGreater(out, [Row(value=1)])
 91 | 
 92 |     def test_Histogram(self):
 93 |         out = self.runner.onData(self.df) \
 94 |             .addAnalyzer(analyzers.Histogram('_3')) \
 95 |             .run().successMetricsAsDataFrame()
 96 |         out = out.select('value').collect()
 97 |         self.assertEqual(out, [Row(value=3.0), Row(value=4.0), Row(value=0.5), Row(value=2.0), Row(value=0.25), Row(value=2.0), Row(value=0.25)])
 98 | 
 99 |     def test_Maximum(self):
100 |         out = self.runner.onData(self.df) \
101 |             .addAnalyzer(analyzers.Maximum('_2')) \
102 |             .run().successMetricsAsDataFrame()
103 |         out = out.select('value').collect()
104 |         self.assertEqual(out, [Row(value=20)])
105 | 
106 |     def test_MaxLength(self):
107 |         out = self.runner.onData(self.df) \
108 |             .addAnalyzer(analyzers.MaxLength('_1')) \
109 |             .run().successMetricsAsDataFrame()
110 |         out = out.select('value').collect()
111 |         self.assertEqual(out, [Row(value=6)])
112 |     
113 |     def test_Mean(self):
114 |         out = self.runner.onData(self.df) \
115 |             .addAnalyzer(analyzers.Mean('_2')) \
116 |             .run().successMetricsAsDataFrame()
117 |         out = out.select('value').collect()
118 |         self.assertEqual(out, [Row(value=11)])
119 | 
120 |     def test_Minimum(self):
121 |         out = self.runner.onData(self.df) \
122 |             .addAnalyzer(analyzers.Minimum('_2')) \
123 |             .run().successMetricsAsDataFrame()
124 |         out = out.select('value').collect()
125 |         self.assertEqual(out, [Row(value=1)])
126 | 
127 |     def test_MinLength(self):
128 |         out = self.runner.onData(self.df) \
129 |             .addAnalyzer(analyzers.MaxLength('_1')) \
130 |             .run().successMetricsAsDataFrame()
131 |         out = out.select('value').collect()
132 |         self.assertEqual(out, [Row(value=6)])
133 | 
134 |     def test_MutualInformation(self):
135 |         out = self.runner.onData(self.df) \
136 |             .addAnalyzer(analyzers.MutualInformation(['_1', '_3'])) \
137 |             .run().successMetricsAsDataFrame()
138 |         out = out.select('value').collect()
139 |         self.assertGreater(out, [Row(value=0.5)])
140 | 
141 |     def test_Size(self):
142 |         out = self.runner.onData(self.df) \
143 |             .addAnalyzer(analyzers.Size()) \
144 |             .run().successMetricsAsDataFrame()
145 |         out = out.select('value').collect()
146 |         self.assertEqual(out, [Row(value=8)])
147 |     
148 |     def test_StandardDeviation(self):
149 |         out = self.runner.onData(self.df) \
150 |             .addAnalyzer(analyzers.StandardDeviation('_2')) \
151 |             .run().successMetricsAsDataFrame()
152 |         out = out.select('value').collect()
153 |         self.assertGreater(out, [Row(value=7)])
154 | 
155 |     def test_Sum(self):
156 |         out = self.runner.onData(self.df) \
157 |             .addAnalyzer(analyzers.Sum('_2')) \
158 |             .run().successMetricsAsDataFrame()
159 |         out = out.select('value').collect()
160 |         self.assertGreater(out, [Row(value=10)])
161 | 
162 |     def test_Uniqueness(self):
163 |         out = self.runner.onData(self.df) \
164 |             .addAnalyzer(analyzers.Uniqueness(['_1'])) \
165 |             .run().successMetricsAsDataFrame()
166 |         out = out.select('value').collect()
167 |         self.assertEqual(out, [Row(value=0.375)])    
168 | 
169 |     def test_UniqueValueRatio(self):
170 |         out = self.runner.onData(self.df) \
171 |             .addAnalyzer(analyzers.UniqueValueRatio(['_1'])) \
172 |             .run().successMetricsAsDataFrame()
173 |         out = out.select('value').collect()
174 |         self.assertEqual(out, [Row(value=0.6)])   
175 | 
176 | if __name__ == '__main__':
177 |     unittest.main()
178 |         


--------------------------------------------------------------------------------
/tests/integration/test_constraints.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | from pyspark.sql import SparkSession, DataFrame, Row
  4 | 
  5 | from pydeequ.base import VerificationSuite
  6 | from pydeequ.checks import Check
  7 | from pydeequ.examples import test_data
  8 | 
  9 | class ConstraintTest(unittest.TestCase):
 10 | 
 11 |     @classmethod
 12 |     def setUpClass(cls):
 13 |         cls.spark = (SparkSession
 14 |                       .builder
 15 |                       .master('local[*]')
 16 |                       .config('spark.jars.packages',
 17 |                               'com.amazon.deequ:deequ:1.0.5')
 18 |                       .appName('pytest-pyspark-local-testing')
 19 |                       .getOrCreate())
 20 |         cls.df = cls.spark.createDataFrame(test_data)
 21 |         cls.suite = VerificationSuite(cls.spark)
 22 |         cls.success = Row(constraint_status = 'Success')
 23 |         cls.failure = Row(constraint_status = 'Failure')
 24 | 
 25 |     @classmethod
 26 |     def tearDownClass(cls):
 27 |         cls.spark.sparkContext._gateway.close()
 28 |         cls.spark.stop()
 29 | 
 30 |     def test_hasSize(self):
 31 |         chk = Check(self.spark) \
 32 |             .hasSize(lambda x: x == 8)
 33 |         out = self.suite.onData(self.df).addCheck(chk).run()
 34 |         out = DataFrame(out, self.spark).select('constraint_status').collect()
 35 |         self.assertEqual(out, [self.success])
 36 | 
 37 |     def test_isUnique(self):
 38 |         chk = Check(self.spark) \
 39 |             .isUnique('_1')
 40 |         out = self.suite.onData(self.df).addCheck(chk).run()
 41 |         out = DataFrame(out, self.spark).select('constraint_status').collect()
 42 |         self.assertEqual(out, [self.failure])
 43 | 
 44 |     def test_hasCompleteness(self):
 45 |         chk = Check(self.spark) \
 46 |                    .hasCompleteness('_2', lambda x: x >= 0.75)
 47 |         out = self.suite.onData(self.df).addCheck(chk).run()
 48 |         out = DataFrame(out, self.spark).select('constraint_status').collect()
 49 |         self.assertEqual(out, [self.success])
 50 | 
 51 |     def test_hasUniqueness(self):
 52 |         chk = Check(self.spark) \
 53 |                    .hasUniqueness('_1', lambda x: x == 3/8)
 54 |         out = self.suite.onData(self.df).addCheck(chk).run()
 55 |         out = DataFrame(out, self.spark).select('constraint_status').collect()
 56 |         self.assertEqual(out, [self.success])
 57 | 
 58 |     def test_hasDistinctness(self):
 59 |         chk = Check(self.spark) \
 60 |                    .hasDistinctness('_1', lambda x: x == 5/8)
 61 |         out = self.suite.onData(self.df).addCheck(chk).run()
 62 |         out = DataFrame(out, self.spark).select('constraint_status').collect()
 63 |         self.assertEqual(out, [self.success])
 64 | 
 65 |     def test_hasUniqueValueRatio(self):
 66 |         chk = Check(self.spark) \
 67 |                    .hasUniqueValueRatio('_2', lambda x: x == 0.8)
 68 |         out = self.suite.onData(self.df).addCheck(chk).run()
 69 |         out = DataFrame(out, self.spark).select('constraint_status').collect()
 70 |         self.assertEqual(out, [self.success])
 71 | 
 72 |     def test_hasNumberOfDistinctValues(self):
 73 |         chk = Check(self.spark) \
 74 |                    .hasNumberOfDistinctValues('_2', lambda x: x == 6)
 75 |         out = self.suite.onData(self.df).addCheck(chk).run()
 76 |         out = DataFrame(out, self.spark).select('constraint_status').collect()
 77 |         self.assertEqual(out, [self.success])
 78 | 
 79 |     # .hasHistogram
 80 | 
 81 |     def test_hasEntropy(self):
 82 |         chk = Check(self.spark) \
 83 |                    .hasEntropy('_3', lambda x: x > 1)
 84 |         out = self.suite.onData(self.df).addCheck(chk).run()
 85 |         out = DataFrame(out, self.spark).select('constraint_status').collect()
 86 |         self.assertEqual(out, [self.success])
 87 | 
 88 |     # .hasMutualInformation
 89 | 
 90 |     def test_hasApproxQuantile(self):
 91 |         chk = Check(self.spark) \
 92 |                    .hasApproxQuantile('_2', 0.5, lambda x: x == 7)
 93 |         out = self.suite.onData(self.df).addCheck(chk).run()
 94 |         out = DataFrame(out, self.spark).select('constraint_status').collect()
 95 |         self.assertEqual(out, [self.success])
 96 | 
 97 |     def test_hasMinLength(self):
 98 |         chk = Check(self.spark) \
 99 |                    .hasMinLength('_1', lambda x: x == 6)
100 |         out = self.suite.onData(self.df).addCheck(chk).run()
101 |         out = DataFrame(out, self.spark).select('constraint_status').collect()
102 |         self.assertEqual(out, [self.success])
103 | 
104 |     def test_hasMaxLength(self):
105 |         chk = Check(self.spark) \
106 |                    .hasMaxLength('_3', lambda x: x == 10)
107 |         out = self.suite.onData(self.df).addCheck(chk).run()
108 |         out = DataFrame(out, self.spark).select('constraint_status').collect()
109 |         self.assertEqual(out, [self.success])
110 | 
111 |     def test_hasMin(self):
112 |         chk = Check(self.spark) \
113 |                    .hasMin('_2', lambda x: x == 1)
114 |         out = self.suite.onData(self.df).addCheck(chk).run()
115 |         out = DataFrame(out, self.spark).select('constraint_status').collect()
116 |         self.assertEqual(out, [self.success])
117 | 
118 |     def test_hasMax(self):
119 |         chk = Check(self.spark) \
120 |                    .hasMax('_2', lambda x: x == 20)
121 |         out = self.suite.onData(self.df).addCheck(chk).run()
122 |         out = DataFrame(out, self.spark).select('constraint_status').collect()
123 |         self.assertEqual(out, [self.success])
124 | 
125 |     def test_hasMean(self):
126 |         chk = Check(self.spark) \
127 |                    .hasMean('_2', lambda x: x > 10)
128 |         out = self.suite.onData(self.df).addCheck(chk).run()
129 |         out = DataFrame(out, self.spark).select('constraint_status').collect()
130 |         self.assertEqual(out, [self.success])
131 | 
132 |     def test_hasSum(self):
133 |         chk = Check(self.spark) \
134 |                    .hasSum('_2', lambda x: x > 50)
135 |         out = self.suite.onData(self.df).addCheck(chk).run()
136 |         out = DataFrame(out, self.spark).select('constraint_status').collect()
137 |         self.assertEqual(out, [self.success])
138 | 
139 |     def test_hasStandardDeviation(self):
140 |         chk = Check(self.spark) \
141 |                    .hasStandardDeviation('_2', lambda x: x > 5)
142 |         out = self.suite.onData(self.df).addCheck(chk).run()
143 |         out = DataFrame(out, self.spark).select('constraint_status').collect()
144 |         self.assertEqual(out, [self.success])
145 | 
146 |     def test_hasApproxContDistintc(self):
147 |         chk = Check(self.spark) \
148 |                    .hasApproxCountDistinct('_2', lambda x: x == 5)
149 |         out = self.suite.onData(self.df).addCheck(chk).run()
150 |         out = DataFrame(out, self.spark).select('constraint_status').collect()
151 |         self.assertEqual(out, [self.success])
152 | 
153 |     def test_hasCorrelation(self):
154 |         chk = Check(self.spark) \
155 |                    .hasCorrelation('_2', '_2', lambda x: x == 1)
156 |         out = self.suite.onData(self.df).addCheck(chk).run()
157 |         out = DataFrame(out, self.spark).select('constraint_status').collect()
158 |         self.assertEqual(out, [self.success])
159 | 
160 |     def test_satisfies(self):
161 |         chk = Check(self.spark) \
162 |                    .satisfies("_2 > 15", "MyCondition", lambda x: x == 0.25)
163 |         out = self.suite.onData(self.df).addCheck(chk).run()
164 |         out = DataFrame(out, self.spark).select('constraint_status').collect()
165 |         self.assertEqual(out, [self.success])
166 | 
167 | 
168 |                    #.hasPattern("_1", "thing([A-Z])", lambda x: x == 1)
169 |                    #.hasDataType("_1", "string", lambda x: x == 1)
170 | 
171 |     def test_isPositive(self):
172 |         chk = Check(self.spark) \
173 |                    .isPositive('_2')
174 |         out = self.suite.onData(self.df).addCheck(chk).run()
175 |         out = DataFrame(out, self.spark).select('constraint_status').collect()
176 |         self.assertEqual(out, [self.success])
177 | 
178 |     def test_isNonNegative(self):
179 |         chk = Check(self.spark) \
180 |                    .isNonNegative('_2')
181 |         out = self.suite.onData(self.df).addCheck(chk).run()
182 |         out = DataFrame(out, self.spark).select('constraint_status').collect()
183 |         self.assertEqual(out, [self.success])
184 | 
185 |     def test_isLessThan(self):
186 |         chk = Check(self.spark) \
187 |                    .isLessThan('_5', '_2', lambda x: x == 0.375)
188 |         out = self.suite.onData(self.df).addCheck(chk).run()
189 |         out = DataFrame(out, self.spark).select('constraint_status').collect()
190 |         self.assertEqual(out, [self.success])
191 | 
192 |     def test_isLessThanOrEqualTo(self):
193 |         chk = Check(self.spark) \
194 |                    .isLessThanOrEqualTo('_5', '_2', lambda x: x == 0.375)
195 |         out = self.suite.onData(self.df).addCheck(chk).run()
196 |         out = DataFrame(out, self.spark).select('constraint_status').collect()
197 |         self.assertEqual(out, [self.success])
198 | 
199 |     def test_isGreaterThan(self):
200 |         chk = Check(self.spark) \
201 |                    .isGreaterThan('_5', '_2', lambda x: x == 0.125)
202 |         out = self.suite.onData(self.df).addCheck(chk).run()
203 |         out = DataFrame(out, self.spark).select('constraint_status').collect()
204 |         self.assertEqual(out, [self.success])
205 | 
206 |     def test_isGreaterThanOrEqualTo(self):
207 |         chk = Check(self.spark) \
208 |                    .isGreaterThanOrEqualTo('_5', '_2', lambda x: x == 0.125)
209 |         out = self.suite.onData(self.df).addCheck(chk).run()
210 |         out = DataFrame(out, self.spark).select('constraint_status').collect()
211 |         self.assertEqual(out, [self.success])
212 | 
213 |                    #.isContainedIn('_3', ['DELAYED', 'INTRANSIT'])
214 | 
215 |     def test_isInInterval(self):
216 |         chk = Check(self.spark) \
217 |                    .isInInterval('_5', 1.0, 50.0)
218 |         out = self.suite.onData(self.df).addCheck(chk).run()
219 |         out = DataFrame(out, self.spark).select('constraint_status').collect()
220 |         self.assertEqual(out, [self.success])
221 | 
222 | if __name__ == '__main__':
223 |     unittest.main()
224 | 


--------------------------------------------------------------------------------
/tests/integration/test_runners.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from pyspark.sql import SparkSession
 4 | 
 5 | from pydeequ.base import VerificationSuite, AnalysisRunner, ConstraintSuggestionRunner
 6 | from pydeequ.profiler import ColumnProfilerRunner
 7 | from pydeequ.examples import test_data
 8 | 
 9 | class VerificationTest(unittest.TestCase):
10 | 
11 |     @classmethod
12 |     def setUpClass(cls):
13 |         cls.spark = (SparkSession
14 |                       .builder
15 |                       .master('local[*]')
16 |                       .config('spark.jars.packages',
17 |                               'com.amazon.deequ:deequ:1.0.5')
18 |                       .appName('pytest-pyspark-local-testing')
19 |                       .getOrCreate())
20 |         cls.df = cls.spark.createDataFrame(test_data)
21 | 
22 |     @classmethod
23 |     def tearDownClass(cls):
24 |         cls.spark.sparkContext._gateway.close()
25 |         cls.spark.stop()
26 | 
27 |     def test_VerificationSuiteArgs(self):
28 |         suiterunner = VerificationSuite(self.spark).onData(self.df)
29 |         # check dataframe prop
30 |         self.assertEqual(suiterunner.dataFrame.columns,
31 |                          ['_1', '_2', '_3', '_4', '_5']
32 |         )
33 |         # check _jsparkSession prop
34 |         self.assertEqual(suiterunner._jsparkSession.getClass().toString(),
35 |                          'class org.apache.spark.sql.SparkSession'
36 |         )
37 |         # check _jvm prop
38 |         self.assertEqual(suiterunner._jvm,
39 |                          self.spark.sparkContext._jvm
40 |         )
41 |         # check jvmVerificationRunBuilder
42 |         self.assertEqual(suiterunner.jvmVerificationRunBuilder.getClass().toString(),
43 |                          "class com.amazon.deequ.VerificationRunBuilder"
44 |         )
45 | 
46 |     def test_AnalyzerRunnerArgs(self):
47 |         runner = AnalysisRunner(self.spark).onData(self.df)
48 |         # check dataframe prop
49 |         self.assertEqual(runner.dataFrame.columns,
50 |                          ['_1', '_2', '_3', '_4', '_5']
51 |         )
52 |         # check _jsparkSession prop
53 |         self.assertEqual(runner._jsparkSession.getClass().toString(),
54 |                          'class org.apache.spark.sql.SparkSession'
55 |         )
56 |         # check _jvm prop
57 |         self.assertEqual(runner._jvm,
58 |                          self.spark.sparkContext._jvm
59 |         )
60 |         # check jvmAnalysisRunBuilder
61 |         self.assertEqual(runner.jvmAnalysisRunBuilder.getClass().toString(),
62 |                          "class com.amazon.deequ.analyzers.runners.AnalysisRunBuilder"
63 |         )
64 | 
65 |     def test_ProfilerRunnerArgs(self):
66 |         profilerrunner = ColumnProfilerRunner().onData(self.df)
67 |         # check dataframe prop
68 |         self.assertEqual(profilerrunner.dataFrame.columns,
69 |                          ['_1', '_2', '_3', '_4', '_5']
70 |         )
71 |         # check _jvm prop
72 |         self.assertEqual(profilerrunner._jvm,
73 |                          self.spark.sparkContext._jvm
74 |         )
75 |         # check jvmColumnProfilerRunBuilder
76 |         self.assertEqual(profilerrunner.jvmColumnProfilerRunBuilder.getClass().toString(),
77 |                          "class com.amazon.deequ.profiles.ColumnProfilerRunBuilder"
78 |         )
79 | 
80 |     def test_SuggestionRunnerArgs(self):
81 |         suggestionrunner = ConstraintSuggestionRunner(self.spark).onData(self.df)
82 |         # check dataframe prop
83 |         self.assertEqual(suggestionrunner.dataFrame.columns,
84 |                          ['_1', '_2', '_3', '_4', '_5']
85 |         )
86 |         # check _jvm prop
87 |         self.assertEqual(suggestionrunner._jvm,
88 |                          self.spark.sparkContext._jvm
89 |         )
90 |         # check jvmColumnProfilerRunBuilder
91 |         self.assertEqual(suggestionrunner.jvmConstraintSuggestionRunBuilder.getClass().toString(),
92 |                          "class com.amazon.deequ.suggestions.ConstraintSuggestionRunBuilder"
93 |         )
94 | 
95 | if __name__ == '__main__':
96 |     unittest.main()
97 | 
98 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | # Tox configuration file
 2 | # Read more under https://tox.readthedocs.org/
 3 | # THIS SCRIPT IS SUPPOSED TO BE AN EXAMPLE. MODIFY IT ACCORDING TO YOUR NEEDS!
 4 | 
 5 | [tox]
 6 | minversion = 2.4
 7 | envlist = default
 8 | 
 9 | [testenv]
10 | setenv = TOXINIDIR = {toxinidir}
11 | passenv =
12 |     HOME
13 | commands =
14 |     py.test {posargs}
15 | extras =
16 |     all
17 |     testing
18 | 


--------------------------------------------------------------------------------