├── docs ├── _static │ ├── .keep │ ├── example.sqlite3 │ ├── data_prep_poll.png │ ├── development_build.jpg │ ├── reference_data_example.zip │ ├── failure_message_example.zip │ ├── example.csv │ ├── tutorial │ │ ├── estimated_totals.csv │ │ ├── test_movies_df.py │ │ ├── movies.csv │ │ ├── test_movies_df_unit.py │ │ ├── test_country_of_birth.py │ │ ├── test_country_of_birth_unit.py │ │ ├── modified_test_country_of_birth.py │ │ ├── modified_test_country_of_birth_unit.py │ │ ├── test_intro1.py │ │ ├── test_intro2.py │ │ ├── modified_country_of_birth.csv │ │ ├── country_of_birth.csv │ │ ├── test_intro1_unit.py │ │ └── test_intro2_unit.py │ ├── test_users.py │ ├── test_users_unit.py │ ├── excel_autoformat.csv │ ├── theme_overrides.css │ ├── test_validation.py │ ├── mydata.csv │ ├── users.csv │ └── test_errors.py ├── _build │ └── .gitignore ├── _templates │ └── layout.html ├── discussion │ ├── terminology.rst │ ├── project-history.rst │ ├── index.rst │ ├── validate-vs-accept.rst │ ├── organizing-tests.rst │ └── data-preparation.rst ├── requirements.txt ├── how-to │ ├── install.rst │ ├── run-tests.rst │ ├── index.rst │ ├── negative-matches.rst │ ├── reorder-acceptances.rst │ ├── get-started.rst │ ├── date-time-str.rst │ ├── fuzzy-matching.rst │ ├── sequences.rst │ ├── phone-numbers.rst │ ├── excel-auto-formatting.rst │ └── customize-differences.rst ├── intro │ └── index.rst ├── reference │ ├── index.rst │ └── unittest-support.rst ├── _ext │ └── autodoc_classinstance.py ├── index.rst └── tutorial │ └── testing-pandas.rst ├── tests ├── __init__.py ├── sample_files │ ├── sample_text_utf8.csv │ ├── sample_excel1997.xls │ ├── sample_excel2007.xlsx │ ├── sample_dbase.dbf │ ├── sample_text_iso88591.csv │ ├── test_sources_excel.xlsx │ └── sample_multiworksheet.xlsx ├── _io.py ├── past_api07_sources_base.py ├── past_api09.py ├── _contextlib.py ├── past_api07_sources_excel.py ├── test_past_subprocesses.py ├── past_api07_error.py ├── common.py ├── test_pandas_integration.py ├── past_api07_sources_pandas.py ├── test_runner.py ├── test_utils_misc.py ├── past_api00.py ├── past_api07_sources_sqlite.py └── past_api09_load_csv.py ├── setup.cfg ├── datatest ├── _vendor │ └── __init__.py ├── _compatibility │ ├── __init__.py │ ├── itertools.py │ ├── abc.py │ ├── statistics.py │ ├── textwrap.py │ ├── contextlib.py │ ├── collections │ │ └── abc.py │ ├── builtins.py │ ├── functools.py │ └── decimal.py ├── __past__ │ ├── api_dev0.py │ ├── api_dev1.py │ ├── api_dev2.py │ ├── api010.py │ ├── squint │ │ └── __init__.py │ ├── __init__.py │ ├── api07_error.py │ ├── api09.py │ ├── api00.py │ ├── load_csv.py │ └── api06.py ├── __main__.py ├── __init__.py ├── _excepthook.py ├── _working_directory.py ├── main.py └── _normalize.py ├── MANIFEST.in ├── requirements-dev.txt ├── AUTHORS ├── .readthedocs.yml ├── LICENSE ├── .travis.yml ├── .gitignore ├── run-tests.sh ├── run-tests.bat └── release-checklist.rst /docs/_static/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal=1 3 | -------------------------------------------------------------------------------- /datatest/_vendor/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /datatest/_compatibility/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /tests/sample_files/sample_text_utf8.csv: -------------------------------------------------------------------------------- 1 | col1,col2 2 | utf8,α 3 | -------------------------------------------------------------------------------- /docs/_build/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore 5 | -------------------------------------------------------------------------------- /docs/_static/example.sqlite3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shawnbrown/datatest/HEAD/docs/_static/example.sqlite3 -------------------------------------------------------------------------------- /docs/_static/data_prep_poll.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shawnbrown/datatest/HEAD/docs/_static/data_prep_poll.png -------------------------------------------------------------------------------- /datatest/__past__/api_dev0.py: -------------------------------------------------------------------------------- 1 | """alias for api00""" 2 | from __future__ import absolute_import 3 | from .api00 import * 4 | -------------------------------------------------------------------------------- /datatest/__past__/api_dev1.py: -------------------------------------------------------------------------------- 1 | """alias for api06""" 2 | from __future__ import absolute_import 3 | from .api06 import * 4 | -------------------------------------------------------------------------------- /datatest/__past__/api_dev2.py: -------------------------------------------------------------------------------- 1 | """alias for api07""" 2 | from __future__ import absolute_import 3 | from .api07 import * 4 | -------------------------------------------------------------------------------- /docs/_static/development_build.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shawnbrown/datatest/HEAD/docs/_static/development_build.jpg -------------------------------------------------------------------------------- /docs/_static/reference_data_example.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shawnbrown/datatest/HEAD/docs/_static/reference_data_example.zip -------------------------------------------------------------------------------- /tests/sample_files/sample_excel1997.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shawnbrown/datatest/HEAD/tests/sample_files/sample_excel1997.xls -------------------------------------------------------------------------------- /docs/_static/failure_message_example.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shawnbrown/datatest/HEAD/docs/_static/failure_message_example.zip -------------------------------------------------------------------------------- /tests/sample_files/sample_excel2007.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shawnbrown/datatest/HEAD/tests/sample_files/sample_excel2007.xlsx -------------------------------------------------------------------------------- /docs/_static/example.csv: -------------------------------------------------------------------------------- 1 | "A","B","C" 2 | "x","foo",20 3 | "x","foo",30 4 | "y","foo",10 5 | "y","bar",20 6 | "z","bar",10 7 | "z","bar",10 8 | -------------------------------------------------------------------------------- /tests/sample_files/sample_dbase.dbf: -------------------------------------------------------------------------------- 1 | aCOL1CCOL2N dBASE1 -------------------------------------------------------------------------------- /tests/sample_files/sample_text_iso88591.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shawnbrown/datatest/HEAD/tests/sample_files/sample_text_iso88591.csv -------------------------------------------------------------------------------- /tests/sample_files/test_sources_excel.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shawnbrown/datatest/HEAD/tests/sample_files/test_sources_excel.xlsx -------------------------------------------------------------------------------- /tests/sample_files/sample_multiworksheet.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shawnbrown/datatest/HEAD/tests/sample_files/sample_multiworksheet.xlsx -------------------------------------------------------------------------------- /datatest/__past__/api010.py: -------------------------------------------------------------------------------- 1 | """Backward compatibility for version 0.10 API.""" 2 | from __future__ import absolute_import 3 | 4 | # This is a stub for future use. 5 | 6 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | include AUTHORS 3 | include LICENSE 4 | include requirements.txt 5 | recursive-include datatest *.py 6 | include tests *.py 7 | include tests/sample_files *.* 8 | -------------------------------------------------------------------------------- /docs/_templates/layout.html: -------------------------------------------------------------------------------- 1 | {% extends "!layout.html" %} 2 | 3 | {# 4 | {% block menu %} 5 | {{ super() }} 6 | Package Index 7 | {% endblock %} 8 | #} 9 | -------------------------------------------------------------------------------- /docs/_static/tutorial/estimated_totals.csv: -------------------------------------------------------------------------------- 1 | state/territory,population 2 | Australian Capital Territory,389785 3 | Jervis Bay Territory,388 4 | New South Wales,7507350 5 | Northern Territory,226412 6 | Queensland,4721503 7 | South Australia,1637325 8 | Tasmania,514245 9 | Victoria,5849330 10 | Western Australia,2451380 11 | -------------------------------------------------------------------------------- /datatest/__past__/squint/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """squint: simple query interface for tabular data 3 | 4 | PYTEST_DONT_REWRITE 5 | """ 6 | from __future__ import absolute_import 7 | 8 | from .query import BaseElement 9 | from .query import Select 10 | from .query import Query 11 | from .query import Result 12 | -------------------------------------------------------------------------------- /docs/discussion/terminology.rst: -------------------------------------------------------------------------------- 1 | :orphan: 2 | 3 | .. currentmodule:: datatest 4 | 5 | .. meta:: 6 | :description: A discussion about the language and vocabulary used in datatest. 7 | :keywords: data, validation, quality, glossary, terms 8 | 9 | 10 | #################### 11 | Notes on Terminology 12 | #################### 13 | 14 | -------------------------------------------------------------------------------- /docs/discussion/project-history.rst: -------------------------------------------------------------------------------- 1 | :orphan: 2 | 3 | .. currentmodule:: datatest 4 | 5 | .. meta:: 6 | :description: A brief discussion on the history and origins of datatest at the NCEC. 7 | :keywords: datatest, history, NCEC, National Committee for an Effective Congress 8 | 9 | 10 | ################ 11 | Datatest History 12 | ################ 13 | 14 | -------------------------------------------------------------------------------- /datatest/_compatibility/itertools.py: -------------------------------------------------------------------------------- 1 | """compatibility layer for itertools (Python standard library)""" 2 | from __future__ import absolute_import 3 | from itertools import * 4 | 5 | try: 6 | filterfalse # New in Python 3. 7 | except NameError: 8 | filterfalse = ifilterfalse 9 | 10 | 11 | try: 12 | zip_longest 13 | except NameError: 14 | zip_longest = izip_longest 15 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | # ============================== 2 | # Requirements for Read The Docs 3 | # ============================== 4 | # 5 | # The following requirements are additional dependencies that 6 | # https://readthedocs.io needs to install so it can properly 7 | # generate the documentation for datatest. 8 | 9 | sphinx>=2.1.0 10 | sphinx-tabs 11 | sphinx_rtd_theme>=0.3.1 12 | 13 | -------------------------------------------------------------------------------- /datatest/_compatibility/abc.py: -------------------------------------------------------------------------------- 1 | """compatibility layer for abc (Python standard library)""" 2 | from __future__ import absolute_import 3 | from abc import * 4 | 5 | 6 | try: 7 | ABC # New in version 3.4. 8 | ABC.__slots__ # New in version 3.7 9 | except (NameError, AttributeError): 10 | # Using Python 2 and 3 compatible syntax. 11 | ABC = ABCMeta('ABC', (object,), {'__slots__': ()}) 12 | -------------------------------------------------------------------------------- /docs/how-to/install.rst: -------------------------------------------------------------------------------- 1 | 2 | .. currentmodule:: datatest 3 | 4 | .. meta:: 5 | :description: How to Install Datatest 6 | :keywords: installing, datatest, python 7 | 8 | 9 | ####################### 10 | How to Install Datatest 11 | ####################### 12 | 13 | .. include:: ../../README.rst 14 | :start-after: start-inclusion-marker-install 15 | :end-before: end-inclusion-marker-install 16 | -------------------------------------------------------------------------------- /datatest/__past__/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Backwards compatibility for phased-out features and behaviors. 3 | 4 | To use a feature that is no longer supported in the current version of 5 | datatest, use the following: 6 | 7 | from datatest.__past__ import api 8 | 9 | For example, importing 'api07' would provide backwards compatibility 10 | for the API as implemented in the 0.7 version of datatest. 11 | """ 12 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | # ======================== 2 | # Development Dependencies 3 | # ======================== 4 | # 5 | # These are not installation requirements! 6 | # 7 | # The following dependencies are only required for 8 | # testing, building, and documentation generation. 9 | # 10 | # pip install -r requirements-dev.txt 11 | 12 | dbfread 13 | ipython 14 | numpy 15 | pandas 16 | squint 17 | xlrd==1.2.0 18 | sphinx>=2.1.0 19 | sphinx-tabs 20 | sphinx_rtd_theme>=0.3.1 21 | twine 22 | wheel 23 | 24 | -------------------------------------------------------------------------------- /tests/_io.py: -------------------------------------------------------------------------------- 1 | """compatibility layer for io (Python standard library)""" 2 | from __future__ import absolute_import 3 | from io import * 4 | from sys import version_info as _version_info 5 | 6 | 7 | if _version_info[:2] <= (2, 7): # For version 2.7 and earlier. 8 | import StringIO as _StringIO 9 | 10 | StringIO = _StringIO.StringIO 11 | class StringIO(_StringIO.StringIO): 12 | def write(self, str): 13 | str = unicode(str) 14 | return _StringIO.StringIO.write(self, str) 15 | -------------------------------------------------------------------------------- /datatest/_compatibility/statistics.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | 4 | try: 5 | from statistics import * 6 | except ImportError: 7 | 8 | class StatisticsError(ValueError): 9 | pass 10 | 11 | 12 | def median(data): 13 | data = sorted(data) 14 | n = len(data) 15 | if n == 0: 16 | raise StatisticsError('no median for empty data') 17 | if n % 2 == 1: 18 | return data[n // 2] 19 | else: 20 | i = n // 2 21 | return (data[i - 1] + data[i]) / 2 22 | -------------------------------------------------------------------------------- /datatest/__main__.py: -------------------------------------------------------------------------------- 1 | """Main entry point""" 2 | 3 | import sys 4 | if sys.argv[0].endswith('__main__.py'): 5 | import os.path 6 | # We change sys.argv[0] to make help message more useful 7 | # use executable without path, unquoted 8 | # (it's just a hint anyway) 9 | # (if you have spaces in your executable you get what you deserve!) 10 | executable = os.path.basename(sys.executable) 11 | sys.argv[0] = executable + ' -m datatest' 12 | del os 13 | 14 | __unittest = True 15 | __datatest = True 16 | 17 | 18 | from .main import main, DataTestProgram 19 | 20 | main(module=None) 21 | -------------------------------------------------------------------------------- /datatest/_compatibility/textwrap.py: -------------------------------------------------------------------------------- 1 | """compatibility layer for textwrap (Python standard library)""" 2 | from __future__ import absolute_import 3 | from textwrap import * 4 | 5 | 6 | try: 7 | indent # New in 3.3 8 | except NameError: 9 | def indent(text, prefix, predicate=None): 10 | if predicate is None: 11 | def predicate(line): 12 | return line.strip() 13 | 14 | def prefixed_lines(): 15 | for line in text.splitlines(True): 16 | yield (prefix + line if predicate(line) else line) 17 | return ''.join(prefixed_lines()) 18 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | Datatest was originally created at NCEC Services, LLC in 2014 2 | by Shawn Brown as 'dataaudit'. In 2015 the project was largely 3 | rewritten and renamed to 'datatest'. 4 | 5 | Work-for-hire Contributors: 6 | 7 | * Shawn Brown (development lead) 8 | * 9 | 10 | Personal Contributors: 11 | 12 | * Shawn Brown 13 | * 14 | 15 | A big thank you goes out to: 16 | 17 | Heather Blum-Pastor for numerous ideas and feedback. 18 | 19 | Brian Fraher, Bilen Estephanos, and Eric Hawkins who helped spec-out 20 | the initial API on our snowy train ride to New York City in February 21 | of 2014. 22 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation in the docs/ directory with Sphinx 9 | sphinx: 10 | configuration: docs/conf.py 11 | 12 | # Build documentation with MkDocs 13 | #mkdocs: 14 | # configuration: mkdocs.yml 15 | 16 | # Optionally build your docs in additional formats such as PDF and ePub 17 | formats: all 18 | 19 | # Optionally set the version of Python and requirements required to build your docs 20 | python: 21 | version: 3 22 | install: 23 | - requirements: docs/requirements.txt 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2014 - 2021 National Committee for an Effective Congress, 2 | NCEC Services LLC, and contributing authors 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use datatest except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | -------------------------------------------------------------------------------- /docs/_static/test_users.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from datatest import working_directory 3 | from datatest import Select 4 | from datatest import validate 5 | 6 | 7 | @pytest.fixture(scope='module') 8 | @working_directory(__file__) 9 | def users(): 10 | return Select('users.csv') 11 | 12 | 13 | @pytest.mark.mandatory 14 | def test_columns(users): 15 | validate(users.fieldnames, {'user_id', 'active'}) 16 | 17 | 18 | def test_user_id(users): 19 | 20 | def is_wellformed(x): # <- Helper function. 21 | return x[:-1].isdigit() and x[-1:].isupper() 22 | 23 | validate(users('user_id'), is_wellformed) 24 | 25 | 26 | def test_active(users): 27 | validate(users({'active'}), {'Y', 'N'}) 28 | -------------------------------------------------------------------------------- /docs/intro/index.rst: -------------------------------------------------------------------------------- 1 | 2 | .. meta:: 3 | :description: Table of Contents for Introduction. 4 | :keywords: 5 | :title: Introduction 6 | 7 | .. sectionauthor:: Shawn Brown 8 | 9 | 10 | ############ 11 | Introduction 12 | ############ 13 | 14 | .. epigraph:: 15 | 16 | *"...tidy datasets are all alike but every messy dataset is messy 17 | in its own way"* 18 | ---Hadley Wickham [#f1]_ 19 | 20 | 21 | .. toctree:: 22 | :maxdepth: 2 23 | 24 | tour-of-datatest 25 | Automated Testing 26 | Pipeline Validation 27 | Validating Pandas 28 | 29 | 30 | .. [#f1] Wickham, Hadley. "Tidy Data." Journal of Statistical Software 59, 31 | no. 10, August 2014. 32 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | matrix: 4 | include: 5 | - python: 3.10-dev 6 | - python: 3.9-dev 7 | - python: 3.8-dev 8 | - python: 3.7 9 | - python: 3.6 10 | - python: 3.5 11 | - python: 3.4 12 | - python: 3.3 13 | dist: trusty 14 | - python: 3.2 15 | dist: trusty 16 | # - python: 3.1 # not currently supported by Travis CI 17 | - python: 2.7 18 | - python: 2.6 19 | dist: trusty 20 | - python: pypy3 21 | - python: pypy 22 | 23 | install: true 24 | #install: 25 | # - pip install xlrd 26 | # - pip install pandas 27 | 28 | # command to run tests and check installation 29 | script: 30 | - python setup.py test 31 | - python -c 'import setuptools;print(setuptools.__version__)' 32 | - python setup.py install 33 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files. 2 | __pycache__/ 3 | *.pyc 4 | *.pyo 5 | *.pyd 6 | 7 | # C extensions. 8 | *.so 9 | 10 | # Distribution / packaging. 11 | .Python 12 | env/ 13 | bin/ 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # Manifest built with MANIFEST.in 28 | MANIFEST 29 | 30 | # Installer logs. 31 | pip-log.txt 32 | pip-delete-this-directory.txt 33 | 34 | # Unit test / coverage reports. 35 | htmlcov/ 36 | .tox/ 37 | .coverage 38 | .cache 39 | nosetests.xml 40 | coverage.xml 41 | 42 | # Translations. 43 | *.mo 44 | 45 | # Sphinx documentation. 46 | docs/_build/ 47 | 48 | # Environments 49 | .env 50 | .venv 51 | env/ 52 | venv/ 53 | ENV/ 54 | env.bak/ 55 | venv.bak/ 56 | -------------------------------------------------------------------------------- /docs/_static/test_users_unit.py: -------------------------------------------------------------------------------- 1 | from datatest import working_directory 2 | from datatest import Select 3 | from datatest import DataTestCase 4 | from datatest import mandatory 5 | 6 | 7 | def setUpModule(): 8 | global users 9 | with working_directory(__file__): 10 | users = Select('users.csv') 11 | 12 | 13 | class TestUserData(DataTestCase): 14 | 15 | @mandatory 16 | def test_columns(self): 17 | self.assertValid(users.fieldnames, {'user_id', 'active'}) 18 | 19 | def test_user_id(self): 20 | 21 | def is_wellformed(x): # <- Helper function. 22 | return x[:-1].isdigit() and x[-1:].isupper() 23 | 24 | self.assertValid(users('user_id'), is_wellformed) 25 | 26 | def test_active(self): 27 | self.assertValid(users({'active'}), {'Y', 'N'}) 28 | -------------------------------------------------------------------------------- /docs/discussion/index.rst: -------------------------------------------------------------------------------- 1 | 2 | .. meta:: 3 | :description: Table of contents for discussion documentation. 4 | :keywords: 5 | :title: Discussion 6 | 7 | .. sectionauthor:: Shawn Brown 8 | 9 | 10 | ########## 11 | Discussion 12 | ########## 13 | 14 | .. epigraph:: 15 | 16 | *"The right information cannot be extracted from the wrong data."* 17 | ---Russell Ackoff [#f1]_ 18 | 19 | 20 | .. toctree:: 21 | :maxdepth: 2 22 | 23 | Organizing Tests 24 | Tips and Tricks 25 | data-preparation 26 | 27 | .. 28 | OMIT UNFINISHED PAGES: 29 | validate-vs-accept 30 | terminology 31 | project-history 32 | 33 | 34 | .. [#f1] Ackoff, Russell L. "Ackoff's Best", New York: John Wiley & Sons, Inc., 35 | 1999. p. 172. 36 | -------------------------------------------------------------------------------- /docs/reference/index.rst: -------------------------------------------------------------------------------- 1 | 2 | .. meta:: 3 | :description: Table of Contents for Reference. 4 | :keywords: 5 | :title: Reference 6 | 7 | 8 | #################################### 9 | Reference 10 | #################################### 11 | 12 | .. epigraph:: 13 | 14 | *"A tool is best if it does the job required with a minimum of 15 | effort, with a minimum of complexity, and with a minimum of power."* 16 | ---Peter Drucker [#f1]_ 17 | 18 | 19 | .. toctree:: 20 | :maxdepth: 2 21 | 22 | Datatest Core 23 | Data Handling 24 | unittest-support 25 | 26 | See the :ref:`Package Index ` for a full list of classes 27 | and objects. 28 | 29 | 30 | .. [#f1] Drucker, Peter F. "Management: Tasks, Responsibilities, Practices", 31 | New York: Harper & Row, 1973. p. 224. 32 | -------------------------------------------------------------------------------- /docs/discussion/validate-vs-accept.rst: -------------------------------------------------------------------------------- 1 | :orphan: 2 | 3 | .. currentmodule:: datatest 4 | 5 | .. meta:: 6 | :description: A discussion about when it's appropriate to assert 7 | data requirements and when it's appropriate to accept 8 | deviations. 9 | :keywords: data, validation, quality, acceptance 10 | 11 | 12 | ######################## 13 | Validation vs Acceptance 14 | ######################## 15 | 16 | .. 17 | validate adherance to a lose requirement 18 | or accept specified deviation 19 | 20 | what's the difference? 21 | does it matter? 22 | 23 | quicker to validate loose requirement 24 | than it is to generate a bunch of differences that must then be accepted 25 | but unless executing time is prohibitive, favor semantic accuracy over 26 | misleading-optimization 27 | 28 | -------------------------------------------------------------------------------- /docs/_static/excel_autoformat.csv: -------------------------------------------------------------------------------- 1 | A,B 2 | 106,ABY-22 3 | 109,ACZ-31 4 | 116,AFA-34 5 | 129,AFV-02 6 | 184,AFY-16 7 | 191,AGF-30 8 | 200,AGK-06 9 | 204,AGW-29 10 | 244,AGZ-08 11 | 252,AHB-28 12 | 255,AIZ-04 13 | 256,ALE-49 14 | 284,AMR-41 15 | 292,AOJ-35 16 | 294,AOX-18 17 | 295,APR-10 18 | 298,AQV-25 19 | 314,ATF-21 20 | 325,AUP-48 21 | 333,AVV-32 22 | 342,AXB-44 23 | 361,AXP-47 24 | 385,APE-07 25 | 391,AZL-36 26 | 414,BAF-37 27 | 418,BES-24 28 | 429,BEW-17 29 | 430,BGO-39 30 | 442,BGW-42 31 | 454,BKE-45 32 | 461,BMO-46 33 | 511,BNT-03 34 | 569,BNW-05 35 | 591,BNX-27 36 | 622,BPD-12 37 | 635,BVD-26 38 | 691,BWP-38 39 | 692,CMO-40 40 | 703,CPX-14 41 | 725,CQO-09 42 | 746,CSA-11 43 | 792,CSD-15 44 | 810,CSN-13 45 | 819,CUT-19 46 | 836,CWK-43 47 | 874,CYL-23 48 | 887,DBB-01 49 | 895,DEC-20 50 | 906,DNZ-33 51 | 981,DVH-50 52 | -------------------------------------------------------------------------------- /docs/_static/tutorial/test_movies_df.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import pytest 4 | import pandas as pd 5 | import datatest as dt 6 | 7 | 8 | @pytest.fixture(scope='module') 9 | @dt.working_directory(__file__) 10 | def df(): 11 | return pd.read_csv('movies.csv') 12 | 13 | 14 | @pytest.mark.mandatory 15 | def test_columns(df): 16 | dt.validate( 17 | df.columns, 18 | {'title', 'rating', 'year', 'runtime'}, 19 | ) 20 | 21 | 22 | def test_title(df): 23 | dt.validate.regex(df['title'], r'^[A-Z]') 24 | 25 | 26 | def test_rating(df): 27 | dt.validate.superset( 28 | df['rating'], 29 | {'G', 'PG', 'PG-13', 'R', 'NC-17', 'Not Rated'}, 30 | ) 31 | 32 | 33 | def test_year(df): 34 | dt.validate(df['year'], int) 35 | 36 | 37 | def test_runtime(df): 38 | dt.validate(df['runtime'], int) 39 | -------------------------------------------------------------------------------- /docs/_static/tutorial/movies.csv: -------------------------------------------------------------------------------- 1 | title,rating,year,runtime 2 | Almost Famous,R,2000,122 3 | American Pie,R,1999,95 4 | Back to the Future,PG,1985,116 5 | Blade Runner,R,1982,117 6 | Blood for Dracula,R,1974,106 7 | Blue Velvet,R,1986,120 8 | The Breakfast Club,R,1985,97 9 | Clueless,PG-13,1995,97 10 | Cool Hand Luke,GP,1967,127 11 | The Craft,R,1996,101 12 | Doctor Zhivago,PG-13,1965,197 13 | el Topo,Not Rated,1970,125 14 | Evil Dead,NC-17,1981,85 15 | Ghostbusters,PG,1984,105 16 | Grease,PG-13,1978,110 17 | Heathers,R,1988,103 18 | Labyrinth,PG,1986,101 19 | The Lost Boys,R,1987,97 20 | Mean Girls,PG-13,2004,97 21 | Millennium Actress,PG,2001,87 22 | My Neighbor Totoro,G,1988,86 23 | Napoleon Dynamite,PG,2004,96 24 | Pee-wee's Big Adventure,PG,1985,91 25 | Pretty in Pink,PG-13,1986,97 26 | The Princess Bride,PG,1987,98 27 | Psycho,R,1960,109 28 | Stand by Me,R,1986,89 29 | Super 8,PG-13,2011,112 30 | superbad,R,2007,113 31 | WarGames,PG,1983,114 32 | -------------------------------------------------------------------------------- /docs/_static/tutorial/test_movies_df_unit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import pandas as pd 4 | import datatest as dt 5 | 6 | 7 | def setUpModule(): 8 | global df 9 | with dt.working_directory(__file__): 10 | df = pd.read_csv('movies.csv') 11 | 12 | 13 | class TestMovies(dt.DataTestCase): 14 | @dt.mandatory 15 | def test_columns(self): 16 | self.assertValid( 17 | df.columns, 18 | {'title', 'rating', 'year', 'runtime'}, 19 | ) 20 | 21 | def test_title(self): 22 | self.assertValidRegex(df['title'], r'^[A-Z]') 23 | 24 | def test_rating(self): 25 | self.assertValidSuperset( 26 | df['rating'], 27 | {'G', 'PG', 'PG-13', 'R', 'NC-17', 'Not Rated'}, 28 | ) 29 | 30 | def test_year(self): 31 | self.assertValid(df['year'], int) 32 | 33 | def test_runtime(self): 34 | self.assertValid(df['runtime'], int) 35 | -------------------------------------------------------------------------------- /tests/past_api07_sources_base.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from . import _unittest as unittest 3 | 4 | from datatest.__past__.api07_sources import MinimalSource 5 | from .mixins import OtherTests 6 | from .mixins import CountTests 7 | 8 | 9 | class TestBaseSource(OtherTests, unittest.TestCase): 10 | fieldnames = ['label1', 'label2', 'value'] 11 | testdata = [['a', 'x', '17'], 12 | ['a', 'x', '13'], 13 | ['a', 'y', '20'], 14 | ['a', 'z', '15'], 15 | ['b', 'z', '5' ], 16 | ['b', 'y', '40'], 17 | ['b', 'x', '25']] 18 | 19 | def setUp(self): 20 | self.datasource = MinimalSource(self.testdata, self.fieldnames) 21 | 22 | 23 | class TestDataSourceCount(CountTests, unittest.TestCase): 24 | def setUp(self): 25 | """Define self.datasource (base version uses MinimalSource).""" 26 | self.datasource = MinimalSource(self.testdata, self.fieldnames) 27 | -------------------------------------------------------------------------------- /tests/past_api09.py: -------------------------------------------------------------------------------- 1 | """Test API for 0.9.x compatibility.""" 2 | from . import _unittest as unittest 3 | import datatest 4 | from datatest.__past__ import api09 # <- MONKEY PATCH!!! 5 | 6 | # IMPORT ADDITIONAL TESTS 7 | #from .past_api09_query import * 8 | 9 | 10 | class TestSubsetAndSupersetMethods(unittest.TestCase): 11 | """Semantics were inverted in the following version (0.10.x).""" 12 | 13 | def test_subset(self): 14 | """Check old-style 0.9.x API validate.subset() behavior.""" 15 | data = ['A', 'B', 'C', 'D'] 16 | requirement = set(['A', 'B']) 17 | datatest.validate.subset(data, requirement) 18 | 19 | def test_superset(self): 20 | """Check old-style 0.9.x API validate.superset() behavior.""" 21 | data = ['A', 'B'] 22 | requirement = set(['A', 'B', 'C', 'D']) 23 | datatest.validate.superset(data, requirement) 24 | 25 | 26 | if __name__ == '__main__': 27 | unittest.main() 28 | else: 29 | raise Exception('This test must be run directly or as a subprocess.') 30 | -------------------------------------------------------------------------------- /datatest/__init__.py: -------------------------------------------------------------------------------- 1 | """Datatest: Test driven data-wrangling and data validation. 2 | 3 | PYTEST_DONT_REWRITE 4 | """ 5 | 6 | from __future__ import absolute_import 7 | 8 | __version__ = '0.12.0.dev1' 9 | 10 | # Datatest Core API (__all__ property defined in submodules) 11 | from .validation import * # Validation error and functions. 12 | from .differences import * # Difference classes. 13 | from .acceptances import accepted 14 | from ._vendor.predicate import Predicate 15 | 16 | # Pandas extensions. 17 | from ._pandas_integration import register_accessors 18 | 19 | # Unittest-style API 20 | from .case import DataTestCase 21 | from .runner import mandatory 22 | from .runner import DataTestRunner 23 | from .main import DataTestProgram 24 | from .main import main 25 | 26 | # Data Handling API 27 | from ._working_directory import working_directory 28 | from ._vendor.repeatingcontainer import RepeatingContainer 29 | 30 | ############################################# 31 | # Register traceback formatting handler. 32 | ############################################# 33 | from . import _excepthook 34 | import sys as _sys 35 | _sys.excepthook = _excepthook.excepthook 36 | -------------------------------------------------------------------------------- /docs/how-to/run-tests.rst: -------------------------------------------------------------------------------- 1 | 2 | .. py:currentmodule:: datatest 3 | 4 | .. meta:: 5 | :description: How to run tests. 6 | :keywords: datatest, run, tests, unittest, pytest 7 | 8 | 9 | ################ 10 | How to Run Tests 11 | ################ 12 | 13 | ====== 14 | Pytest 15 | ====== 16 | 17 | If you have a pytest style script named ``test_mydata.py``, 18 | you can run it by typing the following at the command line: 19 | 20 | .. code-block:: console 21 | 22 | pytest test_mydata.py 23 | 24 | You invoke pytest just as you would in any other circumstance---see 25 | pytest's standard |pytest-usage|_ for full details. 26 | 27 | 28 | ======== 29 | Unittest 30 | ======== 31 | 32 | If you have a unittest style script named ``test_mydata.py``, 33 | you can run it by typing the following at the command line: 34 | 35 | .. code-block:: console 36 | 37 | python -m datatest test_mydata.py 38 | 39 | Datatest includes a unittest-style test runner that facilitates 40 | incremental testing. It runs tests in declaration order (i.e., 41 | by line-number) and supports the :func:`@mandatory ` 42 | decorator. 43 | 44 | 45 | .. 46 | SUBSTITUTIONS: 47 | 48 | .. |pytest-usage| replace:: Usage and Invocations 49 | .. _pytest-usage: https://docs.pytest.org/en/latest/usage.html 50 | 51 | -------------------------------------------------------------------------------- /datatest/_compatibility/contextlib.py: -------------------------------------------------------------------------------- 1 | """compatibility layer for contextlib (Python standard library)""" 2 | from __future__ import absolute_import 3 | from contextlib import * 4 | from . import functools 5 | 6 | 7 | try: 8 | ContextDecorator # New in Python 3.2 9 | except NameError: 10 | # Adapted from Python 3.6 standard libary. 11 | class ContextDecorator(object): 12 | def _recreate_cm(self): # The `_recreate_cm` method is a private 13 | return self # interface for _GeneratorContextManager. 14 | # See issue #11647 for details. 15 | 16 | def __call__(self, func): 17 | @functools.wraps(func) 18 | def inner(*args, **kwds): 19 | with self._recreate_cm(): 20 | return func(*args, **kwds) 21 | return inner 22 | 23 | 24 | try: 25 | suppress # New in Python 3.4 26 | except NameError: 27 | # Adapted from Python 3.6 standard libary. 28 | class suppress(object): 29 | """Context manager to suppress specified exceptions.""" 30 | def __init__(self, *exceptions): 31 | self._exceptions = exceptions 32 | 33 | def __enter__(self): 34 | pass 35 | 36 | def __exit__(self, exctype, excinst, exctb): 37 | return exctype is not None and issubclass(exctype, self._exceptions) 38 | -------------------------------------------------------------------------------- /docs/_static/theme_overrides.css: -------------------------------------------------------------------------------- 1 | /* 2 | Since themes can be loaded after this style sheet is applied, the 3 | declarations below should use the "!important" annotation so they 4 | will take precedence over corresponding declarations defined later. 5 | */ 6 | 7 | 8 | /* 9 | In the sphinx_rtd_theme (version 0.4.2, as of this update), table 10 | cells do not wrap text by default. This can make for unnecessarily 11 | wide tables that scroll off the page. The following declarations 12 | allow lines to wrap when the width is 445px or greater. 13 | 14 | This solution is adapted from ideas discussed on the following 15 | issue: 16 | 17 | https://github.com/rtfd/sphinx_rtd_theme/issues/117 18 | */ 19 | @media screen and (min-width: 445px) { 20 | .wy-table-responsive table td { 21 | white-space: normal !important; 22 | } 23 | .wy-table-responsive { 24 | overflow: visible !important; 25 | } 26 | } 27 | 28 | 29 | /* 30 | The sphinx_rtd_theme (as of version 0.5.0) does not include styles 31 | for "details" or "summary" elements. 32 | */ 33 | details { 34 | margin-bottom: 1em; 35 | } 36 | 37 | summary { 38 | margin-bottom: 1em; 39 | cursor: pointer; 40 | } 41 | 42 | summary:hover { 43 | background: rgb(240, 240, 240); /* fallback if no "rgba" support */ 44 | background-color: rgba(0, 0, 0, 0.0625); 45 | } 46 | -------------------------------------------------------------------------------- /docs/_static/tutorial/test_country_of_birth.py: -------------------------------------------------------------------------------- 1 | 2 | import pytest 3 | from datatest import working_directory 4 | from datatest import Select 5 | from datatest import validate 6 | from datatest import accepted 7 | from datatest import Missing, Extra, Deviation, Invalid 8 | 9 | 10 | # Define fixtures. 11 | 12 | @pytest.fixture(scope='module') 13 | @working_directory(__file__) 14 | def detail(): 15 | return Select('country_of_birth.csv') 16 | 17 | 18 | @pytest.fixture(scope='module') 19 | @working_directory(__file__) 20 | def summary(): 21 | return Select('estimated_totals.csv') 22 | 23 | 24 | # Begin tests. 25 | 26 | @pytest.mark.mandatory 27 | def test_columns(detail, summary): 28 | required_set = set(summary.fieldnames) 29 | 30 | validate(detail.fieldnames, required_set) 31 | 32 | 33 | def test_state_labels(detail, summary): 34 | data = detail({'state/territory'}) 35 | requirement = summary({'state/territory'}) 36 | 37 | validate(data, requirement) 38 | 39 | 40 | def test_population_format(detail): 41 | data = detail({'population'}) 42 | 43 | def integer_format(x): # <- Helper function. 44 | return str(x).isdecimal() 45 | 46 | validate(data, integer_format) 47 | 48 | 49 | def test_population_sums(detail, summary): 50 | data = detail({'state/territory': 'population'}).sum() 51 | requirement = summary({'state/territory': 'population'}).sum() 52 | 53 | validate(data, requirement) 54 | -------------------------------------------------------------------------------- /docs/_static/test_validation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import re 3 | import datatest 4 | 5 | 6 | class TestExample(datatest.DataTestCase): 7 | def test_membership_in_set(self): 8 | data = ['x', 'x', 'y', 'y', 'z', 'z'] 9 | requirement = {'x', 'y', 'z'} # <- set 10 | self.assertValid(data, requirement) 11 | 12 | def test_function_returns_true(self): 13 | data = ['X', 'X', 'Y', 'Y'] 14 | def requirement(x): # <- callable (helper function) 15 | return x.isupper() 16 | self.assertValid(data, requirement) 17 | 18 | def test_regex_matches(self): 19 | data = ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] 20 | requirement = re.compile('^\w\w\w$') # <- regex object 21 | self.assertValid(data, requirement) 22 | 23 | def test_equality(self): 24 | data = ['x', 'x', 'x'] 25 | requirement = 'x' # <- other (not container, callable, or regex) 26 | self.assertValid(data, requirement) 27 | 28 | def test_order(self): 29 | data = ['x', 'x', 'y', 'y', 'z', 'z'] 30 | requirement = ['x', 'x', 'y', 'y', 'z', 'z'] # <- sequence 31 | self.assertValid(data, requirement) 32 | 33 | def test_mapping(self): 34 | data = {'x': 'foo', 'y': 'bar'} 35 | requirement = {'x': 'foo', 'y': 'bar'} # <- mapping 36 | self.assertValid(data, requirement) 37 | 38 | 39 | if __name__ == '__main__': 40 | datatest.main() 41 | -------------------------------------------------------------------------------- /docs/how-to/index.rst: -------------------------------------------------------------------------------- 1 | 2 | .. meta:: 3 | :description: Table of Contents for How-to Guide. 4 | :keywords: 5 | :title: How-to Guide 6 | 7 | .. py:currentmodule:: datatest 8 | .. moduleauthor:: Shawn Brown 9 | .. sectionauthor:: Shawn Brown 10 | 11 | 12 | ############ 13 | How-to Guide 14 | ############ 15 | 16 | .. epigraph:: 17 | 18 | *"Hell is other people's data."* 19 | ---Jim Harris [#f1]_ 20 | 21 | 22 | .. toctree:: 23 | :maxdepth: 1 24 | 25 | Install Datatest 26 | Get Started Testing 27 | Run Tests 28 | Column Names 29 | Customize Differences 30 | Data Types 31 | Date and Time Strings 32 | Date and Time Objects 33 | File Names 34 | Test File Properties 35 | Excel Auto-Formatting 36 | Mailing Addresses 37 | Fuzzy Matching 38 | NaN Values 39 | Negative Matches 40 | Outliers 41 | Phone Numbers 42 | Re-order Acceptances 43 | Sequences 44 | 45 | 46 | .. [#f1] Harris, Jim. "Hell is other people’s data", OCDQ (blog), August 06, 2010, 47 | Retrieved from http://www.ocdqblog.com/home/hell-is-other-peoples-data.html 48 | -------------------------------------------------------------------------------- /docs/_static/tutorial/test_country_of_birth_unit.py: -------------------------------------------------------------------------------- 1 | 2 | import pytest 3 | from datatest import working_directory 4 | from datatest import Select 5 | from datatest import DataTestCase 6 | from datatest import mandatory 7 | from datatest import Missing, Extra, Deviation, Invalid 8 | 9 | 10 | # Define fixtures. 11 | 12 | def setUpModule(): 13 | global detail 14 | global summary 15 | 16 | with working_directory(__file__): 17 | detail = Select('country_of_birth.csv') 18 | summary = Select('estimated_totals.csv') 19 | 20 | 21 | # Begin tests. 22 | 23 | class TestPopulation(DataTestCase): 24 | 25 | @mandatory 26 | def test_columns(self): 27 | required_set = set(summary.fieldnames) 28 | 29 | self.assertValid(detail.fieldnames, required_set) 30 | 31 | def test_state_labels(self): 32 | data = detail({'state/territory'}) 33 | requirement = summary({'state/territory'}) 34 | 35 | self.assertValid(data, requirement) 36 | 37 | def test_population_format(self): 38 | data = detail({'population'}) 39 | 40 | def integer_format(x): # <- Helper function. 41 | return str(x).isdecimal() 42 | 43 | self.assertValid(data, integer_format) 44 | 45 | def test_population_sums(self): 46 | data = detail({'state/territory': 'population'}).sum() 47 | requirement = summary({'state/territory': 'population'}).sum() 48 | 49 | self.assertValid(data, requirement) 50 | -------------------------------------------------------------------------------- /docs/_static/mydata.csv: -------------------------------------------------------------------------------- 1 | user_id,active 2 | 999,Y 3 | 1000,Y 4 | 1001,N 5 | 1002,N 6 | 1003,Y 7 | 1004,Y 8 | 1005,Y 9 | 1006,N 10 | 1007,Y 11 | 1008,Y 12 | 1009,N 13 | 1010,N 14 | 1011,Y 15 | 1012,Y 16 | 1013,Y 17 | 1014,Y 18 | 1015,Y 19 | 1016,Y 20 | 1017,Y 21 | 1018,Y 22 | 1019,Y 23 | 1020,Y 24 | 1021,N 25 | 1022,N 26 | 1023,Y 27 | 1024,N 28 | 1025,Y 29 | 1026,Y 30 | 1027,Y 31 | 1028,N 32 | 1029,N 33 | 1030,N 34 | 1031,N 35 | 1032,Y 36 | 1033,Y 37 | 1034,Y 38 | 1035,N 39 | 1036,Y 40 | 1037,Y 41 | 1038,Y 42 | 1039,Y 43 | 1040,N 44 | 1041,Y 45 | 1042,Y 46 | 1043,Y 47 | 1044,N 48 | 1045,N 49 | 1046,Y 50 | 1047,Y 51 | 1048,N 52 | 1049,N 53 | 1050,N 54 | 1051,N 55 | 1052,Y 56 | 1053,Y 57 | 1054,Y 58 | 1055,Y 59 | 1056,Y 60 | 1057,Y 61 | 1058,Y 62 | 1059,Y 63 | 1060,Y 64 | 1061,Y 65 | 1062,N 66 | 1063,N 67 | 1064,Y 68 | 1065,Y 69 | 1066,Y 70 | 1067,Y 71 | 1068,Y 72 | 1069,Y 73 | 1070,Y 74 | 1071,Y 75 | 1072,Y 76 | 1073,Y 77 | 1074,N 78 | 1075,Y 79 | 1076,Y 80 | 1077,N 81 | 1078,Y 82 | 1079,Y 83 | 1080,Y 84 | 1081,N 85 | 1082,Y 86 | 1083,Y 87 | 1084,N 88 | 1085,N 89 | 1086,Y 90 | 1087,Y 91 | 1088,Y 92 | 1089,Y 93 | 1090,Y 94 | 1091,N 95 | 1092,Y 96 | 1093,N 97 | 1094,N 98 | 1095,Y 99 | 1096,N 100 | 1097,Y 101 | 1098,Y 102 | 1099,N 103 | 1100,N 104 | 1101,Y 105 | 1102,Y 106 | 1103,Y 107 | 1104,Y 108 | 1105,Y 109 | 1106,N 110 | 1107,N 111 | 1108,Y 112 | 1109,Y 113 | 1110,Y 114 | 1111,N 115 | 1112,N 116 | 1113,Y 117 | 1114,Y 118 | 1115,Y 119 | 1116,Y 120 | 1117,Y 121 | 1118,N 122 | -------------------------------------------------------------------------------- /datatest/_compatibility/collections/abc.py: -------------------------------------------------------------------------------- 1 | """compatibility layer for collections.abc (Python standard library)""" 2 | from __future__ import absolute_import 3 | try: 4 | from collections.abc import * # New in 3.3 5 | except ImportError: 6 | # Previously, the collection ABCs were in the root namespace. 7 | from collections import ( 8 | Container, 9 | Hashable, 10 | Iterable, 11 | Iterator, 12 | Sized, 13 | Callable, 14 | Sequence, 15 | MutableSequence, 16 | Set, 17 | MutableSet, 18 | Mapping, 19 | MutableMapping, 20 | MappingView, 21 | KeysView, 22 | ItemsView, 23 | ValuesView, 24 | ) 25 | 26 | 27 | try: 28 | Collection # New in 3.6 29 | except NameError: 30 | # Adapted from Python 3.6 standard library. 31 | def _check_methods(C, *methods): 32 | mro = C.__mro__ 33 | for method in methods: 34 | for B in mro: 35 | if method in B.__dict__: 36 | if B.__dict__[method] is None: 37 | return NotImplemented 38 | break 39 | else: 40 | return NotImplemented 41 | return True 42 | 43 | 44 | # Adapted from Python 3.6 standard library. 45 | class Collection(Sized, Iterable, Container): 46 | __slots__ = () 47 | 48 | @classmethod 49 | def __subclasshook__(cls, C): 50 | if cls is Collection: 51 | return _check_methods(C, '__len__', '__iter__', '__contains__') 52 | -------------------------------------------------------------------------------- /tests/_contextlib.py: -------------------------------------------------------------------------------- 1 | """compatibility layer for contextlib (Python standard library)""" 2 | from __future__ import absolute_import 3 | from contextlib import * 4 | 5 | 6 | try: 7 | redirect_stderr # New in 3.5 8 | except NameError: 9 | # Adapted from Python 3.5 Standard Library. 10 | import sys as _sys 11 | class _RedirectStream: 12 | _stream = None 13 | 14 | def __init__(self, new_target): 15 | self._new_target = new_target 16 | self._old_targets = [] 17 | 18 | def __enter__(self): 19 | self._old_targets.append(getattr(_sys, self._stream)) 20 | setattr(_sys, self._stream, self._new_target) 21 | return self._new_target 22 | 23 | def __exit__(self, exctype, excinst, exctb): 24 | setattr(_sys, self._stream, self._old_targets.pop()) 25 | 26 | class redirect_stderr(_RedirectStream): 27 | """Context manager for temporarily redirecting stderr to 28 | another file. 29 | """ 30 | _stream = 'stderr' 31 | 32 | 33 | try: 34 | redirect_stdout # New in 3.4 35 | except NameError: 36 | class redirect_stdout(_RedirectStream): 37 | """Context manager for temporarily redirecting stdout to 38 | another file. 39 | 40 | # How to send help() to stderr 41 | with redirect_stdout(sys.stderr): 42 | help(dir) 43 | 44 | # How to write help() to a file 45 | with open('help.txt', 'w') as f: 46 | with redirect_stdout(f): 47 | help(pow) 48 | """ 49 | _stream = 'stdout' 50 | -------------------------------------------------------------------------------- /datatest/__past__/api07_error.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pprint 3 | 4 | 5 | class DataError(AssertionError): 6 | """Raised when :meth:`assertValid` finds differences between *data* 7 | and *requirement*. 8 | """ 9 | def __init__(self, msg, differences, subject=None, required=None): 10 | """Initialize self, store *differences* for later reference.""" 11 | if not differences: 12 | raise ValueError('Missing differences.') 13 | self._differences = differences 14 | self.msg = msg 15 | self.subject = str(subject) # Subject data source. 16 | self.required = str(required) # Required object or reference source. 17 | self._verbose = False # <- Set by DataTestResult if verbose. 18 | 19 | return AssertionError.__init__(self, msg) 20 | 21 | @property 22 | def differences(self): 23 | """An iterable (list or dict) of differences.""" 24 | return self._differences 25 | 26 | def __repr__(self): 27 | return self.__class__.__name__ + ': ' + self.__str__() 28 | 29 | def __str__(self): 30 | diff = pprint.pformat(self.differences, width=1) 31 | if any([diff.startswith('{') and diff.endswith('}'), 32 | diff.startswith('[') and diff.endswith(']'), 33 | diff.startswith('(') and diff.endswith(')')]): 34 | diff = diff[1:-1] 35 | 36 | if self._verbose: 37 | msg_extras = '\n\nSUBJECT:\n{0}\nREQUIRED:\n{1}' 38 | msg_extras = msg_extras.format(self.subject, self.required) 39 | else: 40 | msg_extras = '' 41 | 42 | return '{0}:\n {1}{2}'.format(self.msg, diff, msg_extras) 43 | -------------------------------------------------------------------------------- /docs/_static/users.csv: -------------------------------------------------------------------------------- 1 | USER_ID,ACTIVE 2 | 0999F,Y 3 | 1000C,Y 4 | 1001C,n 5 | 1002A,n 6 | 1003C,Y 7 | 1004E,Y 8 | 1005H,Y 9 | 1006E,n 10 | 1007H,Y 11 | 1008A,Y 12 | 1009F,n 13 | 1010D,n 14 | 1011H,Y 15 | 1012H,Y 16 | 1013E,Y 17 | 1014D,Y 18 | 1015C,Y 19 | 1016H,Y 20 | 1017G,Y 21 | 1018A,Y 22 | 1019H,Y 23 | 1020E,Y 24 | 1021H,n 25 | 1022A,n 26 | 1023B,Y 27 | 1024D,n 28 | 1025C,Y 29 | 1026B,Y 30 | 1027H,Y 31 | 1028B,n 32 | 1029A,n 33 | 1030H,n 34 | 1031A,n 35 | 1032G,y 36 | 1033H,y 37 | 1034F,y 38 | 1035F,n 39 | 1036E,y 40 | 1037E,y 41 | 1038G,y 42 | 1039G,y 43 | 1040A,n 44 | 1041A,Y 45 | 1042H,Y 46 | 1043B,Y 47 | 1044G,n 48 | 1045A,n 49 | 1046A,Y 50 | 1047H,Y 51 | 1048D,n 52 | 1049A,n 53 | 1050H,n 54 | 1051A,n 55 | 1052E,Y 56 | 1053A,Y 57 | 1054G,Y 58 | 1055C,Y 59 | 1056a,Y 60 | 1057F,Y 61 | 1058D,Y 62 | 1059H,Y 63 | 1060A,YES 64 | 1061D,YES 65 | 1062E,NO 66 | 1063C,NO 67 | 1064H,YES 68 | 1065A,YES 69 | 1066F,YES 70 | 1067A,YES 71 | 1068F,YES 72 | 1069D,YES 73 | 1070H,YES 74 | 1071E,YES 75 | 1072G,YES 76 | 1073B,YES 77 | 1074B,NO 78 | 1075B,Y 79 | 1076A,Y 80 | 1077A,n 81 | 1078H,Y 82 | 1079C,Y 83 | 1080F,Y 84 | 1081B,n 85 | 1082F,Y 86 | 1083F,Y 87 | 1084F,n 88 | 1085H,n 89 | 1086G,Y 90 | 1087C,Y 91 | 1088A,Y 92 | 1089A,Y 93 | 1090E,Y 94 | 1091B,n 95 | 1092C,Y 96 | 1093G,n 97 | 1094B,n 98 | 1095C,Y 99 | 1096A,n 100 | 1097E,Y 101 | 1098C,Y 102 | 1099b,n 103 | 1100G,n 104 | 1101B,Y 105 | 1102C,Y 106 | 1103A,Y 107 | 1104H,Y 108 | 1105H,Y 109 | 1106A,n 110 | 1107E,n 111 | 1108E,Y 112 | 1109G,Y 113 | 1110B,Y 114 | 1111F,n 115 | 1112D,n 116 | 1113B,Y 117 | 1114H,Y 118 | 1115A,Y 119 | 1116B,Y 120 | 1117B,Y 121 | 1118D,n 122 | -------------------------------------------------------------------------------- /docs/_static/tutorial/modified_test_country_of_birth.py: -------------------------------------------------------------------------------- 1 | 2 | import pytest 3 | from datatest import working_directory 4 | from datatest import Select 5 | from datatest import validate 6 | from datatest import accepted 7 | from datatest import Missing, Extra, Deviation, Invalid 8 | 9 | 10 | # Define fixtures. 11 | 12 | @pytest.fixture(scope='module') 13 | @working_directory(__file__) 14 | def detail(): 15 | return Select('country_of_birth.csv') 16 | 17 | 18 | @pytest.fixture(scope='module') 19 | @working_directory(__file__) 20 | def summary(): 21 | return Select('estimated_totals.csv') 22 | 23 | 24 | # Begin tests. 25 | 26 | @pytest.mark.mandatory 27 | def test_columns(detail, summary): 28 | required_set = set(summary.fieldnames) 29 | 30 | with accepted(Extra): 31 | validate(detail.fieldnames, required_set) 32 | 33 | 34 | def test_state_labels(detail, summary): 35 | data = detail({'state/territory'}) 36 | requirement = summary({'state/territory'}) 37 | 38 | omitted_territory = accepted([ 39 | Missing('Jervis Bay Territory'), 40 | ]) 41 | 42 | with omitted_territory: 43 | validate(data, requirement) 44 | 45 | 46 | def test_population_format(detail): 47 | data = detail({'population'}) 48 | 49 | def integer_format(x): # <- Helper function. 50 | return str(x).isdecimal() 51 | 52 | validate(data, integer_format) 53 | 54 | 55 | def test_population_sums(detail, summary): 56 | data = detail({'state/territory': 'population'}).sum() 57 | requirement = summary({'state/territory': 'population'}).sum() 58 | 59 | omitted_territory = accepted({ 60 | 'Jervis Bay Territory': Missing(388), 61 | }) 62 | 63 | with accepted.percent(0.03) | omitted_territory: 64 | validate(data, requirement) 65 | -------------------------------------------------------------------------------- /docs/how-to/negative-matches.rst: -------------------------------------------------------------------------------- 1 | 2 | .. currentmodule:: datatest 3 | 4 | .. meta:: 5 | :description: How to validate negative matches. 6 | :keywords: datatest, negative match 7 | 8 | 9 | ################################ 10 | How to Validate Negative Matches 11 | ################################ 12 | 13 | Sometimes you want to check that data is **not** equal to a specific 14 | value. There are a few different ways to perform this type of negative 15 | matching. 16 | 17 | 18 | Helper Function 19 | =============== 20 | 21 | One obvious way to check for a negative match is to define a helper 22 | function that checks for ``!=`` to a given value: 23 | 24 | .. code-block:: python 25 | :linenos: 26 | 27 | from datatest import validate 28 | 29 | data = [...] 30 | 31 | def not_bar(x): 32 | return x != 'bar' 33 | 34 | validate(data, not_bar) 35 | 36 | 37 | Inverted Predicate 38 | ================== 39 | 40 | Datatest provides a :class:`Predicate` class for handling different 41 | kinds of matching. You can invert a Predicate's behavior using the 42 | inversion operator, ``~``: 43 | 44 | .. code-block:: python 45 | :emphasize-lines: 4 46 | :linenos: 47 | 48 | from datatest import validate, Predicate 49 | 50 | data = [...] 51 | validate(data, ~Predicate('bar')) 52 | 53 | 54 | Functional Style 55 | ================ 56 | 57 | If you are accustomed to programming in a functional style, you 58 | could perform a negative match using :func:`functools.partial` and 59 | :func:`operator.ne`: 60 | 61 | .. code-block:: python 62 | :emphasize-lines: 6 63 | :linenos: 64 | 65 | from functools import partial 66 | from operator import ne 67 | from datatest import validate 68 | 69 | data = [...] 70 | validate(data, partial(ne, 'bar')) 71 | 72 | -------------------------------------------------------------------------------- /run-tests.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #======================================================================= 3 | # FILE: run-tests.sh 4 | # DESCRIPTION: Runs test suite under all supported versions of Python 5 | # and displays failures when encountered. 6 | #======================================================================= 7 | 8 | #----------------------------------------------------------------------- 9 | # Define function (takes command to run as a single argument). 10 | #----------------------------------------------------------------------- 11 | run_command () 12 | { 13 | echo "" >&2 14 | echo "======================================================================" >&2 15 | echo "$1" >&2 16 | echo "======================================================================" >&2 17 | $1 # <- Run command. 18 | if [ $? -ne 0 ] # Check exit status of completed command. 19 | then 20 | echo "" >&2 21 | echo "Failed Command: $1" >&2 22 | echo "" >&2 23 | exit $? # <- EXIT! 24 | fi 25 | } 26 | 27 | #----------------------------------------------------------------------- 28 | # Run test suite in all supported versions of Python. 29 | #----------------------------------------------------------------------- 30 | run_command "python3.9 -B -m unittest $*" 31 | run_command "python3.8 -B -m unittest $*" 32 | run_command "python3.7 -B -m unittest $*" 33 | run_command "python3.6 -B -m unittest $*" 34 | run_command "python3.5 -B -m unittest $*" 35 | run_command "python3.4 -B -m unittest $*" 36 | #run_command "python3.3 -B -m unittest $*" 37 | #run_command "python3.2 -B -m unittest $*" 38 | #run_command "python3.1 -B tests/discover.py $*" 39 | run_command "python2.7 -B -m unittest discover $*" 40 | run_command "python2.6 -B tests/discover.py $*" 41 | 42 | echo "" >&2 43 | echo "All commands successful." >&2 44 | -------------------------------------------------------------------------------- /datatest/_excepthook.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | from .validation import ValidationError 4 | 5 | 6 | if sys.excepthook: 7 | existing_excepthook = sys.excepthook 8 | else: 9 | existing_excepthook = sys.__excepthook__ 10 | 11 | 12 | def _next_is_internal(tb): 13 | """Return True if the next traceback refers to an internal part of 14 | datatest. 15 | """ 16 | tb_next = tb.tb_next 17 | if not tb_next: 18 | return False 19 | return (tb_next.tb_frame.f_globals.get('__datatest', False) 20 | or tb_next.tb_frame.f_globals.get('__unittest', False)) 21 | 22 | 23 | def excepthook(err_type, err_value, err_traceback): 24 | """Hide calls internal to datatest for ValidationError instances 25 | and print traceback and exception to sys.stderr. 26 | """ 27 | if not issubclass(err_type, ValidationError): 28 | return existing_excepthook(err_type, err_value, err_traceback) 29 | 30 | try: 31 | tb = err_traceback 32 | while tb: 33 | if _next_is_internal(tb): 34 | tb.tb_next = None # <- Only settable in 3.7 and newer. 35 | break 36 | tb = tb.tb_next 37 | 38 | existing_excepthook(err_type, err_value, err_traceback) 39 | 40 | except (AttributeError, TypeError): 41 | # In older versions of Python, "tb_next" is a read-only attribute. 42 | # Trying to set "tb_next" in versions 3.0 through 3.6 will raise an 43 | # AttributeError whereas versions 2.7 and older will raise a TypeError. 44 | limit = 1 45 | tb = err_traceback 46 | while tb: 47 | if _next_is_internal(tb): 48 | break 49 | limit += 1 50 | tb = tb.tb_next 51 | 52 | import traceback 53 | traceback.print_exception(err_type, err_value, err_traceback, limit) 54 | 55 | -------------------------------------------------------------------------------- /docs/_static/tutorial/modified_test_country_of_birth_unit.py: -------------------------------------------------------------------------------- 1 | 2 | import pytest 3 | from datatest import working_directory 4 | from datatest import Select 5 | from datatest import DataTestCase 6 | from datatest import mandatory 7 | from datatest import Missing, Extra, Deviation, Invalid 8 | 9 | 10 | # Define fixtures. 11 | 12 | def setUpModule(): 13 | global detail 14 | global summary 15 | 16 | with working_directory(__file__): 17 | detail = Select('country_of_birth.csv') 18 | summary = Select('estimated_totals.csv') 19 | 20 | 21 | # Begin tests. 22 | 23 | class TestPopulation(DataTestCase): 24 | 25 | @mandatory 26 | def test_columns(self): 27 | required_set = set(summary.fieldnames) 28 | 29 | with self.accepted(Extra): 30 | self.assertValid(detail.fieldnames, required_set) 31 | 32 | def test_state_labels(self): 33 | data = detail({'state/territory'}) 34 | requirement = summary({'state/territory'}) 35 | 36 | omitted_territory = self.accepted([ 37 | Missing('Jervis Bay Territory'), 38 | ]) 39 | 40 | with omitted_territory: 41 | self.assertValid(data, requirement) 42 | 43 | def test_population_format(self): 44 | data = detail({'population'}) 45 | 46 | def integer_format(x): # <- Helper function. 47 | return str(x).isdecimal() 48 | 49 | self.assertValid(data, integer_format) 50 | 51 | def test_population_sums(self): 52 | data = detail({'state/territory': 'population'}).sum() 53 | requirement = summary({'state/territory': 'population'}).sum() 54 | 55 | omitted_territory = self.accepted({ 56 | 'Jervis Bay Territory': Missing(388), 57 | }) 58 | 59 | with self.acceptedPercent(0.03) | omitted_territory: 60 | self.assertValid(data, requirement) 61 | -------------------------------------------------------------------------------- /docs/_static/tutorial/test_intro1.py: -------------------------------------------------------------------------------- 1 | """Example tests using pytest-style conventions.""" 2 | 3 | import re 4 | from datatest import validate 5 | 6 | 7 | def test_using_set(): 8 | """Check for set membership.""" 9 | data = ['A', 'B', 'A'] 10 | 11 | requirement = {'A', 'B'} 12 | 13 | validate(data, requirement) 14 | 15 | 16 | def test_using_function(): 17 | """Check that function returns True.""" 18 | data = [2, 4, 6, 8] 19 | 20 | def is_even(x): 21 | return x % 2 == 0 22 | 23 | validate(data, is_even) 24 | 25 | 26 | def test_using_type(): 27 | """Check that values are of the given type.""" 28 | data = [0.0, 1.0, 2.0] 29 | 30 | validate(data, float) 31 | 32 | 33 | def test_using_regex(): 34 | """Check that values match the given pattern.""" 35 | data = ['bake', 'cake', 'bake'] 36 | 37 | regex = re.compile('[bc]ake') 38 | 39 | validate(data, regex) 40 | 41 | 42 | def test_using_string(): 43 | """Check that values equal the given string.""" 44 | data = ['foo', 'foo', 'foo'] 45 | 46 | validate(data, 'foo') 47 | 48 | 49 | def test_using_tuple(): 50 | """Check that tuples of values satisfy corresponding tuple of 51 | requirements. 52 | """ 53 | data = [('A', 0.0), ('A', 1.0), ('A', 2.0)] 54 | 55 | requirement = ('A', float) 56 | 57 | validate(data, requirement) 58 | 59 | 60 | def test_using_dict(): 61 | """Check that values satisfy requirements of matching keys.""" 62 | data = { 63 | 'A': 100, 64 | 'B': 200, 65 | 'C': 300, 66 | } 67 | requirement = { 68 | 'A': 100, 69 | 'B': 200, 70 | 'C': 300, 71 | } 72 | validate(data, requirement) 73 | 74 | 75 | def test_using_list(): 76 | """Check that the order of values match the required sequence.""" 77 | data = ['A', 'B', 'C', 'D', 'E', 'F', 'G'] 78 | 79 | requirement = ['A', 'B', 'C', 'D', 'E', 'F', 'G'] 80 | 81 | validate(data, requirement) 82 | -------------------------------------------------------------------------------- /docs/discussion/organizing-tests.rst: -------------------------------------------------------------------------------- 1 | 2 | .. currentmodule:: datatest 3 | 4 | .. meta:: 5 | :description: A discussion on organizing a data test suite. 6 | :keywords: data, testing, organizing, incremental, validation 7 | 8 | 9 | ####################### 10 | Organizing a Test Suite 11 | ####################### 12 | 13 | Unlike unit testing of software, it's oftentimes not possible to check 14 | data properties as independent "units" in isolation. Later tests often 15 | depend on the success of earlier ones. For example, it's not useful 16 | to try to check the datatype of an "account_id" column if there's 17 | no column of that name. And it might not be useful to sum the values 18 | in an "accounts_payable" column when the associated account IDs 19 | contain invalid datatypes. 20 | 21 | Typically, data tests should be run sequentially where broader, general 22 | features are tested first and specific details are tested later (after 23 | their prerequisite tests have passed). This approach is called "top-down, 24 | incremental testing". You can use the following list as a rough guide 25 | of which features to check before others. 26 | 27 | 28 | Order to Check Features 29 | ----------------------- 30 | 31 | 1. data is accessible (by loading a file or connecting to a data source 32 | via a fixture) 33 | 2. names of tables or worksheets (if applicable) 34 | 3. names of columns 35 | 4. categorical columns: controlled vocabulary, set membership, etc. 36 | 5. foreign-keys (if applicable) 37 | 6. well-formedness of text values: date formats, phone numbers, etc. 38 | 7. datatypes: int, float, datetime, etc. 39 | 8. constraints: uniqueness, minimum and maximum values, etc. 40 | 9. accuracy of quantitative columns: compare sums, counts, or averages 41 | against known-good values 42 | 10. internal consistency, cross-column comparisons, etc. 43 | 44 | 45 | .. 46 | updating for errors discovered later 47 | don't just fix the data error and move on 48 | instead, devise a test that fails, then fix 49 | the data 50 | 51 | -------------------------------------------------------------------------------- /docs/_static/test_errors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import re 3 | import datatest 4 | 5 | 6 | class TestExample(datatest.DataTestCase): 7 | def test_membership_in_set(self): 8 | data = ['x', 'x2', 'y', 'y', 'z', 'z'] 9 | required_elements = {'x', 'y', 'z'} 10 | self.assertValid(data, required_elements) 11 | 12 | def test_function_returns_true(self): 13 | data = ['X', 'X', 'Y', 'y'] 14 | def uppercase(x): 15 | return x.isupper() 16 | self.assertValid(data, uppercase) 17 | 18 | def test_regex_matches(self): 19 | data = ['foo', 'foo', 'foo', 'bar', 'bar', 'xx'] 20 | three_letters = re.compile('^\w\w\w$') 21 | self.assertValid(data, three_letters) 22 | 23 | def test_equality(self): 24 | data = ['x', 'x', 'Y'] 25 | other_value = 'x' 26 | self.assertValid(data, other_value) 27 | 28 | def test_order(self): 29 | data = ['x', 'X', 'y', 'y', 'z', 'z'] 30 | my_sequence = ['x', 'x', 'y', 'y', 'z', 'z'] 31 | self.assertValid(data, my_sequence) 32 | 33 | def test_mapping1(self): 34 | data = { 35 | 'x': 'foo', 36 | 'y': 'BAZ', 37 | } 38 | required_values = { 39 | 'x': 'foo', 40 | 'y': 'bar', 41 | } 42 | self.assertValid(data, required_values) 43 | 44 | def test_mapping2(self): 45 | data = { 46 | 'x': 11, 47 | 'y': 13, 48 | } 49 | required_values = { 50 | 'x': 10, 51 | 'y': 15, 52 | } 53 | self.assertValid(data, required_values) 54 | 55 | def test_mapping3(self): 56 | data = { 57 | 'x': 10, 58 | 'y': 15, 59 | 'z': 3000, 60 | } 61 | required_values = { 62 | 'x': 10, 63 | 'y': 15, 64 | 'z': 20, 65 | } 66 | self.assertValid(data, required_values) 67 | 68 | 69 | if __name__ == '__main__': 70 | datatest.main() 71 | -------------------------------------------------------------------------------- /docs/how-to/reorder-acceptances.rst: -------------------------------------------------------------------------------- 1 | 2 | .. py:currentmodule:: datatest 3 | 4 | .. meta:: 5 | :description: How to re-order acceptances. 6 | :keywords: datatest, order of operations, acceptance, order 7 | 8 | 9 | ########################### 10 | How to Re-Order Acceptances 11 | ########################### 12 | 13 | Individual acceptances can be combined together to create new acceptances 14 | with narrower or broader criteria (see :ref:`composability-docs`). 15 | When acceptances are combined, their criteria are applied in an order 16 | determined by their scope. Element-wise criteria are applied first, 17 | group-wise criteria are applied second, and whole-error criteria are 18 | applied last (see :ref:`order-of-operations-docs`). 19 | 20 | 21 | Implicit Ordering 22 | ----------------- 23 | 24 | In this first example, we have a combined acceptance made from a 25 | whole-error acceptance, :func:`accepted.count`, and a group-wise 26 | acceptance, :func:`accepted([...]) `: 27 | 28 | .. code-block:: python 29 | :linenos: 30 | :lineno-start: 21 31 | 32 | with accepted.count(4) | accepted([Missing('A'), Missing('B')]): 33 | ... 34 | 35 | Since the :ref:`order-of-operations-docs` specifies that whole-error 36 | acceptances are applied *after* group-wise acceptances, the 37 | ``accepted.count(4)`` criteria is applied last even though it's 38 | defined first. 39 | 40 | 41 | Explicit Ordering 42 | ----------------- 43 | 44 | If you want to control this order explicitly, you can use nested 45 | ``with`` statements to change the default behavior: 46 | 47 | .. code-block:: python 48 | :linenos: 49 | :lineno-start: 21 50 | 51 | with accepted([Missing('A'), Missing('B')]): 52 | with accepted.count(4): 53 | ... 54 | 55 | Using nested ``with`` statements, the inner-most block is applied 56 | first and outer blocks are applied in order until the outer-most 57 | block is applied last. In this example, the ``accepted.count(4)`` 58 | is applied first because it's declared in the inner-most block. 59 | -------------------------------------------------------------------------------- /docs/_static/tutorial/test_intro2.py: -------------------------------------------------------------------------------- 1 | """Example of failing tests using pytest-style conventions.""" 2 | 3 | import re 4 | from datatest import validate 5 | from datatest import accepted 6 | 7 | 8 | def test_using_set(): 9 | """Check for set membership.""" 10 | data = ['A', 'B', 'C', 'D'] 11 | 12 | requirement = {'A', 'B'} 13 | 14 | validate(data, requirement) 15 | 16 | 17 | def test_using_function(): 18 | """Check that function returns True.""" 19 | data = [2, 4, 6, 9] 20 | 21 | def is_even(x): 22 | return x % 2 == 0 23 | 24 | validate(data, is_even) 25 | 26 | 27 | def test_using_type(): 28 | """Check that values are of the given type.""" 29 | data = [0.0, 1.0, 2] 30 | 31 | validate(data, float) 32 | 33 | 34 | def test_using_regex(): 35 | """Check that values match the given pattern.""" 36 | data = ['bake', 'cake', 'fake'] 37 | 38 | regex = re.compile('[bc]ake') 39 | 40 | validate(data, regex) 41 | 42 | 43 | def test_using_string(): 44 | """Check that values equal the given string.""" 45 | data = ['foo', 'foo', 'bar'] 46 | 47 | validate(data, 'foo') 48 | 49 | 50 | def test_using_tuple(): 51 | """Check that tuples of values satisfy corresponding tuple of 52 | requirements. 53 | """ 54 | data = [('A', 1.0), ('A', 2), ('B', 3.0)] 55 | 56 | requirement = ('A', float) 57 | 58 | validate(data, requirement) 59 | 60 | 61 | def test_using_dict(): 62 | """Check that values satisfy requirements of matching keys.""" 63 | data = { 64 | 'A': 100, 65 | 'B': 200, 66 | 'C': 299, 67 | 'D': 405, 68 | } 69 | requirement = { 70 | 'A': 100, 71 | 'B': 200, 72 | 'C': 300, 73 | 'D': 400, 74 | } 75 | validate(data, requirement) 76 | 77 | 78 | def test_using_list(): 79 | """Check that the order of values match the required sequence.""" 80 | data = ['A', 'D', 'XXX', 'YYY', 'E', 'ZZZ', 'G'] 81 | 82 | requirement = ['A', 'B', 'C', 'D', 'E', 'F', 'G'] 83 | 84 | validate(data, requirement) 85 | -------------------------------------------------------------------------------- /docs/_static/tutorial/modified_country_of_birth.csv: -------------------------------------------------------------------------------- 1 | state/territory,country_of_birth,population 2 | Australian Capital Territory,Australia,270033 3 | Australian Capital Territory,China,11351 4 | Australian Capital Territory,England,12757 5 | Australian Capital Territory,India,10414 6 | Australian Capital Territory,New Zealand,4734 7 | Australian Capital Territory,other/unknown,84310 8 | Australian Capital Territory,Philippines,3798 9 | New South Wales,Australia,4899090 10 | New South Wales,China,234508 11 | New South Wales,England,226564 12 | New South Wales,India,143459 13 | New South Wales,New Zealand,117136 14 | New South Wales,other/unknown,1772722 15 | New South Wales,Philippines,86749 16 | Northern Territory,Australia,157531 17 | Northern Territory,England,5583 18 | Northern Territory,Greece,1268 19 | Northern Territory,India,3598 20 | Northern Territory,New Zealand,4636 21 | Northern Territory,other/unknown,50303 22 | Northern Territory,Philippines,5914 23 | Queensland,Australia,3343657 24 | Queensland,China,47114 25 | Queensland,England,180775 26 | Queensland,India,49145 27 | Queensland,New Zealand,201206 28 | Queensland,other/unknown,841165 29 | Queensland,South Africa,40131 30 | South Australia,Australia,1192546 31 | South Australia,China,24610 32 | South Australia,England,97392 33 | South Australia,India,27594 34 | South Australia,Italy,18544 35 | South Australia,other/unknown,301630 36 | South Australia,Vietnam,14337 37 | Tasmania,Australia,411490 38 | Tasmania,China,3036 39 | Tasmania,England,18776 40 | Tasmania,Netherlands,2193 41 | Tasmania,New Zealand,4977 42 | Tasmania,other/unknown,67210 43 | Tasmania,Scotland,2283 44 | Victoria,Australia,3845493 45 | Victoria,China,160652 46 | Victoria,England,171443 47 | Victoria,India,169802 48 | Victoria,New Zealand,93253 49 | Victoria,other/unknown,1405194 50 | Victoria,Vietnam,80787 51 | Western Australia,Australia,1492842 52 | Western Australia,England,194163 53 | Western Australia,India,49385 54 | Western Australia,New Zealand,79221 55 | Western Australia,other/unknown,586956 56 | Western Australia,Philippines,30835 57 | Western Australia,South Africa,41008 58 | -------------------------------------------------------------------------------- /docs/_static/tutorial/country_of_birth.csv: -------------------------------------------------------------------------------- 1 | state/territory,country_of_birth,pop 2 | Australian Capital Territory,Australia,270033 3 | Australian Capital Territory,China,11351 4 | Australian Capital Territory,England,12757 5 | Australian Capital Territory,India,10414 6 | Australian Capital Territory,New Zealand,4734 7 | Australian Capital Territory,other/unknown,84310 8 | Australian Capital Territory,Philippines,3798 9 | New South Wales,Australia,4899090 10 | New South Wales,China,234508 11 | New South Wales,England,226564 12 | New South Wales,India,143459 13 | New South Wales,New Zealand,117136 14 | New South Wales,other/unknown,1772722 15 | New South Wales,Philippines,86749 16 | Northern Territory,Australia,157531 17 | Northern Territory,England,5583 18 | Northern Territory,Greece,1268 19 | Northern Territory,India,3598 20 | Northern Territory,New Zealand,4636 21 | Northern Territory,other/unknown,50303 22 | Northern Territory,Philippines,5914 23 | Queensland,Australia,3343657 24 | Queensland,China,47114 25 | Queensland,England,180775 26 | Queensland,India,49145 27 | Queensland,New Zealand,201206 28 | Queensland,other/unknown,841165 29 | Queensland,South Africa,40131 30 | South Australia,Australia,1192546 31 | South Australia,China,24610 32 | South Australia,England,"England,97392" 33 | South Australia,India,27594 34 | South Australia,Italy,18544 35 | South Australia,other/unknown,301630 36 | South Australia,Vietnam,14337 37 | Tasmania,Australia,411490 38 | Tasmania,China,3036 39 | Tasmania,England,18776 40 | Tasmania,Netherlands,2193 41 | Tasmania,New Zealand,4977 42 | Tasmania,other/unknown,67210 43 | Tasmania,Scotland,2283 44 | Tasmania,SUBTOTAL,509965 45 | Victoria,Australia,3845493 46 | Victoria,China,160652 47 | Victoria,England,171443 48 | Victoria,India,169802 49 | Victoria,New Zealand,93253 50 | Victoria,other/unknown,1405194 51 | Victoria,Vietnam,80787 52 | Western Australia,Australia,1492842 53 | Western Australia,England,194163 54 | Western Australia,India,49385 55 | Western Australia,New Zealand,79221 56 | Western Australia,other/unknown,586956 57 | Western Australia,Philippines,30835 58 | Western Australia,South Africa,41008 59 | -------------------------------------------------------------------------------- /run-tests.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | REM ******************************************************************** 3 | REM File: run-tests.bat 4 | REM Description: Runs test suite under all supported versions of Python 5 | REM and displays failures when encountered. 6 | REM ******************************************************************** 7 | 8 | GOTO:mainProgram 9 | 10 | REM ******************************************************************** 11 | REM Define function (takes command to run as a single argument). 12 | REM ******************************************************************** 13 | :runCommand 14 | SETLOCAL & IF %GLOBAL_ERRORLEVEL% NEQ 0 ENDLOCAL & GOTO:EOF 15 | ECHO. 16 | ECHO ====================================================================== 17 | ECHO %~1 18 | ECHO ====================================================================== 19 | CALL %~fs1 20 | IF %ERRORLEVEL% NEQ 0 ( 21 | ECHO. 22 | ECHO Failed Command: %~1 23 | ) 24 | ENDLOCAL & SET GLOBAL_ERRORLEVEL=%ERRORLEVEL% 25 | GOTO:EOF 26 | 27 | 28 | REM ******************************************************************** 29 | REM Run test suite in all supported versions of Python. 30 | REM ******************************************************************** 31 | :mainProgram 32 | 33 | SET GLOBAL_ERRORLEVEL=0 34 | 35 | CALL :runCommand "C:\Program Files\Python37\python.exe -B -m unittest %*" 36 | CALL :runCommand "C:\Program Files\Python 3.6\python.exe -B -m unittest %*" 37 | CALL :runCommand "C:\Program Files\Python 3.5\python.exe -B -m unittest %*" 38 | CALL :runCommand "C:\Python34\python.exe -B -m unittest %*" 39 | CALL :runCommand "C:\Python33\python.exe -B -m unittest %*" 40 | CALL :runCommand "C:\Python32\python.exe -B -m unittest %*" 41 | CALL :runCommand "C:\Python31\python.exe -B tests/discover.py %*" 42 | CALL :runCommand "C:\Python27\python.exe -B -m unittest discover %*" 43 | CALL :runCommand "C:\Python26\python.exe -B tests/discover.py %*" 44 | 45 | IF %GLOBAL_ERRORLEVEL% EQU 0 ( 46 | ECHO. 47 | ECHO All commands successful. 48 | ) 49 | -------------------------------------------------------------------------------- /tests/past_api07_sources_excel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | import os 4 | from . import _unittest as unittest 5 | from .mixins import OtherTests 6 | from .mixins import CountTests 7 | 8 | try: 9 | import xlrd 10 | except ImportError: 11 | xlrd = None 12 | 13 | from datatest.__past__.api07_sources import ExcelSource 14 | 15 | workbook_path = os.path.join( 16 | os.path.dirname(__file__), 17 | 'sample_files', 18 | 'test_sources_excel.xlsx', 19 | ) 20 | 21 | 22 | @unittest.skipUnless(xlrd, 'requires xlrd') 23 | class TestExcelSource(OtherTests, unittest.TestCase): 24 | def setUp(self): 25 | global workbook_path 26 | self.datasource = ExcelSource(workbook_path) # <- Defaults to "Sheet 1" 27 | 28 | 29 | @unittest.skipUnless(xlrd, 'requires xlrd') 30 | class TestExcelSourceCount(unittest.TestCase): 31 | #class TestExcelSourceCount(CountTests, unittest.TestCase): 32 | def setUp(self): 33 | global workbook_path 34 | self.datasource = ExcelSource(workbook_path, 'count_data') 35 | 36 | def test_count(self): 37 | count = self.datasource.count 38 | 39 | self.assertEqual(9, count('label1')) 40 | 41 | expected = {'a': 4, 'b': 5} 42 | result = count('label1', ['label1']) 43 | self.assertEqual(expected, result) 44 | 45 | expected = {'a': 3, 'b': 3} # Counts only truthy values (not '' or None). 46 | result = count('label2', ['label1']) 47 | self.assertEqual(expected, result) 48 | 49 | expected = { 50 | ('a', 'x'): 2, 51 | ('a', 'y'): 1, 52 | ('a', ''): 1, 53 | ('b', 'z'): 1, 54 | ('b', 'y'): 1, 55 | ('b', 'x'): 1, 56 | #('b', None): 1, # <- None value has no equivalent in XLSX file. 57 | #('b', ''): 1, 58 | ('b', ''): 2, 59 | } 60 | result = count('label1', ['label1', 'label2']) 61 | self.assertEqual(expected, result) 62 | 63 | expected = {'x': 2, 'y': 1, '': 1} 64 | result = count('label1', 'label2', label1='a') 65 | self.assertEqual(expected, result) 66 | -------------------------------------------------------------------------------- /docs/_static/tutorial/test_intro1_unit.py: -------------------------------------------------------------------------------- 1 | """Example tests using unittest-style conventions.""" 2 | 3 | import re 4 | import datatest 5 | 6 | 7 | class ExampleTests(datatest.DataTestCase): 8 | 9 | def test_using_set(self): 10 | """Check for set membership.""" 11 | data = ['A', 'B', 'A'] 12 | 13 | requirement = {'A', 'B'} 14 | 15 | self.assertValid(data, requirement) 16 | 17 | def test_using_function(self): 18 | """Check that function returns True.""" 19 | data = [2, 4, 6, 8] 20 | 21 | def is_even(x): 22 | return x % 2 == 0 23 | 24 | self.assertValid(data, is_even) 25 | 26 | def test_using_type(self): 27 | """Check that values are of the given type.""" 28 | data = [0.0, 1.0, 2.0] 29 | 30 | self.assertValid(data, float) 31 | 32 | def test_using_regex(self): 33 | """Check that values match the given pattern.""" 34 | data = ['bake', 'cake', 'bake'] 35 | 36 | regex = re.compile('[bc]ake') 37 | 38 | self.assertValid(data, regex) 39 | 40 | def test_using_string(self): 41 | """Check that values equal the given string.""" 42 | data = ['foo', 'foo', 'foo'] 43 | 44 | self.assertValid(data, 'foo') 45 | 46 | def test_using_tuple(self): 47 | """Check that tuples of values satisfy corresponding tuple of 48 | requirements. 49 | """ 50 | data = [('A', 0.0), ('A', 1.0), ('A', 2.0)] 51 | 52 | requirement = ('A', float) 53 | 54 | self.assertValid(data, requirement) 55 | 56 | def test_using_dict(self): 57 | """Check that values satisfy requirements of matching keys.""" 58 | data = { 59 | 'A': 100, 60 | 'B': 200, 61 | 'C': 300, 62 | } 63 | requirement = { 64 | 'A': 100, 65 | 'B': 200, 66 | 'C': 300, 67 | } 68 | self.assertValid(data, requirement) 69 | 70 | def test_using_list(self): 71 | """Check that the order of values match the required sequence.""" 72 | data = ['A', 'B', 'C', 'D', 'E', 'F', 'G'] 73 | 74 | requirement = ['A', 'B', 'C', 'D', 'E', 'F', 'G'] 75 | 76 | self.assertValid(data, requirement) 77 | 78 | 79 | if __name__ == '__main__': 80 | datatest.main() 81 | -------------------------------------------------------------------------------- /docs/_static/tutorial/test_intro2_unit.py: -------------------------------------------------------------------------------- 1 | """Example of failing tests using unittest-style conventions.""" 2 | 3 | import re 4 | import datatest 5 | 6 | 7 | class ExampleTests(datatest.DataTestCase): 8 | def test_using_set(self): 9 | """Check for set membership.""" 10 | data = ['A', 'B', 'C', 'D'] 11 | 12 | requirement = {'A', 'B'} 13 | 14 | self.assertValid(data, requirement) 15 | 16 | def test_using_function(self): 17 | """Check that function returns True.""" 18 | data = [2, 4, 6, 9] 19 | 20 | def is_even(x): 21 | return x % 2 == 0 22 | 23 | self.assertValid(data, is_even) 24 | 25 | def test_using_type(self): 26 | """Check that values are of the given type.""" 27 | data = [0.0, 1.0, 2] 28 | 29 | self.assertValid(data, float) 30 | 31 | def test_using_regex(self): 32 | """Check that values match the given pattern.""" 33 | data = ['bake', 'cake', 'fake'] 34 | 35 | regex = re.compile('[bc]ake') 36 | 37 | self.assertValid(data, regex) 38 | 39 | def test_using_string(self): 40 | """Check that values equal the given string.""" 41 | data = ['foo', 'foo', 'bar'] 42 | 43 | self.assertValid(data, 'foo') 44 | 45 | def test_using_tuple(self): 46 | """Check that tuples of values satisfy corresponding tuple of 47 | requirements. 48 | """ 49 | data = [('A', 1.0), ('A', 2), ('B', 3.0)] 50 | 51 | requirement = ('A', float) 52 | 53 | self.assertValid(data, requirement) 54 | 55 | def test_using_dict(self): 56 | """Check that values satisfy requirements of matching keys.""" 57 | data = { 58 | 'A': 100, 59 | 'B': 200, 60 | 'C': 299, 61 | 'D': 405, 62 | } 63 | requirement = { 64 | 'A': 100, 65 | 'B': 200, 66 | 'C': 300, 67 | 'D': 400, 68 | } 69 | self.assertValid(data, requirement) 70 | 71 | def test_using_list(self): 72 | """Check that the order of values match the required sequence.""" 73 | data = ['A', 'D', 'XXX', 'YYY', 'E', 'ZZZ', 'G'] 74 | 75 | requirement = ['A', 'B', 'C', 'D', 'E', 'F', 'G'] 76 | 77 | self.assertValid(data, requirement) 78 | 79 | 80 | if __name__ == '__main__': 81 | datatest.main() 82 | -------------------------------------------------------------------------------- /tests/test_past_subprocesses.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Test backwards compatibility modules using separate subprocesses.""" 3 | import subprocess 4 | import sys 5 | 6 | from datatest._compatibility import textwrap 7 | from . import _unittest as unittest 8 | from .common import ignore_deprecations 9 | 10 | 11 | @ignore_deprecations 12 | class TestBackwardsCompatibility(unittest.TestCase): 13 | def assertSubprocess(self, module): 14 | """Run given *module* in separate process--fails if return code 15 | indicates an error. 16 | """ 17 | command = [sys.executable, '-B', '-O', '-m', module] 18 | p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 19 | stdout_bytes, stderr_bytes = p.communicate() # Closes file-like object. 20 | 21 | # A non-zero return code indicates the command was not successful. 22 | if p.returncode != 0: 23 | output = stdout_bytes + stderr_bytes # Get all output. 24 | output = output.decode('utf-8') # Convert bytes to str. 25 | output = textwrap.wrap(output, width=70) # Get list of wrapped lines. 26 | output = '\n'.join(output) # Join list items as str. 27 | output = textwrap.indent(output, ' ') # Indent lines by 4 spaces. 28 | 29 | msg = '\n'.join([ 30 | 'Subprocess failed:', 31 | output, 32 | '', 33 | 'To run this test directly, use the following command:', 34 | ' '.join(command), 35 | ]) 36 | self.fail(msg) 37 | 38 | def test_api00(self): 39 | """Test compatibility with pre-release alpha API.""" 40 | self.assertSubprocess('tests.past_api00') 41 | 42 | def test_api06(self): 43 | """Test compatibility with first development-release API.""" 44 | self.assertSubprocess('tests.past_api06') 45 | 46 | def test_api07(self): 47 | """Test compatibility with second development-release API.""" 48 | self.assertSubprocess('tests.past_api07') 49 | 50 | def test_api08(self): 51 | """Test compatibility with version 0.8 API.""" 52 | self.assertSubprocess('tests.past_api08') 53 | 54 | def test_api09(self): 55 | """Test compatibility with version 0.9 API.""" 56 | self.assertSubprocess('tests.past_api09') 57 | 58 | 59 | if __name__ == '__main__': 60 | unittest.main() 61 | -------------------------------------------------------------------------------- /tests/past_api07_error.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from . import _unittest as unittest 3 | 4 | from datatest.__past__.api07_diffs import xMissing 5 | from datatest.__past__.api07_error import DataError 6 | 7 | 8 | class TestDataError(unittest.TestCase): 9 | def test_subclass(self): 10 | self.assertTrue(issubclass(DataError, AssertionError)) 11 | 12 | def test_instantiation(self): 13 | DataError('column names', xMissing('foo')) 14 | DataError('column names', [xMissing('foo')]) 15 | DataError('column names', {'foo': xMissing('bar')}) 16 | DataError('column names', {('foo', 'bar'): xMissing('baz')}) 17 | 18 | with self.assertRaises(ValueError, msg='Empty error should raise exception.'): 19 | DataError(msg='', differences={}) 20 | 21 | def test_repr(self): 22 | error = DataError('different columns', [xMissing('foo')]) 23 | pattern = "DataError: different columns:\n xMissing('foo')" 24 | self.assertEqual(repr(error), pattern) 25 | 26 | error = DataError('different columns', xMissing('foo')) 27 | pattern = "DataError: different columns:\n xMissing('foo')" 28 | self.assertEqual(repr(error), pattern) 29 | 30 | # Test pprint lists. 31 | error = DataError('different columns', [xMissing('foo'), 32 | xMissing('bar')]) 33 | pattern = ("DataError: different columns:\n" 34 | " xMissing('foo'),\n" 35 | " xMissing('bar')") 36 | self.assertEqual(repr(error), pattern) 37 | 38 | # Test dictionary. 39 | error = DataError('different columns', {'FOO': xMissing('bar')}) 40 | pattern = ("DataError: different columns:\n" 41 | " 'FOO': xMissing('bar')") 42 | self.assertEqual(repr(error), pattern) 43 | 44 | def test_verbose_repr(self): 45 | reference = 'reference-data-source' 46 | subject = 'subject-data-source' 47 | error = DataError('different columns', [xMissing('foo')], subject, reference) 48 | error._verbose = True # <- Set verbose flag, here! 49 | 50 | pattern = ("DataError: different columns:\n" 51 | " xMissing('foo')\n" 52 | "\n" 53 | "SUBJECT:\n" 54 | "subject-data-source\n" 55 | "REQUIRED:\n" 56 | "reference-data-source") 57 | self.assertEqual(repr(error), pattern) 58 | -------------------------------------------------------------------------------- /docs/how-to/get-started.rst: -------------------------------------------------------------------------------- 1 | 2 | .. currentmodule:: datatest 3 | 4 | .. meta:: 5 | :description: How to get started. 6 | :keywords: datatest, example, getting started 7 | 8 | 9 | ############################### 10 | How to Get Started With Testing 11 | ############################### 12 | 13 | Once you have reviewed the tutorials and have a basic understanding 14 | of datatest, you should be ready to start testing your own data. 15 | 16 | 17 | ========================================= 18 | 1. Create a File and Add Some Sample Code 19 | ========================================= 20 | 21 | A simple way to get started is to create a **.py** file in the same folder 22 | as the data you want to test. It's a good idea to follow established testing 23 | conventions and make sure your filename starts with "**test\_**". 24 | 25 | Then, copy one of following the **pytest** or **unittest** code samples 26 | to use as a template for writing your own tests: 27 | 28 | .. raw:: html 29 | 30 |
31 | Pytest Samples 32 | 33 | .. include:: ../intro/automated-testing.rst 34 | :start-after: start-inclusion-marker-pytestsamples 35 | :end-before: end-inclusion-marker-pytestsamples 36 | 37 | .. raw:: html 38 | 39 |
40 | 41 | 42 | .. raw:: html 43 | 44 |
45 | Unittest Samples 46 | 47 | .. include:: ../intro/automated-testing.rst 48 | :start-after: start-inclusion-marker-unittestsamples 49 | :end-before: end-inclusion-marker-unittestsamples 50 | 51 | .. raw:: html 52 | 53 |
54 | 55 | 56 | ========================================== 57 | 2. Adapt the Sample Code to Suit Your Data 58 | ========================================== 59 | 60 | After copying the sample code into your own file, begin adapting 61 | it to suit your data: 62 | 63 | 1. Change the fixture to use your data (instead of "example.csv"). 64 | 2. Update the set in ``test_column_names()`` to require the names your 65 | data should contain (instead of "A", "B", and "C"). 66 | 3. Rename ``test_a()`` and change it to check values in one of the 67 | columns in your data. 68 | 4. Add more tests appropriate for your own data requirements. 69 | 70 | 71 | =================================== 72 | 3. Refactor Your Tests as They Grow 73 | =================================== 74 | 75 | As your tests grow, look to structure them into related groups. Start 76 | by creating separate classes to contain groups of related test cases. 77 | And as you develop more and more classes, create separate modules to 78 | hold groups of related classes. If you are using ``pytest``, move your 79 | fixtures into a ``conftest.py`` file. 80 | -------------------------------------------------------------------------------- /tests/common.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import glob 3 | import os 4 | import shutil 5 | import sys 6 | import tempfile 7 | import warnings 8 | from functools import wraps 9 | 10 | from . import _io as io 11 | from . import _unittest as unittest 12 | 13 | 14 | class MkdtempTestCase(unittest.TestCase): 15 | # TestCase changes cwd to temporary location. After testing, 16 | # removes files and restores original cwd. 17 | @classmethod 18 | def setUpClass(cls): 19 | cls._orig_dir = os.getcwd() 20 | cls._temp_dir = tempfile.mkdtemp() # Requires mkdtemp--cannot 21 | 22 | @classmethod 23 | def tearDownClass(cls): 24 | os.rmdir(cls._temp_dir) 25 | 26 | def setUp(self): 27 | os.chdir(self._temp_dir) 28 | 29 | def tearDown(self): 30 | for path in glob.glob(os.path.join(self._temp_dir, '*')): 31 | if os.path.isdir(path): 32 | shutil.rmtree(path) 33 | else: 34 | os.remove(path) 35 | os.chdir(self._orig_dir) 36 | 37 | 38 | def ignore_deprecations(obj): 39 | """A class and function decorator to ignore DeprecationWarnings.""" 40 | def decorate(func): 41 | @wraps(func) 42 | def wrapper(*args, **kwds): 43 | with warnings.catch_warnings(): 44 | warnings.simplefilter('ignore', DeprecationWarning) 45 | return func(*args, **kwds) 46 | return wrapper 47 | 48 | if isinstance(obj, type): 49 | # If object is a class, decorate its methods. 50 | for key, val in obj.__dict__.items(): 51 | if callable(val): 52 | setattr(obj, key, decorate(val)) 53 | else: 54 | # Else decorate the object itself. 55 | obj = decorate(obj) 56 | 57 | return obj 58 | 59 | 60 | try: 61 | unittest.TestCase.setUpClass # New in 2.7 62 | except AttributeError: 63 | _MkdtempTestCase = MkdtempTestCase 64 | class MkdtempTestCase(_MkdtempTestCase): 65 | def setUp(self): 66 | self.setUpClass.__func__(self) 67 | _MkdtempTestCase.setUp(self) 68 | 69 | def tearDown(self): 70 | _MkdtempTestCase.tearDown(self) 71 | self.tearDownClass.__func__(self) 72 | 73 | 74 | def make_csv_file(fieldnames, datarows): 75 | """Helper function to make CSV file-like object using *fieldnames* 76 | (a list of field names) and *datarows* (a list of lists containing 77 | the row values). 78 | """ 79 | init_string = [] 80 | init_string.append(','.join(fieldnames)) # Concat cells into row. 81 | for row in datarows: 82 | row = [str(cell) for cell in row] 83 | init_string.append(','.join(row)) # Concat cells into row. 84 | init_string = '\n'.join(init_string) # Concat rows into final string. 85 | return io.StringIO(init_string) 86 | -------------------------------------------------------------------------------- /tests/test_pandas_integration.py: -------------------------------------------------------------------------------- 1 | """Tests for Pandas accessor extensions.""" 2 | from . import _unittest as unittest 3 | 4 | try: 5 | import pandas 6 | except ImportError: 7 | pandas = None 8 | 9 | from datatest import Invalid 10 | from datatest import ValidationError 11 | from datatest import register_accessors 12 | 13 | 14 | @unittest.skipUnless(pandas, 'requires pandas') 15 | class TestAccessorExtensions(unittest.TestCase): 16 | """Test Pandas accessors.""" 17 | def setUp(self): # Change to `setUpClass` when dropping 18 | register_accessors() # support for Python 2.6 and 3.1. 19 | self.df = pandas.DataFrame( 20 | data=[(1, 'x'), (2, 'y'), (3, 'z')], 21 | columns=['A', 'B'], 22 | ) 23 | 24 | def test_dataframe_success(self): 25 | # Should pass without error on success. 26 | self.df.validate((int, str)) 27 | 28 | def test_dataframe_failure(self): 29 | with self.assertRaises(ValidationError) as cm: 30 | is_odd = lambda x: x % 2 == 1 31 | self.df.validate((is_odd, str)) 32 | 33 | actual = cm.exception.differences 34 | expected = [Invalid((2, 'y'))] 35 | self.assertEqual(actual, expected) 36 | 37 | def test_series_success(self): 38 | # Should pass without error on success. 39 | self.df.columns.validate.order(['A', 'B']) # Columns are a Series 40 | self.df['A'].validate(int) # A selected Series of column values. 41 | 42 | def test_series_failure(self): 43 | with self.assertRaises(ValidationError) as cm: 44 | is_odd = lambda x: x % 2 == 1 45 | self.df['A'].validate(is_odd) 46 | 47 | self.assertEqual(cm.exception.differences, [Invalid(2)]) 48 | 49 | def test_index_success(self): 50 | # Should pass without error on success. 51 | self.df.index.validate(int) 52 | 53 | def test_index_failure(self): 54 | with self.assertRaises(ValidationError) as cm: 55 | is_odd = lambda x: x % 2 == 1 56 | self.df.index.validate(is_odd) 57 | 58 | actual = cm.exception.differences 59 | expected = [Invalid(0), Invalid(2)] 60 | self.assertEqual(actual, expected) 61 | 62 | def test_multiindex_success(self): 63 | # Should pass without error on success. 64 | multi_index = pandas.MultiIndex.from_arrays( 65 | [[1, 1, 2], ['A', 'B', 'C']], 66 | names=('number', 'letter') 67 | ) 68 | multi_index.validate((int, str)) 69 | 70 | def test_multiindex_failure(self): 71 | multi_index = pandas.MultiIndex.from_arrays( 72 | [[1, 1, 2], ['A', 'B', 'C']], 73 | names=('number', 'letter') 74 | ) 75 | with self.assertRaises(ValidationError) as cm: 76 | is_odd = lambda x: x % 2 == 1 77 | multi_index.validate((is_odd, str)) 78 | 79 | self.assertEqual(cm.exception.differences, [Invalid((2, 'C'))]) 80 | -------------------------------------------------------------------------------- /docs/_ext/autodoc_classinstance.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """Sphinx Extension: autodoc_classinstance (written by Shawn Brown).""" 4 | from sphinx.domains.python import PyClasslike 5 | from sphinx.ext.autodoc import ClassDocumenter 6 | from sphinx.ext.autodoc import MethodDocumenter 7 | from sphinx.util import inspect 8 | 9 | 10 | class PyClassInstance(PyClasslike): 11 | """ 12 | Description of a class-instance object. 13 | """ 14 | def get_signature_prefix(self, sig): 15 | return '' # Omit "class" prefix for instances. 16 | 17 | 18 | class ClassInstanceDocumenter(ClassDocumenter): 19 | """ 20 | Specialized Documenter subclass for class instances. 21 | """ 22 | objtype = 'classinstance' 23 | 24 | @classmethod 25 | def can_document_member(cls, member, membername, isattr, parent): 26 | return not isinstance(member, type) 27 | 28 | def import_object(self): 29 | ret = super().import_object() 30 | self.doc_as_attr = False # never document as a data/attribute 31 | return ret 32 | 33 | def format_args(self): 34 | # for instances, the relevant signature is the __call__ method's 35 | callmeth = self.get_attr(self.object, '__call__', None) 36 | if callmeth: 37 | sig = inspect.Signature(callmeth, bound_method=True, has_retval=True) 38 | return sig.format_args() 39 | return None 40 | 41 | 42 | class AlternateMethodDocumenter(MethodDocumenter): 43 | """ 44 | Alternative documenter for methods of classes and class instances. 45 | """ 46 | def add_directive_header(self, sig): 47 | if isinstance(self.parent, type): 48 | # If parent is a class definition, then add header as normal. 49 | super(AlternateMethodDocumenter, self).add_directive_header(sig) 50 | else: 51 | # When parent is an instance, then add a special header 52 | # (calls superclass' superclass method). 53 | super(MethodDocumenter, self).add_directive_header(sig) 54 | 55 | # Tag async methods but do not tag abstract, class, or 56 | # static methods. 57 | parentclass = self.parent.__class__ 58 | obj = parentclass.__dict__.get(self.object_name, self.object) 59 | if inspect.iscoroutinefunction(obj): 60 | sourcename = self.get_sourcename() 61 | self.add_line(' :async:', sourcename) 62 | 63 | 64 | def setup(app): 65 | app.add_directive('classinstance', PyClassInstance) 66 | app.add_directive('py:classinstance', PyClassInstance) 67 | app.add_autodocumenter(ClassInstanceDocumenter) 68 | 69 | # If sphinx.ext.autosummary is used, it will override the 70 | # existing autodocumenters on the 'builder-inited' event. 71 | # Adding AlternateMethodDocumenter after this event makes 72 | # sure it isn't overridden. 73 | def add_method_documenter(app, env, docnames): 74 | app.add_autodocumenter(AlternateMethodDocumenter) 75 | app.connect('env-before-read-docs', add_method_documenter) 76 | -------------------------------------------------------------------------------- /datatest/__past__/api09.py: -------------------------------------------------------------------------------- 1 | """Backward compatibility for version 0.9 API.""" 2 | from __future__ import absolute_import 3 | 4 | import datatest 5 | from datatest._compatibility.collections.abc import Mapping 6 | from datatest._compatibility.collections.abc import Set 7 | from datatest._normalize import normalize 8 | from datatest._utils import IterItems 9 | 10 | 11 | class RequiredSubset_090(datatest.requirements.GroupRequirement): 12 | """Implements inverted subset behavior from 0.9.x API.""" 13 | def __init__(self, requirement): 14 | if not isinstance(requirement, Set): 15 | requirement = set(requirement) 16 | self._set = requirement 17 | 18 | def check_group(self, group): 19 | missing = self._set.copy() 20 | for element in group: 21 | if not missing: 22 | break 23 | missing.discard(element) 24 | 25 | differences = (Missing(element) for element in missing) 26 | description = 'must contain all elements of given requirement' 27 | return differences, description 28 | 29 | 30 | class RequiredSuperset_090(datatest.requirements.GroupRequirement): 31 | """Implements inverted superset behavior from 0.9.x API.""" 32 | 33 | def __init__(self, requirement): 34 | if not isinstance(requirement, Set): 35 | requirement = set(requirement) 36 | self._set = requirement 37 | 38 | def check_group(self, group): 39 | superset = self._set 40 | extras = set() 41 | for element in group: 42 | if element not in superset: 43 | extras.add(element) 44 | 45 | differences = (Extra(element) for element in extras) 46 | description = 'may only contain elements of given requirement' 47 | return differences, description 48 | 49 | 50 | 51 | class ValidateType(datatest.validation.ValidateType): 52 | def subset(self, data, requirement, msg=None): 53 | """Implements API 0.9.x subset behavior.""" 54 | __tracebackhide__ = datatest.validation._pytest_tracebackhide 55 | 56 | requirement = normalize(requirement, lazy_evaluation=False, default_type=set) 57 | 58 | if isinstance(requirement, (Mapping, IterItems)): 59 | factory = RequiredSubset_090 60 | requirement = datatest.requirements.RequiredMapping(requirement, factory) 61 | else: 62 | requirement = RequiredSubset_090(requirement) 63 | 64 | self(data, requirement, msg=msg) 65 | 66 | def superset(self, data, requirement, msg=None): 67 | """Implements API 0.9.x superset behavior.""" 68 | __tracebackhide__ = datatest.validation._pytest_tracebackhide 69 | 70 | requirement = normalize(requirement, lazy_evaluation=False, default_type=set) 71 | 72 | if isinstance(requirement, (Mapping, IterItems)): 73 | factory = RequiredSuperset_090 74 | requirement = datatest.requirements.RequiredMapping(requirement, factory) 75 | else: 76 | requirement = RequiredSuperset_090(requirement) 77 | 78 | self(data, requirement, msg=msg) 79 | 80 | 81 | datatest.validate = ValidateType() 82 | -------------------------------------------------------------------------------- /datatest/__past__/api00.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Backwards compatibility for version 0.6.0.dev0 API.""" 3 | from __future__ import absolute_import 4 | import datatest 5 | from datatest.__past__ import api08 6 | from datatest.__past__ import api07 7 | from datatest.__past__ import api06 8 | from datatest import DataTestCase 9 | 10 | datatest.DataAssertionError = datatest.__past__.api07_error.DataError 11 | 12 | # Acceptances. 13 | DataTestCase.allowSpecified = DataTestCase.allowOnly 14 | DataTestCase.allowUnspecified = DataTestCase.allowAny 15 | DataTestCase.allowDeviationPercent = DataTestCase.allowPercentDeviation 16 | 17 | # Assertions. 18 | from .api06 import _assertDataCount 19 | DataTestCase.assertValueCount = _assertDataCount 20 | 21 | DataTestCase.assertColumnSet = DataTestCase.assertSubjectColumns 22 | DataTestCase.assertValueSet = DataTestCase.assertSubjectSet 23 | DataTestCase.assertValueSum = DataTestCase.assertSubjectSum 24 | DataTestCase.assertValueRegex = DataTestCase.assertSubjectRegex 25 | DataTestCase.assertValueNotRegex = DataTestCase.assertSubjectNotRegex 26 | 27 | 28 | def _assertColumnSubset(self, ref=None, msg=None): 29 | """Test that the set of subject columns is a subset of reference 30 | columns. If *ref* is provided, it is used in-place of the set 31 | from ``referenceData``. 32 | """ 33 | try: 34 | self.assertColumnSet(ref, msg) 35 | except datatest.DataAssertionError: 36 | with self.allowMissing(): 37 | self.assertColumnSet(ref, msg) 38 | 39 | DataTestCase.assertColumnSubset = _assertColumnSubset 40 | 41 | 42 | def _assertColumnSuperset(self, ref=None, msg=None): 43 | """Test that the set of subject columns is a superset of reference 44 | columns. If *ref* is provided, it is used in-place of the set 45 | from ``referenceData``. 46 | """ 47 | try: 48 | self.assertColumnSet(ref, msg) 49 | except datatest.DataAssertionError: 50 | with self.allowExtra(): 51 | self.assertColumnSet(ref, msg) 52 | 53 | DataTestCase.assertColumnSuperset = _assertColumnSuperset 54 | 55 | 56 | def _assertValueSubset(self, column, ref=None, msg=None, **filter_by): 57 | """Test that the set of subject values is a subset of reference 58 | values for the given *column*. If *ref* is provided, it is used 59 | in place of the set from ``referenceData``. 60 | """ 61 | try: 62 | self.assertValueSet(column, ref, msg, **filter_by) 63 | except datatest.DataAssertionError: 64 | with self.allowMissing(): 65 | self.assertValueSet(column, ref, msg, **filter_by) 66 | 67 | DataTestCase.assertValueSubset = _assertValueSubset 68 | 69 | 70 | def _assertValueSuperset(self, column, ref=None, msg=None, **filter_by): 71 | """Test that the set of subject values is a superset of reference 72 | values for the given *column*. If *ref* is provided, it is used 73 | in place of the set from ``referenceData``. 74 | """ 75 | try: 76 | self.assertValueSet(column, ref, msg, **filter_by) 77 | except datatest.DataAssertionError: 78 | with self.allowExtra(): 79 | self.assertValueSet(column, ref, msg, **filter_by) 80 | 81 | DataTestCase.assertValueSuperset = _assertValueSuperset 82 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | :tocdepth: 2 2 | 3 | .. meta:: 4 | :description: Datatest introduction and table of contents. 5 | :keywords: data cleaning, data quality, etl testing, data validation, data testing, data preparation, python, datatest 6 | :title: Datatest: Test driven data-wrangling and data validation. 7 | 8 | .. module:: datatest 9 | :synopsis: Test driven data-wrangling and data validation. 10 | .. moduleauthor:: Shawn Brown 11 | .. sectionauthor:: Shawn Brown 12 | 13 | 14 | ######################################################## 15 | Datatest: Test driven data-wrangling and data validation 16 | ######################################################## 17 | 18 | 19 | .. include:: ../README.rst 20 | :start-after: start-inclusion-marker-badge-substitutions 21 | :end-before: end-inclusion-marker-badge-substitutions 22 | 23 | |licensebadge| |pythonbadge| |requiresbadge| |releasebadge| |repobadge| 24 | 25 | 26 | Datatest helps to speed up and formalize data-wrangling and data 27 | validation tasks. It was designed to work with poorly formatted 28 | data by detecting and describing validation failures. 29 | 30 | * |Validate| the format, type, set membership, and more from a variety of data 31 | sources including pandas ``DataFrames`` and ``Series``, NumPy ``ndarrays``, 32 | built-in data structures, etc. 33 | * Smart |comparison behavior| applies the appropriate validation method for 34 | a given data requirement. 35 | * Automatic |data handling| manages the validation of single elements, 36 | sequences, sets, dictionaries, and other containers of elements. 37 | * |Difference objects| characterize the discrepancies and deviations 38 | between a dataset and its requirements. 39 | * |Acceptance managers| distinguish between ideal criteria and acceptable 40 | differences. 41 | 42 | .. |Validate| replace:: :ref:`Validate ` 43 | .. |comparison behavior| replace:: :ref:`comparison behavior ` 44 | .. |data handling| replace:: :ref:`data handling ` 45 | .. |Difference objects| replace:: :ref:`Difference objects ` 46 | .. |Acceptance managers| replace:: :ref:`Acceptance managers ` 47 | 48 | 49 | **Test driven data-wrangling** is a process for taking data from a source 50 | of unverified quality or format and producing a verified, well-formatted 51 | dataset. It repurposes software testing practices for data preparation 52 | and quality assurance projects. **Pipeline validation** monitors the status 53 | and quality of data as it passes through a pipeline and identifies *where* 54 | in a pipeline an error occurs. 55 | 56 | See the project `README `_ file for 57 | full details regarding supported versions, backward compatibility, and 58 | more. 59 | 60 | 61 | ================= 62 | Table of Contents 63 | ================= 64 | 65 | .. toctree:: 66 | :caption: Documentation 67 | :hidden: 68 | 69 | Home 70 | 71 | 72 | .. toctree:: 73 | :maxdepth: 2 74 | 75 | intro/index 76 | how-to/index 77 | reference/index 78 | discussion/index 79 | 80 | .. 81 | OMIT UNFINISHED PAGES: 82 | tutorial/index 83 | 84 | -------------------------------------------------------------------------------- /datatest/_working_directory.py: -------------------------------------------------------------------------------- 1 | """working_directory context manager.""" 2 | 3 | import os 4 | from ._compatibility import contextlib 5 | 6 | 7 | class working_directory(contextlib.ContextDecorator): 8 | """A context manager to temporarily set the working directory 9 | to a given *path*. If *path* specifies a file, the file's 10 | directory is used. When exiting the with-block, the working 11 | directory is automatically changed back to its previous 12 | location. 13 | 14 | **Context Manager:** 15 | 16 | You can use Python's :py:obj:`__file__` constant to load data 17 | relative to a file's current directory: 18 | 19 | .. code-block:: python 20 | :emphasize-lines: 4 21 | 22 | from datatest import working_directory 23 | import pandas as pd 24 | 25 | with working_directory(__file__): 26 | my_df = pd.read_csv('myfile.csv') 27 | 28 | **Decorator:** 29 | 30 | This context manager can also be used as a decorator: 31 | 32 | .. code-block:: python 33 | :emphasize-lines: 4 34 | 35 | from datatest import working_directory 36 | import pandas as pd 37 | 38 | @working_directory(__file__) 39 | def my_df(): 40 | return pd.read_csv('myfile.csv') 41 | 42 | **Explicit Control:** 43 | 44 | In some cases, you may want to forgo the use of a context manager 45 | or decorator. You can explicitly control directory switching with 46 | the ``change()`` and ``revert()`` methods: 47 | 48 | .. code-block:: python 49 | :emphasize-lines: 4,8 50 | 51 | from datatest import working_directory 52 | 53 | work_dir = working_directory(__file__) 54 | work_dir.change() 55 | 56 | ... 57 | 58 | work_dir.revert() 59 | """ 60 | def __init__(self, path): 61 | if os.path.isfile(path): 62 | path = os.path.dirname(path) 63 | self._working_dir = os.path.abspath(path) 64 | self._original_dir = None # Assigned on __enter__(), not before. 65 | 66 | def __enter__(self): 67 | if self._original_dir: 68 | msg = 'cannot reenter {0}, already entered from {1!r}'.format( 69 | self.__class__.__name__, 70 | self._original_dir, 71 | ) 72 | raise RuntimeError(msg) 73 | 74 | self._original_dir = os.path.abspath(os.getcwd()) 75 | os.chdir(self._working_dir) 76 | 77 | def __exit__(self, exc_type, exc_value, traceback): 78 | if self._original_dir: 79 | os.chdir(self._original_dir) 80 | self._original_dir = None 81 | 82 | def change(self): 83 | """Change to the defined working directory (enter the context). 84 | 85 | While operating in a working directory context, you cannot 86 | enter it again. Calling ``change()`` a second time will raise 87 | a :py:class:`RuntimeError`---you must first call ``revert()``. 88 | """ 89 | self.__enter__() 90 | 91 | def revert(self): 92 | """Revert to the original working directory (exit the context). 93 | 94 | If no context has been entered, calling ``revert()`` will do 95 | nothing and pass without error. 96 | """ 97 | self.__exit__(None, None, None) 98 | -------------------------------------------------------------------------------- /docs/discussion/data-preparation.rst: -------------------------------------------------------------------------------- 1 | 2 | .. meta:: 3 | :description: A discussion about the need for a structured approach 4 | to data preparation and data-wrangling. 5 | :keywords: data preparation, test driven, data-wrangling, structured, 6 | data science 7 | 8 | 9 | ################ 10 | Data Preparation 11 | ################ 12 | 13 | In the practice of data science, data preparation is a huge part of 14 | the job. Practitioners often spend 50 to 80 percent of their time 15 | wrangling data [#f1]_ [#f2]_ [#f3]_ [#f4]_. This critically important 16 | phase is time-consuming, unglamorous, and often poorly structured. 17 | 18 | The :mod:`datatest` package was created to support test driven 19 | data-wrangling and provide a disciplined approach to an otherwise 20 | messy process. 21 | 22 | A datatest suite can facilitate quick edit-test cycles to help guide 23 | the selection, cleaning, integration, and formatting of data. Data tests 24 | can also help to automate check-lists, measure progress, and promote 25 | best practices. 26 | 27 | 28 | ************************** 29 | Test Driven Data-Wrangling 30 | ************************** 31 | 32 | When data is messy, poorly structured, or uses an incompatible format, 33 | it's oftentimes not possible to prepare it using an automated process. 34 | There are a multitude of ways for messy data to counfound a processing 35 | system or schema. Dealing with data like this requires a data-wrangling 36 | approach where users are actively involved with making decisions and 37 | judgment calls about cleaning and formatting the data. 38 | 39 | A well-structured suite of data tests can serve as a template to guide 40 | the data-wrangling process. Using a quick edit-test cycle, users can: 41 | 42 | 1. focus on a failing test 43 | 2. make change to the data or the test 44 | 3. re-run the suite to check that the test now passes 45 | 4. then, move on to the next failing test 46 | 47 | The work of cleaning and formatting data takes place outside of the 48 | datatest package itself. Users can work with with the tools they find 49 | the most productive (Excel, `pandas `_, R, 50 | sed, etc.). 51 | 52 | 53 | .. rubric:: Footnotes 54 | 55 | .. [#f1] "Data scientists, according to interviews and expert estimates, spend 56 | from 50 percent to 80 percent of their time mired in this more mundane 57 | labor of collecting and preparing unruly digital data..." Steve Lohraug 58 | in *For Big-Data Scientists, 'Janitor Work' Is Key Hurdle to Insights*. 59 | Retrieved from http://www.nytimes.com/2014/08/18/technology/for-big-data-scientists-hurdle-to-insights-is-janitor-work.html 60 | 61 | .. [#f2] "This [data preparation step] has historically taken the largest part 62 | of the overall time in the data mining solution process, which in some 63 | cases can approach 80% of the time." *Dynamic Warehousing: Data Mining 64 | Made Easy* (p. 19) 65 | 66 | .. [#f3] Online poll of data mining practitioners: `See image <../_static/data_prep_poll.png>`_, 67 | *Data preparation (Oct 2003)*. 68 | Retrieved from http://www.kdnuggets.com/polls/2003/data_preparation.htm 69 | [While this poll is quite old, the situation has not changed 70 | drastically.] 71 | 72 | .. [#f4] "As much as 80% of KDD is about preparing data, and the remaining 20% 73 | is about mining." *Data Mining for Design and Manufacturing* (p. 44) 74 | -------------------------------------------------------------------------------- /datatest/__past__/load_csv.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import warnings 3 | from .._utils import exhaustible 4 | from .._utils import seekable 5 | from .._utils import file_types 6 | from .get_reader import get_reader 7 | from .temptable import load_data 8 | from .temptable import savepoint 9 | 10 | 11 | preferred_encoding = 'utf-8' 12 | fallback_encoding = ['latin-1'] 13 | 14 | 15 | def load_csv(cursor, table, csvfile, encoding=None, **kwds): 16 | """Load *csvfile* and insert data into *table*.""" 17 | global preferred_encoding 18 | global fallback_encoding 19 | 20 | default = kwds.get('restval', '') # Used for default column value. 21 | 22 | if encoding: 23 | # When an encoding is specified, use it to load *csvfile* or 24 | # fail if there are errors (no fallback recovery): 25 | with savepoint(cursor): 26 | reader = get_reader.from_csv(csvfile, encoding, **kwds) 27 | load_data(cursor, table, reader, default=default) 28 | 29 | return # <- EXIT! 30 | 31 | # When the encoding is unspecified, try to load *csvfile* using the 32 | # preferred encoding and failing that, try the fallback encodings: 33 | 34 | if isinstance(csvfile, file_types) and seekable(csvfile): 35 | position = csvfile.tell() # Get current position if 36 | else: # csvfile is file-like and 37 | position = None # supports random access. 38 | 39 | try: 40 | with savepoint(cursor): 41 | reader = get_reader.from_csv(csvfile, preferred_encoding, **kwds) 42 | load_data(cursor, table, reader, default=default) 43 | 44 | return # <- EXIT! 45 | 46 | except UnicodeDecodeError as orig_error: 47 | if exhaustible(csvfile) and position is None: 48 | encoding, object_, start, end, reason = orig_error.args # Unpack args. 49 | reason = ( 50 | '{0}: unable to load {1!r}, cannot attempt fallback with ' 51 | '{2!r} type: must specify an appropriate text encoding' 52 | ).format(reason, csvfile, csvfile.__class__.__name__) 53 | raise UnicodeDecodeError(encoding, object_, start, end, reason) 54 | 55 | if isinstance(fallback_encoding, list): 56 | fallback_list = fallback_encoding 57 | else: 58 | fallback_list = [fallback_encoding] 59 | 60 | for fallback in fallback_list: 61 | if position is not None: 62 | csvfile.seek(position) 63 | 64 | try: 65 | with savepoint(cursor): 66 | reader = get_reader.from_csv(csvfile, fallback, **kwds) 67 | load_data(cursor, table, reader, default=default) 68 | 69 | msg = ( 70 | '{0}: loaded {1!r} using fallback {2!r}: specify an ' 71 | 'appropriate text encoding to assure correct operation' 72 | ).format(orig_error, csvfile, fallback) 73 | warnings.warn(msg) 74 | 75 | return # <- EXIT! 76 | 77 | except UnicodeDecodeError: 78 | pass 79 | 80 | # Note: DO NOT refactor this section using a for-else. I swear... 81 | encoding, object_, start, end, reason = orig_error.args # Unpack args. 82 | reason = ( 83 | '{0}: unable to load {1!r}, fallback recovery unsuccessful: ' 84 | 'must specify an appropriate text encoding' 85 | ).format(reason, csvfile) 86 | raise UnicodeDecodeError(encoding, object_, start, end, reason) 87 | -------------------------------------------------------------------------------- /tests/past_api07_sources_pandas.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from . import _unittest as unittest 3 | from .mixins import CountTests 4 | from .mixins import OtherTests 5 | 6 | from datatest.__past__.api07_sources import PandasSource 7 | from datatest.__past__.api07_sources import _version_info 8 | 9 | 10 | ######################################################################## 11 | # Test version parsing and import ``pandas`` if available. 12 | ######################################################################## 13 | class TestVersionInfo(unittest.TestCase): 14 | def test_public_version(self): 15 | public_version = '0.19.2' 16 | info_tuple = _version_info(public_version) 17 | self.assertEqual(info_tuple, (0, 19, 2)) 18 | 19 | def test_local_version(self): 20 | """Version items after a "+" are considered "local" version 21 | identifiers (see PEP 440). 22 | """ 23 | local_version = '0.19.2+0.g825876c.dirty' 24 | info_tuple = _version_info(local_version) 25 | self.assertEqual(info_tuple, (0, 19, 2, 0, 'g825876c', 'dirty')) 26 | 27 | 28 | try: 29 | import pandas 30 | if (_version_info(pandas) < (0, 13, 0) 31 | or _version_info(pandas.np) < (1, 7, 1)): 32 | raise ImportError 33 | except ImportError: 34 | pandas = None 35 | 36 | 37 | ######################################################################## 38 | # Test with DataFrame with no specified index (using default indexing). 39 | ######################################################################## 40 | @unittest.skipUnless(pandas, 'requires pandas 0.13 or newer') 41 | class TestPandasSource(OtherTests, unittest.TestCase): 42 | def setUp(self): 43 | df = pandas.DataFrame(self.testdata, columns=self.fieldnames) 44 | self.datasource = PandasSource(df) 45 | 46 | 47 | @unittest.skipUnless(pandas, 'requires pandas 0.13 or newer') 48 | class TestPandasSourceCount(CountTests, unittest.TestCase): 49 | def setUp(self): 50 | df = pandas.DataFrame(self.testdata, columns=self.fieldnames) 51 | self.datasource = PandasSource(df) 52 | 53 | 54 | ######################################################################## 55 | # Test with DataFrame that has a specified index. 56 | ######################################################################## 57 | @unittest.skipUnless(pandas, 'requires pandas 0.13 or newer') 58 | class TestPandasSourceWithIndex(OtherTests, unittest.TestCase): 59 | def setUp(self): 60 | df = pandas.DataFrame(self.testdata, columns=self.fieldnames) 61 | df = df.set_index(['label1', 'label2']) # <- Specify index! 62 | self.datasource = PandasSource(df) 63 | 64 | 65 | @unittest.skipUnless(pandas, 'requires pandas 0.13 or newer') 66 | class TestPandasSourceWithIndexCount(CountTests, unittest.TestCase): 67 | def setUp(self): 68 | df = pandas.DataFrame(self.testdata, columns=self.fieldnames) 69 | df = df.set_index(['label1', 'label2']) # <- Specify index! 70 | self.datasource = PandasSource(df) 71 | 72 | def test_compound_keys(self): 73 | expected = { 74 | ('a', 'x'): 2, 75 | ('a', 'y'): 1, 76 | ('a', ''): 1, 77 | ('b', 'z'): 1, 78 | ('b', 'y'): 1, 79 | ('b', 'x'): 1, 80 | #('b', None): 1, 81 | ('b', pandas.np.nan): 1, # <- Returns nan instead of None (and that's OK!). 82 | ('b', ''): 1, 83 | } 84 | result = self.datasource.count('label1', ['label1', 'label2']) 85 | self.assertEqual(expected, result) 86 | -------------------------------------------------------------------------------- /docs/how-to/date-time-str.rst: -------------------------------------------------------------------------------- 1 | 2 | .. currentmodule:: datatest 3 | 4 | .. meta:: 5 | :description: How to validate date formats. 6 | :keywords: datatest, date format, validate, validation 7 | 8 | 9 | ##################################### 10 | How to Validate Date and Time Strings 11 | ##################################### 12 | 13 | To validate date and time formats, we can define a helper function that 14 | uses `strftime codes`_ to check for matching strings. 15 | 16 | In the following example, we use the code ``%Y-%m-%d`` to check for 17 | dates that match the pattern YYYY-MM-DD: 18 | 19 | .. code-block:: python 20 | :emphasize-lines: 17 21 | :linenos: 22 | 23 | from datetime import datetime 24 | from datatest import validate 25 | 26 | 27 | def strftime_format(format): 28 | def func(value): 29 | try: 30 | datetime.strptime(value, format) 31 | except ValueError: 32 | return False 33 | return True 34 | func.__doc__ = f'should use date format {format}' 35 | return func 36 | 37 | 38 | data = ['2020-02-29', '03-17-2021', '2021-02-29', '2021-04-01'] 39 | validate(data, strftime_format('%Y-%m-%d')) 40 | 41 | 42 | Date strings that don't match the required format are flagged as 43 | :class:`Invalid`: 44 | 45 | .. code-block:: none 46 | 47 | Traceback (most recent call last): 48 | File "example.py", line 17, in 49 | validate(data, strftime_format('%Y-%m-%d')) 50 | datatest.ValidationError: should use date format %Y-%m-%d (2 differences): [ 51 | Invalid('03-17-2021'), 52 | Invalid('2021-02-29'), 53 | ] 54 | 55 | Above, the date ``03-17-2021`` is invalid because it's not well-formed 56 | and ``2021-02-29`` is invalid because 2021 is not a leap-year so the last 57 | day of February is the 28th---there is no February 29th in that calendar 58 | year. 59 | 60 | 61 | Strftime Codes for Common Formats 62 | ================================= 63 | 64 | You can use the following **format codes** with the function 65 | defined earlier to validate many common date and time formats 66 | (e.g., ``strftime_format('%d %B %Y')``): 67 | 68 | ======================== ========================= ======================== 69 | format codes description example 70 | ======================== ========================= ======================== 71 | ``%Y-%m-%d`` YYYY-MM-DD 2021-03-17 72 | ``%m/%d/%Y`` MM/DD/YYYY 3/17/2021 73 | ``%d/%m/%Y`` DD/MM/YYYY 17/03/2021 74 | ``%d.%m.%Y`` DD.MM.YYYY 17.03.2021 75 | ``%d %B %Y`` DD Month YYYY 17 March 2021 76 | ``%b %d, %Y`` Mnth DD, YYYY Mar 17, 2021 77 | ``%a %b %d %H:%M:%S %Y`` WkDay Mnth DD H:M:S YYYY Wed Mar 17 19:42:50 2021 78 | ``%I:%M %p`` 12-hour time 7:42 PM [1]_ 79 | ``%H:%M:%S`` 24-hour time with seconds 19:42:50 80 | ======================== ========================= ======================== 81 | 82 | In Python's :py:mod:`datetime` module, see `strftime() and strptime() Format Codes`_ 83 | for all supported codes. 84 | 85 | .. _`strftime codes`: https://docs.python.org/library/datetime.html#strftime-and-strptime-format-codes 86 | .. _`strftime() and strptime() Format Codes`: https://docs.python.org/library/datetime.html#strftime-and-strptime-format-codes 87 | 88 | 89 | .. rubric:: Footnotes 90 | 91 | .. [1] The code ``%p`` expects the system locale's equivalent of AM or PM. 92 | For example, the locale ``en_US`` uses "AM" and "PM" while the locale 93 | ``de_DE`` uses "am" and "pm". 94 | -------------------------------------------------------------------------------- /tests/test_runner.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | from . import _unittest as unittest 4 | from datatest import DataTestCase 5 | from datatest import ValidationError 6 | from datatest import Missing 7 | 8 | from datatest.runner import DataTestResult 9 | from datatest.runner import mandatory 10 | from datatest.runner import _sort_key 11 | 12 | 13 | class TestDataTestResult(unittest.TestCase): 14 | def test_is_mandatory(self): 15 | testresult = DataTestResult() 16 | 17 | class _TestClass(DataTestCase): # Dummy class. 18 | def test_method(_self): 19 | pass 20 | 21 | def runTest(_self): 22 | pass 23 | 24 | # Not mandatory. 25 | testcase = _TestClass() 26 | self.assertFalse(testresult._is_mandatory(testcase)) 27 | 28 | # Mandatory class. 29 | testcase = _TestClass() 30 | testcase.__datatest_mandatory__ = True 31 | self.assertTrue(testresult._is_mandatory(testcase)) 32 | 33 | # Mandatory method. 34 | #TODO!!!: Need to make this test. 35 | 36 | # Check non-test-case behavior. 37 | not_a_testcase = object() 38 | self.assertFalse(testresult._is_mandatory(not_a_testcase)) 39 | 40 | def test_add_mandatory_message(self): 41 | testresult = DataTestResult() 42 | 43 | err_tuple = (ValidationError, 44 | ValidationError([Missing('x')], 'example failure'), 45 | '') 46 | 47 | new_tuple = testresult._add_mandatory_message(err_tuple) 48 | _, err, _ = new_tuple 49 | self.assertRegex(str(err), 'mandatory test failed, stopping early') 50 | 51 | 52 | class TestOrdering(unittest.TestCase): 53 | def test_sort_key(self): 54 | # Define and instantiate sample case. 55 | class SampleCase(unittest.TestCase): 56 | def test_reference(self): # <- This line number used as reference. 57 | pass # +1 58 | # +2 59 | @unittest.skip('Testing skip behavior.') # +3 (first check) 60 | def test_skipped(self): # +4 61 | pass # +5 62 | # +6 63 | @mandatory # +7 (second check) 64 | def test_mandatory(self): # +8 65 | pass # +9 66 | 67 | # Get line number of undecorated method--this is uses as a 68 | # reference point from which to determine the required line 69 | # numbers for the decorated methods. 70 | reference_case = SampleCase('test_reference') 71 | _, reference_line_no = _sort_key(reference_case) 72 | 73 | # Starting in Python 3.3, the @functools.wraps() decorator 74 | # added a greatly needed `__wrapped__` attribute that points 75 | # to the original wrapped object. After @unittest.skip() is 76 | # applied, this attribute is needed to get the line number 77 | # of the original object (instead of the line number of the 78 | # decorator). 79 | if sys.version_info >= (3, 3): 80 | # Test line number of skipped method. 81 | skipped_case = SampleCase('test_skipped') 82 | skipped_line_no = reference_line_no + 3 83 | _, line_no = _sort_key(skipped_case) 84 | self.assertEqual(skipped_line_no, line_no) 85 | 86 | # Test line number of mandatory method. 87 | mandatory_case = SampleCase('test_mandatory') 88 | mandatory_line_no = reference_line_no + 7 89 | _, line_no = _sort_key(mandatory_case) 90 | self.assertEqual(mandatory_line_no, line_no) 91 | -------------------------------------------------------------------------------- /datatest/_compatibility/builtins.py: -------------------------------------------------------------------------------- 1 | """compatibility layer for built-in functions""" 2 | from __future__ import absolute_import 3 | 4 | 5 | try: 6 | chr = unichr 7 | except NameError: 8 | pass 9 | 10 | 11 | from io import open as _open 12 | if open == _open: # Starting in 3.1 13 | open = open # <- Declare in local namespace. 14 | else: 15 | open = _open 16 | 17 | 18 | try: 19 | callable = callable # Removed from 3.0 and 3.1, added back in 3.2. 20 | except NameError: 21 | def callable(obj): 22 | parent_types = type(obj).__mro__ 23 | return any('__call__' in typ.__dict__ for typ in parent_types) 24 | 25 | 26 | try: 27 | property.__isabstractmethod__ # New in 3.3. 28 | property = property 29 | except AttributeError: 30 | _property = property 31 | class property(_property): 32 | def __init__(self, fget=None, fset=None, fdel=None, doc=None): 33 | super(property, self).__init__(fget, fset, fdel, doc) 34 | self.__isabstractmethod__ = getattr( 35 | fget, '__isabstractmethod__', False, 36 | ) 37 | 38 | 39 | # In the move to Python 3.0, map, filter, zip were replaced with their 40 | # iterable equivalents from the itertools module. 41 | try: 42 | map.__iter__ 43 | filter.__iter__ 44 | zip.__iter__ 45 | map = map 46 | filter = filter 47 | zip = zip 48 | except AttributeError: 49 | from itertools import imap as map 50 | from itertools import ifilter as filter 51 | from itertools import izip as zip 52 | 53 | 54 | try: 55 | max([0, 1], default=None) # The default keyword for max() 56 | min([0, 1], default=None) # and min() is new in 3.4. 57 | max = max 58 | min = min 59 | except TypeError: 60 | from itertools import chain as _chain 61 | 62 | _max = max 63 | def max(*iterable, **kwds): 64 | """ 65 | max(iterable, *[, default, key]) 66 | max(arg1, arg2, *args, *[, key]) 67 | """ 68 | allowed_kwds = ('default', 'key') 69 | for key in kwds: 70 | if key not in allowed_kwds: 71 | msg = "'{0}' is an invalid keyword argument for this function" 72 | raise TypeError(msg.format(key)) 73 | 74 | if len(iterable) == 1: 75 | iterable = iterable[0] 76 | 77 | try: 78 | first_item = next(iter(iterable)) 79 | if iter(iterable) is iterable: 80 | iterable = _chain([first_item], iterable) 81 | except StopIteration: 82 | if 'default' not in kwds: 83 | raise ValueError('max() arg is an empty sequence') 84 | return kwds['default'] 85 | 86 | if 'key' in kwds: 87 | return _max(iterable, key=kwds['key']) 88 | return _max(iterable) 89 | 90 | _min = min 91 | def min(*iterable, **kwds): 92 | """ 93 | min(iterable, *[, default, key]) 94 | min(arg1, arg2, *args, *[, key]) 95 | """ 96 | allowed_kwds = ('default', 'key') 97 | for key in kwds: 98 | if key not in allowed_kwds: 99 | msg = "'{0}' is an invalid keyword argument for this function" 100 | raise TypeError(msg.format(key)) 101 | 102 | if len(iterable) == 1: 103 | iterable = iterable[0] 104 | 105 | try: 106 | first_item = next(iter(iterable)) 107 | if iter(iterable) is iterable: 108 | iterable = _chain([first_item], iterable) 109 | except StopIteration: 110 | if 'default' not in kwds: 111 | raise ValueError('min() arg is an empty sequence') 112 | return kwds['default'] 113 | 114 | if 'key' in kwds: 115 | return _min(iterable, key=kwds['key']) 116 | return _min(iterable) 117 | -------------------------------------------------------------------------------- /datatest/main.py: -------------------------------------------------------------------------------- 1 | """Datatest main program""" 2 | 3 | import sys as _sys 4 | from unittest import TestProgram as _TestProgram 5 | from unittest import defaultTestLoader as _defaultTestLoader 6 | try: 7 | from unittest.signals import installHandler 8 | except ImportError: 9 | installHandler = None 10 | 11 | from datatest import DataTestRunner 12 | 13 | __unittest = True 14 | __datatest = True 15 | 16 | 17 | class DataTestProgram(_TestProgram): 18 | def __init__(self, module='__main__', defaultTest=None, argv=None, 19 | testRunner=DataTestRunner, testLoader=_defaultTestLoader, 20 | exit=True, verbosity=1, failfast=None, catchbreak=None, 21 | buffer=None, ignore=False): 22 | self.ignore = ignore 23 | _TestProgram.__init__(self, 24 | module=module, 25 | defaultTest=defaultTest, 26 | argv=argv, 27 | testRunner=testRunner, 28 | testLoader=testLoader, 29 | exit=exit, 30 | verbosity=verbosity, 31 | failfast=failfast, 32 | catchbreak=catchbreak, 33 | buffer=buffer) 34 | 35 | def runTests(self): 36 | try: 37 | if self.catchbreak and installHandler: 38 | installHandler() 39 | except AttributeError: 40 | pass # does not have catchbreak attribute 41 | 42 | if self.testRunner is None: 43 | self.testRunner = DataTestRunner 44 | 45 | if isinstance(self.testRunner, type): 46 | try: 47 | kwds = ['verbosity', 'failfast', 'buffer', 'warnings', 'ignore'] 48 | kwds = [attr for attr in kwds if hasattr(self, attr)] 49 | kwds = dict((attr, getattr(self, attr)) for attr in kwds) 50 | testRunner = self.testRunner(**kwds) 51 | except TypeError: 52 | if 'warnings' in kwds: 53 | del kwds['warnings'] 54 | testRunner = self.testRunner(**kwds) 55 | else: 56 | # assumed to be a TestRunner instance 57 | testRunner = self.testRunner 58 | 59 | self.result = testRunner.run(self.test) 60 | if self.exit: 61 | _sys.exit(not self.result.wasSuccessful()) 62 | 63 | 64 | if _sys.version_info[:2] == (3, 1): # Patch methods for Python 3.1. 65 | def __init__(self, module='__main__', defaultTest=None, argv=None, 66 | testRunner=DataTestRunner, testLoader=_defaultTestLoader, 67 | exit=True, ignore=False): 68 | self.ignore = ignore 69 | _TestProgram.__init__(self, 70 | module=module, 71 | defaultTest=defaultTest, 72 | argv=argv, 73 | testRunner=testRunner, 74 | testLoader=testLoader, 75 | exit=exit) 76 | DataTestProgram.__init__ = __init__ 77 | 78 | elif _sys.version_info[:2] == (2, 6): # Patch runTests() for Python 2.6. 79 | def __init__(self, module='__main__', defaultTest=None, argv=None, 80 | testRunner=DataTestRunner, testLoader=_defaultTestLoader, 81 | exit=True, ignore=False): 82 | self.exit = exit # <- 2.6 does not handle exit argument. 83 | self.ignore = ignore 84 | _TestProgram.__init__(self, 85 | module=module, 86 | defaultTest=defaultTest, 87 | argv=argv, 88 | testRunner=testRunner, 89 | testLoader=testLoader) 90 | DataTestProgram.__init__ = __init__ 91 | 92 | 93 | main = DataTestProgram 94 | -------------------------------------------------------------------------------- /docs/how-to/fuzzy-matching.rst: -------------------------------------------------------------------------------- 1 | 2 | .. currentmodule:: datatest 3 | 4 | .. meta:: 5 | :description: How to assert fuzzy matches. 6 | :keywords: approximate string, fuzzy matching, testing, datatest 7 | 8 | 9 | ############################# 10 | How to Validate Fuzzy Matches 11 | ############################# 12 | 13 | When comparing strings of text, it can sometimes be useful 14 | to check that values are similar instead of asserting that 15 | they are exactly the same. Datatest provides options for 16 | *approximate string matching* (also called "fuzzy 17 | matching"). 18 | 19 | When checking mappings or sequences of values, you can accept 20 | approximate matches with the :meth:`accepted.fuzzy` acceptance: 21 | 22 | .. tabs:: 23 | 24 | .. tab:: Using Acceptance 25 | 26 | .. code-block:: python 27 | :emphasize-lines: 19 28 | :linenos: 29 | 30 | from datatest import validate, accepted 31 | 32 | linked_record = { 33 | 'id165': 'Saint Louis', 34 | 'id382': 'Raliegh', 35 | 'id592': 'Austin', 36 | 'id720': 'Cincinatti', 37 | 'id826': 'Philadelphia', 38 | } 39 | 40 | master_record = { 41 | 'id165': 'St. Louis', 42 | 'id382': 'Raleigh', 43 | 'id592': 'Austin', 44 | 'id720': 'Cincinnati', 45 | 'id826': 'Philadelphia', 46 | } 47 | 48 | with accepted.fuzzy(cutoff=0.6): 49 | validate(linked_record, master_record) 50 | 51 | .. tab:: No Acceptance 52 | 53 | .. code-block:: python 54 | :linenos: 55 | 56 | from datatest import validate 57 | 58 | linked_record = { 59 | 'id165': 'Saint Louis', 60 | 'id382': 'Raliegh', 61 | 'id592': 'Austin', 62 | 'id720': 'Cincinatti', 63 | 'id826': 'Philadelphia', 64 | } 65 | 66 | master_record = { 67 | 'id165': 'St. Louis', 68 | 'id382': 'Raleigh', 69 | 'id592': 'Austin', 70 | 'id720': 'Cincinnati', 71 | 'id826': 'Philadelphia', 72 | } 73 | 74 | validate(linked_record, master_record) 75 | 76 | 77 | .. code-block:: none 78 | :emphasize-lines: 5-7 79 | 80 | Traceback (most recent call last): 81 | File "example.py", line 19, in 82 | validate(linked_record, master_record) 83 | datatest.ValidationError: does not satisfy mapping requirements (3 differences): { 84 | 'id165': Invalid('Saint Louis', expected='St. Louis'), 85 | 'id382': Invalid('Raliegh', expected='Raleigh'), 86 | 'id720': Invalid('Cincinatti', expected='Cincinnati'), 87 | } 88 | 89 | 90 | If variation is an inherent, natural feature of the data and 91 | does not necessarily represent a defect, it may be appropriate 92 | to use :meth:`validate.fuzzy` instead of the acceptance shown 93 | previously: 94 | 95 | .. code-block:: python 96 | :emphasize-lines: 19 97 | :linenos: 98 | 99 | from datatest import validate 100 | 101 | linked_record = { 102 | 'id165': 'Saint Louis', 103 | 'id382': 'Raliegh', 104 | 'id592': 'Austin', 105 | 'id720': 'Cincinatti', 106 | 'id826': 'Philadelphia', 107 | } 108 | 109 | master_record = { 110 | 'id165': 'St. Louis', 111 | 'id382': 'Raleigh', 112 | 'id592': 'Austin', 113 | 'id720': 'Cincinnati', 114 | 'id826': 'Philadelphia', 115 | } 116 | 117 | validate.fuzzy(linked_record, master_record, cutoff=0.6) 118 | 119 | 120 | That said, it's probably more appropriate to use an acceptance 121 | for this specific example. 122 | 123 | -------------------------------------------------------------------------------- /datatest/_compatibility/functools.py: -------------------------------------------------------------------------------- 1 | """compatibility layer for functools (Python standard library)""" 2 | from __future__ import absolute_import 3 | from functools import * 4 | from sys import version_info as _version_info 5 | 6 | 7 | if _version_info[:2] <= (2, 7): # For version 2.7 and earlier. 8 | 9 | def update_wrapper(wrapper, 10 | wrapped, 11 | assigned=WRAPPER_ASSIGNMENTS, 12 | updated=WRAPPER_UPDATES): 13 | for attr in assigned: 14 | try: # <- This try/except 15 | value = getattr(wrapped, attr) # fixes issue #3445 16 | except AttributeError: # in Python 2.7 and 17 | pass # earlier. 18 | else: 19 | setattr(wrapper, attr, value) 20 | for attr in updated: 21 | getattr(wrapper, attr).update(getattr(wrapped, attr, {})) 22 | wrapper.__wrapped__ = wrapped 23 | return wrapper 24 | 25 | 26 | def wraps(wrapped, 27 | assigned=WRAPPER_ASSIGNMENTS, 28 | updated=WRAPPER_UPDATES): 29 | return partial(update_wrapper, # <- Patched update_wrapper(). 30 | wrapped=wrapped, 31 | assigned=assigned, 32 | updated=updated) 33 | 34 | 35 | try: 36 | partialmethod # New in version 3.4. 37 | except NameError: 38 | # Adapted from the Python 3.6 Standard Library. 39 | class partialmethod(object): 40 | def __init__(self, func, *args, **keywords): 41 | if not callable(func) and not hasattr(func, "__get__"): 42 | raise TypeError("{!r} is not callable or a descriptor" 43 | .format(func)) 44 | 45 | if isinstance(func, partialmethod): 46 | self.func = func.func 47 | self.args = func.args + args 48 | self.keywords = func.keywords.copy() 49 | self.keywords.update(keywords) 50 | else: 51 | self.func = func 52 | self.args = args 53 | self.keywords = keywords 54 | 55 | def __repr__(self): 56 | args = ", ".join(map(repr, self.args)) 57 | keywords = ", ".join("{}={!r}".format(k, v) 58 | for k, v in self.keywords.items()) 59 | format_string = "{module}.{cls}({func}, {args}, {keywords})" 60 | return format_string.format(module=self.__class__.__module__, 61 | cls=self.__class__.__qualname__, 62 | func=self.func, 63 | args=args, 64 | keywords=keywords) 65 | 66 | def _make_unbound_method(self): 67 | def _method(*args, **keywords): 68 | call_keywords = self.keywords.copy() 69 | call_keywords.update(keywords) 70 | #cls_or_self, *rest = args 71 | cls_or_self, rest = args[0], args[1:] 72 | call_args = (cls_or_self,) + self.args + tuple(rest) 73 | return self.func(*call_args, **call_keywords) 74 | _method.__isabstractmethod__ = self.__isabstractmethod__ 75 | _method._partialmethod = self 76 | return _method 77 | 78 | def __get__(self, obj, cls): 79 | get = getattr(self.func, "__get__", None) 80 | result = None 81 | if get is not None: 82 | new_func = get(obj, cls) 83 | if new_func is not self.func: 84 | result = partial(new_func, *self.args, **self.keywords) 85 | try: 86 | result.__self__ = new_func.__self__ 87 | except AttributeError: 88 | pass 89 | if result is None: 90 | result = self._make_unbound_method().__get__(obj, cls) 91 | return result 92 | 93 | @property 94 | def __isabstractmethod__(self): 95 | return getattr(self.func, "__isabstractmethod__", False) 96 | -------------------------------------------------------------------------------- /release-checklist.rst: -------------------------------------------------------------------------------- 1 | 2 | Release Checklist 3 | ================= 4 | 5 | #. Make sure correct version number is set in the following files 6 | (remove the ".devN" suffix): 7 | 8 | * ``datatest/__init__.py`` 9 | * ``docs/conf.py`` 10 | 11 | #. Make sure the *description* argument in ``setup.py`` matches the project 12 | description on GitHub (in the "About" section). 13 | 14 | #. In the call to ``setup()``, check the versions defined by the 15 | *python_requires* argument (see the "Version specifiers" section of 16 | PEP-440 for details). 17 | 18 | #. In the call to ``setup()``, check the trove classifiers in the 19 | *classifiers* argument (see https://pypi.org/classifiers/ for values). 20 | 21 | #. Check that *packages* argument of ``setup()`` is correct. Check that the 22 | value matches what ``setuptools.find_packages()`` returns: 23 | 24 | .. code-block:: python 25 | 26 | >>> import setuptools 27 | >>> sorted(setuptools.find_packages('.', exclude=['tests'])) 28 | 29 | Defining this list explicitly (rather than using ``find_packages()`` 30 | directly in ``setup.py`` file) is needed when installing on systems 31 | where ``setuptools`` is not available. 32 | 33 | #. Make sure ``__past__`` sub-package includes a stub module for the 34 | current API version. 35 | 36 | #. Update ``README.rst`` (including "Backward Compatibility" section). 37 | 38 | #. Make final edits to ``CHANGELOG`` (doublecheck release date and version). 39 | 40 | #. Commit and push final changes to upstream repository: 41 | 42 | Prepare version info, README, and CHANGELOG for version N.N.N release. 43 | 44 | #. Perform final checks to make sure there are no CI test failures. 45 | 46 | #. Make sure the packaging tools are up-to-date: 47 | 48 | .. code-block:: console 49 | 50 | pip install -U twine wheel setuptools check-manifest 51 | 52 | #. Check the manifest against the project's root folder: 53 | 54 | .. code-block:: console 55 | 56 | check-manifest . 57 | 58 | #. Remove all existing files in the ``dist/`` folder. 59 | 60 | #. Build new distributions: 61 | 62 | .. code-block:: console 63 | 64 | python setup.py sdist bdist_wheel 65 | 66 | #. Upload distributions to TestPyPI: 67 | 68 | .. code-block:: console 69 | 70 | twine upload --repository testpypi dist/* 71 | 72 | #. View the package's web page on TestPyPI and verify that the information 73 | is correct for the "Project links" and "Meta" sections: 74 | 75 | * https://test.pypi.org/project/datatest 76 | 77 | If you are testing a pre-release version, make sure to use the URL returned 78 | by twine in the previous step (the default URL shows the latest *stable* 79 | version). 80 | 81 | #. Test the installation process from TestPyPI: 82 | 83 | .. code-block:: console 84 | 85 | python -m pip install --index-url https://test.pypi.org/simple/ datatest 86 | 87 | If you're testing a pre-release version, make sure to use the "pip install" 88 | command listed at the top of the project's TestPyPI page. 89 | 90 | #. Upload source and wheel distributions to PyPI: 91 | 92 | .. code-block:: console 93 | 94 | twine upload dist/* 95 | 96 | #. Double check PyPI project page and test installation from PyPI: 97 | 98 | .. code-block:: console 99 | 100 | python -m pip install datatest 101 | 102 | #. Add version tag to upstream repository (also used by readthedocs.org). 103 | 104 | #. Iterate the version number in the development repository to the next 105 | anticipated release and add a "dev" suffix (e.g., N.N.N.dev1). This 106 | version number should conform to the "Version scheme" section of PEP-440. 107 | Make sure these changes are reflected in the following files: 108 | 109 | * ``datatest/__init__.py`` 110 | * ``docs/conf.py`` 111 | 112 | Commit these changes with a comment like the one below: 113 | 114 | Iterate version number to the next anticipated release. 115 | 116 | This is done so that installations made directly from the development 117 | repository and the "latest" docs are not confused with the just-published 118 | "stable" versions. 119 | 120 | #. Make sure the documentation reflects the new versions: 121 | 122 | * https://datatest.readthedocs.io/ (stable) 123 | * https://datatest.readthedocs.io/en/latest/ (latest) 124 | 125 | If the documentation was not automatically updated, you may need to 126 | login to https://readthedocs.org/ and start the build process manually. 127 | 128 | #. Publish update announcement to relevant mailing lists: 129 | 130 | * python-announce-list@python.org 131 | * testing-in-python@lists.idyll.org 132 | -------------------------------------------------------------------------------- /tests/test_utils_misc.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from datetime import timedelta 4 | from . import _unittest as unittest 5 | from datatest import _utils 6 | from datatest._utils import IterItems 7 | from datatest._utils import pretty_timedelta_repr 8 | 9 | 10 | class TestIterItems(unittest.TestCase): 11 | def test_type_error(self): 12 | regex = "expected iterable or mapping, got 'int'" 13 | with self.assertRaisesRegex(TypeError, regex): 14 | IterItems(123) 15 | 16 | def test_non_exhaustible(self): 17 | items_list = [('a', 1), ('b', 2)] # <- Non-exhaustible input. 18 | 19 | items = IterItems(items_list) 20 | self.assertIs(iter(items), iter(items), msg='exhaustible output') 21 | self.assertEqual(list(items), items_list) 22 | self.assertEqual(list(items), [], msg='already exhausted') 23 | 24 | def test_exhaustible(self): 25 | items_iter = iter([('a', 1), ('b', 2)]) # <- Exhaustible iterator. 26 | 27 | items = IterItems(items_iter) 28 | self.assertIs(iter(items), iter(items)) 29 | self.assertEqual(list(items), [('a', 1), ('b', 2)]) 30 | self.assertEqual(list(items), [], msg='already exhausted') 31 | 32 | def test_dict(self): 33 | mapping = {'a': 1, 'b': 2} 34 | 35 | items = IterItems(mapping) 36 | self.assertEqual(set(items), set([('a', 1), ('b', 2)])) 37 | self.assertEqual(set(items), set(), msg='already exhausted') 38 | 39 | def test_dictitems(self): 40 | dic = {'a': 1} 41 | 42 | if hasattr(dic, 'iteritems'): # <- Python 2 43 | dic_items = dic.iteritems() 44 | 45 | items = IterItems(dic_items) 46 | self.assertEqual(list(items), [('a', 1)]) 47 | self.assertEqual(list(items), [], msg='already exhausted') 48 | 49 | dic_items = dic.items() 50 | 51 | items = IterItems(dic_items) 52 | self.assertEqual(list(items), [('a', 1)]) 53 | self.assertEqual(list(items), [], msg='already exhausted') 54 | 55 | def test_empty_iterable(self): 56 | empty = iter([]) 57 | 58 | items = IterItems(empty) 59 | self.assertEqual(list(items), []) 60 | 61 | def test_repr(self): 62 | items = IterItems([1, 2]) 63 | 64 | repr_part = repr(iter([])).partition(' ')[0] 65 | repr_start = 'IterItems({0}'.format(repr_part) 66 | self.assertTrue(repr(items).startswith(repr_start)) 67 | 68 | generator = (x for x in [1, 2]) 69 | items = IterItems(generator) 70 | self.assertEqual(repr(items), 'IterItems({0!r})'.format(generator)) 71 | 72 | def test_subclasshook(self): 73 | items = IterItems(iter([])) 74 | self.assertIsInstance(items, IterItems) 75 | 76 | try: 77 | items = dict([]).iteritems() # <- For Python 2 78 | except AttributeError: 79 | items = dict([]).items() # <- For Python 3 80 | self.assertIsInstance(items, IterItems) 81 | 82 | items = enumerate([]) 83 | self.assertIsInstance(items, IterItems) 84 | 85 | def test_virtual_subclass(self): 86 | class OtherClass(object): 87 | pass 88 | 89 | oth_cls = OtherClass() 90 | 91 | IterItems.register(OtherClass) # <- Register virtual subclass. 92 | self.assertIsInstance(oth_cls, IterItems) 93 | 94 | 95 | class TestMakeSentinel(unittest.TestCase): 96 | def test_basic(self): 97 | sentinel = _utils._make_token( 98 | 'TheName', '', 'The docstring.' 99 | ) 100 | self.assertEqual(sentinel.__class__.__name__, 'TheName') 101 | self.assertEqual(repr(sentinel), '') 102 | self.assertEqual(sentinel.__doc__, 'The docstring.') 103 | self.assertTrue(bool(sentinel)) 104 | 105 | def test_falsy(self): 106 | token = _utils._make_token( 107 | 'TheName', '', 'The docstring.', truthy=False 108 | ) 109 | self.assertFalse(bool(token)) 110 | 111 | 112 | class TestPrettyTimedeltaRepr(unittest.TestCase): 113 | def test_already_normalized_units(self): 114 | delta = timedelta(days=6, seconds=27, microseconds=100) 115 | 116 | actual = pretty_timedelta_repr(delta) 117 | expected = 'timedelta(days=+6, seconds=+27, microseconds=+100)' 118 | self.assertEqual(actual, expected) 119 | 120 | def test_negative_delta(self): 121 | delta = timedelta(seconds=-2) # The built-in repr for this timedelta 122 | # is: timedelta(days=-1, seconds=86398) 123 | 124 | actual = pretty_timedelta_repr(delta) 125 | expected = 'timedelta(seconds=-2)' 126 | self.assertEqual(actual, expected) 127 | -------------------------------------------------------------------------------- /datatest/__past__/api06.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Backwards compatibility for version 0.6.0.dev1 API.""" 3 | from __future__ import absolute_import 4 | import inspect 5 | import datatest 6 | from datatest.__past__ import api08 7 | from datatest.__past__ import api07 8 | from datatest._compatibility import itertools 9 | from datatest import DataTestCase 10 | 11 | 12 | DataTestCase.subjectData = property(fget=DataTestCase.subject.fget, 13 | fset=DataTestCase.subject.fset) 14 | DataTestCase.referenceData = property(fget=DataTestCase.reference.fget, 15 | fset=DataTestCase.reference.fset) 16 | DataTestCase.assertDataColumns = DataTestCase.assertSubjectColumns 17 | DataTestCase.assertDataSet = DataTestCase.assertSubjectSet 18 | DataTestCase.assertDataSum = DataTestCase.assertSubjectSum 19 | DataTestCase.assertDataRegex = DataTestCase.assertSubjectRegex 20 | DataTestCase.assertDataNotRegex = DataTestCase.assertSubjectNotRegex 21 | datatest.DataAssertionError = datatest.__past__.api07_error.DataError 22 | 23 | 24 | _wrapped_find_data_source = DataTestCase._find_data_source 25 | @staticmethod 26 | def _find_data_source(name): 27 | if name in ('subject', 'subjectData'): 28 | stack = inspect.stack() 29 | stack.pop() # Skip record of current frame. 30 | for record in stack: 31 | frame = record[0] 32 | if 'subject' in frame.f_globals: 33 | return frame.f_globals['subject'] # <- EXIT! 34 | if 'subjectData' in frame.f_globals: 35 | return frame.f_globals['subjectData'] # <- EXIT! 36 | raise NameError('cannot find {0!r}'.format(name)) 37 | elif name in ('reference', 'referenceData'): 38 | stack = inspect.stack() 39 | stack.pop() # Skip record of current frame. 40 | for record in stack: 41 | frame = record[0] 42 | if 'reference' in frame.f_globals: 43 | return frame.f_globals['reference'] # <- EXIT! 44 | if 'referenceData' in frame.f_globals: 45 | return frame.f_globals['referenceData'] # <- EXIT! 46 | raise NameError('cannot find {0!r}'.format(name)) 47 | return _wrapped_find_data_source(name) 48 | DataTestCase._find_data_source = _find_data_source 49 | 50 | 51 | def _normalize_required(self, required, method, *args, **kwds): 52 | if required == None: 53 | required = self.referenceData # <- OLD NAME! 54 | if isinstance(required, datatest.BaseSource): 55 | fn = getattr(required, method) 56 | required = fn(*args, **kwds) 57 | return required 58 | DataTestCase._normalize_required = _normalize_required 59 | 60 | 61 | # This method was removed entirely. 62 | def _assertDataCount(self, column, keys, required=None, msg=None, **kwds_filter): 63 | subject_dict = self.subject.count(column, keys, **kwds_filter) 64 | required = self._normalize_required(required, 'sum', column, keys, **kwds_filter) 65 | msg = msg or 'row counts different than {0!r} sums'.format(column) 66 | self.assertEqual(subject_dict, required, msg) 67 | DataTestCase.assertDataCount = _assertDataCount 68 | 69 | 70 | # Function signature and behavior was changed. 71 | def _allowAny(self, number=None, msg=None, **kwds_filter): 72 | if number: 73 | return datatest.allow_limit(number, msg, **kwds_filter) 74 | return datatest.allow_any(msg, **kwds_filter) 75 | DataTestCase.allowAny = _allowAny 76 | 77 | 78 | # Function signature and behavior was changed. 79 | def _allowMissing(self, number=None, msg=None): 80 | def function(iterable): 81 | t1, t2 = itertools.tee(iterable) 82 | not_allowed = [] 83 | count = 0 84 | for x in t1: 85 | if not isinstance(x, datatest.Missing): 86 | not_allowed.append(x) 87 | else: 88 | count += 1 89 | if number and count > number: 90 | return t2 # <- EXIT! Exceeds limit, return all. 91 | return not_allowed 92 | return datatest.allow_iter(function, msg) 93 | DataTestCase.allowMissing = _allowMissing 94 | 95 | 96 | # Function signature and behavior was changed. 97 | def _allowExtra(self, number=None, msg=None): 98 | def function(iterable): 99 | t1, t2 = itertools.tee(iterable) 100 | not_allowed = [] 101 | count = 0 102 | for x in t1: 103 | if not isinstance(x, datatest.Extra): 104 | not_allowed.append(x) 105 | else: 106 | count += 1 107 | if number and count > number: 108 | return t2 # <- EXIT! Exceeds limit, return all. 109 | return not_allowed 110 | return datatest.allow_iter(function, msg) 111 | DataTestCase.allowExtra = _allowExtra 112 | -------------------------------------------------------------------------------- /docs/how-to/sequences.rst: -------------------------------------------------------------------------------- 1 | 2 | .. py:currentmodule:: datatest 3 | 4 | .. meta:: 5 | :description: How to validate sequences. 6 | :keywords: datatest, sequences, order 7 | 8 | 9 | ######################### 10 | How to Validate Sequences 11 | ######################### 12 | 13 | 14 | Index Position 15 | ============== 16 | 17 | To check for a specific sequence, you can pass a list [1]_ as the 18 | *requirement* argument: 19 | 20 | .. code-block:: python 21 | :emphasize-lines: 4 22 | :linenos: 23 | 24 | from datatest import validate 25 | 26 | data = ['A', 'B', 'X', 'C', 'D'] 27 | requirement = ['A', 'B', 'C', 'D'] # <- a list 28 | validate(data, requirement) 29 | 30 | 31 | Elements in the *data* and *requirement* lists are compared by 32 | sequence position. The items at index position 0 are compared to 33 | each other, then items at index position 1 are compared to each 34 | other, and so on: 35 | 36 | .. math:: 37 | 38 | \begin{array}{cccc} 39 | \hline 40 | \textbf{index} & \textbf{data} & \textbf{requirement} & \textbf{result} \\ 41 | \hline 42 | 0 & \textbf{A} & \textbf{A} & \textrm{matches} \\ 43 | 1 & \textbf{B} & \textbf{B} & \textrm{matches} \\ 44 | 2 & \textbf{X} & \textbf{C} & \textrm{doesn't match} \\ 45 | 3 & \textbf{C} & \textbf{D} & \textrm{doesn't match} \\ 46 | 4 & \textbf{D} & no\;value & \textrm{doesn't match} \\ 47 | \hline 48 | \end{array} 49 | 50 | 51 | In this example, there are three differences: 52 | 53 | .. code-block:: none 54 | 55 | ValidationError: does not match required sequence (3 differences): [ 56 | Invalid('X', expected='C'), 57 | Invalid('C', expected='D'), 58 | Extra('D'), 59 | ] 60 | 61 | 62 | Using enumerate() 63 | ----------------- 64 | 65 | While the previous example works well for short lists, the error 66 | does not describe **where** in your sequence the differences occur. 67 | To get the index positions associated with any differences, you 68 | can :py:func:`enumerate` your *data* and *requirement* objects: 69 | 70 | .. code-block:: python 71 | :emphasize-lines: 5 72 | :linenos: 73 | 74 | from datatest import validate 75 | 76 | data = ['A', 'B', 'X', 'C', 'D'] 77 | requirement = ['A', 'B', 'C', 'D'] 78 | validate(enumerate(data), enumerate(requirement)) 79 | 80 | 81 | A required **enumerate object** is treated as a mapping. The keys 82 | for any differences will correspond to their index positions: 83 | 84 | .. code-block:: none 85 | 86 | ValidationError: does not satisfy mapping requirements (3 differences): { 87 | 2: Invalid('X', expected='C'), 88 | 3: Invalid('C', expected='D'), 89 | 4: Extra('D'), 90 | } 91 | 92 | 93 | Relative Order 94 | ============== 95 | 96 | When comparing elements by sequence position, one mis-alignment can 97 | create differences for all following elements. If this behavior is 98 | not desireable, you may want to check for *relative order* instead. 99 | 100 | If you want to check the relative order of elements rather than 101 | their index positions, you can use :meth:`validate.order`: 102 | 103 | .. code-block:: python 104 | :emphasize-lines: 5 105 | :linenos: 106 | 107 | from datatest import validate 108 | 109 | data = ['A', 'B', 'X', 'C', 'D'] 110 | requirement = ['A', 'B', 'C', 'D'] 111 | validate.order(data, requirement) 112 | 113 | 114 | When checking for relative order, this method tries to align 115 | elements into contiguous matching subsequences. This reduces 116 | the number of non-matches: 117 | 118 | .. math:: 119 | 120 | \begin{array}{cccc} 121 | \hline 122 | \textbf{index} & \textbf{data} & \textbf{requirement} & \textbf{result} \\ 123 | \hline 124 | 0 & \textbf{A} & \textbf{A} & \textrm{matches} \\ 125 | 1 & \textbf{B} & \textbf{B} & \textrm{matches} \\ 126 | 2 & \textbf{X} & no\;value & \textrm{doesn't match} \\ 127 | 3 & \textbf{C} & \textbf{C} & \textrm{matches} \\ 128 | 4 & \textbf{D} & \textbf{D} & \textrm{matches} \\ 129 | \hline 130 | \end{array} 131 | 132 | Differences are reported as two-tuples containing the index (in *data*) 133 | where the difference occurs and the non-matching value. In the earlier 134 | examples, we saw that validating by index position produced three 135 | differences. But in this example, validating the same sequences by 136 | relative order produces only one difference: 137 | 138 | .. code-block:: none 139 | 140 | ValidationError: does not match required order (1 difference): [ 141 | Extra((2, 'X')), 142 | ] 143 | 144 | 145 | .. rubric:: Footnotes 146 | 147 | .. [1] The validate() function will check *data* by index position when the 148 | *requirement* is any iterable object other than a set, mapping, tuple 149 | or string. See the :ref:`Sequence Validation ` 150 | section of the :func:`validate` documentation for full details. 151 | -------------------------------------------------------------------------------- /tests/past_api00.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Test backwards compatibility with pre-release API. 3 | 4 | .. note:: Because this sub-module works by monkey-patching the global 5 | ``datatest`` package, these tests should be run in a separate 6 | process. 7 | """ 8 | from . import _unittest as unittest 9 | 10 | import datatest 11 | from datatest.__past__ import api00 # <- MONKEY PATCH!!! 12 | 13 | DataTestCase = datatest.DataTestCase 14 | from datatest.__past__.api07_error import DataError 15 | from datatest.__past__.api07_sources import MinimalSource 16 | 17 | 18 | class TestAttributes(unittest.TestCase): 19 | def test_api_dev0(self): 20 | # Error class. 21 | self.assertTrue(hasattr(datatest, 'DataAssertionError')) 22 | 23 | # Data source properties. 24 | self.assertTrue(hasattr(datatest.DataTestCase, 'subjectData')) 25 | self.assertTrue(hasattr(datatest.DataTestCase, 'referenceData')) 26 | 27 | # Acceptance context managers. 28 | self.assertTrue(hasattr(datatest.DataTestCase, 'allowSpecified')) 29 | self.assertTrue(hasattr(datatest.DataTestCase, 'allowUnspecified')) 30 | self.assertTrue(hasattr(datatest.DataTestCase, 'allowDeviationPercent')) 31 | 32 | # Assert methods. 33 | self.assertTrue(hasattr(datatest.DataTestCase, 'assertColumnSet')) 34 | self.assertTrue(hasattr(datatest.DataTestCase, 'assertColumnSubset')) 35 | self.assertTrue(hasattr(datatest.DataTestCase, 'assertColumnSuperset')) 36 | self.assertTrue(hasattr(datatest.DataTestCase, 'assertValueSet')) 37 | self.assertTrue(hasattr(datatest.DataTestCase, 'assertValueSubset')) 38 | self.assertTrue(hasattr(datatest.DataTestCase, 'assertValueSuperset')) 39 | self.assertTrue(hasattr(datatest.DataTestCase, 'assertValueSum')) 40 | self.assertTrue(hasattr(datatest.DataTestCase, 'assertValueCount')) 41 | self.assertTrue(hasattr(datatest.DataTestCase, 'assertValueRegex')) 42 | self.assertTrue(hasattr(datatest.DataTestCase, 'assertValueNotRegex')) 43 | 44 | 45 | class TestColumnSubset(datatest.DataTestCase): 46 | def setUp(self): 47 | self.subjectData = MinimalSource(data=[['a', '65'], ['b', '70']], 48 | fieldnames=['label1', 'value']) 49 | 50 | def test_is_same(self): 51 | self.assertColumnSubset(ref=['label1', 'value']) # Should pass without error. 52 | 53 | def test_is_subset(self): 54 | self.assertColumnSubset(ref=['label1', 'label2', 'value']) # Should pass without error. 55 | 56 | def test_is_superset(self): 57 | regex = "different column names:\n xExtra\(u?'value'\)" 58 | with self.assertRaisesRegex(DataError, regex): 59 | self.assertColumnSubset(ref=['label1']) 60 | 61 | 62 | class TestColumnSuperset(datatest.DataTestCase): 63 | def setUp(self): 64 | self.subjectData = MinimalSource(data=[['a', '65'], ['b', '70']], 65 | fieldnames=['label1', 'value']) 66 | 67 | def test_is_same(self): 68 | self.assertColumnSuperset(ref=['label1', 'value']) # Should pass without error. 69 | 70 | def test_is_superset(self): 71 | self.assertColumnSuperset(ref=['label1']) # Should pass without error. 72 | 73 | def test_is_subset(self): 74 | regex = "different column names:\n xMissing\(u?'label2'\)" 75 | with self.assertRaisesRegex(DataError, regex): 76 | self.assertColumnSuperset(ref=['label1', 'label2', 'value']) 77 | 78 | 79 | class TestValueSubset(DataTestCase): 80 | def setUp(self): 81 | self.subjectData = MinimalSource(data=[['a'], ['b'], ['c']], 82 | fieldnames=['label']) 83 | 84 | def test_is_same(self): 85 | self.assertValueSubset('label', ref=['a', 'b', 'c']) # Should pass without error. 86 | 87 | def test_is_subset(self): 88 | self.assertValueSubset('label', ref=['a', 'b', 'c', 'd']) # Should pass without error. 89 | 90 | def test_is_superset(self): 91 | regex = "different 'label' values:\n xExtra\(u?'c'\)" 92 | with self.assertRaisesRegex(DataError, regex): 93 | self.assertValueSubset('label', ref=['a', 'b']) 94 | 95 | 96 | class TestValueSuperset(DataTestCase): 97 | def setUp(self): 98 | self.subjectData = MinimalSource(data=[['a'], ['b'], ['c']], 99 | fieldnames=['label']) 100 | 101 | def test_is_same(self): 102 | self.assertValueSuperset('label', ref=['a', 'b', 'c']) # Should pass without error. 103 | 104 | def test_is_superset(self): 105 | self.assertValueSuperset('label', ref=['a', 'b']) # Should pass without error. 106 | 107 | def test_is_subset(self): 108 | regex = "different 'label' values:\n xMissing\(u?'d'\)" 109 | with self.assertRaisesRegex(DataError, regex): 110 | self.assertValueSuperset('label', ref=['a', 'b', 'c', 'd']) 111 | 112 | 113 | if __name__ == '__main__': 114 | unittest.main() 115 | else: 116 | raise Exception('This test must be run directly or as a subprocess.') 117 | -------------------------------------------------------------------------------- /docs/reference/unittest-support.rst: -------------------------------------------------------------------------------- 1 | 2 | .. currentmodule:: datatest 3 | 4 | .. meta:: 5 | :description: datatest API for unittest-style testing 6 | :keywords: datatest, unittest, data-wrangling 7 | 8 | 9 | ################ 10 | Unittest Support 11 | ################ 12 | 13 | Datatest can be used together with the :mod:`unittest` package 14 | from the Python Standard Library. For a quick introduction, see: 15 | 16 | * :ref:`Automated Data Testing: Unittest ` 17 | * :ref:`Unittest Samples ` 18 | 19 | 20 | .. _datatestcase-docs: 21 | 22 | ************ 23 | DataTestCase 24 | ************ 25 | 26 | .. autoclass:: DataTestCase 27 | 28 | **VALIDATION METHODS** 29 | 30 | The assertion methods wrap :func:`validate` and its methods: 31 | 32 | .. code-block:: python 33 | :emphasize-lines: 7 34 | 35 | from datatest import DataTestCase 36 | 37 | class MyTest(DataTestCase): 38 | def test_mydata(self): 39 | data = ... 40 | requirement = ... 41 | self.assertValid(data, requirement) 42 | 43 | .. automethod:: assertValid 44 | 45 | .. automethod:: assertValidPredicate 46 | 47 | .. automethod:: assertValidRegex 48 | 49 | .. automethod:: assertValidApprox 50 | 51 | .. automethod:: assertValidFuzzy 52 | 53 | .. automethod:: assertValidInterval 54 | 55 | .. automethod:: assertValidSet 56 | 57 | .. automethod:: assertValidSubset 58 | 59 | .. automethod:: assertValidSuperset 60 | 61 | .. automethod:: assertValidUnique 62 | 63 | .. automethod:: assertValidOrder 64 | 65 | **ACCEPTANCE METHODS** 66 | 67 | The acceptance methods wrap :func:`accepted` and its methods: 68 | 69 | .. code-block:: python 70 | :emphasize-lines: 7 71 | 72 | from datatest import DataTestCase 73 | 74 | class MyTest(DataTestCase): 75 | def test_mydata(self): 76 | data = ... 77 | requirement = ... 78 | with self.accepted(Missing): 79 | self.assertValid(data, requirement) 80 | 81 | .. automethod:: accepted 82 | 83 | .. automethod:: acceptedKeys 84 | 85 | .. automethod:: acceptedArgs 86 | 87 | .. method:: acceptedTolerance(tolerance, /, msg=None) 88 | acceptedTolerance(lower, upper, msg=None) 89 | 90 | Wrapper for :meth:`accepted.tolerance`. 91 | 92 | .. method:: acceptedPercent(tolerance, /, msg=None) 93 | acceptedPercent(lower, upper, msg=None) 94 | 95 | Wrapper for :meth:`accepted.percent`. 96 | 97 | .. automethod:: acceptedFuzzy 98 | 99 | .. automethod:: acceptedCount 100 | 101 | 102 | .. _unittest-style-invocation: 103 | 104 | ********************** 105 | Command-Line Interface 106 | ********************** 107 | 108 | The datatest module can be used from the command line just like 109 | unittest. To run the program with `test discovery 110 | `_ 111 | use the following command:: 112 | 113 | python -m datatest 114 | 115 | Run tests from specific modules, classes, or individual methods with:: 116 | 117 | python -m datatest test_module1 test_module2 118 | python -m datatest test_module.TestClass 119 | python -m datatest test_module.TestClass.test_method 120 | 121 | The syntax and command-line options (``-f``, ``-v``, etc.) are the 122 | same as unittest---see unittest's `command-line documentation 123 | `_ 124 | for full details. 125 | 126 | .. note:: 127 | 128 | Tests are ordered by **file name** and then by **line number** 129 | (within each file) when running datatest from the command-line. 130 | 131 | .. 132 | Unlike strict unit testing, data preparation tests are often 133 | dependant on one another---this strict order-by-line-number 134 | behavior lets users design test suites appropriately. 135 | For example, asserting the population of a city will always 136 | fail when the 'city' column is missing. So it's appropriate 137 | to validate column names *before* validating the contents of 138 | each column. 139 | 140 | 141 | ******************* 142 | Test Runner Program 143 | ******************* 144 | 145 | .. py:decorator:: mandatory 146 | 147 | A decorator to mark whole test cases or individual methods as 148 | mandatory. If a mandatory test fails, DataTestRunner will stop 149 | immediately (this is similar to the ``--failfast`` command line 150 | argument behavior):: 151 | 152 | @datatest.mandatory 153 | class TestFileFormat(datatest.DataTestCase): 154 | def test_columns(self): 155 | ... 156 | 157 | .. autoclass:: DataTestRunner 158 | :members: 159 | :inherited-members: 160 | 161 | .. autoclass:: DataTestProgram(module='__main__', defaultTest=None, argv=None, testRunner=datatest.DataTestRunner, testLoader=unittest.TestLoader, exit=True, verbosity=1, failfast=None, catchbreak=None, buffer=None, warnings=None) 162 | :members: 163 | :inherited-members: 164 | 165 | | 166 | 167 | .. autoclass:: main 168 | :members: 169 | :inherited-members: 170 | -------------------------------------------------------------------------------- /docs/how-to/phone-numbers.rst: -------------------------------------------------------------------------------- 1 | 2 | .. py:currentmodule:: datatest 3 | 4 | .. meta:: 5 | :description: How to assert telephone number formats. 6 | :keywords: datatest, phone format, validate phone number 7 | 8 | 9 | ############################# 10 | How to Validate Phone Numbers 11 | ############################# 12 | 13 | To check that phone numbers are well-formed, you can use a regular 14 | expression. 15 | 16 | 17 | USA and Canada 18 | ============== 19 | 20 | 21 | .. code-block:: python 22 | 23 | from datatest import validate 24 | 25 | pattern = r'^\(\d{3}\)[ ]\d{3}-\d{4}$' 26 | 27 | data = [ 28 | '(914) 232-9901', 29 | '(914) 737-9938', 30 | '(213) 888-7636', 31 | '(202) 965-2900', 32 | '(858) 651-5050', 33 | ] 34 | 35 | validate.regex(data, pattern, msg='must use phone number format') 36 | 37 | 38 | For other common US and Canadian formats, you can use the regex 39 | patterns: 40 | 41 | .. table:: 42 | :widths: auto 43 | 44 | +-------------------------------+-------------------+ 45 | | pattern | examples | 46 | +===============================+===================+ 47 | | ``^\(\d{3}\)[ ]\d{3}-\d{4}$`` | \(914) 232-9901 | 48 | +-------------------------------+-------------------+ 49 | | ``^\d{3}-\d{3}-\d{4}$`` | 914-232-9901 | 50 | +-------------------------------+-------------------+ 51 | | ``^\+?1-\d{3}-\d{3}-\d{4}$`` | 1-914-232-9901 | 52 | | +-------------------+ 53 | | | +1-914-232-9901 | 54 | +-------------------------------+-------------------+ 55 | 56 | 57 | .. 58 | THESE PHONE NUMBER PATTERNS ARE INCOMPLETE 59 | 60 | China 61 | ===== 62 | 63 | .. code-block:: python 64 | 65 | from datatest import validate 66 | 67 | pattern = r'^\d{3}[ ]\d{3,4}[ ]\d{4}$' 68 | 69 | data = [ 70 | '074 7284 5586', 71 | '400 669 5539', 72 | ] 73 | 74 | validate.regex(data, pattern, msg='must use phone number format') 75 | 76 | 77 | For common variants, you can use the following patterns: 78 | 79 | .. table:: 80 | :widths: auto 81 | 82 | +--------------------------------------+-------------------+ 83 | | ``^\d{3}[ ]\d{3,4}[ ]\d{4}$`` | 074 7284 5586 | 84 | | +-------------------+ 85 | | | 400 669 5539 | 86 | +--------------------------------------+-------------------+ 87 | | ``^\+86[ ]\d{3}[ ]\d{3,4}[ ]\d{4}$`` | +86 074 7284 5586 | 88 | | +-------------------+ 89 | | | +86 400 669 5539 | 90 | +--------------------------------------+-------------------+ 91 | 92 | 93 | India 94 | ===== 95 | 96 | .. code-block:: python 97 | 98 | import re 99 | from datatest import validate 100 | 101 | 102 | indian_phone_format = re.compile(r'''^ 103 | (\+91[ ])? # Optional international code. 104 | (\(0\))? # Optional trunk prefix. 105 | # 10 digit codes with area & number splits. 106 | ( 107 | \d{10} # xxxxxxxxxx 108 | | \d{5}[ ]\d{5} # xxxxx xxxxx 109 | | \d{4}[ ]\d{6} # xxxx xxxxxx 110 | | \d{3}[ ]\d{7} # xxx xxxxxxx 111 | | \d{2}[ ]\d{8} # xx xxxxxxxx 112 | ) 113 | $''', re.VERBOSE) 114 | 115 | data = [ 116 | '+91 (0)99999 99999', 117 | '+91 99999 99999', 118 | '9999999999', 119 | '99999 99999', 120 | '9999 999999', 121 | '999 9999999', 122 | '99 99999999', 123 | ] 124 | 125 | validate(data, indian_phone_format, msg='must use phone number format') 126 | 127 | 128 | United Kingdom 129 | ============== 130 | 131 | .. code-block:: python 132 | 133 | import re 134 | from datatest import validate 135 | 136 | 137 | uk_phone_format = re.compile(r'''^( 138 | # 10 digit NSNs (leading zero doesn't count) 139 | \(01\d{2}[ ]\d{2}\d\)[ ]\d{2}[ ]\d{3} # (01xx xx) xx xxx 140 | | \(01\d{3}\)[ ]\d{3}[ ]\d{3} # (01xxx) xxx xxx 141 | | \(01\d{2}\)[ ]\d{3}[ ]\d{4} # (01xx) xxx xxxx 142 | | \(02\d\)[ ]\d{4}[ ]\d{4} # (02x) xxxx xxxx 143 | | 0\d{3}[ ]\d{3}[ ]\d{4} # 0xxx xxx xxxx 144 | | 0\d{2}[ ]\d{4}[ ]\d{4} # 0xx xxxx xxxx 145 | | 07\d{3}[ ]\d{3}[ ]\d{3} # 07xxx xxx xxx 146 | 147 | # 9 digit NSNs 148 | | \(0169[ ]77\)[ ]\d{4} # (0169 77) xxxx 149 | | \(01\d{3}\)[ ]\d{2}[ ]\d{3} # (01xxx) xx xxx 150 | | 0500[ ]\d{3}[ ]\d{3} # 0500 xxx xxx 151 | | 0800[ ]\d{3}[ ]\d{3} # 0800 xxx xxx 152 | )$''', re.VERBOSE) 153 | 154 | data = [ 155 | '(01257) 421 282', 156 | '(01736) 759 307', 157 | '(0169 77) 3452', 158 | '0116 319 5885', 159 | '0191 384 6777', 160 | '020 8399 0617', 161 | ] 162 | 163 | validate(data, uk_phone_format, msg='must use phone number format') 164 | 165 | 166 | .. 167 | TO ADD: 168 | Germany 169 | Japan 170 | France 171 | 172 | -------------------------------------------------------------------------------- /docs/how-to/excel-auto-formatting.rst: -------------------------------------------------------------------------------- 1 | 2 | .. currentmodule:: datatest 3 | 4 | .. meta:: 5 | :description: How to prevent Excel from converting values. 6 | :keywords: datatest, excel, date conversion, scientific notation, leading zeros 7 | 8 | 9 | ####################################### 10 | How to Avoid Excel Automatic Formatting 11 | ####################################### 12 | 13 | When MS Excel opens CSV files (and many other tabular formats), 14 | its default behavior will reformat certain values as dates, 15 | strip leading zeros, convert long numbers into scientific 16 | notation, and more. There are many cases where these kinds 17 | of changes actually corrupt your data. 18 | 19 | It is possible to control Excel's formatting behavior using its 20 | *Text Import Wizard*. But as long as other users can open and 21 | re-save your CSV files, there may be no good way to guarantee that 22 | someone else won't inadvertently corrupt your data with Excel's 23 | default auto-format behavior. In a situation like this, you can 24 | mitigate problems by avoiding values that Excel likes to auto-format. 25 | 26 | Using the :class:`Predicate` object below, you can check that values 27 | are "Excel safe" and receive a list of differences when values are 28 | vulnerable to inadvertent auto-formatting: 29 | 30 | .. code-block:: python 31 | :emphasize-lines: 44 32 | :linenos: 33 | 34 | import re 35 | from datatest import validate, Predicate 36 | 37 | 38 | # Predicate to check that elements are not subject 39 | # to Excel auto-formatting. 40 | excel_safe = ~Predicate(re.compile(r'''^( 41 | # Date format character combinations. 42 | \d{1,2}-(?:\d{1,2}|\d{4}) 43 | | (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[ \-]\d{1,2} 44 | | [01]?[0-9]-(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) 45 | 46 | # Time conversions. 47 | | [01]?[0-9][ ]?(AM?|PM?) # Twelve-hour clock. 48 | | \d?\d[ ]*: # HH (hours). 49 | | \d?\d[ ]*(:[ ]*\d\d?){1,2} # HH:MM and HH:MM:SS 50 | 51 | # Numeric conversions. 52 | | 0\d+\.?\d* # Number with leading zeros. 53 | | \d*\.\d*0 # Decimal point with trailing zeros. 54 | | \d*\. # Trailing decimal point. 55 | | \d.?\d*E[+-]?\d+ # Scientific notation. 56 | | \d{16,} # Numbers of 16+ digits get approximated. 57 | 58 | # Whitespace normalization. 59 | | \s.* # Leading whitespace. 60 | | .*\s # Trailing whitespace. 61 | | .*\s\s.* # Irregular whitespace (new in Office 365). 62 | 63 | # Other conversions 64 | | =.+ # Spreadsheet formula. 65 | 66 | )$''', re.VERBOSE | re.IGNORECASE), name='excel_safe') 67 | 68 | 69 | data = [ 70 | 'AOX-18', 71 | 'APR-23', 72 | 'DBB-01', 73 | 'DEC-20', 74 | 'DNZ-33', 75 | 'DVH-50', 76 | ] 77 | validate(data, excel_safe) 78 | 79 | In the example above, we use ``excel_safe`` as our *requirement*. 80 | The validation fails because our *data* contains two codes that 81 | Excel would auto-convert into date types: 82 | 83 | .. code-block:: none 84 | 85 | ValidationError: does not satisfy excel_safe() (2 differences): [ 86 | Invalid('APR-23'), 87 | Invalid('DEC-20'), 88 | ] 89 | 90 | 91 | Fixing the Data 92 | --------------- 93 | 94 | To address the failure, we need to change the values in *data* so 95 | they are no longer subject to Excel's auto-formatting behavior. 96 | There are a few ways to do this. 97 | 98 | We can prefix the failing values with apostrophes (``'APR-23`` 99 | and ``'DEC-20``). This causes Excel to treat them as text instead 100 | of dates or numbers: 101 | 102 | .. code-block:: python 103 | :emphasize-lines: 5,7 104 | :linenos: 105 | :lineno-start: 34 106 | 107 | ... 108 | 109 | data = [ 110 | "AOX-18", 111 | "'APR-23", 112 | "DBB-01", 113 | "'DEC-20", 114 | "DNZ-33", 115 | "DVH-50", 116 | ] 117 | validate(data, excel_safe) 118 | 119 | 120 | Another approach would be to change the formatting for the all of 121 | the values. Below, the hyphens in *data* have been replaced with 122 | underscores (``_``): 123 | 124 | .. code-block:: python 125 | :emphasize-lines: 4-9 126 | :linenos: 127 | :lineno-start: 34 128 | 129 | ... 130 | 131 | data = [ 132 | 'AOX_18', 133 | 'APR_23', 134 | 'DBB_01', 135 | 'DEC_20', 136 | 'DNZ_33', 137 | 'DVH_50', 138 | ] 139 | validate(data, excel_safe) 140 | 141 | 142 | After making the needed changes, the validation will now pass without 143 | error. 144 | 145 | 146 | .. caution:: 147 | 148 | The ``excel_safe`` predicate implements a blacklist approach 149 | to detect values that Excel will automatically convert. It is 150 | not guaranteed to catch everything and future versions of Excel 151 | could introduce new behaviors. If you discover auto-formatted 152 | values that are not handled by this helper function (or if you 153 | have an idea regarding a workable whitelist approach), please 154 | `file an issue`_ and we will try to improve it. 155 | 156 | 157 | .. _`file an issue`: https://github.com/shawnbrown/datatest/issues 158 | -------------------------------------------------------------------------------- /tests/past_api07_sources_sqlite.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sqlite3 3 | from . import _unittest as unittest 4 | 5 | from .mixins import CountTests 6 | from .mixins import OtherTests 7 | 8 | from datatest.__past__.api07_sources import SqliteSource 9 | 10 | 11 | class TestSqliteSourceCount(CountTests, unittest.TestCase): 12 | def setUp(self): 13 | tablename = 'testtable' 14 | connection = sqlite3.connect(':memory:') 15 | cursor = connection.cursor() 16 | cursor.execute("CREATE TABLE testtable (label1, label2, value)") 17 | for values in self.testdata: 18 | cursor.execute("INSERT INTO testtable VALUES (?, ?, ?)", values) 19 | connection.commit() 20 | 21 | self.datasource = SqliteSource(connection, tablename) 22 | 23 | 24 | class TestSqliteSource(OtherTests, unittest.TestCase): 25 | def setUp(self): 26 | tablename = 'testtable' 27 | connection = sqlite3.connect(':memory:') 28 | cursor = connection.cursor() 29 | cursor.execute("CREATE TABLE testtable (label1, label2, value)") 30 | for values in self.testdata: 31 | cursor.execute("INSERT INTO testtable VALUES (?, ?, ?)", values) 32 | connection.commit() 33 | 34 | self.datasource = SqliteSource(connection, tablename) 35 | 36 | def test_where_clause(self): 37 | # No key-word args. 38 | clause, params = SqliteSource._build_where_clause() 39 | self.assertEqual(clause, '') 40 | self.assertEqual(params, []) 41 | 42 | # Single condition (where label1 equals 'a'). 43 | clause, params = SqliteSource._build_where_clause(label1='a') 44 | self.assertEqual(clause, 'label1=?') 45 | self.assertEqual(params, ['a']) 46 | 47 | # Multiple conditions (where label1 equals 'a' AND label2 equals 'x'). 48 | clause, params = SqliteSource._build_where_clause(label1='a', label2='x') 49 | self.assertEqual(clause, 'label1=? AND label2=?') 50 | self.assertEqual(params, ['a', 'x']) 51 | 52 | # Compound condition (where label1 equals 'a' OR 'b'). 53 | clause, params = SqliteSource._build_where_clause(label1=('a', 'b')) 54 | self.assertEqual(clause, 'label1 IN (?, ?)') 55 | self.assertEqual(params, ['a', 'b']) 56 | 57 | # Mixed conditions (where label1 equals 'a' OR 'b' AND label2 equals 'x'). 58 | clause, params = SqliteSource._build_where_clause(label1=('a', 'b'), label2='x') 59 | self.assertEqual(clause, 'label1 IN (?, ?) AND label2=?') 60 | self.assertEqual(params, ['a', 'b', 'x']) 61 | 62 | def test_normalize_column(self): 63 | result = SqliteSource._normalize_column('foo') 64 | self.assertEqual('"foo"', result) 65 | 66 | result = SqliteSource._normalize_column('foo bar') 67 | self.assertEqual('"foo bar"', result) 68 | 69 | result = SqliteSource._normalize_column('foo "bar" baz') 70 | self.assertEqual('"foo ""bar"" baz"', result) 71 | 72 | def test_from_records(self): 73 | """Test from_records method (wrapper for TemporarySqliteTable class).""" 74 | # Test tuples. 75 | columns = ['foo', 'bar', 'baz'] 76 | data = [ 77 | ('a', 'x', '1'), 78 | ('b', 'y', '2'), 79 | ('c', 'z', '3'), 80 | ] 81 | source = SqliteSource.from_records(data, columns) 82 | 83 | expected = [ 84 | {'foo': 'a', 'bar': 'x', 'baz': '1'}, 85 | {'foo': 'b', 'bar': 'y', 'baz': '2'}, 86 | {'foo': 'c', 'bar': 'z', 'baz': '3'}, 87 | ] 88 | self.assertEqual(expected, list(source)) 89 | 90 | # Test dict. 91 | columns = ['foo', 'bar', 'baz'] 92 | data = [ 93 | {'foo': 'a', 'bar': 'x', 'baz': '1'}, 94 | {'foo': 'b', 'bar': 'y', 'baz': '2'}, 95 | {'foo': 'c', 'bar': 'z', 'baz': '3'}, 96 | ] 97 | source = SqliteSource.from_records(data, columns) 98 | self.assertEqual(data, list(source)) 99 | 100 | # Test omitted *columns* argument. 101 | data_dict = [ 102 | {'foo': 'a', 'bar': 'x', 'baz': '1'}, 103 | {'foo': 'b', 'bar': 'y', 'baz': '2'}, 104 | {'foo': 'c', 'bar': 'z', 'baz': '3'}, 105 | ] 106 | source = SqliteSource.from_records(data_dict) 107 | self.assertEqual(data_dict, list(source)) 108 | 109 | def test_create_index(self): 110 | cursor = self.datasource._connection.cursor() 111 | 112 | # There should be no indexes initially. 113 | cursor.execute("PRAGMA INDEX_LIST('testtable')") 114 | self.assertEqual(cursor.fetchall(), []) 115 | 116 | # Add single-column index. 117 | self.datasource.create_index('label1') # <- CREATE INDEX! 118 | cursor.execute("PRAGMA INDEX_LIST('testtable')") 119 | results = [tup[1] for tup in cursor.fetchall()] 120 | self.assertEqual(results, ['idx_testtable_label1']) 121 | 122 | # Add multi-column index. 123 | self.datasource.create_index('label2', 'value') # <- CREATE INDEX! 124 | cursor.execute("PRAGMA INDEX_LIST('testtable')") 125 | results = sorted(tup[1] for tup in cursor.fetchall()) 126 | self.assertEqual(results, ['idx_testtable_label1', 'idx_testtable_label2_value']) 127 | 128 | # Duplicate of first, single-column index should have no effect. 129 | self.datasource.create_index('label1') # <- CREATE INDEX! 130 | cursor.execute("PRAGMA INDEX_LIST('testtable')") 131 | results = sorted(tup[1] for tup in cursor.fetchall()) 132 | self.assertEqual(results, ['idx_testtable_label1', 'idx_testtable_label2_value']) 133 | -------------------------------------------------------------------------------- /datatest/_compatibility/decimal.py: -------------------------------------------------------------------------------- 1 | """compatibility layer for decimal (Python standard library)""" 2 | from __future__ import absolute_import 3 | from decimal import * 4 | 5 | 6 | try: 7 | Decimal.from_float # New in 2.7 8 | except AttributeError: 9 | import math as _math 10 | 11 | def _bit_length(integer): 12 | s = bin(integer) # binary representation: bin(-37) --> '-0b100101' 13 | s = s.lstrip('-0b') # remove leading zeros and minus sign 14 | return len(s) # len('100101') --> 6 15 | 16 | @classmethod 17 | def _from_float(cls, f): 18 | if isinstance(f, int): # handle integer inputs 19 | return cls(f) 20 | if not isinstance(f, float): 21 | raise TypeError("argument must be int or float.") 22 | if _math.isinf(f) or _math.isnan(f): 23 | return cls(repr(f)) 24 | if _math.copysign(1.0, f) == 1.0: 25 | sign = 0 26 | else: 27 | sign = 1 28 | n, d = abs(f).as_integer_ratio() 29 | #k = d.bit_length() - 1 30 | k = _bit_length(d) - 1 31 | result = _dec_from_triple(sign, str(n*5**k), -k) 32 | if cls is Decimal: 33 | return result 34 | else: 35 | return cls(result) 36 | 37 | Decimal.from_float = _from_float 38 | 39 | 40 | if Decimal('1.0') != 1.0: # Changed in Python 3.2 41 | 42 | import numbers as _numbers 43 | from decimal import _dec_from_triple 44 | 45 | 46 | class FloatOperation(DecimalException, TypeError): 47 | """Enable stricter semantics for mixing floats and Decimals.""" 48 | pass 49 | 50 | 51 | # Adapted from Python 3.1 standard library. 52 | _context_init_orig = Context.__init__ 53 | def _context_init_new(self, prec=None, rounding=None, 54 | traps=None, flags=None, 55 | Emin=None, Emax=None, 56 | capitals=None, _clamp=0, 57 | _ignored_flags=None): 58 | 59 | # Call original __init__. 60 | _context_init_orig(self, prec=prec, rounding=rounding, traps=traps, 61 | flags=flags, Emin=Emin, Emax=Emax, capitals=capitals, 62 | _clamp=_clamp, _ignored_flags=_ignored_flags) 63 | 64 | # Add FloatOperation to `traps` dict. 65 | self.traps[FloatOperation] = 0 66 | 67 | Context.__init__ = _context_init_new 68 | 69 | 70 | # Adapted from Python 3.4 standard library. 71 | def _convert_for_comparison(self, other, equality_op=False): 72 | if isinstance(other, Decimal): 73 | return self, other 74 | if isinstance(other, _numbers.Rational): 75 | if not self._is_special: 76 | self = _dec_from_triple(self._sign, 77 | str(int(self._int) * other.denominator), 78 | self._exp) 79 | return self, Decimal(other.numerator) 80 | if equality_op and isinstance(other, _numbers.Complex) and other.imag == 0: 81 | other = other.real 82 | if isinstance(other, float): 83 | context = getcontext() 84 | if equality_op: 85 | context.flags[FloatOperation] = 1 86 | else: 87 | context._raise_error(FloatOperation, 88 | "strict semantics for mixing floats and Decimals are enabled") 89 | return self, Decimal.from_float(other) 90 | return NotImplemented, NotImplemented 91 | 92 | def _eq(self, other, context=None): 93 | self, other = _convert_for_comparison(self, other, equality_op=True) 94 | if other is NotImplemented: 95 | return other 96 | if self._check_nans(other, context): 97 | return False 98 | return self._cmp(other) == 0 99 | Decimal.__eq__ = _eq 100 | 101 | def _ne(self, other, context=None): 102 | self, other = _convert_for_comparison(self, other, equality_op=True) 103 | if other is NotImplemented: 104 | return other 105 | if self._check_nans(other, context): 106 | return True 107 | return self._cmp(other) != 0 108 | Decimal.__ne__ = _ne 109 | 110 | def _lt(self, other, context=None): 111 | self, other = _convert_for_comparison(self, other) 112 | if other is NotImplemented: 113 | return other 114 | ans = self._compare_check_nans(other, context) 115 | if ans: 116 | return False 117 | return self._cmp(other) < 0 118 | Decimal.__lt__ = _lt 119 | 120 | def _le(self, other, context=None): 121 | self, other = _convert_for_comparison(self, other) 122 | if other is NotImplemented: 123 | return other 124 | ans = self._compare_check_nans(other, context) 125 | if ans: 126 | return False 127 | return self._cmp(other) <= 0 128 | Decimal.__le__ = _le 129 | 130 | def _gt(self, other, context=None): 131 | self, other = _convert_for_comparison(self, other) 132 | if other is NotImplemented: 133 | return other 134 | ans = self._compare_check_nans(other, context) 135 | if ans: 136 | return False 137 | return self._cmp(other) > 0 138 | Decimal.__gt__ = _gt 139 | 140 | def _ge(self, other, context=None): 141 | self, other = _convert_for_comparison(self, other) 142 | if other is NotImplemented: 143 | return other 144 | ans = self._compare_check_nans(other, context) 145 | if ans: 146 | return False 147 | return self._cmp(other) >= 0 148 | Decimal.__ge__ = _ge 149 | -------------------------------------------------------------------------------- /docs/how-to/customize-differences.rst: -------------------------------------------------------------------------------- 1 | 2 | .. currentmodule:: datatest 3 | 4 | .. meta:: 5 | :description: How to customize error differences. 6 | :keywords: datatest, difference, customize 7 | 8 | 9 | ############################ 10 | How to Customize Differences 11 | ############################ 12 | 13 | When using a helper function for validation, datatest's default 14 | behavior is to produce :class:`Invalid` differences when the 15 | function returns False. But you can customize this behavior 16 | by returning a difference object instead of False. The returned 17 | difference is used in place of an automatically generated one. 18 | 19 | 20 | Default Behavior 21 | ================ 22 | 23 | In the following example, the helper function checks that text 24 | values are upper case and have no extra whitespace. If the values 25 | are good, the function returns ``True``, if the values are bad it 26 | returns ``False``: 27 | 28 | .. code-block:: python 29 | :linenos: 30 | :emphasize-lines: 6 31 | 32 | from datatest import validate 33 | 34 | 35 | def wellformed(x): # <- Helper function. 36 | """Must be upercase and no extra whitespace.""" 37 | return x == ' '.join(x.split()) and x.isupper() 38 | 39 | data = [ 40 | 'CAPE GIRARDEAU', 41 | 'GREENE ', 42 | 'JACKSON', 43 | 'St. Louis', 44 | ] 45 | 46 | validate(data, wellformed) 47 | 48 | 49 | Each time the helper function returns ``False``, an :class:`Invalid` 50 | difference is created: 51 | 52 | .. code-block:: none 53 | :emphasize-lines: 5-6 54 | 55 | Traceback (most recent call last): 56 | File "example.py", line 15, in 57 | validate(data, wellformed) 58 | ValidationError: Must be upercase and no extra whitespace. (2 differences): [ 59 | Invalid('GREENE '), 60 | Invalid('St. Louis'), 61 | ] 62 | 63 | 64 | Custom Differences 65 | ================== 66 | 67 | In this example, the helper function returns a custom ``BadWhitespace`` 68 | or ``NotUpperCase`` difference for each bad value: 69 | 70 | .. code-block:: python 71 | :linenos: 72 | :emphasize-lines: 15,17 73 | 74 | from datatest import validate, Invalid 75 | 76 | 77 | class BadWhitespace(Invalid): 78 | """For strings with leading, trailing, or irregular whitespace.""" 79 | 80 | 81 | class NotUpperCase(Invalid): 82 | """For strings that aren't upper case.""" 83 | 84 | 85 | def wellformed(x): # <- Helper function. 86 | """Must be upercase and no extra whitespace.""" 87 | if x != ' '.join(x.split()): 88 | return BadWhitespace(x) 89 | if not x.isupper(): 90 | return NotUpperCase(x) 91 | return True 92 | 93 | 94 | data = [ 95 | 'CAPE GIRARDEAU', 96 | 'GREENE ', 97 | 'JACKSON', 98 | 'St. Louis', 99 | ] 100 | 101 | validate(data, wellformed) 102 | 103 | 104 | These differences are use in the ValidationError: 105 | 106 | .. code-block:: none 107 | :emphasize-lines: 5-6 108 | 109 | Traceback (most recent call last): 110 | File "example.py", line 15, in 111 | validate(data, wellformed) 112 | ValidationError: Must be upercase and no extra whitespace. (2 differences): [ 113 | BadWhitespace('GREENE '), 114 | NotUpperCase('St. Louis'), 115 | ] 116 | 117 | 118 | .. caution:: 119 | 120 | Typically, you should try to **stick with existing differences** 121 | in your data tests. Only create a custom subclass when its meaning 122 | is evident and doing so helps your data preparation workflow. 123 | 124 | Don't add a custom class when it doesn't benefit your testing 125 | process. At best, you're doing extra work for no added benefit. 126 | And at worst, an ambiguous or needlessly complex subclass can 127 | cause more problems than it solves. 128 | 129 | If you need to resolve ambiguity in a validation, you can split 130 | the check into multiple calls. Below, we perform the same check 131 | demonstrated earlier using two :func:`validate` calls: 132 | 133 | .. code-block:: python 134 | :linenos: 135 | :emphasize-lines: 14,21 136 | 137 | from datatest import validate 138 | 139 | data = [ 140 | 'CAPE GIRARDEAU', 141 | 'GREENE ', 142 | 'JACKSON', 143 | 'St. Louis', 144 | ] 145 | 146 | def no_irregular_whitespace(x): # <- Helper function. 147 | """Must have no irregular whitespace.""" 148 | return x == ' '.join(x.split()) 149 | 150 | validate(data, no_irregular_whitespace) 151 | 152 | 153 | def is_upper_case(x): # <- Helper function. 154 | """Must be upper case.""" 155 | return x.isupper() 156 | 157 | validate(data, is_upper_case) 158 | 159 | 160 | .. 161 | # In the future, after adding a comparator interface to validate(), 162 | # possibly change the example to something like the following. 163 | 164 | from enum import Enum 165 | from datatest import validate, Invalid 166 | 167 | 168 | # Likert Scale 169 | class response(Enum): 170 | STRONGLY_OPPOSE = 1 171 | OPPOSE = 2 172 | NEUTRAL = 3 173 | SUPPORT = 4 174 | STRONGLY_SUPPORT = 5 175 | 176 | 177 | # 7-Point Likert Scale 178 | #class response(Enum): 179 | # STRONGLY_OPPOSE = 1 180 | # OPPOSE = 2 181 | # SOMEWHAT_OPPOSE = 3 182 | # NEUTRAL = 4 183 | # SOMEWHAT_SUPPORT = 5 184 | # SUPPORT = 6 185 | # STRONGLY_SUPPORT = 7 186 | 187 | 188 | class Change(Invalid): 189 | """For differences of 1 point.""" 190 | 191 | 192 | class LargeChange(Invalid): 193 | """For differences of 2 or more points.""" 194 | 195 | 196 | latest_survey = { 197 | 'a': response.SUPPORT, 198 | 'b': response.STRONGLY_OPPOSE, 199 | 'c': response.STRONGLY_SUPPORT, 200 | 'd': response.OPPOSE, 201 | } 202 | 203 | previous_survey = { 204 | 'a': response.SUPPORT, 205 | 'b': response.OPPOSE, 206 | 'c': response.STRONGLY_SUPPORT, 207 | 'd': response.SUPPORT, 208 | } 209 | 210 | validate(latest_survey, previous_survey) 211 | 212 | -------------------------------------------------------------------------------- /tests/past_api09_load_csv.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import sqlite3 4 | import sys 5 | import warnings 6 | from . import _io as io 7 | from . import _unittest as unittest 8 | from datatest._compatibility.builtins import * 9 | 10 | from datatest._vendor.load_csv import load_csv 11 | 12 | try: 13 | from StringIO import StringIO 14 | except ImportError: 15 | StringIO = None 16 | 17 | 18 | class TestLoadCsv(unittest.TestCase): 19 | def setUp(self): 20 | connection = sqlite3.connect(':memory:') 21 | connection.execute('PRAGMA synchronous=OFF') 22 | connection.isolation_level = None 23 | self.cursor = connection.cursor() 24 | 25 | self.original_cwd = os.path.abspath(os.getcwd()) 26 | os.chdir(os.path.join(os.path.dirname(__file__), 'sample_files')) 27 | 28 | def tearDown(self): # It would be best to use addCleanup() 29 | os.chdir(self.original_cwd) # but it is not available in Python 2.6. 30 | 31 | @staticmethod 32 | def get_stream(string, encoding=None): 33 | """Accepts a string and returns a file-like stream object. 34 | 35 | In Python 2, Unicode files should be opened in binary-mode 36 | but in Python 3, they should be opened in text-mode. This 37 | function emulates the appropriate opening behavior. 38 | """ 39 | fh = io.BytesIO(string) 40 | if sys.version_info[0] == 2: 41 | return fh 42 | return io.TextIOWrapper(fh, encoding=encoding) 43 | 44 | def test_encoding_with_stream(self): 45 | csvfile = self.get_stream(( 46 | b'col1,col2\n' 47 | b'1,\xe6\n' # '\xe6' -> æ (ash) 48 | b'2,\xf0\n' # '\xf0' -> ð (eth) 49 | b'3,\xfe\n' # '\xfe' -> þ (thorn) 50 | ), encoding='latin-1') 51 | load_csv(self.cursor, 'testtable1', csvfile, encoding='latin-1') 52 | 53 | expected = [ 54 | ('1', chr(0xe6)), # chr(0xe6) -> æ 55 | ('2', chr(0xf0)), # chr(0xf0) -> ð 56 | ('3', chr(0xfe)), # chr(0xfe) -> þ 57 | ] 58 | self.cursor.execute('SELECT col1, col2 FROM testtable1') 59 | self.assertEqual(list(self.cursor), expected) 60 | 61 | def test_encoding_with_file(self): 62 | path = 'sample_text_iso88591.csv' 63 | load_csv(self.cursor, 'testtable', path, encoding='latin-1') 64 | 65 | expected = [ 66 | ('iso88591', chr(0xe6)), # chr(0xe6) -> æ 67 | ] 68 | self.cursor.execute('SELECT col1, col2 FROM testtable') 69 | self.assertEqual(list(self.cursor), expected) 70 | 71 | def test_encoding_mismatch(self): 72 | path = 'sample_text_iso88591.csv' 73 | wrong_encoding = 'utf-8' # <- Doesn't match file. 74 | 75 | with self.assertRaises(UnicodeDecodeError): 76 | load_csv(self.cursor, 'testtable', path, wrong_encoding) 77 | 78 | def test_fallback_with_stream(self): 79 | with warnings.catch_warnings(record=True): # Catch warnings issued 80 | csvfile = self.get_stream(( # when running Python 2. 81 | b'col1,col2\n' 82 | b'1,\xe6\n' # '\xe6' -> æ (ash) 83 | b'2,\xf0\n' # '\xf0' -> ð (eth) 84 | b'3,\xfe\n' # '\xfe' -> þ (thorn) 85 | ), encoding='latin-1') 86 | load_csv(self.cursor, 'testtable1', csvfile) # <- No encoding arg. 87 | 88 | expected = [ 89 | ('1', chr(0xe6)), # chr(0xe6) -> æ 90 | ('2', chr(0xf0)), # chr(0xf0) -> ð 91 | ('3', chr(0xfe)), # chr(0xfe) -> þ 92 | ] 93 | self.cursor.execute('SELECT col1, col2 FROM testtable1') 94 | self.assertEqual(list(self.cursor), expected) 95 | 96 | def test_fallback_with_StringIO(self): 97 | if not StringIO: # <- Python 2.x only. 98 | return 99 | 100 | csvfile = StringIO( 101 | b'col1,col2\n' 102 | b'1,\xe6\n' # '\xe6' -> æ (ash) 103 | b'2,\xf0\n' # '\xf0' -> ð (eth) 104 | b'3,\xfe\n' # '\xfe' -> þ (thorn) 105 | ) 106 | 107 | with warnings.catch_warnings(record=True): 108 | load_csv(self.cursor, 'testtable1', csvfile) 109 | 110 | expected = [ 111 | ('1', chr(0xe6)), # chr(0xe6) -> æ 112 | ('2', chr(0xf0)), # chr(0xf0) -> ð 113 | ('3', chr(0xfe)), # chr(0xfe) -> þ 114 | ] 115 | self.cursor.execute('SELECT col1, col2 FROM testtable1') 116 | self.assertEqual(list(self.cursor), expected) 117 | 118 | def test_fallback_with_file(self): 119 | with warnings.catch_warnings(record=True) as warning_list: 120 | warnings.simplefilter('always') 121 | path = 'sample_text_iso88591.csv' 122 | load_csv(self.cursor, 'testtable', path) # <- No encoding arg. 123 | 124 | self.assertEqual(len(warning_list), 1) 125 | expected = "using fallback 'latin-1'" 126 | self.assertIn(expected, str(warning_list[0].message)) 127 | 128 | expected = [ 129 | ('iso88591', chr(0xe6)), # chr(0xe6) -> æ 130 | ] 131 | self.cursor.execute('SELECT col1, col2 FROM testtable') 132 | self.assertEqual(list(self.cursor), expected) 133 | 134 | def test_fallback_with_exhaustible_object(self): 135 | """Exhaustible iterators and unseekable file-like objects 136 | can only be iterated over once. This means that the usual 137 | fallback behavior can not be applied and the function must 138 | raise an exception. 139 | """ 140 | if not sys.version_info[0] == 2: 141 | return 142 | 143 | csvfile = self.get_stream(( 144 | b'col1,col2\n' 145 | b'1,\xe6\n' # '\xe6' -> æ (ash) 146 | b'2,\xf0\n' # '\xf0' -> ð (eth) 147 | b'3,\xfe\n' # '\xfe' -> þ (thorn) 148 | ), encoding='latin-1') 149 | generator = (x for x in csvfile) # <- Make stream unseekable. 150 | 151 | with self.assertRaises(UnicodeDecodeError) as cm: 152 | load_csv(self.cursor, 'testtable', generator) 153 | 154 | error_message = str(cm.exception) 155 | self.assertIn('cannot attempt fallback', error_message.lower()) 156 | -------------------------------------------------------------------------------- /docs/tutorial/testing-pandas.rst: -------------------------------------------------------------------------------- 1 | :orphan: 2 | 3 | .. meta:: 4 | :description: Datatest examples demonstrating use of pandas DataFrame objects. 5 | :keywords: datatest, pandas, DataFrame 6 | 7 | 8 | ################### 9 | Testing With Pandas 10 | ################### 11 | 12 | Datatest can validate :mod:`pandas` objects (:class:`DataFrame 13 | `, :class:`Series `, and 14 | :class:`Index `) the same way it does with 15 | built-in types. 16 | 17 | 18 | ============= 19 | Some Examples 20 | ============= 21 | 22 | This example uses a :class:`DataFrame ` to 23 | load and inspect data from a CSV file (:download:`movies.csv 24 | `). The CSV file uses the 25 | following format: 26 | 27 | .. csv-table:: 28 | :header: title, rating, year, runtime 29 | 30 | Almost Famous, R, 2000, 122 31 | American Pie, R, 1999, 95 32 | Back to the Future, PG, 1985, 116 33 | Blade Runner, R, 1982, 117 34 | ..., ..., ..., ... 35 | 36 | 37 | .. tabs:: 38 | 39 | .. group-tab:: Pytest 40 | 41 | The :download:`test_movies_df.py ` 42 | script demonstrates pytest-style tests: 43 | 44 | .. literalinclude:: /_static/tutorial/test_movies_df.py 45 | :language: python 46 | :lineno-match: 47 | 48 | .. group-tab:: Unittest 49 | 50 | The :download:`test_movies_df_unit.py ` 51 | script demonstrates unittest-style tests: 52 | 53 | .. literalinclude:: /_static/tutorial/test_movies_df_unit.py 54 | :language: python 55 | :lineno-match: 56 | 57 | 58 | You can run these tests, use the following command: 59 | 60 | .. tabs:: 61 | 62 | .. group-tab:: Pytest 63 | 64 | .. code-block:: none 65 | 66 | pytest test_movies_df.py 67 | 68 | .. group-tab:: Unittest 69 | 70 | .. code-block:: none 71 | 72 | python -m datatest test_movies_df_unit.py 73 | 74 | 75 | ======================== 76 | Step by Step Explanation 77 | ======================== 78 | 79 | 80 | 1. Define a test fixture 81 | ------------------------ 82 | 83 | Define a test fixture that loads the CSV file into a 84 | :class:`DataFrame `: 85 | 86 | .. tabs:: 87 | 88 | .. group-tab:: Pytest 89 | 90 | .. literalinclude:: /_static/tutorial/test_movies_df.py 91 | :pyobject: df 92 | :lineno-match: 93 | 94 | .. group-tab:: Unittest 95 | 96 | .. literalinclude:: /_static/tutorial/test_movies_df_unit.py 97 | :pyobject: setUpModule 98 | :lineno-match: 99 | 100 | 101 | 2. Check column names 102 | --------------------- 103 | 104 | Check that the data includes the expected column names: 105 | 106 | .. tabs:: 107 | 108 | .. group-tab:: Pytest 109 | 110 | .. literalinclude:: /_static/tutorial/test_movies_df.py 111 | :pyobject: test_columns 112 | :lineno-match: 113 | 114 | .. group-tab:: Unittest 115 | 116 | .. literalinclude:: /_static/tutorial/test_movies_df_unit.py 117 | :pyobject: TestMovies.test_columns 118 | :lineno-match: 119 | 120 | This validation requires that the set of values in ``df.columns`` 121 | matches the required :py:class:`set`. The ``df.columns`` attribute is 122 | an :class:`Index ` object---datatest treats this the same 123 | as any other sequence of values. 124 | 125 | This test is marked ``mandatory`` because it's a prerequisite that must 126 | be satisfied before any of the other tests can pass. When a mandatory 127 | test fails, the test suite stops immediately and no more tests are run. 128 | 129 | 130 | 3. Check 'title' values 131 | ----------------------- 132 | 133 | Check that values in the **title** column begin with an upper-case letter: 134 | 135 | .. tabs:: 136 | 137 | .. group-tab:: Pytest 138 | 139 | .. literalinclude:: /_static/tutorial/test_movies_df.py 140 | :pyobject: test_title 141 | :lineno-match: 142 | 143 | .. group-tab:: Unittest 144 | 145 | .. literalinclude:: /_static/tutorial/test_movies_df_unit.py 146 | :pyobject: TestMovies.test_title 147 | :lineno-match: 148 | 149 | This validation checks that each value in the ``df['title']`` matches 150 | the regular expression ``^[A-Z]``. 151 | 152 | 153 | 4. Check 'rating' values 154 | ------------------------ 155 | 156 | Check that values in the **rating** column match one of the allowed codes: 157 | 158 | .. tabs:: 159 | 160 | .. group-tab:: Pytest 161 | 162 | .. literalinclude:: /_static/tutorial/test_movies_df.py 163 | :pyobject: test_rating 164 | :lineno-match: 165 | 166 | .. group-tab:: Unittest 167 | 168 | .. literalinclude:: /_static/tutorial/test_movies_df_unit.py 169 | :pyobject: TestMovies.test_rating 170 | :lineno-match: 171 | 172 | This validation checks that the values in ``df['rating']`` are also 173 | contained in the given set. 174 | 175 | 176 | 5. Check 'year' and 'runtime' types 177 | ----------------------------------- 178 | 179 | Check that values in the **year** and **runtime** columns are integers: 180 | 181 | .. tabs:: 182 | 183 | .. group-tab:: Pytest 184 | 185 | .. literalinclude:: /_static/tutorial/test_movies_df.py 186 | :pyobject: test_year 187 | :lineno-match: 188 | 189 | .. literalinclude:: /_static/tutorial/test_movies_df.py 190 | :pyobject: test_runtime 191 | :lineno-match: 192 | 193 | .. group-tab:: Unittest 194 | 195 | .. literalinclude:: /_static/tutorial/test_movies_df_unit.py 196 | :pyobject: TestMovies.test_year 197 | :lineno-match: 198 | 199 | .. literalinclude:: /_static/tutorial/test_movies_df_unit.py 200 | :pyobject: TestMovies.test_runtime 201 | :lineno-match: 202 | 203 | 204 | ================ 205 | More Information 206 | ================ 207 | 208 | .. seealso:: 209 | 210 | See the :doc:`../intro/validating-pandas` introduction docs 211 | for more information and examples. 212 | 213 | See :ref:`pandas-accessor-docs` to learn about the alternate 214 | validation syntax provided by pandas **accessor extensions**. 215 | 216 | -------------------------------------------------------------------------------- /datatest/_normalize.py: -------------------------------------------------------------------------------- 1 | """Normalize objects for validation.""" 2 | 3 | import sys 4 | from ._compatibility.collections.abc import Collection 5 | from ._compatibility.collections.abc import Iterable 6 | from ._compatibility.collections.abc import Iterator 7 | from ._compatibility.collections.abc import Mapping 8 | 9 | from ._utils import exhaustible 10 | from ._utils import iterpeek 11 | from ._utils import IterItems 12 | 13 | 14 | class TypedIterator(Iterator): 15 | def __init__(self, iterable, evaltype): 16 | self._iterator = iter(iterable) 17 | self.evaltype = evaltype 18 | 19 | def __iter__(self): 20 | return self 21 | 22 | def __next__(self): 23 | return next(self._iterator) 24 | 25 | def next(self): # Python 2.x support. 26 | return self.__next__() 27 | 28 | def fetch(self): 29 | return self.evaltype(self._iterator) 30 | 31 | 32 | NoneType = type(None) 33 | 34 | 35 | def _normalize_lazy(obj): 36 | """Return an iterator for lazy evaluation.""" 37 | if isinstance(obj, TypedIterator): 38 | if issubclass(obj.evaltype, Mapping): 39 | obj = IterItems(obj) 40 | return obj # <- EXIT! 41 | 42 | # Separate Squint module. 43 | squint = sys.modules.get('squint', None) 44 | if squint: 45 | if isinstance(obj, squint.Query): 46 | obj = obj.execute() 47 | if issubclass(getattr(obj, 'evaltype', NoneType), Mapping): 48 | obj = IterItems(obj) 49 | return obj # <- EXIT! 50 | 51 | if isinstance(obj, squint.Result): 52 | if issubclass(obj.evaltype, Mapping): 53 | obj = IterItems(obj) 54 | return obj # <- EXIT! 55 | 56 | pandas = sys.modules.get('pandas', None) 57 | if pandas: 58 | if isinstance(obj, pandas.DataFrame): 59 | if not obj.index.is_unique: 60 | msg = '{0} index contains duplicates, must be unique' 61 | raise ValueError(msg.format(obj.__class__.__name__)) 62 | 63 | if isinstance(obj.index, pandas.RangeIndex): 64 | # DataFrame with RangeIndex is treated as an iterator. 65 | if len(obj.columns) == 1: 66 | obj = (x[0] for x in obj.values) 67 | else: 68 | obj = (tuple(x) for x in obj.values) 69 | return TypedIterator(obj, evaltype=list) # <- EXIT! 70 | else: 71 | # DataFrame with another index type is treated as a mapping. 72 | if len(obj.columns) == 1: 73 | gen = ((x[0], x[1]) for x in obj.itertuples()) 74 | else: 75 | gen = ((x[0], tuple(x[1:])) for x in obj.itertuples()) 76 | return IterItems(gen) # <- EXIT! 77 | elif isinstance(obj, pandas.Series): 78 | if not obj.index.is_unique: 79 | msg = '{0} index contains duplicates, must be unique' 80 | raise ValueError(msg.format(obj.__class__.__name__)) 81 | 82 | if isinstance(obj.index, pandas.RangeIndex): 83 | # Series with RangeIndex is treated as an iterator. 84 | return TypedIterator(obj.values, evaltype=list) # <- EXIT! 85 | else: 86 | # Series with another index type is treated as a mapping. 87 | return IterItems(obj.iteritems()) # <- EXIT! 88 | 89 | numpy = sys.modules.get('numpy', None) 90 | if numpy and isinstance(obj, numpy.ndarray): 91 | # Two-dimentional array, recarray, or structured array. 92 | if obj.ndim == 2 or (obj.ndim == 1 and len(obj.dtype) > 1): 93 | obj = (tuple(x) for x in obj) 94 | return TypedIterator(obj, evaltype=list) # <- EXIT! 95 | 96 | # One-dimentional array, recarray, or structured array. 97 | if obj.ndim == 1: 98 | if len(obj.dtype) == 1: # Unpack single-valued recarray 99 | obj = (x[0] for x in obj) # or structured array. 100 | else: 101 | obj = iter(obj) 102 | return TypedIterator(obj, evaltype=list) # <- EXIT! 103 | 104 | # Check for cursor-like object (if obj has DBAPI2 cursor attributes). 105 | if all(hasattr(obj, n) for n in ('fetchone', 'execute', 106 | 'rowcount', 'description')): 107 | if not isinstance(obj, Iterable): 108 | def cursor_to_gen(cursor): # While most cursor objects are 109 | while True: # iterable, it is not required 110 | row = cursor.fetchone() # by the DBAPI2 specification. 111 | if row is None: 112 | break 113 | yield row 114 | obj = cursor_to_gen(obj) 115 | 116 | first, obj = iterpeek(obj) 117 | if first and len(first) == 1: 118 | obj = iter(x[0] for x in obj) # Unwrap single-value records. 119 | return obj # <- EXIT! 120 | 121 | return obj 122 | 123 | 124 | def _normalize_eager(obj, default_type=None): 125 | """Eagerly evaluate *obj* when possible. When *obj* is exhaustible, 126 | a *default_type* must be specified. When provided, *default_type* 127 | must be a collection type (a sized iterable container). 128 | """ 129 | if isinstance(obj, TypedIterator): 130 | return obj.fetch() 131 | 132 | # Separate Squint module. 133 | squint = sys.modules.get('squint', None) 134 | if squint and isinstance(obj, squint.Result): 135 | return obj.fetch() 136 | 137 | if isinstance(obj, IterItems): 138 | return dict(obj) 139 | 140 | if isinstance(obj, Iterable) and exhaustible(obj): 141 | if isinstance(default_type, type) and issubclass(default_type, Collection): 142 | return default_type(obj) 143 | else: 144 | cls_name = obj.__class__.__name__ 145 | msg = ("exhaustible type '{0}' cannot be eagerly evaluated " 146 | "without specifying a 'default_type' collection") 147 | raise TypeError(msg.format(cls_name)) 148 | 149 | return obj 150 | 151 | 152 | def normalize(obj, lazy_evaluation=False, default_type=None): 153 | obj = _normalize_lazy(obj) 154 | if lazy_evaluation: 155 | return obj 156 | return _normalize_eager(obj, default_type) 157 | --------------------------------------------------------------------------------