├── docs
├── _static
│ ├── .keep
│ ├── example.sqlite3
│ ├── data_prep_poll.png
│ ├── development_build.jpg
│ ├── reference_data_example.zip
│ ├── failure_message_example.zip
│ ├── example.csv
│ ├── tutorial
│ │ ├── estimated_totals.csv
│ │ ├── test_movies_df.py
│ │ ├── movies.csv
│ │ ├── test_movies_df_unit.py
│ │ ├── test_country_of_birth.py
│ │ ├── test_country_of_birth_unit.py
│ │ ├── modified_test_country_of_birth.py
│ │ ├── modified_test_country_of_birth_unit.py
│ │ ├── test_intro1.py
│ │ ├── test_intro2.py
│ │ ├── modified_country_of_birth.csv
│ │ ├── country_of_birth.csv
│ │ ├── test_intro1_unit.py
│ │ └── test_intro2_unit.py
│ ├── test_users.py
│ ├── test_users_unit.py
│ ├── excel_autoformat.csv
│ ├── theme_overrides.css
│ ├── test_validation.py
│ ├── mydata.csv
│ ├── users.csv
│ └── test_errors.py
├── _build
│ └── .gitignore
├── _templates
│ └── layout.html
├── discussion
│ ├── terminology.rst
│ ├── project-history.rst
│ ├── index.rst
│ ├── validate-vs-accept.rst
│ ├── organizing-tests.rst
│ └── data-preparation.rst
├── requirements.txt
├── how-to
│ ├── install.rst
│ ├── run-tests.rst
│ ├── index.rst
│ ├── negative-matches.rst
│ ├── reorder-acceptances.rst
│ ├── get-started.rst
│ ├── date-time-str.rst
│ ├── fuzzy-matching.rst
│ ├── sequences.rst
│ ├── phone-numbers.rst
│ ├── excel-auto-formatting.rst
│ └── customize-differences.rst
├── intro
│ └── index.rst
├── reference
│ ├── index.rst
│ └── unittest-support.rst
├── _ext
│ └── autodoc_classinstance.py
├── index.rst
└── tutorial
│ └── testing-pandas.rst
├── tests
├── __init__.py
├── sample_files
│ ├── sample_text_utf8.csv
│ ├── sample_excel1997.xls
│ ├── sample_excel2007.xlsx
│ ├── sample_dbase.dbf
│ ├── sample_text_iso88591.csv
│ ├── test_sources_excel.xlsx
│ └── sample_multiworksheet.xlsx
├── _io.py
├── past_api07_sources_base.py
├── past_api09.py
├── _contextlib.py
├── past_api07_sources_excel.py
├── test_past_subprocesses.py
├── past_api07_error.py
├── common.py
├── test_pandas_integration.py
├── past_api07_sources_pandas.py
├── test_runner.py
├── test_utils_misc.py
├── past_api00.py
├── past_api07_sources_sqlite.py
└── past_api09_load_csv.py
├── setup.cfg
├── datatest
├── _vendor
│ └── __init__.py
├── _compatibility
│ ├── __init__.py
│ ├── itertools.py
│ ├── abc.py
│ ├── statistics.py
│ ├── textwrap.py
│ ├── contextlib.py
│ ├── collections
│ │ └── abc.py
│ ├── builtins.py
│ ├── functools.py
│ └── decimal.py
├── __past__
│ ├── api_dev0.py
│ ├── api_dev1.py
│ ├── api_dev2.py
│ ├── api010.py
│ ├── squint
│ │ └── __init__.py
│ ├── __init__.py
│ ├── api07_error.py
│ ├── api09.py
│ ├── api00.py
│ ├── load_csv.py
│ └── api06.py
├── __main__.py
├── __init__.py
├── _excepthook.py
├── _working_directory.py
├── main.py
└── _normalize.py
├── MANIFEST.in
├── requirements-dev.txt
├── AUTHORS
├── .readthedocs.yml
├── LICENSE
├── .travis.yml
├── .gitignore
├── run-tests.sh
├── run-tests.bat
└── release-checklist.rst
/docs/_static/.keep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal=1
3 |
--------------------------------------------------------------------------------
/datatest/_vendor/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
--------------------------------------------------------------------------------
/datatest/_compatibility/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
--------------------------------------------------------------------------------
/tests/sample_files/sample_text_utf8.csv:
--------------------------------------------------------------------------------
1 | col1,col2
2 | utf8,α
3 |
--------------------------------------------------------------------------------
/docs/_build/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
5 |
--------------------------------------------------------------------------------
/docs/_static/example.sqlite3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawnbrown/datatest/HEAD/docs/_static/example.sqlite3
--------------------------------------------------------------------------------
/docs/_static/data_prep_poll.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawnbrown/datatest/HEAD/docs/_static/data_prep_poll.png
--------------------------------------------------------------------------------
/datatest/__past__/api_dev0.py:
--------------------------------------------------------------------------------
1 | """alias for api00"""
2 | from __future__ import absolute_import
3 | from .api00 import *
4 |
--------------------------------------------------------------------------------
/datatest/__past__/api_dev1.py:
--------------------------------------------------------------------------------
1 | """alias for api06"""
2 | from __future__ import absolute_import
3 | from .api06 import *
4 |
--------------------------------------------------------------------------------
/datatest/__past__/api_dev2.py:
--------------------------------------------------------------------------------
1 | """alias for api07"""
2 | from __future__ import absolute_import
3 | from .api07 import *
4 |
--------------------------------------------------------------------------------
/docs/_static/development_build.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawnbrown/datatest/HEAD/docs/_static/development_build.jpg
--------------------------------------------------------------------------------
/docs/_static/reference_data_example.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawnbrown/datatest/HEAD/docs/_static/reference_data_example.zip
--------------------------------------------------------------------------------
/tests/sample_files/sample_excel1997.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawnbrown/datatest/HEAD/tests/sample_files/sample_excel1997.xls
--------------------------------------------------------------------------------
/docs/_static/failure_message_example.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawnbrown/datatest/HEAD/docs/_static/failure_message_example.zip
--------------------------------------------------------------------------------
/tests/sample_files/sample_excel2007.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawnbrown/datatest/HEAD/tests/sample_files/sample_excel2007.xlsx
--------------------------------------------------------------------------------
/docs/_static/example.csv:
--------------------------------------------------------------------------------
1 | "A","B","C"
2 | "x","foo",20
3 | "x","foo",30
4 | "y","foo",10
5 | "y","bar",20
6 | "z","bar",10
7 | "z","bar",10
8 |
--------------------------------------------------------------------------------
/tests/sample_files/sample_dbase.dbf:
--------------------------------------------------------------------------------
1 | a COL1 C COL2 N
dBASE1
--------------------------------------------------------------------------------
/tests/sample_files/sample_text_iso88591.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawnbrown/datatest/HEAD/tests/sample_files/sample_text_iso88591.csv
--------------------------------------------------------------------------------
/tests/sample_files/test_sources_excel.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawnbrown/datatest/HEAD/tests/sample_files/test_sources_excel.xlsx
--------------------------------------------------------------------------------
/tests/sample_files/sample_multiworksheet.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shawnbrown/datatest/HEAD/tests/sample_files/sample_multiworksheet.xlsx
--------------------------------------------------------------------------------
/datatest/__past__/api010.py:
--------------------------------------------------------------------------------
1 | """Backward compatibility for version 0.10 API."""
2 | from __future__ import absolute_import
3 |
4 | # This is a stub for future use.
5 |
6 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst
2 | include AUTHORS
3 | include LICENSE
4 | include requirements.txt
5 | recursive-include datatest *.py
6 | include tests *.py
7 | include tests/sample_files *.*
8 |
--------------------------------------------------------------------------------
/docs/_templates/layout.html:
--------------------------------------------------------------------------------
1 | {% extends "!layout.html" %}
2 |
3 | {#
4 | {% block menu %}
5 | {{ super() }}
6 | Package Index
7 | {% endblock %}
8 | #}
9 |
--------------------------------------------------------------------------------
/docs/_static/tutorial/estimated_totals.csv:
--------------------------------------------------------------------------------
1 | state/territory,population
2 | Australian Capital Territory,389785
3 | Jervis Bay Territory,388
4 | New South Wales,7507350
5 | Northern Territory,226412
6 | Queensland,4721503
7 | South Australia,1637325
8 | Tasmania,514245
9 | Victoria,5849330
10 | Western Australia,2451380
11 |
--------------------------------------------------------------------------------
/datatest/__past__/squint/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """squint: simple query interface for tabular data
3 |
4 | PYTEST_DONT_REWRITE
5 | """
6 | from __future__ import absolute_import
7 |
8 | from .query import BaseElement
9 | from .query import Select
10 | from .query import Query
11 | from .query import Result
12 |
--------------------------------------------------------------------------------
/docs/discussion/terminology.rst:
--------------------------------------------------------------------------------
1 | :orphan:
2 |
3 | .. currentmodule:: datatest
4 |
5 | .. meta::
6 | :description: A discussion about the language and vocabulary used in datatest.
7 | :keywords: data, validation, quality, glossary, terms
8 |
9 |
10 | ####################
11 | Notes on Terminology
12 | ####################
13 |
14 |
--------------------------------------------------------------------------------
/docs/discussion/project-history.rst:
--------------------------------------------------------------------------------
1 | :orphan:
2 |
3 | .. currentmodule:: datatest
4 |
5 | .. meta::
6 | :description: A brief discussion on the history and origins of datatest at the NCEC.
7 | :keywords: datatest, history, NCEC, National Committee for an Effective Congress
8 |
9 |
10 | ################
11 | Datatest History
12 | ################
13 |
14 |
--------------------------------------------------------------------------------
/datatest/_compatibility/itertools.py:
--------------------------------------------------------------------------------
1 | """compatibility layer for itertools (Python standard library)"""
2 | from __future__ import absolute_import
3 | from itertools import *
4 |
5 | try:
6 | filterfalse # New in Python 3.
7 | except NameError:
8 | filterfalse = ifilterfalse
9 |
10 |
11 | try:
12 | zip_longest
13 | except NameError:
14 | zip_longest = izip_longest
15 |
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | # ==============================
2 | # Requirements for Read The Docs
3 | # ==============================
4 | #
5 | # The following requirements are additional dependencies that
6 | # https://readthedocs.io needs to install so it can properly
7 | # generate the documentation for datatest.
8 |
9 | sphinx>=2.1.0
10 | sphinx-tabs
11 | sphinx_rtd_theme>=0.3.1
12 |
13 |
--------------------------------------------------------------------------------
/datatest/_compatibility/abc.py:
--------------------------------------------------------------------------------
1 | """compatibility layer for abc (Python standard library)"""
2 | from __future__ import absolute_import
3 | from abc import *
4 |
5 |
6 | try:
7 | ABC # New in version 3.4.
8 | ABC.__slots__ # New in version 3.7
9 | except (NameError, AttributeError):
10 | # Using Python 2 and 3 compatible syntax.
11 | ABC = ABCMeta('ABC', (object,), {'__slots__': ()})
12 |
--------------------------------------------------------------------------------
/docs/how-to/install.rst:
--------------------------------------------------------------------------------
1 |
2 | .. currentmodule:: datatest
3 |
4 | .. meta::
5 | :description: How to Install Datatest
6 | :keywords: installing, datatest, python
7 |
8 |
9 | #######################
10 | How to Install Datatest
11 | #######################
12 |
13 | .. include:: ../../README.rst
14 | :start-after: start-inclusion-marker-install
15 | :end-before: end-inclusion-marker-install
16 |
--------------------------------------------------------------------------------
/datatest/__past__/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """Backwards compatibility for phased-out features and behaviors.
3 |
4 | To use a feature that is no longer supported in the current version of
5 | datatest, use the following:
6 |
7 | from datatest.__past__ import api
8 |
9 | For example, importing 'api07' would provide backwards compatibility
10 | for the API as implemented in the 0.7 version of datatest.
11 | """
12 |
--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | # ========================
2 | # Development Dependencies
3 | # ========================
4 | #
5 | # These are not installation requirements!
6 | #
7 | # The following dependencies are only required for
8 | # testing, building, and documentation generation.
9 | #
10 | # pip install -r requirements-dev.txt
11 |
12 | dbfread
13 | ipython
14 | numpy
15 | pandas
16 | squint
17 | xlrd==1.2.0
18 | sphinx>=2.1.0
19 | sphinx-tabs
20 | sphinx_rtd_theme>=0.3.1
21 | twine
22 | wheel
23 |
24 |
--------------------------------------------------------------------------------
/tests/_io.py:
--------------------------------------------------------------------------------
1 | """compatibility layer for io (Python standard library)"""
2 | from __future__ import absolute_import
3 | from io import *
4 | from sys import version_info as _version_info
5 |
6 |
7 | if _version_info[:2] <= (2, 7): # For version 2.7 and earlier.
8 | import StringIO as _StringIO
9 |
10 | StringIO = _StringIO.StringIO
11 | class StringIO(_StringIO.StringIO):
12 | def write(self, str):
13 | str = unicode(str)
14 | return _StringIO.StringIO.write(self, str)
15 |
--------------------------------------------------------------------------------
/datatest/_compatibility/statistics.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 |
4 | try:
5 | from statistics import *
6 | except ImportError:
7 |
8 | class StatisticsError(ValueError):
9 | pass
10 |
11 |
12 | def median(data):
13 | data = sorted(data)
14 | n = len(data)
15 | if n == 0:
16 | raise StatisticsError('no median for empty data')
17 | if n % 2 == 1:
18 | return data[n // 2]
19 | else:
20 | i = n // 2
21 | return (data[i - 1] + data[i]) / 2
22 |
--------------------------------------------------------------------------------
/datatest/__main__.py:
--------------------------------------------------------------------------------
1 | """Main entry point"""
2 |
3 | import sys
4 | if sys.argv[0].endswith('__main__.py'):
5 | import os.path
6 | # We change sys.argv[0] to make help message more useful
7 | # use executable without path, unquoted
8 | # (it's just a hint anyway)
9 | # (if you have spaces in your executable you get what you deserve!)
10 | executable = os.path.basename(sys.executable)
11 | sys.argv[0] = executable + ' -m datatest'
12 | del os
13 |
14 | __unittest = True
15 | __datatest = True
16 |
17 |
18 | from .main import main, DataTestProgram
19 |
20 | main(module=None)
21 |
--------------------------------------------------------------------------------
/datatest/_compatibility/textwrap.py:
--------------------------------------------------------------------------------
1 | """compatibility layer for textwrap (Python standard library)"""
2 | from __future__ import absolute_import
3 | from textwrap import *
4 |
5 |
6 | try:
7 | indent # New in 3.3
8 | except NameError:
9 | def indent(text, prefix, predicate=None):
10 | if predicate is None:
11 | def predicate(line):
12 | return line.strip()
13 |
14 | def prefixed_lines():
15 | for line in text.splitlines(True):
16 | yield (prefix + line if predicate(line) else line)
17 | return ''.join(prefixed_lines())
18 |
--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
1 | Datatest was originally created at NCEC Services, LLC in 2014
2 | by Shawn Brown as 'dataaudit'. In 2015 the project was largely
3 | rewritten and renamed to 'datatest'.
4 |
5 | Work-for-hire Contributors:
6 |
7 | * Shawn Brown (development lead)
8 | *
9 |
10 | Personal Contributors:
11 |
12 | * Shawn Brown
13 | *
14 |
15 | A big thank you goes out to:
16 |
17 | Heather Blum-Pastor for numerous ideas and feedback.
18 |
19 | Brian Fraher, Bilen Estephanos, and Eric Hawkins who helped spec-out
20 | the initial API on our snowy train ride to New York City in February
21 | of 2014.
22 |
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | # .readthedocs.yml
2 | # Read the Docs configuration file
3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4 |
5 | # Required
6 | version: 2
7 |
8 | # Build documentation in the docs/ directory with Sphinx
9 | sphinx:
10 | configuration: docs/conf.py
11 |
12 | # Build documentation with MkDocs
13 | #mkdocs:
14 | # configuration: mkdocs.yml
15 |
16 | # Optionally build your docs in additional formats such as PDF and ePub
17 | formats: all
18 |
19 | # Optionally set the version of Python and requirements required to build your docs
20 | python:
21 | version: 3
22 | install:
23 | - requirements: docs/requirements.txt
24 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2014 - 2021 National Committee for an Effective Congress,
2 | NCEC Services LLC, and contributing authors
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License");
5 | you may not use datatest except in compliance with the License.
6 | You may obtain a copy of the License at
7 |
8 | http://www.apache.org/licenses/LICENSE-2.0
9 |
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 |
--------------------------------------------------------------------------------
/docs/_static/test_users.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from datatest import working_directory
3 | from datatest import Select
4 | from datatest import validate
5 |
6 |
7 | @pytest.fixture(scope='module')
8 | @working_directory(__file__)
9 | def users():
10 | return Select('users.csv')
11 |
12 |
13 | @pytest.mark.mandatory
14 | def test_columns(users):
15 | validate(users.fieldnames, {'user_id', 'active'})
16 |
17 |
18 | def test_user_id(users):
19 |
20 | def is_wellformed(x): # <- Helper function.
21 | return x[:-1].isdigit() and x[-1:].isupper()
22 |
23 | validate(users('user_id'), is_wellformed)
24 |
25 |
26 | def test_active(users):
27 | validate(users({'active'}), {'Y', 'N'})
28 |
--------------------------------------------------------------------------------
/docs/intro/index.rst:
--------------------------------------------------------------------------------
1 |
2 | .. meta::
3 | :description: Table of Contents for Introduction.
4 | :keywords:
5 | :title: Introduction
6 |
7 | .. sectionauthor:: Shawn Brown
8 |
9 |
10 | ############
11 | Introduction
12 | ############
13 |
14 | .. epigraph::
15 |
16 | *"...tidy datasets are all alike but every messy dataset is messy
17 | in its own way"*
18 | ---Hadley Wickham [#f1]_
19 |
20 |
21 | .. toctree::
22 | :maxdepth: 2
23 |
24 | tour-of-datatest
25 | Automated Testing
26 | Pipeline Validation
27 | Validating Pandas
28 |
29 |
30 | .. [#f1] Wickham, Hadley. "Tidy Data." Journal of Statistical Software 59,
31 | no. 10, August 2014.
32 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 |
3 | matrix:
4 | include:
5 | - python: 3.10-dev
6 | - python: 3.9-dev
7 | - python: 3.8-dev
8 | - python: 3.7
9 | - python: 3.6
10 | - python: 3.5
11 | - python: 3.4
12 | - python: 3.3
13 | dist: trusty
14 | - python: 3.2
15 | dist: trusty
16 | # - python: 3.1 # not currently supported by Travis CI
17 | - python: 2.7
18 | - python: 2.6
19 | dist: trusty
20 | - python: pypy3
21 | - python: pypy
22 |
23 | install: true
24 | #install:
25 | # - pip install xlrd
26 | # - pip install pandas
27 |
28 | # command to run tests and check installation
29 | script:
30 | - python setup.py test
31 | - python -c 'import setuptools;print(setuptools.__version__)'
32 | - python setup.py install
33 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files.
2 | __pycache__/
3 | *.pyc
4 | *.pyo
5 | *.pyd
6 |
7 | # C extensions.
8 | *.so
9 |
10 | # Distribution / packaging.
11 | .Python
12 | env/
13 | bin/
14 | build/
15 | develop-eggs/
16 | dist/
17 | eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # Manifest built with MANIFEST.in
28 | MANIFEST
29 |
30 | # Installer logs.
31 | pip-log.txt
32 | pip-delete-this-directory.txt
33 |
34 | # Unit test / coverage reports.
35 | htmlcov/
36 | .tox/
37 | .coverage
38 | .cache
39 | nosetests.xml
40 | coverage.xml
41 |
42 | # Translations.
43 | *.mo
44 |
45 | # Sphinx documentation.
46 | docs/_build/
47 |
48 | # Environments
49 | .env
50 | .venv
51 | env/
52 | venv/
53 | ENV/
54 | env.bak/
55 | venv.bak/
56 |
--------------------------------------------------------------------------------
/docs/_static/test_users_unit.py:
--------------------------------------------------------------------------------
1 | from datatest import working_directory
2 | from datatest import Select
3 | from datatest import DataTestCase
4 | from datatest import mandatory
5 |
6 |
7 | def setUpModule():
8 | global users
9 | with working_directory(__file__):
10 | users = Select('users.csv')
11 |
12 |
13 | class TestUserData(DataTestCase):
14 |
15 | @mandatory
16 | def test_columns(self):
17 | self.assertValid(users.fieldnames, {'user_id', 'active'})
18 |
19 | def test_user_id(self):
20 |
21 | def is_wellformed(x): # <- Helper function.
22 | return x[:-1].isdigit() and x[-1:].isupper()
23 |
24 | self.assertValid(users('user_id'), is_wellformed)
25 |
26 | def test_active(self):
27 | self.assertValid(users({'active'}), {'Y', 'N'})
28 |
--------------------------------------------------------------------------------
/docs/discussion/index.rst:
--------------------------------------------------------------------------------
1 |
2 | .. meta::
3 | :description: Table of contents for discussion documentation.
4 | :keywords:
5 | :title: Discussion
6 |
7 | .. sectionauthor:: Shawn Brown
8 |
9 |
10 | ##########
11 | Discussion
12 | ##########
13 |
14 | .. epigraph::
15 |
16 | *"The right information cannot be extracted from the wrong data."*
17 | ---Russell Ackoff [#f1]_
18 |
19 |
20 | .. toctree::
21 | :maxdepth: 2
22 |
23 | Organizing Tests
24 | Tips and Tricks
25 | data-preparation
26 |
27 | ..
28 | OMIT UNFINISHED PAGES:
29 | validate-vs-accept
30 | terminology
31 | project-history
32 |
33 |
34 | .. [#f1] Ackoff, Russell L. "Ackoff's Best", New York: John Wiley & Sons, Inc.,
35 | 1999. p. 172.
36 |
--------------------------------------------------------------------------------
/docs/reference/index.rst:
--------------------------------------------------------------------------------
1 |
2 | .. meta::
3 | :description: Table of Contents for Reference.
4 | :keywords:
5 | :title: Reference
6 |
7 |
8 | ####################################
9 | Reference
10 | ####################################
11 |
12 | .. epigraph::
13 |
14 | *"A tool is best if it does the job required with a minimum of
15 | effort, with a minimum of complexity, and with a minimum of power."*
16 | ---Peter Drucker [#f1]_
17 |
18 |
19 | .. toctree::
20 | :maxdepth: 2
21 |
22 | Datatest Core
23 | Data Handling
24 | unittest-support
25 |
26 | See the :ref:`Package Index ` for a full list of classes
27 | and objects.
28 |
29 |
30 | .. [#f1] Drucker, Peter F. "Management: Tasks, Responsibilities, Practices",
31 | New York: Harper & Row, 1973. p. 224.
32 |
--------------------------------------------------------------------------------
/docs/discussion/validate-vs-accept.rst:
--------------------------------------------------------------------------------
1 | :orphan:
2 |
3 | .. currentmodule:: datatest
4 |
5 | .. meta::
6 | :description: A discussion about when it's appropriate to assert
7 | data requirements and when it's appropriate to accept
8 | deviations.
9 | :keywords: data, validation, quality, acceptance
10 |
11 |
12 | ########################
13 | Validation vs Acceptance
14 | ########################
15 |
16 | ..
17 | validate adherance to a lose requirement
18 | or accept specified deviation
19 |
20 | what's the difference?
21 | does it matter?
22 |
23 | quicker to validate loose requirement
24 | than it is to generate a bunch of differences that must then be accepted
25 | but unless executing time is prohibitive, favor semantic accuracy over
26 | misleading-optimization
27 |
28 |
--------------------------------------------------------------------------------
/docs/_static/excel_autoformat.csv:
--------------------------------------------------------------------------------
1 | A,B
2 | 106,ABY-22
3 | 109,ACZ-31
4 | 116,AFA-34
5 | 129,AFV-02
6 | 184,AFY-16
7 | 191,AGF-30
8 | 200,AGK-06
9 | 204,AGW-29
10 | 244,AGZ-08
11 | 252,AHB-28
12 | 255,AIZ-04
13 | 256,ALE-49
14 | 284,AMR-41
15 | 292,AOJ-35
16 | 294,AOX-18
17 | 295,APR-10
18 | 298,AQV-25
19 | 314,ATF-21
20 | 325,AUP-48
21 | 333,AVV-32
22 | 342,AXB-44
23 | 361,AXP-47
24 | 385,APE-07
25 | 391,AZL-36
26 | 414,BAF-37
27 | 418,BES-24
28 | 429,BEW-17
29 | 430,BGO-39
30 | 442,BGW-42
31 | 454,BKE-45
32 | 461,BMO-46
33 | 511,BNT-03
34 | 569,BNW-05
35 | 591,BNX-27
36 | 622,BPD-12
37 | 635,BVD-26
38 | 691,BWP-38
39 | 692,CMO-40
40 | 703,CPX-14
41 | 725,CQO-09
42 | 746,CSA-11
43 | 792,CSD-15
44 | 810,CSN-13
45 | 819,CUT-19
46 | 836,CWK-43
47 | 874,CYL-23
48 | 887,DBB-01
49 | 895,DEC-20
50 | 906,DNZ-33
51 | 981,DVH-50
52 |
--------------------------------------------------------------------------------
/docs/_static/tutorial/test_movies_df.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import pytest
4 | import pandas as pd
5 | import datatest as dt
6 |
7 |
8 | @pytest.fixture(scope='module')
9 | @dt.working_directory(__file__)
10 | def df():
11 | return pd.read_csv('movies.csv')
12 |
13 |
14 | @pytest.mark.mandatory
15 | def test_columns(df):
16 | dt.validate(
17 | df.columns,
18 | {'title', 'rating', 'year', 'runtime'},
19 | )
20 |
21 |
22 | def test_title(df):
23 | dt.validate.regex(df['title'], r'^[A-Z]')
24 |
25 |
26 | def test_rating(df):
27 | dt.validate.superset(
28 | df['rating'],
29 | {'G', 'PG', 'PG-13', 'R', 'NC-17', 'Not Rated'},
30 | )
31 |
32 |
33 | def test_year(df):
34 | dt.validate(df['year'], int)
35 |
36 |
37 | def test_runtime(df):
38 | dt.validate(df['runtime'], int)
39 |
--------------------------------------------------------------------------------
/docs/_static/tutorial/movies.csv:
--------------------------------------------------------------------------------
1 | title,rating,year,runtime
2 | Almost Famous,R,2000,122
3 | American Pie,R,1999,95
4 | Back to the Future,PG,1985,116
5 | Blade Runner,R,1982,117
6 | Blood for Dracula,R,1974,106
7 | Blue Velvet,R,1986,120
8 | The Breakfast Club,R,1985,97
9 | Clueless,PG-13,1995,97
10 | Cool Hand Luke,GP,1967,127
11 | The Craft,R,1996,101
12 | Doctor Zhivago,PG-13,1965,197
13 | el Topo,Not Rated,1970,125
14 | Evil Dead,NC-17,1981,85
15 | Ghostbusters,PG,1984,105
16 | Grease,PG-13,1978,110
17 | Heathers,R,1988,103
18 | Labyrinth,PG,1986,101
19 | The Lost Boys,R,1987,97
20 | Mean Girls,PG-13,2004,97
21 | Millennium Actress,PG,2001,87
22 | My Neighbor Totoro,G,1988,86
23 | Napoleon Dynamite,PG,2004,96
24 | Pee-wee's Big Adventure,PG,1985,91
25 | Pretty in Pink,PG-13,1986,97
26 | The Princess Bride,PG,1987,98
27 | Psycho,R,1960,109
28 | Stand by Me,R,1986,89
29 | Super 8,PG-13,2011,112
30 | superbad,R,2007,113
31 | WarGames,PG,1983,114
32 |
--------------------------------------------------------------------------------
/docs/_static/tutorial/test_movies_df_unit.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import pandas as pd
4 | import datatest as dt
5 |
6 |
7 | def setUpModule():
8 | global df
9 | with dt.working_directory(__file__):
10 | df = pd.read_csv('movies.csv')
11 |
12 |
13 | class TestMovies(dt.DataTestCase):
14 | @dt.mandatory
15 | def test_columns(self):
16 | self.assertValid(
17 | df.columns,
18 | {'title', 'rating', 'year', 'runtime'},
19 | )
20 |
21 | def test_title(self):
22 | self.assertValidRegex(df['title'], r'^[A-Z]')
23 |
24 | def test_rating(self):
25 | self.assertValidSuperset(
26 | df['rating'],
27 | {'G', 'PG', 'PG-13', 'R', 'NC-17', 'Not Rated'},
28 | )
29 |
30 | def test_year(self):
31 | self.assertValid(df['year'], int)
32 |
33 | def test_runtime(self):
34 | self.assertValid(df['runtime'], int)
35 |
--------------------------------------------------------------------------------
/tests/past_api07_sources_base.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from . import _unittest as unittest
3 |
4 | from datatest.__past__.api07_sources import MinimalSource
5 | from .mixins import OtherTests
6 | from .mixins import CountTests
7 |
8 |
9 | class TestBaseSource(OtherTests, unittest.TestCase):
10 | fieldnames = ['label1', 'label2', 'value']
11 | testdata = [['a', 'x', '17'],
12 | ['a', 'x', '13'],
13 | ['a', 'y', '20'],
14 | ['a', 'z', '15'],
15 | ['b', 'z', '5' ],
16 | ['b', 'y', '40'],
17 | ['b', 'x', '25']]
18 |
19 | def setUp(self):
20 | self.datasource = MinimalSource(self.testdata, self.fieldnames)
21 |
22 |
23 | class TestDataSourceCount(CountTests, unittest.TestCase):
24 | def setUp(self):
25 | """Define self.datasource (base version uses MinimalSource)."""
26 | self.datasource = MinimalSource(self.testdata, self.fieldnames)
27 |
--------------------------------------------------------------------------------
/tests/past_api09.py:
--------------------------------------------------------------------------------
1 | """Test API for 0.9.x compatibility."""
2 | from . import _unittest as unittest
3 | import datatest
4 | from datatest.__past__ import api09 # <- MONKEY PATCH!!!
5 |
6 | # IMPORT ADDITIONAL TESTS
7 | #from .past_api09_query import *
8 |
9 |
10 | class TestSubsetAndSupersetMethods(unittest.TestCase):
11 | """Semantics were inverted in the following version (0.10.x)."""
12 |
13 | def test_subset(self):
14 | """Check old-style 0.9.x API validate.subset() behavior."""
15 | data = ['A', 'B', 'C', 'D']
16 | requirement = set(['A', 'B'])
17 | datatest.validate.subset(data, requirement)
18 |
19 | def test_superset(self):
20 | """Check old-style 0.9.x API validate.superset() behavior."""
21 | data = ['A', 'B']
22 | requirement = set(['A', 'B', 'C', 'D'])
23 | datatest.validate.superset(data, requirement)
24 |
25 |
26 | if __name__ == '__main__':
27 | unittest.main()
28 | else:
29 | raise Exception('This test must be run directly or as a subprocess.')
30 |
--------------------------------------------------------------------------------
/datatest/__init__.py:
--------------------------------------------------------------------------------
1 | """Datatest: Test driven data-wrangling and data validation.
2 |
3 | PYTEST_DONT_REWRITE
4 | """
5 |
6 | from __future__ import absolute_import
7 |
8 | __version__ = '0.12.0.dev1'
9 |
10 | # Datatest Core API (__all__ property defined in submodules)
11 | from .validation import * # Validation error and functions.
12 | from .differences import * # Difference classes.
13 | from .acceptances import accepted
14 | from ._vendor.predicate import Predicate
15 |
16 | # Pandas extensions.
17 | from ._pandas_integration import register_accessors
18 |
19 | # Unittest-style API
20 | from .case import DataTestCase
21 | from .runner import mandatory
22 | from .runner import DataTestRunner
23 | from .main import DataTestProgram
24 | from .main import main
25 |
26 | # Data Handling API
27 | from ._working_directory import working_directory
28 | from ._vendor.repeatingcontainer import RepeatingContainer
29 |
30 | #############################################
31 | # Register traceback formatting handler.
32 | #############################################
33 | from . import _excepthook
34 | import sys as _sys
35 | _sys.excepthook = _excepthook.excepthook
36 |
--------------------------------------------------------------------------------
/docs/how-to/run-tests.rst:
--------------------------------------------------------------------------------
1 |
2 | .. py:currentmodule:: datatest
3 |
4 | .. meta::
5 | :description: How to run tests.
6 | :keywords: datatest, run, tests, unittest, pytest
7 |
8 |
9 | ################
10 | How to Run Tests
11 | ################
12 |
13 | ======
14 | Pytest
15 | ======
16 |
17 | If you have a pytest style script named ``test_mydata.py``,
18 | you can run it by typing the following at the command line:
19 |
20 | .. code-block:: console
21 |
22 | pytest test_mydata.py
23 |
24 | You invoke pytest just as you would in any other circumstance---see
25 | pytest's standard |pytest-usage|_ for full details.
26 |
27 |
28 | ========
29 | Unittest
30 | ========
31 |
32 | If you have a unittest style script named ``test_mydata.py``,
33 | you can run it by typing the following at the command line:
34 |
35 | .. code-block:: console
36 |
37 | python -m datatest test_mydata.py
38 |
39 | Datatest includes a unittest-style test runner that facilitates
40 | incremental testing. It runs tests in declaration order (i.e.,
41 | by line-number) and supports the :func:`@mandatory `
42 | decorator.
43 |
44 |
45 | ..
46 | SUBSTITUTIONS:
47 |
48 | .. |pytest-usage| replace:: Usage and Invocations
49 | .. _pytest-usage: https://docs.pytest.org/en/latest/usage.html
50 |
51 |
--------------------------------------------------------------------------------
/datatest/_compatibility/contextlib.py:
--------------------------------------------------------------------------------
1 | """compatibility layer for contextlib (Python standard library)"""
2 | from __future__ import absolute_import
3 | from contextlib import *
4 | from . import functools
5 |
6 |
7 | try:
8 | ContextDecorator # New in Python 3.2
9 | except NameError:
10 | # Adapted from Python 3.6 standard libary.
11 | class ContextDecorator(object):
12 | def _recreate_cm(self): # The `_recreate_cm` method is a private
13 | return self # interface for _GeneratorContextManager.
14 | # See issue #11647 for details.
15 |
16 | def __call__(self, func):
17 | @functools.wraps(func)
18 | def inner(*args, **kwds):
19 | with self._recreate_cm():
20 | return func(*args, **kwds)
21 | return inner
22 |
23 |
24 | try:
25 | suppress # New in Python 3.4
26 | except NameError:
27 | # Adapted from Python 3.6 standard libary.
28 | class suppress(object):
29 | """Context manager to suppress specified exceptions."""
30 | def __init__(self, *exceptions):
31 | self._exceptions = exceptions
32 |
33 | def __enter__(self):
34 | pass
35 |
36 | def __exit__(self, exctype, excinst, exctb):
37 | return exctype is not None and issubclass(exctype, self._exceptions)
38 |
--------------------------------------------------------------------------------
/docs/_static/theme_overrides.css:
--------------------------------------------------------------------------------
1 | /*
2 | Since themes can be loaded after this style sheet is applied, the
3 | declarations below should use the "!important" annotation so they
4 | will take precedence over corresponding declarations defined later.
5 | */
6 |
7 |
8 | /*
9 | In the sphinx_rtd_theme (version 0.4.2, as of this update), table
10 | cells do not wrap text by default. This can make for unnecessarily
11 | wide tables that scroll off the page. The following declarations
12 | allow lines to wrap when the width is 445px or greater.
13 |
14 | This solution is adapted from ideas discussed on the following
15 | issue:
16 |
17 | https://github.com/rtfd/sphinx_rtd_theme/issues/117
18 | */
19 | @media screen and (min-width: 445px) {
20 | .wy-table-responsive table td {
21 | white-space: normal !important;
22 | }
23 | .wy-table-responsive {
24 | overflow: visible !important;
25 | }
26 | }
27 |
28 |
29 | /*
30 | The sphinx_rtd_theme (as of version 0.5.0) does not include styles
31 | for "details" or "summary" elements.
32 | */
33 | details {
34 | margin-bottom: 1em;
35 | }
36 |
37 | summary {
38 | margin-bottom: 1em;
39 | cursor: pointer;
40 | }
41 |
42 | summary:hover {
43 | background: rgb(240, 240, 240); /* fallback if no "rgba" support */
44 | background-color: rgba(0, 0, 0, 0.0625);
45 | }
46 |
--------------------------------------------------------------------------------
/docs/_static/tutorial/test_country_of_birth.py:
--------------------------------------------------------------------------------
1 |
2 | import pytest
3 | from datatest import working_directory
4 | from datatest import Select
5 | from datatest import validate
6 | from datatest import accepted
7 | from datatest import Missing, Extra, Deviation, Invalid
8 |
9 |
10 | # Define fixtures.
11 |
12 | @pytest.fixture(scope='module')
13 | @working_directory(__file__)
14 | def detail():
15 | return Select('country_of_birth.csv')
16 |
17 |
18 | @pytest.fixture(scope='module')
19 | @working_directory(__file__)
20 | def summary():
21 | return Select('estimated_totals.csv')
22 |
23 |
24 | # Begin tests.
25 |
26 | @pytest.mark.mandatory
27 | def test_columns(detail, summary):
28 | required_set = set(summary.fieldnames)
29 |
30 | validate(detail.fieldnames, required_set)
31 |
32 |
33 | def test_state_labels(detail, summary):
34 | data = detail({'state/territory'})
35 | requirement = summary({'state/territory'})
36 |
37 | validate(data, requirement)
38 |
39 |
40 | def test_population_format(detail):
41 | data = detail({'population'})
42 |
43 | def integer_format(x): # <- Helper function.
44 | return str(x).isdecimal()
45 |
46 | validate(data, integer_format)
47 |
48 |
49 | def test_population_sums(detail, summary):
50 | data = detail({'state/territory': 'population'}).sum()
51 | requirement = summary({'state/territory': 'population'}).sum()
52 |
53 | validate(data, requirement)
54 |
--------------------------------------------------------------------------------
/docs/_static/test_validation.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import re
3 | import datatest
4 |
5 |
6 | class TestExample(datatest.DataTestCase):
7 | def test_membership_in_set(self):
8 | data = ['x', 'x', 'y', 'y', 'z', 'z']
9 | requirement = {'x', 'y', 'z'} # <- set
10 | self.assertValid(data, requirement)
11 |
12 | def test_function_returns_true(self):
13 | data = ['X', 'X', 'Y', 'Y']
14 | def requirement(x): # <- callable (helper function)
15 | return x.isupper()
16 | self.assertValid(data, requirement)
17 |
18 | def test_regex_matches(self):
19 | data = ['foo', 'foo', 'foo', 'bar', 'bar', 'bar']
20 | requirement = re.compile('^\w\w\w$') # <- regex object
21 | self.assertValid(data, requirement)
22 |
23 | def test_equality(self):
24 | data = ['x', 'x', 'x']
25 | requirement = 'x' # <- other (not container, callable, or regex)
26 | self.assertValid(data, requirement)
27 |
28 | def test_order(self):
29 | data = ['x', 'x', 'y', 'y', 'z', 'z']
30 | requirement = ['x', 'x', 'y', 'y', 'z', 'z'] # <- sequence
31 | self.assertValid(data, requirement)
32 |
33 | def test_mapping(self):
34 | data = {'x': 'foo', 'y': 'bar'}
35 | requirement = {'x': 'foo', 'y': 'bar'} # <- mapping
36 | self.assertValid(data, requirement)
37 |
38 |
39 | if __name__ == '__main__':
40 | datatest.main()
41 |
--------------------------------------------------------------------------------
/docs/how-to/index.rst:
--------------------------------------------------------------------------------
1 |
2 | .. meta::
3 | :description: Table of Contents for How-to Guide.
4 | :keywords:
5 | :title: How-to Guide
6 |
7 | .. py:currentmodule:: datatest
8 | .. moduleauthor:: Shawn Brown
9 | .. sectionauthor:: Shawn Brown
10 |
11 |
12 | ############
13 | How-to Guide
14 | ############
15 |
16 | .. epigraph::
17 |
18 | *"Hell is other people's data."*
19 | ---Jim Harris [#f1]_
20 |
21 |
22 | .. toctree::
23 | :maxdepth: 1
24 |
25 | Install Datatest
26 | Get Started Testing
27 | Run Tests
28 | Column Names
29 | Customize Differences
30 | Data Types
31 | Date and Time Strings
32 | Date and Time Objects
33 | File Names
34 | Test File Properties
35 | Excel Auto-Formatting
36 | Mailing Addresses
37 | Fuzzy Matching
38 | NaN Values
39 | Negative Matches
40 | Outliers
41 | Phone Numbers
42 | Re-order Acceptances
43 | Sequences
44 |
45 |
46 | .. [#f1] Harris, Jim. "Hell is other people’s data", OCDQ (blog), August 06, 2010,
47 | Retrieved from http://www.ocdqblog.com/home/hell-is-other-peoples-data.html
48 |
--------------------------------------------------------------------------------
/docs/_static/tutorial/test_country_of_birth_unit.py:
--------------------------------------------------------------------------------
1 |
2 | import pytest
3 | from datatest import working_directory
4 | from datatest import Select
5 | from datatest import DataTestCase
6 | from datatest import mandatory
7 | from datatest import Missing, Extra, Deviation, Invalid
8 |
9 |
10 | # Define fixtures.
11 |
12 | def setUpModule():
13 | global detail
14 | global summary
15 |
16 | with working_directory(__file__):
17 | detail = Select('country_of_birth.csv')
18 | summary = Select('estimated_totals.csv')
19 |
20 |
21 | # Begin tests.
22 |
23 | class TestPopulation(DataTestCase):
24 |
25 | @mandatory
26 | def test_columns(self):
27 | required_set = set(summary.fieldnames)
28 |
29 | self.assertValid(detail.fieldnames, required_set)
30 |
31 | def test_state_labels(self):
32 | data = detail({'state/territory'})
33 | requirement = summary({'state/territory'})
34 |
35 | self.assertValid(data, requirement)
36 |
37 | def test_population_format(self):
38 | data = detail({'population'})
39 |
40 | def integer_format(x): # <- Helper function.
41 | return str(x).isdecimal()
42 |
43 | self.assertValid(data, integer_format)
44 |
45 | def test_population_sums(self):
46 | data = detail({'state/territory': 'population'}).sum()
47 | requirement = summary({'state/territory': 'population'}).sum()
48 |
49 | self.assertValid(data, requirement)
50 |
--------------------------------------------------------------------------------
/docs/_static/mydata.csv:
--------------------------------------------------------------------------------
1 | user_id,active
2 | 999,Y
3 | 1000,Y
4 | 1001,N
5 | 1002,N
6 | 1003,Y
7 | 1004,Y
8 | 1005,Y
9 | 1006,N
10 | 1007,Y
11 | 1008,Y
12 | 1009,N
13 | 1010,N
14 | 1011,Y
15 | 1012,Y
16 | 1013,Y
17 | 1014,Y
18 | 1015,Y
19 | 1016,Y
20 | 1017,Y
21 | 1018,Y
22 | 1019,Y
23 | 1020,Y
24 | 1021,N
25 | 1022,N
26 | 1023,Y
27 | 1024,N
28 | 1025,Y
29 | 1026,Y
30 | 1027,Y
31 | 1028,N
32 | 1029,N
33 | 1030,N
34 | 1031,N
35 | 1032,Y
36 | 1033,Y
37 | 1034,Y
38 | 1035,N
39 | 1036,Y
40 | 1037,Y
41 | 1038,Y
42 | 1039,Y
43 | 1040,N
44 | 1041,Y
45 | 1042,Y
46 | 1043,Y
47 | 1044,N
48 | 1045,N
49 | 1046,Y
50 | 1047,Y
51 | 1048,N
52 | 1049,N
53 | 1050,N
54 | 1051,N
55 | 1052,Y
56 | 1053,Y
57 | 1054,Y
58 | 1055,Y
59 | 1056,Y
60 | 1057,Y
61 | 1058,Y
62 | 1059,Y
63 | 1060,Y
64 | 1061,Y
65 | 1062,N
66 | 1063,N
67 | 1064,Y
68 | 1065,Y
69 | 1066,Y
70 | 1067,Y
71 | 1068,Y
72 | 1069,Y
73 | 1070,Y
74 | 1071,Y
75 | 1072,Y
76 | 1073,Y
77 | 1074,N
78 | 1075,Y
79 | 1076,Y
80 | 1077,N
81 | 1078,Y
82 | 1079,Y
83 | 1080,Y
84 | 1081,N
85 | 1082,Y
86 | 1083,Y
87 | 1084,N
88 | 1085,N
89 | 1086,Y
90 | 1087,Y
91 | 1088,Y
92 | 1089,Y
93 | 1090,Y
94 | 1091,N
95 | 1092,Y
96 | 1093,N
97 | 1094,N
98 | 1095,Y
99 | 1096,N
100 | 1097,Y
101 | 1098,Y
102 | 1099,N
103 | 1100,N
104 | 1101,Y
105 | 1102,Y
106 | 1103,Y
107 | 1104,Y
108 | 1105,Y
109 | 1106,N
110 | 1107,N
111 | 1108,Y
112 | 1109,Y
113 | 1110,Y
114 | 1111,N
115 | 1112,N
116 | 1113,Y
117 | 1114,Y
118 | 1115,Y
119 | 1116,Y
120 | 1117,Y
121 | 1118,N
122 |
--------------------------------------------------------------------------------
/datatest/_compatibility/collections/abc.py:
--------------------------------------------------------------------------------
1 | """compatibility layer for collections.abc (Python standard library)"""
2 | from __future__ import absolute_import
3 | try:
4 | from collections.abc import * # New in 3.3
5 | except ImportError:
6 | # Previously, the collection ABCs were in the root namespace.
7 | from collections import (
8 | Container,
9 | Hashable,
10 | Iterable,
11 | Iterator,
12 | Sized,
13 | Callable,
14 | Sequence,
15 | MutableSequence,
16 | Set,
17 | MutableSet,
18 | Mapping,
19 | MutableMapping,
20 | MappingView,
21 | KeysView,
22 | ItemsView,
23 | ValuesView,
24 | )
25 |
26 |
27 | try:
28 | Collection # New in 3.6
29 | except NameError:
30 | # Adapted from Python 3.6 standard library.
31 | def _check_methods(C, *methods):
32 | mro = C.__mro__
33 | for method in methods:
34 | for B in mro:
35 | if method in B.__dict__:
36 | if B.__dict__[method] is None:
37 | return NotImplemented
38 | break
39 | else:
40 | return NotImplemented
41 | return True
42 |
43 |
44 | # Adapted from Python 3.6 standard library.
45 | class Collection(Sized, Iterable, Container):
46 | __slots__ = ()
47 |
48 | @classmethod
49 | def __subclasshook__(cls, C):
50 | if cls is Collection:
51 | return _check_methods(C, '__len__', '__iter__', '__contains__')
52 |
--------------------------------------------------------------------------------
/tests/_contextlib.py:
--------------------------------------------------------------------------------
1 | """compatibility layer for contextlib (Python standard library)"""
2 | from __future__ import absolute_import
3 | from contextlib import *
4 |
5 |
6 | try:
7 | redirect_stderr # New in 3.5
8 | except NameError:
9 | # Adapted from Python 3.5 Standard Library.
10 | import sys as _sys
11 | class _RedirectStream:
12 | _stream = None
13 |
14 | def __init__(self, new_target):
15 | self._new_target = new_target
16 | self._old_targets = []
17 |
18 | def __enter__(self):
19 | self._old_targets.append(getattr(_sys, self._stream))
20 | setattr(_sys, self._stream, self._new_target)
21 | return self._new_target
22 |
23 | def __exit__(self, exctype, excinst, exctb):
24 | setattr(_sys, self._stream, self._old_targets.pop())
25 |
26 | class redirect_stderr(_RedirectStream):
27 | """Context manager for temporarily redirecting stderr to
28 | another file.
29 | """
30 | _stream = 'stderr'
31 |
32 |
33 | try:
34 | redirect_stdout # New in 3.4
35 | except NameError:
36 | class redirect_stdout(_RedirectStream):
37 | """Context manager for temporarily redirecting stdout to
38 | another file.
39 |
40 | # How to send help() to stderr
41 | with redirect_stdout(sys.stderr):
42 | help(dir)
43 |
44 | # How to write help() to a file
45 | with open('help.txt', 'w') as f:
46 | with redirect_stdout(f):
47 | help(pow)
48 | """
49 | _stream = 'stdout'
50 |
--------------------------------------------------------------------------------
/datatest/__past__/api07_error.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import pprint
3 |
4 |
5 | class DataError(AssertionError):
6 | """Raised when :meth:`assertValid` finds differences between *data*
7 | and *requirement*.
8 | """
9 | def __init__(self, msg, differences, subject=None, required=None):
10 | """Initialize self, store *differences* for later reference."""
11 | if not differences:
12 | raise ValueError('Missing differences.')
13 | self._differences = differences
14 | self.msg = msg
15 | self.subject = str(subject) # Subject data source.
16 | self.required = str(required) # Required object or reference source.
17 | self._verbose = False # <- Set by DataTestResult if verbose.
18 |
19 | return AssertionError.__init__(self, msg)
20 |
21 | @property
22 | def differences(self):
23 | """An iterable (list or dict) of differences."""
24 | return self._differences
25 |
26 | def __repr__(self):
27 | return self.__class__.__name__ + ': ' + self.__str__()
28 |
29 | def __str__(self):
30 | diff = pprint.pformat(self.differences, width=1)
31 | if any([diff.startswith('{') and diff.endswith('}'),
32 | diff.startswith('[') and diff.endswith(']'),
33 | diff.startswith('(') and diff.endswith(')')]):
34 | diff = diff[1:-1]
35 |
36 | if self._verbose:
37 | msg_extras = '\n\nSUBJECT:\n{0}\nREQUIRED:\n{1}'
38 | msg_extras = msg_extras.format(self.subject, self.required)
39 | else:
40 | msg_extras = ''
41 |
42 | return '{0}:\n {1}{2}'.format(self.msg, diff, msg_extras)
43 |
--------------------------------------------------------------------------------
/docs/_static/users.csv:
--------------------------------------------------------------------------------
1 | USER_ID,ACTIVE
2 | 0999F,Y
3 | 1000C,Y
4 | 1001C,n
5 | 1002A,n
6 | 1003C,Y
7 | 1004E,Y
8 | 1005H,Y
9 | 1006E,n
10 | 1007H,Y
11 | 1008A,Y
12 | 1009F,n
13 | 1010D,n
14 | 1011H,Y
15 | 1012H,Y
16 | 1013E,Y
17 | 1014D,Y
18 | 1015C,Y
19 | 1016H,Y
20 | 1017G,Y
21 | 1018A,Y
22 | 1019H,Y
23 | 1020E,Y
24 | 1021H,n
25 | 1022A,n
26 | 1023B,Y
27 | 1024D,n
28 | 1025C,Y
29 | 1026B,Y
30 | 1027H,Y
31 | 1028B,n
32 | 1029A,n
33 | 1030H,n
34 | 1031A,n
35 | 1032G,y
36 | 1033H,y
37 | 1034F,y
38 | 1035F,n
39 | 1036E,y
40 | 1037E,y
41 | 1038G,y
42 | 1039G,y
43 | 1040A,n
44 | 1041A,Y
45 | 1042H,Y
46 | 1043B,Y
47 | 1044G,n
48 | 1045A,n
49 | 1046A,Y
50 | 1047H,Y
51 | 1048D,n
52 | 1049A,n
53 | 1050H,n
54 | 1051A,n
55 | 1052E,Y
56 | 1053A,Y
57 | 1054G,Y
58 | 1055C,Y
59 | 1056a,Y
60 | 1057F,Y
61 | 1058D,Y
62 | 1059H,Y
63 | 1060A,YES
64 | 1061D,YES
65 | 1062E,NO
66 | 1063C,NO
67 | 1064H,YES
68 | 1065A,YES
69 | 1066F,YES
70 | 1067A,YES
71 | 1068F,YES
72 | 1069D,YES
73 | 1070H,YES
74 | 1071E,YES
75 | 1072G,YES
76 | 1073B,YES
77 | 1074B,NO
78 | 1075B,Y
79 | 1076A,Y
80 | 1077A,n
81 | 1078H,Y
82 | 1079C,Y
83 | 1080F,Y
84 | 1081B,n
85 | 1082F,Y
86 | 1083F,Y
87 | 1084F,n
88 | 1085H,n
89 | 1086G,Y
90 | 1087C,Y
91 | 1088A,Y
92 | 1089A,Y
93 | 1090E,Y
94 | 1091B,n
95 | 1092C,Y
96 | 1093G,n
97 | 1094B,n
98 | 1095C,Y
99 | 1096A,n
100 | 1097E,Y
101 | 1098C,Y
102 | 1099b,n
103 | 1100G,n
104 | 1101B,Y
105 | 1102C,Y
106 | 1103A,Y
107 | 1104H,Y
108 | 1105H,Y
109 | 1106A,n
110 | 1107E,n
111 | 1108E,Y
112 | 1109G,Y
113 | 1110B,Y
114 | 1111F,n
115 | 1112D,n
116 | 1113B,Y
117 | 1114H,Y
118 | 1115A,Y
119 | 1116B,Y
120 | 1117B,Y
121 | 1118D,n
122 |
--------------------------------------------------------------------------------
/docs/_static/tutorial/modified_test_country_of_birth.py:
--------------------------------------------------------------------------------
1 |
2 | import pytest
3 | from datatest import working_directory
4 | from datatest import Select
5 | from datatest import validate
6 | from datatest import accepted
7 | from datatest import Missing, Extra, Deviation, Invalid
8 |
9 |
10 | # Define fixtures.
11 |
12 | @pytest.fixture(scope='module')
13 | @working_directory(__file__)
14 | def detail():
15 | return Select('country_of_birth.csv')
16 |
17 |
18 | @pytest.fixture(scope='module')
19 | @working_directory(__file__)
20 | def summary():
21 | return Select('estimated_totals.csv')
22 |
23 |
24 | # Begin tests.
25 |
26 | @pytest.mark.mandatory
27 | def test_columns(detail, summary):
28 | required_set = set(summary.fieldnames)
29 |
30 | with accepted(Extra):
31 | validate(detail.fieldnames, required_set)
32 |
33 |
34 | def test_state_labels(detail, summary):
35 | data = detail({'state/territory'})
36 | requirement = summary({'state/territory'})
37 |
38 | omitted_territory = accepted([
39 | Missing('Jervis Bay Territory'),
40 | ])
41 |
42 | with omitted_territory:
43 | validate(data, requirement)
44 |
45 |
46 | def test_population_format(detail):
47 | data = detail({'population'})
48 |
49 | def integer_format(x): # <- Helper function.
50 | return str(x).isdecimal()
51 |
52 | validate(data, integer_format)
53 |
54 |
55 | def test_population_sums(detail, summary):
56 | data = detail({'state/territory': 'population'}).sum()
57 | requirement = summary({'state/territory': 'population'}).sum()
58 |
59 | omitted_territory = accepted({
60 | 'Jervis Bay Territory': Missing(388),
61 | })
62 |
63 | with accepted.percent(0.03) | omitted_territory:
64 | validate(data, requirement)
65 |
--------------------------------------------------------------------------------
/docs/how-to/negative-matches.rst:
--------------------------------------------------------------------------------
1 |
2 | .. currentmodule:: datatest
3 |
4 | .. meta::
5 | :description: How to validate negative matches.
6 | :keywords: datatest, negative match
7 |
8 |
9 | ################################
10 | How to Validate Negative Matches
11 | ################################
12 |
13 | Sometimes you want to check that data is **not** equal to a specific
14 | value. There are a few different ways to perform this type of negative
15 | matching.
16 |
17 |
18 | Helper Function
19 | ===============
20 |
21 | One obvious way to check for a negative match is to define a helper
22 | function that checks for ``!=`` to a given value:
23 |
24 | .. code-block:: python
25 | :linenos:
26 |
27 | from datatest import validate
28 |
29 | data = [...]
30 |
31 | def not_bar(x):
32 | return x != 'bar'
33 |
34 | validate(data, not_bar)
35 |
36 |
37 | Inverted Predicate
38 | ==================
39 |
40 | Datatest provides a :class:`Predicate` class for handling different
41 | kinds of matching. You can invert a Predicate's behavior using the
42 | inversion operator, ``~``:
43 |
44 | .. code-block:: python
45 | :emphasize-lines: 4
46 | :linenos:
47 |
48 | from datatest import validate, Predicate
49 |
50 | data = [...]
51 | validate(data, ~Predicate('bar'))
52 |
53 |
54 | Functional Style
55 | ================
56 |
57 | If you are accustomed to programming in a functional style, you
58 | could perform a negative match using :func:`functools.partial` and
59 | :func:`operator.ne`:
60 |
61 | .. code-block:: python
62 | :emphasize-lines: 6
63 | :linenos:
64 |
65 | from functools import partial
66 | from operator import ne
67 | from datatest import validate
68 |
69 | data = [...]
70 | validate(data, partial(ne, 'bar'))
71 |
72 |
--------------------------------------------------------------------------------
/run-tests.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #=======================================================================
3 | # FILE: run-tests.sh
4 | # DESCRIPTION: Runs test suite under all supported versions of Python
5 | # and displays failures when encountered.
6 | #=======================================================================
7 |
8 | #-----------------------------------------------------------------------
9 | # Define function (takes command to run as a single argument).
10 | #-----------------------------------------------------------------------
11 | run_command ()
12 | {
13 | echo "" >&2
14 | echo "======================================================================" >&2
15 | echo "$1" >&2
16 | echo "======================================================================" >&2
17 | $1 # <- Run command.
18 | if [ $? -ne 0 ] # Check exit status of completed command.
19 | then
20 | echo "" >&2
21 | echo "Failed Command: $1" >&2
22 | echo "" >&2
23 | exit $? # <- EXIT!
24 | fi
25 | }
26 |
27 | #-----------------------------------------------------------------------
28 | # Run test suite in all supported versions of Python.
29 | #-----------------------------------------------------------------------
30 | run_command "python3.9 -B -m unittest $*"
31 | run_command "python3.8 -B -m unittest $*"
32 | run_command "python3.7 -B -m unittest $*"
33 | run_command "python3.6 -B -m unittest $*"
34 | run_command "python3.5 -B -m unittest $*"
35 | run_command "python3.4 -B -m unittest $*"
36 | #run_command "python3.3 -B -m unittest $*"
37 | #run_command "python3.2 -B -m unittest $*"
38 | #run_command "python3.1 -B tests/discover.py $*"
39 | run_command "python2.7 -B -m unittest discover $*"
40 | run_command "python2.6 -B tests/discover.py $*"
41 |
42 | echo "" >&2
43 | echo "All commands successful." >&2
44 |
--------------------------------------------------------------------------------
/datatest/_excepthook.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import sys
3 | from .validation import ValidationError
4 |
5 |
6 | if sys.excepthook:
7 | existing_excepthook = sys.excepthook
8 | else:
9 | existing_excepthook = sys.__excepthook__
10 |
11 |
12 | def _next_is_internal(tb):
13 | """Return True if the next traceback refers to an internal part of
14 | datatest.
15 | """
16 | tb_next = tb.tb_next
17 | if not tb_next:
18 | return False
19 | return (tb_next.tb_frame.f_globals.get('__datatest', False)
20 | or tb_next.tb_frame.f_globals.get('__unittest', False))
21 |
22 |
23 | def excepthook(err_type, err_value, err_traceback):
24 | """Hide calls internal to datatest for ValidationError instances
25 | and print traceback and exception to sys.stderr.
26 | """
27 | if not issubclass(err_type, ValidationError):
28 | return existing_excepthook(err_type, err_value, err_traceback)
29 |
30 | try:
31 | tb = err_traceback
32 | while tb:
33 | if _next_is_internal(tb):
34 | tb.tb_next = None # <- Only settable in 3.7 and newer.
35 | break
36 | tb = tb.tb_next
37 |
38 | existing_excepthook(err_type, err_value, err_traceback)
39 |
40 | except (AttributeError, TypeError):
41 | # In older versions of Python, "tb_next" is a read-only attribute.
42 | # Trying to set "tb_next" in versions 3.0 through 3.6 will raise an
43 | # AttributeError whereas versions 2.7 and older will raise a TypeError.
44 | limit = 1
45 | tb = err_traceback
46 | while tb:
47 | if _next_is_internal(tb):
48 | break
49 | limit += 1
50 | tb = tb.tb_next
51 |
52 | import traceback
53 | traceback.print_exception(err_type, err_value, err_traceback, limit)
54 |
55 |
--------------------------------------------------------------------------------
/docs/_static/tutorial/modified_test_country_of_birth_unit.py:
--------------------------------------------------------------------------------
1 |
2 | import pytest
3 | from datatest import working_directory
4 | from datatest import Select
5 | from datatest import DataTestCase
6 | from datatest import mandatory
7 | from datatest import Missing, Extra, Deviation, Invalid
8 |
9 |
10 | # Define fixtures.
11 |
12 | def setUpModule():
13 | global detail
14 | global summary
15 |
16 | with working_directory(__file__):
17 | detail = Select('country_of_birth.csv')
18 | summary = Select('estimated_totals.csv')
19 |
20 |
21 | # Begin tests.
22 |
23 | class TestPopulation(DataTestCase):
24 |
25 | @mandatory
26 | def test_columns(self):
27 | required_set = set(summary.fieldnames)
28 |
29 | with self.accepted(Extra):
30 | self.assertValid(detail.fieldnames, required_set)
31 |
32 | def test_state_labels(self):
33 | data = detail({'state/territory'})
34 | requirement = summary({'state/territory'})
35 |
36 | omitted_territory = self.accepted([
37 | Missing('Jervis Bay Territory'),
38 | ])
39 |
40 | with omitted_territory:
41 | self.assertValid(data, requirement)
42 |
43 | def test_population_format(self):
44 | data = detail({'population'})
45 |
46 | def integer_format(x): # <- Helper function.
47 | return str(x).isdecimal()
48 |
49 | self.assertValid(data, integer_format)
50 |
51 | def test_population_sums(self):
52 | data = detail({'state/territory': 'population'}).sum()
53 | requirement = summary({'state/territory': 'population'}).sum()
54 |
55 | omitted_territory = self.accepted({
56 | 'Jervis Bay Territory': Missing(388),
57 | })
58 |
59 | with self.acceptedPercent(0.03) | omitted_territory:
60 | self.assertValid(data, requirement)
61 |
--------------------------------------------------------------------------------
/docs/_static/tutorial/test_intro1.py:
--------------------------------------------------------------------------------
1 | """Example tests using pytest-style conventions."""
2 |
3 | import re
4 | from datatest import validate
5 |
6 |
7 | def test_using_set():
8 | """Check for set membership."""
9 | data = ['A', 'B', 'A']
10 |
11 | requirement = {'A', 'B'}
12 |
13 | validate(data, requirement)
14 |
15 |
16 | def test_using_function():
17 | """Check that function returns True."""
18 | data = [2, 4, 6, 8]
19 |
20 | def is_even(x):
21 | return x % 2 == 0
22 |
23 | validate(data, is_even)
24 |
25 |
26 | def test_using_type():
27 | """Check that values are of the given type."""
28 | data = [0.0, 1.0, 2.0]
29 |
30 | validate(data, float)
31 |
32 |
33 | def test_using_regex():
34 | """Check that values match the given pattern."""
35 | data = ['bake', 'cake', 'bake']
36 |
37 | regex = re.compile('[bc]ake')
38 |
39 | validate(data, regex)
40 |
41 |
42 | def test_using_string():
43 | """Check that values equal the given string."""
44 | data = ['foo', 'foo', 'foo']
45 |
46 | validate(data, 'foo')
47 |
48 |
49 | def test_using_tuple():
50 | """Check that tuples of values satisfy corresponding tuple of
51 | requirements.
52 | """
53 | data = [('A', 0.0), ('A', 1.0), ('A', 2.0)]
54 |
55 | requirement = ('A', float)
56 |
57 | validate(data, requirement)
58 |
59 |
60 | def test_using_dict():
61 | """Check that values satisfy requirements of matching keys."""
62 | data = {
63 | 'A': 100,
64 | 'B': 200,
65 | 'C': 300,
66 | }
67 | requirement = {
68 | 'A': 100,
69 | 'B': 200,
70 | 'C': 300,
71 | }
72 | validate(data, requirement)
73 |
74 |
75 | def test_using_list():
76 | """Check that the order of values match the required sequence."""
77 | data = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
78 |
79 | requirement = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
80 |
81 | validate(data, requirement)
82 |
--------------------------------------------------------------------------------
/docs/discussion/organizing-tests.rst:
--------------------------------------------------------------------------------
1 |
2 | .. currentmodule:: datatest
3 |
4 | .. meta::
5 | :description: A discussion on organizing a data test suite.
6 | :keywords: data, testing, organizing, incremental, validation
7 |
8 |
9 | #######################
10 | Organizing a Test Suite
11 | #######################
12 |
13 | Unlike unit testing of software, it's oftentimes not possible to check
14 | data properties as independent "units" in isolation. Later tests often
15 | depend on the success of earlier ones. For example, it's not useful
16 | to try to check the datatype of an "account_id" column if there's
17 | no column of that name. And it might not be useful to sum the values
18 | in an "accounts_payable" column when the associated account IDs
19 | contain invalid datatypes.
20 |
21 | Typically, data tests should be run sequentially where broader, general
22 | features are tested first and specific details are tested later (after
23 | their prerequisite tests have passed). This approach is called "top-down,
24 | incremental testing". You can use the following list as a rough guide
25 | of which features to check before others.
26 |
27 |
28 | Order to Check Features
29 | -----------------------
30 |
31 | 1. data is accessible (by loading a file or connecting to a data source
32 | via a fixture)
33 | 2. names of tables or worksheets (if applicable)
34 | 3. names of columns
35 | 4. categorical columns: controlled vocabulary, set membership, etc.
36 | 5. foreign-keys (if applicable)
37 | 6. well-formedness of text values: date formats, phone numbers, etc.
38 | 7. datatypes: int, float, datetime, etc.
39 | 8. constraints: uniqueness, minimum and maximum values, etc.
40 | 9. accuracy of quantitative columns: compare sums, counts, or averages
41 | against known-good values
42 | 10. internal consistency, cross-column comparisons, etc.
43 |
44 |
45 | ..
46 | updating for errors discovered later
47 | don't just fix the data error and move on
48 | instead, devise a test that fails, then fix
49 | the data
50 |
51 |
--------------------------------------------------------------------------------
/docs/_static/test_errors.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import re
3 | import datatest
4 |
5 |
6 | class TestExample(datatest.DataTestCase):
7 | def test_membership_in_set(self):
8 | data = ['x', 'x2', 'y', 'y', 'z', 'z']
9 | required_elements = {'x', 'y', 'z'}
10 | self.assertValid(data, required_elements)
11 |
12 | def test_function_returns_true(self):
13 | data = ['X', 'X', 'Y', 'y']
14 | def uppercase(x):
15 | return x.isupper()
16 | self.assertValid(data, uppercase)
17 |
18 | def test_regex_matches(self):
19 | data = ['foo', 'foo', 'foo', 'bar', 'bar', 'xx']
20 | three_letters = re.compile('^\w\w\w$')
21 | self.assertValid(data, three_letters)
22 |
23 | def test_equality(self):
24 | data = ['x', 'x', 'Y']
25 | other_value = 'x'
26 | self.assertValid(data, other_value)
27 |
28 | def test_order(self):
29 | data = ['x', 'X', 'y', 'y', 'z', 'z']
30 | my_sequence = ['x', 'x', 'y', 'y', 'z', 'z']
31 | self.assertValid(data, my_sequence)
32 |
33 | def test_mapping1(self):
34 | data = {
35 | 'x': 'foo',
36 | 'y': 'BAZ',
37 | }
38 | required_values = {
39 | 'x': 'foo',
40 | 'y': 'bar',
41 | }
42 | self.assertValid(data, required_values)
43 |
44 | def test_mapping2(self):
45 | data = {
46 | 'x': 11,
47 | 'y': 13,
48 | }
49 | required_values = {
50 | 'x': 10,
51 | 'y': 15,
52 | }
53 | self.assertValid(data, required_values)
54 |
55 | def test_mapping3(self):
56 | data = {
57 | 'x': 10,
58 | 'y': 15,
59 | 'z': 3000,
60 | }
61 | required_values = {
62 | 'x': 10,
63 | 'y': 15,
64 | 'z': 20,
65 | }
66 | self.assertValid(data, required_values)
67 |
68 |
69 | if __name__ == '__main__':
70 | datatest.main()
71 |
--------------------------------------------------------------------------------
/docs/how-to/reorder-acceptances.rst:
--------------------------------------------------------------------------------
1 |
2 | .. py:currentmodule:: datatest
3 |
4 | .. meta::
5 | :description: How to re-order acceptances.
6 | :keywords: datatest, order of operations, acceptance, order
7 |
8 |
9 | ###########################
10 | How to Re-Order Acceptances
11 | ###########################
12 |
13 | Individual acceptances can be combined together to create new acceptances
14 | with narrower or broader criteria (see :ref:`composability-docs`).
15 | When acceptances are combined, their criteria are applied in an order
16 | determined by their scope. Element-wise criteria are applied first,
17 | group-wise criteria are applied second, and whole-error criteria are
18 | applied last (see :ref:`order-of-operations-docs`).
19 |
20 |
21 | Implicit Ordering
22 | -----------------
23 |
24 | In this first example, we have a combined acceptance made from a
25 | whole-error acceptance, :func:`accepted.count`, and a group-wise
26 | acceptance, :func:`accepted([...]) `:
27 |
28 | .. code-block:: python
29 | :linenos:
30 | :lineno-start: 21
31 |
32 | with accepted.count(4) | accepted([Missing('A'), Missing('B')]):
33 | ...
34 |
35 | Since the :ref:`order-of-operations-docs` specifies that whole-error
36 | acceptances are applied *after* group-wise acceptances, the
37 | ``accepted.count(4)`` criteria is applied last even though it's
38 | defined first.
39 |
40 |
41 | Explicit Ordering
42 | -----------------
43 |
44 | If you want to control this order explicitly, you can use nested
45 | ``with`` statements to change the default behavior:
46 |
47 | .. code-block:: python
48 | :linenos:
49 | :lineno-start: 21
50 |
51 | with accepted([Missing('A'), Missing('B')]):
52 | with accepted.count(4):
53 | ...
54 |
55 | Using nested ``with`` statements, the inner-most block is applied
56 | first and outer blocks are applied in order until the outer-most
57 | block is applied last. In this example, the ``accepted.count(4)``
58 | is applied first because it's declared in the inner-most block.
59 |
--------------------------------------------------------------------------------
/docs/_static/tutorial/test_intro2.py:
--------------------------------------------------------------------------------
1 | """Example of failing tests using pytest-style conventions."""
2 |
3 | import re
4 | from datatest import validate
5 | from datatest import accepted
6 |
7 |
8 | def test_using_set():
9 | """Check for set membership."""
10 | data = ['A', 'B', 'C', 'D']
11 |
12 | requirement = {'A', 'B'}
13 |
14 | validate(data, requirement)
15 |
16 |
17 | def test_using_function():
18 | """Check that function returns True."""
19 | data = [2, 4, 6, 9]
20 |
21 | def is_even(x):
22 | return x % 2 == 0
23 |
24 | validate(data, is_even)
25 |
26 |
27 | def test_using_type():
28 | """Check that values are of the given type."""
29 | data = [0.0, 1.0, 2]
30 |
31 | validate(data, float)
32 |
33 |
34 | def test_using_regex():
35 | """Check that values match the given pattern."""
36 | data = ['bake', 'cake', 'fake']
37 |
38 | regex = re.compile('[bc]ake')
39 |
40 | validate(data, regex)
41 |
42 |
43 | def test_using_string():
44 | """Check that values equal the given string."""
45 | data = ['foo', 'foo', 'bar']
46 |
47 | validate(data, 'foo')
48 |
49 |
50 | def test_using_tuple():
51 | """Check that tuples of values satisfy corresponding tuple of
52 | requirements.
53 | """
54 | data = [('A', 1.0), ('A', 2), ('B', 3.0)]
55 |
56 | requirement = ('A', float)
57 |
58 | validate(data, requirement)
59 |
60 |
61 | def test_using_dict():
62 | """Check that values satisfy requirements of matching keys."""
63 | data = {
64 | 'A': 100,
65 | 'B': 200,
66 | 'C': 299,
67 | 'D': 405,
68 | }
69 | requirement = {
70 | 'A': 100,
71 | 'B': 200,
72 | 'C': 300,
73 | 'D': 400,
74 | }
75 | validate(data, requirement)
76 |
77 |
78 | def test_using_list():
79 | """Check that the order of values match the required sequence."""
80 | data = ['A', 'D', 'XXX', 'YYY', 'E', 'ZZZ', 'G']
81 |
82 | requirement = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
83 |
84 | validate(data, requirement)
85 |
--------------------------------------------------------------------------------
/docs/_static/tutorial/modified_country_of_birth.csv:
--------------------------------------------------------------------------------
1 | state/territory,country_of_birth,population
2 | Australian Capital Territory,Australia,270033
3 | Australian Capital Territory,China,11351
4 | Australian Capital Territory,England,12757
5 | Australian Capital Territory,India,10414
6 | Australian Capital Territory,New Zealand,4734
7 | Australian Capital Territory,other/unknown,84310
8 | Australian Capital Territory,Philippines,3798
9 | New South Wales,Australia,4899090
10 | New South Wales,China,234508
11 | New South Wales,England,226564
12 | New South Wales,India,143459
13 | New South Wales,New Zealand,117136
14 | New South Wales,other/unknown,1772722
15 | New South Wales,Philippines,86749
16 | Northern Territory,Australia,157531
17 | Northern Territory,England,5583
18 | Northern Territory,Greece,1268
19 | Northern Territory,India,3598
20 | Northern Territory,New Zealand,4636
21 | Northern Territory,other/unknown,50303
22 | Northern Territory,Philippines,5914
23 | Queensland,Australia,3343657
24 | Queensland,China,47114
25 | Queensland,England,180775
26 | Queensland,India,49145
27 | Queensland,New Zealand,201206
28 | Queensland,other/unknown,841165
29 | Queensland,South Africa,40131
30 | South Australia,Australia,1192546
31 | South Australia,China,24610
32 | South Australia,England,97392
33 | South Australia,India,27594
34 | South Australia,Italy,18544
35 | South Australia,other/unknown,301630
36 | South Australia,Vietnam,14337
37 | Tasmania,Australia,411490
38 | Tasmania,China,3036
39 | Tasmania,England,18776
40 | Tasmania,Netherlands,2193
41 | Tasmania,New Zealand,4977
42 | Tasmania,other/unknown,67210
43 | Tasmania,Scotland,2283
44 | Victoria,Australia,3845493
45 | Victoria,China,160652
46 | Victoria,England,171443
47 | Victoria,India,169802
48 | Victoria,New Zealand,93253
49 | Victoria,other/unknown,1405194
50 | Victoria,Vietnam,80787
51 | Western Australia,Australia,1492842
52 | Western Australia,England,194163
53 | Western Australia,India,49385
54 | Western Australia,New Zealand,79221
55 | Western Australia,other/unknown,586956
56 | Western Australia,Philippines,30835
57 | Western Australia,South Africa,41008
58 |
--------------------------------------------------------------------------------
/docs/_static/tutorial/country_of_birth.csv:
--------------------------------------------------------------------------------
1 | state/territory,country_of_birth,pop
2 | Australian Capital Territory,Australia,270033
3 | Australian Capital Territory,China,11351
4 | Australian Capital Territory,England,12757
5 | Australian Capital Territory,India,10414
6 | Australian Capital Territory,New Zealand,4734
7 | Australian Capital Territory,other/unknown,84310
8 | Australian Capital Territory,Philippines,3798
9 | New South Wales,Australia,4899090
10 | New South Wales,China,234508
11 | New South Wales,England,226564
12 | New South Wales,India,143459
13 | New South Wales,New Zealand,117136
14 | New South Wales,other/unknown,1772722
15 | New South Wales,Philippines,86749
16 | Northern Territory,Australia,157531
17 | Northern Territory,England,5583
18 | Northern Territory,Greece,1268
19 | Northern Territory,India,3598
20 | Northern Territory,New Zealand,4636
21 | Northern Territory,other/unknown,50303
22 | Northern Territory,Philippines,5914
23 | Queensland,Australia,3343657
24 | Queensland,China,47114
25 | Queensland,England,180775
26 | Queensland,India,49145
27 | Queensland,New Zealand,201206
28 | Queensland,other/unknown,841165
29 | Queensland,South Africa,40131
30 | South Australia,Australia,1192546
31 | South Australia,China,24610
32 | South Australia,England,"England,97392"
33 | South Australia,India,27594
34 | South Australia,Italy,18544
35 | South Australia,other/unknown,301630
36 | South Australia,Vietnam,14337
37 | Tasmania,Australia,411490
38 | Tasmania,China,3036
39 | Tasmania,England,18776
40 | Tasmania,Netherlands,2193
41 | Tasmania,New Zealand,4977
42 | Tasmania,other/unknown,67210
43 | Tasmania,Scotland,2283
44 | Tasmania,SUBTOTAL,509965
45 | Victoria,Australia,3845493
46 | Victoria,China,160652
47 | Victoria,England,171443
48 | Victoria,India,169802
49 | Victoria,New Zealand,93253
50 | Victoria,other/unknown,1405194
51 | Victoria,Vietnam,80787
52 | Western Australia,Australia,1492842
53 | Western Australia,England,194163
54 | Western Australia,India,49385
55 | Western Australia,New Zealand,79221
56 | Western Australia,other/unknown,586956
57 | Western Australia,Philippines,30835
58 | Western Australia,South Africa,41008
59 |
--------------------------------------------------------------------------------
/run-tests.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 | REM ********************************************************************
3 | REM File: run-tests.bat
4 | REM Description: Runs test suite under all supported versions of Python
5 | REM and displays failures when encountered.
6 | REM ********************************************************************
7 |
8 | GOTO:mainProgram
9 |
10 | REM ********************************************************************
11 | REM Define function (takes command to run as a single argument).
12 | REM ********************************************************************
13 | :runCommand
14 | SETLOCAL & IF %GLOBAL_ERRORLEVEL% NEQ 0 ENDLOCAL & GOTO:EOF
15 | ECHO.
16 | ECHO ======================================================================
17 | ECHO %~1
18 | ECHO ======================================================================
19 | CALL %~fs1
20 | IF %ERRORLEVEL% NEQ 0 (
21 | ECHO.
22 | ECHO Failed Command: %~1
23 | )
24 | ENDLOCAL & SET GLOBAL_ERRORLEVEL=%ERRORLEVEL%
25 | GOTO:EOF
26 |
27 |
28 | REM ********************************************************************
29 | REM Run test suite in all supported versions of Python.
30 | REM ********************************************************************
31 | :mainProgram
32 |
33 | SET GLOBAL_ERRORLEVEL=0
34 |
35 | CALL :runCommand "C:\Program Files\Python37\python.exe -B -m unittest %*"
36 | CALL :runCommand "C:\Program Files\Python 3.6\python.exe -B -m unittest %*"
37 | CALL :runCommand "C:\Program Files\Python 3.5\python.exe -B -m unittest %*"
38 | CALL :runCommand "C:\Python34\python.exe -B -m unittest %*"
39 | CALL :runCommand "C:\Python33\python.exe -B -m unittest %*"
40 | CALL :runCommand "C:\Python32\python.exe -B -m unittest %*"
41 | CALL :runCommand "C:\Python31\python.exe -B tests/discover.py %*"
42 | CALL :runCommand "C:\Python27\python.exe -B -m unittest discover %*"
43 | CALL :runCommand "C:\Python26\python.exe -B tests/discover.py %*"
44 |
45 | IF %GLOBAL_ERRORLEVEL% EQU 0 (
46 | ECHO.
47 | ECHO All commands successful.
48 | )
49 |
--------------------------------------------------------------------------------
/tests/past_api07_sources_excel.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import
3 | import os
4 | from . import _unittest as unittest
5 | from .mixins import OtherTests
6 | from .mixins import CountTests
7 |
8 | try:
9 | import xlrd
10 | except ImportError:
11 | xlrd = None
12 |
13 | from datatest.__past__.api07_sources import ExcelSource
14 |
15 | workbook_path = os.path.join(
16 | os.path.dirname(__file__),
17 | 'sample_files',
18 | 'test_sources_excel.xlsx',
19 | )
20 |
21 |
22 | @unittest.skipUnless(xlrd, 'requires xlrd')
23 | class TestExcelSource(OtherTests, unittest.TestCase):
24 | def setUp(self):
25 | global workbook_path
26 | self.datasource = ExcelSource(workbook_path) # <- Defaults to "Sheet 1"
27 |
28 |
29 | @unittest.skipUnless(xlrd, 'requires xlrd')
30 | class TestExcelSourceCount(unittest.TestCase):
31 | #class TestExcelSourceCount(CountTests, unittest.TestCase):
32 | def setUp(self):
33 | global workbook_path
34 | self.datasource = ExcelSource(workbook_path, 'count_data')
35 |
36 | def test_count(self):
37 | count = self.datasource.count
38 |
39 | self.assertEqual(9, count('label1'))
40 |
41 | expected = {'a': 4, 'b': 5}
42 | result = count('label1', ['label1'])
43 | self.assertEqual(expected, result)
44 |
45 | expected = {'a': 3, 'b': 3} # Counts only truthy values (not '' or None).
46 | result = count('label2', ['label1'])
47 | self.assertEqual(expected, result)
48 |
49 | expected = {
50 | ('a', 'x'): 2,
51 | ('a', 'y'): 1,
52 | ('a', ''): 1,
53 | ('b', 'z'): 1,
54 | ('b', 'y'): 1,
55 | ('b', 'x'): 1,
56 | #('b', None): 1, # <- None value has no equivalent in XLSX file.
57 | #('b', ''): 1,
58 | ('b', ''): 2,
59 | }
60 | result = count('label1', ['label1', 'label2'])
61 | self.assertEqual(expected, result)
62 |
63 | expected = {'x': 2, 'y': 1, '': 1}
64 | result = count('label1', 'label2', label1='a')
65 | self.assertEqual(expected, result)
66 |
--------------------------------------------------------------------------------
/docs/_static/tutorial/test_intro1_unit.py:
--------------------------------------------------------------------------------
1 | """Example tests using unittest-style conventions."""
2 |
3 | import re
4 | import datatest
5 |
6 |
7 | class ExampleTests(datatest.DataTestCase):
8 |
9 | def test_using_set(self):
10 | """Check for set membership."""
11 | data = ['A', 'B', 'A']
12 |
13 | requirement = {'A', 'B'}
14 |
15 | self.assertValid(data, requirement)
16 |
17 | def test_using_function(self):
18 | """Check that function returns True."""
19 | data = [2, 4, 6, 8]
20 |
21 | def is_even(x):
22 | return x % 2 == 0
23 |
24 | self.assertValid(data, is_even)
25 |
26 | def test_using_type(self):
27 | """Check that values are of the given type."""
28 | data = [0.0, 1.0, 2.0]
29 |
30 | self.assertValid(data, float)
31 |
32 | def test_using_regex(self):
33 | """Check that values match the given pattern."""
34 | data = ['bake', 'cake', 'bake']
35 |
36 | regex = re.compile('[bc]ake')
37 |
38 | self.assertValid(data, regex)
39 |
40 | def test_using_string(self):
41 | """Check that values equal the given string."""
42 | data = ['foo', 'foo', 'foo']
43 |
44 | self.assertValid(data, 'foo')
45 |
46 | def test_using_tuple(self):
47 | """Check that tuples of values satisfy corresponding tuple of
48 | requirements.
49 | """
50 | data = [('A', 0.0), ('A', 1.0), ('A', 2.0)]
51 |
52 | requirement = ('A', float)
53 |
54 | self.assertValid(data, requirement)
55 |
56 | def test_using_dict(self):
57 | """Check that values satisfy requirements of matching keys."""
58 | data = {
59 | 'A': 100,
60 | 'B': 200,
61 | 'C': 300,
62 | }
63 | requirement = {
64 | 'A': 100,
65 | 'B': 200,
66 | 'C': 300,
67 | }
68 | self.assertValid(data, requirement)
69 |
70 | def test_using_list(self):
71 | """Check that the order of values match the required sequence."""
72 | data = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
73 |
74 | requirement = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
75 |
76 | self.assertValid(data, requirement)
77 |
78 |
79 | if __name__ == '__main__':
80 | datatest.main()
81 |
--------------------------------------------------------------------------------
/docs/_static/tutorial/test_intro2_unit.py:
--------------------------------------------------------------------------------
1 | """Example of failing tests using unittest-style conventions."""
2 |
3 | import re
4 | import datatest
5 |
6 |
7 | class ExampleTests(datatest.DataTestCase):
8 | def test_using_set(self):
9 | """Check for set membership."""
10 | data = ['A', 'B', 'C', 'D']
11 |
12 | requirement = {'A', 'B'}
13 |
14 | self.assertValid(data, requirement)
15 |
16 | def test_using_function(self):
17 | """Check that function returns True."""
18 | data = [2, 4, 6, 9]
19 |
20 | def is_even(x):
21 | return x % 2 == 0
22 |
23 | self.assertValid(data, is_even)
24 |
25 | def test_using_type(self):
26 | """Check that values are of the given type."""
27 | data = [0.0, 1.0, 2]
28 |
29 | self.assertValid(data, float)
30 |
31 | def test_using_regex(self):
32 | """Check that values match the given pattern."""
33 | data = ['bake', 'cake', 'fake']
34 |
35 | regex = re.compile('[bc]ake')
36 |
37 | self.assertValid(data, regex)
38 |
39 | def test_using_string(self):
40 | """Check that values equal the given string."""
41 | data = ['foo', 'foo', 'bar']
42 |
43 | self.assertValid(data, 'foo')
44 |
45 | def test_using_tuple(self):
46 | """Check that tuples of values satisfy corresponding tuple of
47 | requirements.
48 | """
49 | data = [('A', 1.0), ('A', 2), ('B', 3.0)]
50 |
51 | requirement = ('A', float)
52 |
53 | self.assertValid(data, requirement)
54 |
55 | def test_using_dict(self):
56 | """Check that values satisfy requirements of matching keys."""
57 | data = {
58 | 'A': 100,
59 | 'B': 200,
60 | 'C': 299,
61 | 'D': 405,
62 | }
63 | requirement = {
64 | 'A': 100,
65 | 'B': 200,
66 | 'C': 300,
67 | 'D': 400,
68 | }
69 | self.assertValid(data, requirement)
70 |
71 | def test_using_list(self):
72 | """Check that the order of values match the required sequence."""
73 | data = ['A', 'D', 'XXX', 'YYY', 'E', 'ZZZ', 'G']
74 |
75 | requirement = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
76 |
77 | self.assertValid(data, requirement)
78 |
79 |
80 | if __name__ == '__main__':
81 | datatest.main()
82 |
--------------------------------------------------------------------------------
/tests/test_past_subprocesses.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """Test backwards compatibility modules using separate subprocesses."""
3 | import subprocess
4 | import sys
5 |
6 | from datatest._compatibility import textwrap
7 | from . import _unittest as unittest
8 | from .common import ignore_deprecations
9 |
10 |
11 | @ignore_deprecations
12 | class TestBackwardsCompatibility(unittest.TestCase):
13 | def assertSubprocess(self, module):
14 | """Run given *module* in separate process--fails if return code
15 | indicates an error.
16 | """
17 | command = [sys.executable, '-B', '-O', '-m', module]
18 | p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
19 | stdout_bytes, stderr_bytes = p.communicate() # Closes file-like object.
20 |
21 | # A non-zero return code indicates the command was not successful.
22 | if p.returncode != 0:
23 | output = stdout_bytes + stderr_bytes # Get all output.
24 | output = output.decode('utf-8') # Convert bytes to str.
25 | output = textwrap.wrap(output, width=70) # Get list of wrapped lines.
26 | output = '\n'.join(output) # Join list items as str.
27 | output = textwrap.indent(output, ' ') # Indent lines by 4 spaces.
28 |
29 | msg = '\n'.join([
30 | 'Subprocess failed:',
31 | output,
32 | '',
33 | 'To run this test directly, use the following command:',
34 | ' '.join(command),
35 | ])
36 | self.fail(msg)
37 |
38 | def test_api00(self):
39 | """Test compatibility with pre-release alpha API."""
40 | self.assertSubprocess('tests.past_api00')
41 |
42 | def test_api06(self):
43 | """Test compatibility with first development-release API."""
44 | self.assertSubprocess('tests.past_api06')
45 |
46 | def test_api07(self):
47 | """Test compatibility with second development-release API."""
48 | self.assertSubprocess('tests.past_api07')
49 |
50 | def test_api08(self):
51 | """Test compatibility with version 0.8 API."""
52 | self.assertSubprocess('tests.past_api08')
53 |
54 | def test_api09(self):
55 | """Test compatibility with version 0.9 API."""
56 | self.assertSubprocess('tests.past_api09')
57 |
58 |
59 | if __name__ == '__main__':
60 | unittest.main()
61 |
--------------------------------------------------------------------------------
/tests/past_api07_error.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from . import _unittest as unittest
3 |
4 | from datatest.__past__.api07_diffs import xMissing
5 | from datatest.__past__.api07_error import DataError
6 |
7 |
8 | class TestDataError(unittest.TestCase):
9 | def test_subclass(self):
10 | self.assertTrue(issubclass(DataError, AssertionError))
11 |
12 | def test_instantiation(self):
13 | DataError('column names', xMissing('foo'))
14 | DataError('column names', [xMissing('foo')])
15 | DataError('column names', {'foo': xMissing('bar')})
16 | DataError('column names', {('foo', 'bar'): xMissing('baz')})
17 |
18 | with self.assertRaises(ValueError, msg='Empty error should raise exception.'):
19 | DataError(msg='', differences={})
20 |
21 | def test_repr(self):
22 | error = DataError('different columns', [xMissing('foo')])
23 | pattern = "DataError: different columns:\n xMissing('foo')"
24 | self.assertEqual(repr(error), pattern)
25 |
26 | error = DataError('different columns', xMissing('foo'))
27 | pattern = "DataError: different columns:\n xMissing('foo')"
28 | self.assertEqual(repr(error), pattern)
29 |
30 | # Test pprint lists.
31 | error = DataError('different columns', [xMissing('foo'),
32 | xMissing('bar')])
33 | pattern = ("DataError: different columns:\n"
34 | " xMissing('foo'),\n"
35 | " xMissing('bar')")
36 | self.assertEqual(repr(error), pattern)
37 |
38 | # Test dictionary.
39 | error = DataError('different columns', {'FOO': xMissing('bar')})
40 | pattern = ("DataError: different columns:\n"
41 | " 'FOO': xMissing('bar')")
42 | self.assertEqual(repr(error), pattern)
43 |
44 | def test_verbose_repr(self):
45 | reference = 'reference-data-source'
46 | subject = 'subject-data-source'
47 | error = DataError('different columns', [xMissing('foo')], subject, reference)
48 | error._verbose = True # <- Set verbose flag, here!
49 |
50 | pattern = ("DataError: different columns:\n"
51 | " xMissing('foo')\n"
52 | "\n"
53 | "SUBJECT:\n"
54 | "subject-data-source\n"
55 | "REQUIRED:\n"
56 | "reference-data-source")
57 | self.assertEqual(repr(error), pattern)
58 |
--------------------------------------------------------------------------------
/docs/how-to/get-started.rst:
--------------------------------------------------------------------------------
1 |
2 | .. currentmodule:: datatest
3 |
4 | .. meta::
5 | :description: How to get started.
6 | :keywords: datatest, example, getting started
7 |
8 |
9 | ###############################
10 | How to Get Started With Testing
11 | ###############################
12 |
13 | Once you have reviewed the tutorials and have a basic understanding
14 | of datatest, you should be ready to start testing your own data.
15 |
16 |
17 | =========================================
18 | 1. Create a File and Add Some Sample Code
19 | =========================================
20 |
21 | A simple way to get started is to create a **.py** file in the same folder
22 | as the data you want to test. It's a good idea to follow established testing
23 | conventions and make sure your filename starts with "**test\_**".
24 |
25 | Then, copy one of following the **pytest** or **unittest** code samples
26 | to use as a template for writing your own tests:
27 |
28 | .. raw:: html
29 |
30 |
31 | Pytest Samples
32 |
33 | .. include:: ../intro/automated-testing.rst
34 | :start-after: start-inclusion-marker-pytestsamples
35 | :end-before: end-inclusion-marker-pytestsamples
36 |
37 | .. raw:: html
38 |
39 |
40 |
41 |
42 | .. raw:: html
43 |
44 |
45 | Unittest Samples
46 |
47 | .. include:: ../intro/automated-testing.rst
48 | :start-after: start-inclusion-marker-unittestsamples
49 | :end-before: end-inclusion-marker-unittestsamples
50 |
51 | .. raw:: html
52 |
53 |
54 |
55 |
56 | ==========================================
57 | 2. Adapt the Sample Code to Suit Your Data
58 | ==========================================
59 |
60 | After copying the sample code into your own file, begin adapting
61 | it to suit your data:
62 |
63 | 1. Change the fixture to use your data (instead of "example.csv").
64 | 2. Update the set in ``test_column_names()`` to require the names your
65 | data should contain (instead of "A", "B", and "C").
66 | 3. Rename ``test_a()`` and change it to check values in one of the
67 | columns in your data.
68 | 4. Add more tests appropriate for your own data requirements.
69 |
70 |
71 | ===================================
72 | 3. Refactor Your Tests as They Grow
73 | ===================================
74 |
75 | As your tests grow, look to structure them into related groups. Start
76 | by creating separate classes to contain groups of related test cases.
77 | And as you develop more and more classes, create separate modules to
78 | hold groups of related classes. If you are using ``pytest``, move your
79 | fixtures into a ``conftest.py`` file.
80 |
--------------------------------------------------------------------------------
/tests/common.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import glob
3 | import os
4 | import shutil
5 | import sys
6 | import tempfile
7 | import warnings
8 | from functools import wraps
9 |
10 | from . import _io as io
11 | from . import _unittest as unittest
12 |
13 |
14 | class MkdtempTestCase(unittest.TestCase):
15 | # TestCase changes cwd to temporary location. After testing,
16 | # removes files and restores original cwd.
17 | @classmethod
18 | def setUpClass(cls):
19 | cls._orig_dir = os.getcwd()
20 | cls._temp_dir = tempfile.mkdtemp() # Requires mkdtemp--cannot
21 |
22 | @classmethod
23 | def tearDownClass(cls):
24 | os.rmdir(cls._temp_dir)
25 |
26 | def setUp(self):
27 | os.chdir(self._temp_dir)
28 |
29 | def tearDown(self):
30 | for path in glob.glob(os.path.join(self._temp_dir, '*')):
31 | if os.path.isdir(path):
32 | shutil.rmtree(path)
33 | else:
34 | os.remove(path)
35 | os.chdir(self._orig_dir)
36 |
37 |
38 | def ignore_deprecations(obj):
39 | """A class and function decorator to ignore DeprecationWarnings."""
40 | def decorate(func):
41 | @wraps(func)
42 | def wrapper(*args, **kwds):
43 | with warnings.catch_warnings():
44 | warnings.simplefilter('ignore', DeprecationWarning)
45 | return func(*args, **kwds)
46 | return wrapper
47 |
48 | if isinstance(obj, type):
49 | # If object is a class, decorate its methods.
50 | for key, val in obj.__dict__.items():
51 | if callable(val):
52 | setattr(obj, key, decorate(val))
53 | else:
54 | # Else decorate the object itself.
55 | obj = decorate(obj)
56 |
57 | return obj
58 |
59 |
60 | try:
61 | unittest.TestCase.setUpClass # New in 2.7
62 | except AttributeError:
63 | _MkdtempTestCase = MkdtempTestCase
64 | class MkdtempTestCase(_MkdtempTestCase):
65 | def setUp(self):
66 | self.setUpClass.__func__(self)
67 | _MkdtempTestCase.setUp(self)
68 |
69 | def tearDown(self):
70 | _MkdtempTestCase.tearDown(self)
71 | self.tearDownClass.__func__(self)
72 |
73 |
74 | def make_csv_file(fieldnames, datarows):
75 | """Helper function to make CSV file-like object using *fieldnames*
76 | (a list of field names) and *datarows* (a list of lists containing
77 | the row values).
78 | """
79 | init_string = []
80 | init_string.append(','.join(fieldnames)) # Concat cells into row.
81 | for row in datarows:
82 | row = [str(cell) for cell in row]
83 | init_string.append(','.join(row)) # Concat cells into row.
84 | init_string = '\n'.join(init_string) # Concat rows into final string.
85 | return io.StringIO(init_string)
86 |
--------------------------------------------------------------------------------
/tests/test_pandas_integration.py:
--------------------------------------------------------------------------------
1 | """Tests for Pandas accessor extensions."""
2 | from . import _unittest as unittest
3 |
4 | try:
5 | import pandas
6 | except ImportError:
7 | pandas = None
8 |
9 | from datatest import Invalid
10 | from datatest import ValidationError
11 | from datatest import register_accessors
12 |
13 |
14 | @unittest.skipUnless(pandas, 'requires pandas')
15 | class TestAccessorExtensions(unittest.TestCase):
16 | """Test Pandas accessors."""
17 | def setUp(self): # Change to `setUpClass` when dropping
18 | register_accessors() # support for Python 2.6 and 3.1.
19 | self.df = pandas.DataFrame(
20 | data=[(1, 'x'), (2, 'y'), (3, 'z')],
21 | columns=['A', 'B'],
22 | )
23 |
24 | def test_dataframe_success(self):
25 | # Should pass without error on success.
26 | self.df.validate((int, str))
27 |
28 | def test_dataframe_failure(self):
29 | with self.assertRaises(ValidationError) as cm:
30 | is_odd = lambda x: x % 2 == 1
31 | self.df.validate((is_odd, str))
32 |
33 | actual = cm.exception.differences
34 | expected = [Invalid((2, 'y'))]
35 | self.assertEqual(actual, expected)
36 |
37 | def test_series_success(self):
38 | # Should pass without error on success.
39 | self.df.columns.validate.order(['A', 'B']) # Columns are a Series
40 | self.df['A'].validate(int) # A selected Series of column values.
41 |
42 | def test_series_failure(self):
43 | with self.assertRaises(ValidationError) as cm:
44 | is_odd = lambda x: x % 2 == 1
45 | self.df['A'].validate(is_odd)
46 |
47 | self.assertEqual(cm.exception.differences, [Invalid(2)])
48 |
49 | def test_index_success(self):
50 | # Should pass without error on success.
51 | self.df.index.validate(int)
52 |
53 | def test_index_failure(self):
54 | with self.assertRaises(ValidationError) as cm:
55 | is_odd = lambda x: x % 2 == 1
56 | self.df.index.validate(is_odd)
57 |
58 | actual = cm.exception.differences
59 | expected = [Invalid(0), Invalid(2)]
60 | self.assertEqual(actual, expected)
61 |
62 | def test_multiindex_success(self):
63 | # Should pass without error on success.
64 | multi_index = pandas.MultiIndex.from_arrays(
65 | [[1, 1, 2], ['A', 'B', 'C']],
66 | names=('number', 'letter')
67 | )
68 | multi_index.validate((int, str))
69 |
70 | def test_multiindex_failure(self):
71 | multi_index = pandas.MultiIndex.from_arrays(
72 | [[1, 1, 2], ['A', 'B', 'C']],
73 | names=('number', 'letter')
74 | )
75 | with self.assertRaises(ValidationError) as cm:
76 | is_odd = lambda x: x % 2 == 1
77 | multi_index.validate((is_odd, str))
78 |
79 | self.assertEqual(cm.exception.differences, [Invalid((2, 'C'))])
80 |
--------------------------------------------------------------------------------
/docs/_ext/autodoc_classinstance.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """Sphinx Extension: autodoc_classinstance (written by Shawn Brown)."""
4 | from sphinx.domains.python import PyClasslike
5 | from sphinx.ext.autodoc import ClassDocumenter
6 | from sphinx.ext.autodoc import MethodDocumenter
7 | from sphinx.util import inspect
8 |
9 |
10 | class PyClassInstance(PyClasslike):
11 | """
12 | Description of a class-instance object.
13 | """
14 | def get_signature_prefix(self, sig):
15 | return '' # Omit "class" prefix for instances.
16 |
17 |
18 | class ClassInstanceDocumenter(ClassDocumenter):
19 | """
20 | Specialized Documenter subclass for class instances.
21 | """
22 | objtype = 'classinstance'
23 |
24 | @classmethod
25 | def can_document_member(cls, member, membername, isattr, parent):
26 | return not isinstance(member, type)
27 |
28 | def import_object(self):
29 | ret = super().import_object()
30 | self.doc_as_attr = False # never document as a data/attribute
31 | return ret
32 |
33 | def format_args(self):
34 | # for instances, the relevant signature is the __call__ method's
35 | callmeth = self.get_attr(self.object, '__call__', None)
36 | if callmeth:
37 | sig = inspect.Signature(callmeth, bound_method=True, has_retval=True)
38 | return sig.format_args()
39 | return None
40 |
41 |
42 | class AlternateMethodDocumenter(MethodDocumenter):
43 | """
44 | Alternative documenter for methods of classes and class instances.
45 | """
46 | def add_directive_header(self, sig):
47 | if isinstance(self.parent, type):
48 | # If parent is a class definition, then add header as normal.
49 | super(AlternateMethodDocumenter, self).add_directive_header(sig)
50 | else:
51 | # When parent is an instance, then add a special header
52 | # (calls superclass' superclass method).
53 | super(MethodDocumenter, self).add_directive_header(sig)
54 |
55 | # Tag async methods but do not tag abstract, class, or
56 | # static methods.
57 | parentclass = self.parent.__class__
58 | obj = parentclass.__dict__.get(self.object_name, self.object)
59 | if inspect.iscoroutinefunction(obj):
60 | sourcename = self.get_sourcename()
61 | self.add_line(' :async:', sourcename)
62 |
63 |
64 | def setup(app):
65 | app.add_directive('classinstance', PyClassInstance)
66 | app.add_directive('py:classinstance', PyClassInstance)
67 | app.add_autodocumenter(ClassInstanceDocumenter)
68 |
69 | # If sphinx.ext.autosummary is used, it will override the
70 | # existing autodocumenters on the 'builder-inited' event.
71 | # Adding AlternateMethodDocumenter after this event makes
72 | # sure it isn't overridden.
73 | def add_method_documenter(app, env, docnames):
74 | app.add_autodocumenter(AlternateMethodDocumenter)
75 | app.connect('env-before-read-docs', add_method_documenter)
76 |
--------------------------------------------------------------------------------
/datatest/__past__/api09.py:
--------------------------------------------------------------------------------
1 | """Backward compatibility for version 0.9 API."""
2 | from __future__ import absolute_import
3 |
4 | import datatest
5 | from datatest._compatibility.collections.abc import Mapping
6 | from datatest._compatibility.collections.abc import Set
7 | from datatest._normalize import normalize
8 | from datatest._utils import IterItems
9 |
10 |
11 | class RequiredSubset_090(datatest.requirements.GroupRequirement):
12 | """Implements inverted subset behavior from 0.9.x API."""
13 | def __init__(self, requirement):
14 | if not isinstance(requirement, Set):
15 | requirement = set(requirement)
16 | self._set = requirement
17 |
18 | def check_group(self, group):
19 | missing = self._set.copy()
20 | for element in group:
21 | if not missing:
22 | break
23 | missing.discard(element)
24 |
25 | differences = (Missing(element) for element in missing)
26 | description = 'must contain all elements of given requirement'
27 | return differences, description
28 |
29 |
30 | class RequiredSuperset_090(datatest.requirements.GroupRequirement):
31 | """Implements inverted superset behavior from 0.9.x API."""
32 |
33 | def __init__(self, requirement):
34 | if not isinstance(requirement, Set):
35 | requirement = set(requirement)
36 | self._set = requirement
37 |
38 | def check_group(self, group):
39 | superset = self._set
40 | extras = set()
41 | for element in group:
42 | if element not in superset:
43 | extras.add(element)
44 |
45 | differences = (Extra(element) for element in extras)
46 | description = 'may only contain elements of given requirement'
47 | return differences, description
48 |
49 |
50 |
51 | class ValidateType(datatest.validation.ValidateType):
52 | def subset(self, data, requirement, msg=None):
53 | """Implements API 0.9.x subset behavior."""
54 | __tracebackhide__ = datatest.validation._pytest_tracebackhide
55 |
56 | requirement = normalize(requirement, lazy_evaluation=False, default_type=set)
57 |
58 | if isinstance(requirement, (Mapping, IterItems)):
59 | factory = RequiredSubset_090
60 | requirement = datatest.requirements.RequiredMapping(requirement, factory)
61 | else:
62 | requirement = RequiredSubset_090(requirement)
63 |
64 | self(data, requirement, msg=msg)
65 |
66 | def superset(self, data, requirement, msg=None):
67 | """Implements API 0.9.x superset behavior."""
68 | __tracebackhide__ = datatest.validation._pytest_tracebackhide
69 |
70 | requirement = normalize(requirement, lazy_evaluation=False, default_type=set)
71 |
72 | if isinstance(requirement, (Mapping, IterItems)):
73 | factory = RequiredSuperset_090
74 | requirement = datatest.requirements.RequiredMapping(requirement, factory)
75 | else:
76 | requirement = RequiredSuperset_090(requirement)
77 |
78 | self(data, requirement, msg=msg)
79 |
80 |
81 | datatest.validate = ValidateType()
82 |
--------------------------------------------------------------------------------
/datatest/__past__/api00.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """Backwards compatibility for version 0.6.0.dev0 API."""
3 | from __future__ import absolute_import
4 | import datatest
5 | from datatest.__past__ import api08
6 | from datatest.__past__ import api07
7 | from datatest.__past__ import api06
8 | from datatest import DataTestCase
9 |
10 | datatest.DataAssertionError = datatest.__past__.api07_error.DataError
11 |
12 | # Acceptances.
13 | DataTestCase.allowSpecified = DataTestCase.allowOnly
14 | DataTestCase.allowUnspecified = DataTestCase.allowAny
15 | DataTestCase.allowDeviationPercent = DataTestCase.allowPercentDeviation
16 |
17 | # Assertions.
18 | from .api06 import _assertDataCount
19 | DataTestCase.assertValueCount = _assertDataCount
20 |
21 | DataTestCase.assertColumnSet = DataTestCase.assertSubjectColumns
22 | DataTestCase.assertValueSet = DataTestCase.assertSubjectSet
23 | DataTestCase.assertValueSum = DataTestCase.assertSubjectSum
24 | DataTestCase.assertValueRegex = DataTestCase.assertSubjectRegex
25 | DataTestCase.assertValueNotRegex = DataTestCase.assertSubjectNotRegex
26 |
27 |
28 | def _assertColumnSubset(self, ref=None, msg=None):
29 | """Test that the set of subject columns is a subset of reference
30 | columns. If *ref* is provided, it is used in-place of the set
31 | from ``referenceData``.
32 | """
33 | try:
34 | self.assertColumnSet(ref, msg)
35 | except datatest.DataAssertionError:
36 | with self.allowMissing():
37 | self.assertColumnSet(ref, msg)
38 |
39 | DataTestCase.assertColumnSubset = _assertColumnSubset
40 |
41 |
42 | def _assertColumnSuperset(self, ref=None, msg=None):
43 | """Test that the set of subject columns is a superset of reference
44 | columns. If *ref* is provided, it is used in-place of the set
45 | from ``referenceData``.
46 | """
47 | try:
48 | self.assertColumnSet(ref, msg)
49 | except datatest.DataAssertionError:
50 | with self.allowExtra():
51 | self.assertColumnSet(ref, msg)
52 |
53 | DataTestCase.assertColumnSuperset = _assertColumnSuperset
54 |
55 |
56 | def _assertValueSubset(self, column, ref=None, msg=None, **filter_by):
57 | """Test that the set of subject values is a subset of reference
58 | values for the given *column*. If *ref* is provided, it is used
59 | in place of the set from ``referenceData``.
60 | """
61 | try:
62 | self.assertValueSet(column, ref, msg, **filter_by)
63 | except datatest.DataAssertionError:
64 | with self.allowMissing():
65 | self.assertValueSet(column, ref, msg, **filter_by)
66 |
67 | DataTestCase.assertValueSubset = _assertValueSubset
68 |
69 |
70 | def _assertValueSuperset(self, column, ref=None, msg=None, **filter_by):
71 | """Test that the set of subject values is a superset of reference
72 | values for the given *column*. If *ref* is provided, it is used
73 | in place of the set from ``referenceData``.
74 | """
75 | try:
76 | self.assertValueSet(column, ref, msg, **filter_by)
77 | except datatest.DataAssertionError:
78 | with self.allowExtra():
79 | self.assertValueSet(column, ref, msg, **filter_by)
80 |
81 | DataTestCase.assertValueSuperset = _assertValueSuperset
82 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | :tocdepth: 2
2 |
3 | .. meta::
4 | :description: Datatest introduction and table of contents.
5 | :keywords: data cleaning, data quality, etl testing, data validation, data testing, data preparation, python, datatest
6 | :title: Datatest: Test driven data-wrangling and data validation.
7 |
8 | .. module:: datatest
9 | :synopsis: Test driven data-wrangling and data validation.
10 | .. moduleauthor:: Shawn Brown
11 | .. sectionauthor:: Shawn Brown
12 |
13 |
14 | ########################################################
15 | Datatest: Test driven data-wrangling and data validation
16 | ########################################################
17 |
18 |
19 | .. include:: ../README.rst
20 | :start-after: start-inclusion-marker-badge-substitutions
21 | :end-before: end-inclusion-marker-badge-substitutions
22 |
23 | |licensebadge| |pythonbadge| |requiresbadge| |releasebadge| |repobadge|
24 |
25 |
26 | Datatest helps to speed up and formalize data-wrangling and data
27 | validation tasks. It was designed to work with poorly formatted
28 | data by detecting and describing validation failures.
29 |
30 | * |Validate| the format, type, set membership, and more from a variety of data
31 | sources including pandas ``DataFrames`` and ``Series``, NumPy ``ndarrays``,
32 | built-in data structures, etc.
33 | * Smart |comparison behavior| applies the appropriate validation method for
34 | a given data requirement.
35 | * Automatic |data handling| manages the validation of single elements,
36 | sequences, sets, dictionaries, and other containers of elements.
37 | * |Difference objects| characterize the discrepancies and deviations
38 | between a dataset and its requirements.
39 | * |Acceptance managers| distinguish between ideal criteria and acceptable
40 | differences.
41 |
42 | .. |Validate| replace:: :ref:`Validate `
43 | .. |comparison behavior| replace:: :ref:`comparison behavior `
44 | .. |data handling| replace:: :ref:`data handling `
45 | .. |Difference objects| replace:: :ref:`Difference objects `
46 | .. |Acceptance managers| replace:: :ref:`Acceptance managers `
47 |
48 |
49 | **Test driven data-wrangling** is a process for taking data from a source
50 | of unverified quality or format and producing a verified, well-formatted
51 | dataset. It repurposes software testing practices for data preparation
52 | and quality assurance projects. **Pipeline validation** monitors the status
53 | and quality of data as it passes through a pipeline and identifies *where*
54 | in a pipeline an error occurs.
55 |
56 | See the project `README `_ file for
57 | full details regarding supported versions, backward compatibility, and
58 | more.
59 |
60 |
61 | =================
62 | Table of Contents
63 | =================
64 |
65 | .. toctree::
66 | :caption: Documentation
67 | :hidden:
68 |
69 | Home
70 |
71 |
72 | .. toctree::
73 | :maxdepth: 2
74 |
75 | intro/index
76 | how-to/index
77 | reference/index
78 | discussion/index
79 |
80 | ..
81 | OMIT UNFINISHED PAGES:
82 | tutorial/index
83 |
84 |
--------------------------------------------------------------------------------
/datatest/_working_directory.py:
--------------------------------------------------------------------------------
1 | """working_directory context manager."""
2 |
3 | import os
4 | from ._compatibility import contextlib
5 |
6 |
7 | class working_directory(contextlib.ContextDecorator):
8 | """A context manager to temporarily set the working directory
9 | to a given *path*. If *path* specifies a file, the file's
10 | directory is used. When exiting the with-block, the working
11 | directory is automatically changed back to its previous
12 | location.
13 |
14 | **Context Manager:**
15 |
16 | You can use Python's :py:obj:`__file__` constant to load data
17 | relative to a file's current directory:
18 |
19 | .. code-block:: python
20 | :emphasize-lines: 4
21 |
22 | from datatest import working_directory
23 | import pandas as pd
24 |
25 | with working_directory(__file__):
26 | my_df = pd.read_csv('myfile.csv')
27 |
28 | **Decorator:**
29 |
30 | This context manager can also be used as a decorator:
31 |
32 | .. code-block:: python
33 | :emphasize-lines: 4
34 |
35 | from datatest import working_directory
36 | import pandas as pd
37 |
38 | @working_directory(__file__)
39 | def my_df():
40 | return pd.read_csv('myfile.csv')
41 |
42 | **Explicit Control:**
43 |
44 | In some cases, you may want to forgo the use of a context manager
45 | or decorator. You can explicitly control directory switching with
46 | the ``change()`` and ``revert()`` methods:
47 |
48 | .. code-block:: python
49 | :emphasize-lines: 4,8
50 |
51 | from datatest import working_directory
52 |
53 | work_dir = working_directory(__file__)
54 | work_dir.change()
55 |
56 | ...
57 |
58 | work_dir.revert()
59 | """
60 | def __init__(self, path):
61 | if os.path.isfile(path):
62 | path = os.path.dirname(path)
63 | self._working_dir = os.path.abspath(path)
64 | self._original_dir = None # Assigned on __enter__(), not before.
65 |
66 | def __enter__(self):
67 | if self._original_dir:
68 | msg = 'cannot reenter {0}, already entered from {1!r}'.format(
69 | self.__class__.__name__,
70 | self._original_dir,
71 | )
72 | raise RuntimeError(msg)
73 |
74 | self._original_dir = os.path.abspath(os.getcwd())
75 | os.chdir(self._working_dir)
76 |
77 | def __exit__(self, exc_type, exc_value, traceback):
78 | if self._original_dir:
79 | os.chdir(self._original_dir)
80 | self._original_dir = None
81 |
82 | def change(self):
83 | """Change to the defined working directory (enter the context).
84 |
85 | While operating in a working directory context, you cannot
86 | enter it again. Calling ``change()`` a second time will raise
87 | a :py:class:`RuntimeError`---you must first call ``revert()``.
88 | """
89 | self.__enter__()
90 |
91 | def revert(self):
92 | """Revert to the original working directory (exit the context).
93 |
94 | If no context has been entered, calling ``revert()`` will do
95 | nothing and pass without error.
96 | """
97 | self.__exit__(None, None, None)
98 |
--------------------------------------------------------------------------------
/docs/discussion/data-preparation.rst:
--------------------------------------------------------------------------------
1 |
2 | .. meta::
3 | :description: A discussion about the need for a structured approach
4 | to data preparation and data-wrangling.
5 | :keywords: data preparation, test driven, data-wrangling, structured,
6 | data science
7 |
8 |
9 | ################
10 | Data Preparation
11 | ################
12 |
13 | In the practice of data science, data preparation is a huge part of
14 | the job. Practitioners often spend 50 to 80 percent of their time
15 | wrangling data [#f1]_ [#f2]_ [#f3]_ [#f4]_. This critically important
16 | phase is time-consuming, unglamorous, and often poorly structured.
17 |
18 | The :mod:`datatest` package was created to support test driven
19 | data-wrangling and provide a disciplined approach to an otherwise
20 | messy process.
21 |
22 | A datatest suite can facilitate quick edit-test cycles to help guide
23 | the selection, cleaning, integration, and formatting of data. Data tests
24 | can also help to automate check-lists, measure progress, and promote
25 | best practices.
26 |
27 |
28 | **************************
29 | Test Driven Data-Wrangling
30 | **************************
31 |
32 | When data is messy, poorly structured, or uses an incompatible format,
33 | it's oftentimes not possible to prepare it using an automated process.
34 | There are a multitude of ways for messy data to counfound a processing
35 | system or schema. Dealing with data like this requires a data-wrangling
36 | approach where users are actively involved with making decisions and
37 | judgment calls about cleaning and formatting the data.
38 |
39 | A well-structured suite of data tests can serve as a template to guide
40 | the data-wrangling process. Using a quick edit-test cycle, users can:
41 |
42 | 1. focus on a failing test
43 | 2. make change to the data or the test
44 | 3. re-run the suite to check that the test now passes
45 | 4. then, move on to the next failing test
46 |
47 | The work of cleaning and formatting data takes place outside of the
48 | datatest package itself. Users can work with with the tools they find
49 | the most productive (Excel, `pandas `_, R,
50 | sed, etc.).
51 |
52 |
53 | .. rubric:: Footnotes
54 |
55 | .. [#f1] "Data scientists, according to interviews and expert estimates, spend
56 | from 50 percent to 80 percent of their time mired in this more mundane
57 | labor of collecting and preparing unruly digital data..." Steve Lohraug
58 | in *For Big-Data Scientists, 'Janitor Work' Is Key Hurdle to Insights*.
59 | Retrieved from http://www.nytimes.com/2014/08/18/technology/for-big-data-scientists-hurdle-to-insights-is-janitor-work.html
60 |
61 | .. [#f2] "This [data preparation step] has historically taken the largest part
62 | of the overall time in the data mining solution process, which in some
63 | cases can approach 80% of the time." *Dynamic Warehousing: Data Mining
64 | Made Easy* (p. 19)
65 |
66 | .. [#f3] Online poll of data mining practitioners: `See image <../_static/data_prep_poll.png>`_,
67 | *Data preparation (Oct 2003)*.
68 | Retrieved from http://www.kdnuggets.com/polls/2003/data_preparation.htm
69 | [While this poll is quite old, the situation has not changed
70 | drastically.]
71 |
72 | .. [#f4] "As much as 80% of KDD is about preparing data, and the remaining 20%
73 | is about mining." *Data Mining for Design and Manufacturing* (p. 44)
74 |
--------------------------------------------------------------------------------
/datatest/__past__/load_csv.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import warnings
3 | from .._utils import exhaustible
4 | from .._utils import seekable
5 | from .._utils import file_types
6 | from .get_reader import get_reader
7 | from .temptable import load_data
8 | from .temptable import savepoint
9 |
10 |
11 | preferred_encoding = 'utf-8'
12 | fallback_encoding = ['latin-1']
13 |
14 |
15 | def load_csv(cursor, table, csvfile, encoding=None, **kwds):
16 | """Load *csvfile* and insert data into *table*."""
17 | global preferred_encoding
18 | global fallback_encoding
19 |
20 | default = kwds.get('restval', '') # Used for default column value.
21 |
22 | if encoding:
23 | # When an encoding is specified, use it to load *csvfile* or
24 | # fail if there are errors (no fallback recovery):
25 | with savepoint(cursor):
26 | reader = get_reader.from_csv(csvfile, encoding, **kwds)
27 | load_data(cursor, table, reader, default=default)
28 |
29 | return # <- EXIT!
30 |
31 | # When the encoding is unspecified, try to load *csvfile* using the
32 | # preferred encoding and failing that, try the fallback encodings:
33 |
34 | if isinstance(csvfile, file_types) and seekable(csvfile):
35 | position = csvfile.tell() # Get current position if
36 | else: # csvfile is file-like and
37 | position = None # supports random access.
38 |
39 | try:
40 | with savepoint(cursor):
41 | reader = get_reader.from_csv(csvfile, preferred_encoding, **kwds)
42 | load_data(cursor, table, reader, default=default)
43 |
44 | return # <- EXIT!
45 |
46 | except UnicodeDecodeError as orig_error:
47 | if exhaustible(csvfile) and position is None:
48 | encoding, object_, start, end, reason = orig_error.args # Unpack args.
49 | reason = (
50 | '{0}: unable to load {1!r}, cannot attempt fallback with '
51 | '{2!r} type: must specify an appropriate text encoding'
52 | ).format(reason, csvfile, csvfile.__class__.__name__)
53 | raise UnicodeDecodeError(encoding, object_, start, end, reason)
54 |
55 | if isinstance(fallback_encoding, list):
56 | fallback_list = fallback_encoding
57 | else:
58 | fallback_list = [fallback_encoding]
59 |
60 | for fallback in fallback_list:
61 | if position is not None:
62 | csvfile.seek(position)
63 |
64 | try:
65 | with savepoint(cursor):
66 | reader = get_reader.from_csv(csvfile, fallback, **kwds)
67 | load_data(cursor, table, reader, default=default)
68 |
69 | msg = (
70 | '{0}: loaded {1!r} using fallback {2!r}: specify an '
71 | 'appropriate text encoding to assure correct operation'
72 | ).format(orig_error, csvfile, fallback)
73 | warnings.warn(msg)
74 |
75 | return # <- EXIT!
76 |
77 | except UnicodeDecodeError:
78 | pass
79 |
80 | # Note: DO NOT refactor this section using a for-else. I swear...
81 | encoding, object_, start, end, reason = orig_error.args # Unpack args.
82 | reason = (
83 | '{0}: unable to load {1!r}, fallback recovery unsuccessful: '
84 | 'must specify an appropriate text encoding'
85 | ).format(reason, csvfile)
86 | raise UnicodeDecodeError(encoding, object_, start, end, reason)
87 |
--------------------------------------------------------------------------------
/tests/past_api07_sources_pandas.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from . import _unittest as unittest
3 | from .mixins import CountTests
4 | from .mixins import OtherTests
5 |
6 | from datatest.__past__.api07_sources import PandasSource
7 | from datatest.__past__.api07_sources import _version_info
8 |
9 |
10 | ########################################################################
11 | # Test version parsing and import ``pandas`` if available.
12 | ########################################################################
13 | class TestVersionInfo(unittest.TestCase):
14 | def test_public_version(self):
15 | public_version = '0.19.2'
16 | info_tuple = _version_info(public_version)
17 | self.assertEqual(info_tuple, (0, 19, 2))
18 |
19 | def test_local_version(self):
20 | """Version items after a "+" are considered "local" version
21 | identifiers (see PEP 440).
22 | """
23 | local_version = '0.19.2+0.g825876c.dirty'
24 | info_tuple = _version_info(local_version)
25 | self.assertEqual(info_tuple, (0, 19, 2, 0, 'g825876c', 'dirty'))
26 |
27 |
28 | try:
29 | import pandas
30 | if (_version_info(pandas) < (0, 13, 0)
31 | or _version_info(pandas.np) < (1, 7, 1)):
32 | raise ImportError
33 | except ImportError:
34 | pandas = None
35 |
36 |
37 | ########################################################################
38 | # Test with DataFrame with no specified index (using default indexing).
39 | ########################################################################
40 | @unittest.skipUnless(pandas, 'requires pandas 0.13 or newer')
41 | class TestPandasSource(OtherTests, unittest.TestCase):
42 | def setUp(self):
43 | df = pandas.DataFrame(self.testdata, columns=self.fieldnames)
44 | self.datasource = PandasSource(df)
45 |
46 |
47 | @unittest.skipUnless(pandas, 'requires pandas 0.13 or newer')
48 | class TestPandasSourceCount(CountTests, unittest.TestCase):
49 | def setUp(self):
50 | df = pandas.DataFrame(self.testdata, columns=self.fieldnames)
51 | self.datasource = PandasSource(df)
52 |
53 |
54 | ########################################################################
55 | # Test with DataFrame that has a specified index.
56 | ########################################################################
57 | @unittest.skipUnless(pandas, 'requires pandas 0.13 or newer')
58 | class TestPandasSourceWithIndex(OtherTests, unittest.TestCase):
59 | def setUp(self):
60 | df = pandas.DataFrame(self.testdata, columns=self.fieldnames)
61 | df = df.set_index(['label1', 'label2']) # <- Specify index!
62 | self.datasource = PandasSource(df)
63 |
64 |
65 | @unittest.skipUnless(pandas, 'requires pandas 0.13 or newer')
66 | class TestPandasSourceWithIndexCount(CountTests, unittest.TestCase):
67 | def setUp(self):
68 | df = pandas.DataFrame(self.testdata, columns=self.fieldnames)
69 | df = df.set_index(['label1', 'label2']) # <- Specify index!
70 | self.datasource = PandasSource(df)
71 |
72 | def test_compound_keys(self):
73 | expected = {
74 | ('a', 'x'): 2,
75 | ('a', 'y'): 1,
76 | ('a', ''): 1,
77 | ('b', 'z'): 1,
78 | ('b', 'y'): 1,
79 | ('b', 'x'): 1,
80 | #('b', None): 1,
81 | ('b', pandas.np.nan): 1, # <- Returns nan instead of None (and that's OK!).
82 | ('b', ''): 1,
83 | }
84 | result = self.datasource.count('label1', ['label1', 'label2'])
85 | self.assertEqual(expected, result)
86 |
--------------------------------------------------------------------------------
/docs/how-to/date-time-str.rst:
--------------------------------------------------------------------------------
1 |
2 | .. currentmodule:: datatest
3 |
4 | .. meta::
5 | :description: How to validate date formats.
6 | :keywords: datatest, date format, validate, validation
7 |
8 |
9 | #####################################
10 | How to Validate Date and Time Strings
11 | #####################################
12 |
13 | To validate date and time formats, we can define a helper function that
14 | uses `strftime codes`_ to check for matching strings.
15 |
16 | In the following example, we use the code ``%Y-%m-%d`` to check for
17 | dates that match the pattern YYYY-MM-DD:
18 |
19 | .. code-block:: python
20 | :emphasize-lines: 17
21 | :linenos:
22 |
23 | from datetime import datetime
24 | from datatest import validate
25 |
26 |
27 | def strftime_format(format):
28 | def func(value):
29 | try:
30 | datetime.strptime(value, format)
31 | except ValueError:
32 | return False
33 | return True
34 | func.__doc__ = f'should use date format {format}'
35 | return func
36 |
37 |
38 | data = ['2020-02-29', '03-17-2021', '2021-02-29', '2021-04-01']
39 | validate(data, strftime_format('%Y-%m-%d'))
40 |
41 |
42 | Date strings that don't match the required format are flagged as
43 | :class:`Invalid`:
44 |
45 | .. code-block:: none
46 |
47 | Traceback (most recent call last):
48 | File "example.py", line 17, in
49 | validate(data, strftime_format('%Y-%m-%d'))
50 | datatest.ValidationError: should use date format %Y-%m-%d (2 differences): [
51 | Invalid('03-17-2021'),
52 | Invalid('2021-02-29'),
53 | ]
54 |
55 | Above, the date ``03-17-2021`` is invalid because it's not well-formed
56 | and ``2021-02-29`` is invalid because 2021 is not a leap-year so the last
57 | day of February is the 28th---there is no February 29th in that calendar
58 | year.
59 |
60 |
61 | Strftime Codes for Common Formats
62 | =================================
63 |
64 | You can use the following **format codes** with the function
65 | defined earlier to validate many common date and time formats
66 | (e.g., ``strftime_format('%d %B %Y')``):
67 |
68 | ======================== ========================= ========================
69 | format codes description example
70 | ======================== ========================= ========================
71 | ``%Y-%m-%d`` YYYY-MM-DD 2021-03-17
72 | ``%m/%d/%Y`` MM/DD/YYYY 3/17/2021
73 | ``%d/%m/%Y`` DD/MM/YYYY 17/03/2021
74 | ``%d.%m.%Y`` DD.MM.YYYY 17.03.2021
75 | ``%d %B %Y`` DD Month YYYY 17 March 2021
76 | ``%b %d, %Y`` Mnth DD, YYYY Mar 17, 2021
77 | ``%a %b %d %H:%M:%S %Y`` WkDay Mnth DD H:M:S YYYY Wed Mar 17 19:42:50 2021
78 | ``%I:%M %p`` 12-hour time 7:42 PM [1]_
79 | ``%H:%M:%S`` 24-hour time with seconds 19:42:50
80 | ======================== ========================= ========================
81 |
82 | In Python's :py:mod:`datetime` module, see `strftime() and strptime() Format Codes`_
83 | for all supported codes.
84 |
85 | .. _`strftime codes`: https://docs.python.org/library/datetime.html#strftime-and-strptime-format-codes
86 | .. _`strftime() and strptime() Format Codes`: https://docs.python.org/library/datetime.html#strftime-and-strptime-format-codes
87 |
88 |
89 | .. rubric:: Footnotes
90 |
91 | .. [1] The code ``%p`` expects the system locale's equivalent of AM or PM.
92 | For example, the locale ``en_US`` uses "AM" and "PM" while the locale
93 | ``de_DE`` uses "am" and "pm".
94 |
--------------------------------------------------------------------------------
/tests/test_runner.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import sys
3 | from . import _unittest as unittest
4 | from datatest import DataTestCase
5 | from datatest import ValidationError
6 | from datatest import Missing
7 |
8 | from datatest.runner import DataTestResult
9 | from datatest.runner import mandatory
10 | from datatest.runner import _sort_key
11 |
12 |
13 | class TestDataTestResult(unittest.TestCase):
14 | def test_is_mandatory(self):
15 | testresult = DataTestResult()
16 |
17 | class _TestClass(DataTestCase): # Dummy class.
18 | def test_method(_self):
19 | pass
20 |
21 | def runTest(_self):
22 | pass
23 |
24 | # Not mandatory.
25 | testcase = _TestClass()
26 | self.assertFalse(testresult._is_mandatory(testcase))
27 |
28 | # Mandatory class.
29 | testcase = _TestClass()
30 | testcase.__datatest_mandatory__ = True
31 | self.assertTrue(testresult._is_mandatory(testcase))
32 |
33 | # Mandatory method.
34 | #TODO!!!: Need to make this test.
35 |
36 | # Check non-test-case behavior.
37 | not_a_testcase = object()
38 | self.assertFalse(testresult._is_mandatory(not_a_testcase))
39 |
40 | def test_add_mandatory_message(self):
41 | testresult = DataTestResult()
42 |
43 | err_tuple = (ValidationError,
44 | ValidationError([Missing('x')], 'example failure'),
45 | '')
46 |
47 | new_tuple = testresult._add_mandatory_message(err_tuple)
48 | _, err, _ = new_tuple
49 | self.assertRegex(str(err), 'mandatory test failed, stopping early')
50 |
51 |
52 | class TestOrdering(unittest.TestCase):
53 | def test_sort_key(self):
54 | # Define and instantiate sample case.
55 | class SampleCase(unittest.TestCase):
56 | def test_reference(self): # <- This line number used as reference.
57 | pass # +1
58 | # +2
59 | @unittest.skip('Testing skip behavior.') # +3 (first check)
60 | def test_skipped(self): # +4
61 | pass # +5
62 | # +6
63 | @mandatory # +7 (second check)
64 | def test_mandatory(self): # +8
65 | pass # +9
66 |
67 | # Get line number of undecorated method--this is uses as a
68 | # reference point from which to determine the required line
69 | # numbers for the decorated methods.
70 | reference_case = SampleCase('test_reference')
71 | _, reference_line_no = _sort_key(reference_case)
72 |
73 | # Starting in Python 3.3, the @functools.wraps() decorator
74 | # added a greatly needed `__wrapped__` attribute that points
75 | # to the original wrapped object. After @unittest.skip() is
76 | # applied, this attribute is needed to get the line number
77 | # of the original object (instead of the line number of the
78 | # decorator).
79 | if sys.version_info >= (3, 3):
80 | # Test line number of skipped method.
81 | skipped_case = SampleCase('test_skipped')
82 | skipped_line_no = reference_line_no + 3
83 | _, line_no = _sort_key(skipped_case)
84 | self.assertEqual(skipped_line_no, line_no)
85 |
86 | # Test line number of mandatory method.
87 | mandatory_case = SampleCase('test_mandatory')
88 | mandatory_line_no = reference_line_no + 7
89 | _, line_no = _sort_key(mandatory_case)
90 | self.assertEqual(mandatory_line_no, line_no)
91 |
--------------------------------------------------------------------------------
/datatest/_compatibility/builtins.py:
--------------------------------------------------------------------------------
1 | """compatibility layer for built-in functions"""
2 | from __future__ import absolute_import
3 |
4 |
5 | try:
6 | chr = unichr
7 | except NameError:
8 | pass
9 |
10 |
11 | from io import open as _open
12 | if open == _open: # Starting in 3.1
13 | open = open # <- Declare in local namespace.
14 | else:
15 | open = _open
16 |
17 |
18 | try:
19 | callable = callable # Removed from 3.0 and 3.1, added back in 3.2.
20 | except NameError:
21 | def callable(obj):
22 | parent_types = type(obj).__mro__
23 | return any('__call__' in typ.__dict__ for typ in parent_types)
24 |
25 |
26 | try:
27 | property.__isabstractmethod__ # New in 3.3.
28 | property = property
29 | except AttributeError:
30 | _property = property
31 | class property(_property):
32 | def __init__(self, fget=None, fset=None, fdel=None, doc=None):
33 | super(property, self).__init__(fget, fset, fdel, doc)
34 | self.__isabstractmethod__ = getattr(
35 | fget, '__isabstractmethod__', False,
36 | )
37 |
38 |
39 | # In the move to Python 3.0, map, filter, zip were replaced with their
40 | # iterable equivalents from the itertools module.
41 | try:
42 | map.__iter__
43 | filter.__iter__
44 | zip.__iter__
45 | map = map
46 | filter = filter
47 | zip = zip
48 | except AttributeError:
49 | from itertools import imap as map
50 | from itertools import ifilter as filter
51 | from itertools import izip as zip
52 |
53 |
54 | try:
55 | max([0, 1], default=None) # The default keyword for max()
56 | min([0, 1], default=None) # and min() is new in 3.4.
57 | max = max
58 | min = min
59 | except TypeError:
60 | from itertools import chain as _chain
61 |
62 | _max = max
63 | def max(*iterable, **kwds):
64 | """
65 | max(iterable, *[, default, key])
66 | max(arg1, arg2, *args, *[, key])
67 | """
68 | allowed_kwds = ('default', 'key')
69 | for key in kwds:
70 | if key not in allowed_kwds:
71 | msg = "'{0}' is an invalid keyword argument for this function"
72 | raise TypeError(msg.format(key))
73 |
74 | if len(iterable) == 1:
75 | iterable = iterable[0]
76 |
77 | try:
78 | first_item = next(iter(iterable))
79 | if iter(iterable) is iterable:
80 | iterable = _chain([first_item], iterable)
81 | except StopIteration:
82 | if 'default' not in kwds:
83 | raise ValueError('max() arg is an empty sequence')
84 | return kwds['default']
85 |
86 | if 'key' in kwds:
87 | return _max(iterable, key=kwds['key'])
88 | return _max(iterable)
89 |
90 | _min = min
91 | def min(*iterable, **kwds):
92 | """
93 | min(iterable, *[, default, key])
94 | min(arg1, arg2, *args, *[, key])
95 | """
96 | allowed_kwds = ('default', 'key')
97 | for key in kwds:
98 | if key not in allowed_kwds:
99 | msg = "'{0}' is an invalid keyword argument for this function"
100 | raise TypeError(msg.format(key))
101 |
102 | if len(iterable) == 1:
103 | iterable = iterable[0]
104 |
105 | try:
106 | first_item = next(iter(iterable))
107 | if iter(iterable) is iterable:
108 | iterable = _chain([first_item], iterable)
109 | except StopIteration:
110 | if 'default' not in kwds:
111 | raise ValueError('min() arg is an empty sequence')
112 | return kwds['default']
113 |
114 | if 'key' in kwds:
115 | return _min(iterable, key=kwds['key'])
116 | return _min(iterable)
117 |
--------------------------------------------------------------------------------
/datatest/main.py:
--------------------------------------------------------------------------------
1 | """Datatest main program"""
2 |
3 | import sys as _sys
4 | from unittest import TestProgram as _TestProgram
5 | from unittest import defaultTestLoader as _defaultTestLoader
6 | try:
7 | from unittest.signals import installHandler
8 | except ImportError:
9 | installHandler = None
10 |
11 | from datatest import DataTestRunner
12 |
13 | __unittest = True
14 | __datatest = True
15 |
16 |
17 | class DataTestProgram(_TestProgram):
18 | def __init__(self, module='__main__', defaultTest=None, argv=None,
19 | testRunner=DataTestRunner, testLoader=_defaultTestLoader,
20 | exit=True, verbosity=1, failfast=None, catchbreak=None,
21 | buffer=None, ignore=False):
22 | self.ignore = ignore
23 | _TestProgram.__init__(self,
24 | module=module,
25 | defaultTest=defaultTest,
26 | argv=argv,
27 | testRunner=testRunner,
28 | testLoader=testLoader,
29 | exit=exit,
30 | verbosity=verbosity,
31 | failfast=failfast,
32 | catchbreak=catchbreak,
33 | buffer=buffer)
34 |
35 | def runTests(self):
36 | try:
37 | if self.catchbreak and installHandler:
38 | installHandler()
39 | except AttributeError:
40 | pass # does not have catchbreak attribute
41 |
42 | if self.testRunner is None:
43 | self.testRunner = DataTestRunner
44 |
45 | if isinstance(self.testRunner, type):
46 | try:
47 | kwds = ['verbosity', 'failfast', 'buffer', 'warnings', 'ignore']
48 | kwds = [attr for attr in kwds if hasattr(self, attr)]
49 | kwds = dict((attr, getattr(self, attr)) for attr in kwds)
50 | testRunner = self.testRunner(**kwds)
51 | except TypeError:
52 | if 'warnings' in kwds:
53 | del kwds['warnings']
54 | testRunner = self.testRunner(**kwds)
55 | else:
56 | # assumed to be a TestRunner instance
57 | testRunner = self.testRunner
58 |
59 | self.result = testRunner.run(self.test)
60 | if self.exit:
61 | _sys.exit(not self.result.wasSuccessful())
62 |
63 |
64 | if _sys.version_info[:2] == (3, 1): # Patch methods for Python 3.1.
65 | def __init__(self, module='__main__', defaultTest=None, argv=None,
66 | testRunner=DataTestRunner, testLoader=_defaultTestLoader,
67 | exit=True, ignore=False):
68 | self.ignore = ignore
69 | _TestProgram.__init__(self,
70 | module=module,
71 | defaultTest=defaultTest,
72 | argv=argv,
73 | testRunner=testRunner,
74 | testLoader=testLoader,
75 | exit=exit)
76 | DataTestProgram.__init__ = __init__
77 |
78 | elif _sys.version_info[:2] == (2, 6): # Patch runTests() for Python 2.6.
79 | def __init__(self, module='__main__', defaultTest=None, argv=None,
80 | testRunner=DataTestRunner, testLoader=_defaultTestLoader,
81 | exit=True, ignore=False):
82 | self.exit = exit # <- 2.6 does not handle exit argument.
83 | self.ignore = ignore
84 | _TestProgram.__init__(self,
85 | module=module,
86 | defaultTest=defaultTest,
87 | argv=argv,
88 | testRunner=testRunner,
89 | testLoader=testLoader)
90 | DataTestProgram.__init__ = __init__
91 |
92 |
93 | main = DataTestProgram
94 |
--------------------------------------------------------------------------------
/docs/how-to/fuzzy-matching.rst:
--------------------------------------------------------------------------------
1 |
2 | .. currentmodule:: datatest
3 |
4 | .. meta::
5 | :description: How to assert fuzzy matches.
6 | :keywords: approximate string, fuzzy matching, testing, datatest
7 |
8 |
9 | #############################
10 | How to Validate Fuzzy Matches
11 | #############################
12 |
13 | When comparing strings of text, it can sometimes be useful
14 | to check that values are similar instead of asserting that
15 | they are exactly the same. Datatest provides options for
16 | *approximate string matching* (also called "fuzzy
17 | matching").
18 |
19 | When checking mappings or sequences of values, you can accept
20 | approximate matches with the :meth:`accepted.fuzzy` acceptance:
21 |
22 | .. tabs::
23 |
24 | .. tab:: Using Acceptance
25 |
26 | .. code-block:: python
27 | :emphasize-lines: 19
28 | :linenos:
29 |
30 | from datatest import validate, accepted
31 |
32 | linked_record = {
33 | 'id165': 'Saint Louis',
34 | 'id382': 'Raliegh',
35 | 'id592': 'Austin',
36 | 'id720': 'Cincinatti',
37 | 'id826': 'Philadelphia',
38 | }
39 |
40 | master_record = {
41 | 'id165': 'St. Louis',
42 | 'id382': 'Raleigh',
43 | 'id592': 'Austin',
44 | 'id720': 'Cincinnati',
45 | 'id826': 'Philadelphia',
46 | }
47 |
48 | with accepted.fuzzy(cutoff=0.6):
49 | validate(linked_record, master_record)
50 |
51 | .. tab:: No Acceptance
52 |
53 | .. code-block:: python
54 | :linenos:
55 |
56 | from datatest import validate
57 |
58 | linked_record = {
59 | 'id165': 'Saint Louis',
60 | 'id382': 'Raliegh',
61 | 'id592': 'Austin',
62 | 'id720': 'Cincinatti',
63 | 'id826': 'Philadelphia',
64 | }
65 |
66 | master_record = {
67 | 'id165': 'St. Louis',
68 | 'id382': 'Raleigh',
69 | 'id592': 'Austin',
70 | 'id720': 'Cincinnati',
71 | 'id826': 'Philadelphia',
72 | }
73 |
74 | validate(linked_record, master_record)
75 |
76 |
77 | .. code-block:: none
78 | :emphasize-lines: 5-7
79 |
80 | Traceback (most recent call last):
81 | File "example.py", line 19, in
82 | validate(linked_record, master_record)
83 | datatest.ValidationError: does not satisfy mapping requirements (3 differences): {
84 | 'id165': Invalid('Saint Louis', expected='St. Louis'),
85 | 'id382': Invalid('Raliegh', expected='Raleigh'),
86 | 'id720': Invalid('Cincinatti', expected='Cincinnati'),
87 | }
88 |
89 |
90 | If variation is an inherent, natural feature of the data and
91 | does not necessarily represent a defect, it may be appropriate
92 | to use :meth:`validate.fuzzy` instead of the acceptance shown
93 | previously:
94 |
95 | .. code-block:: python
96 | :emphasize-lines: 19
97 | :linenos:
98 |
99 | from datatest import validate
100 |
101 | linked_record = {
102 | 'id165': 'Saint Louis',
103 | 'id382': 'Raliegh',
104 | 'id592': 'Austin',
105 | 'id720': 'Cincinatti',
106 | 'id826': 'Philadelphia',
107 | }
108 |
109 | master_record = {
110 | 'id165': 'St. Louis',
111 | 'id382': 'Raleigh',
112 | 'id592': 'Austin',
113 | 'id720': 'Cincinnati',
114 | 'id826': 'Philadelphia',
115 | }
116 |
117 | validate.fuzzy(linked_record, master_record, cutoff=0.6)
118 |
119 |
120 | That said, it's probably more appropriate to use an acceptance
121 | for this specific example.
122 |
123 |
--------------------------------------------------------------------------------
/datatest/_compatibility/functools.py:
--------------------------------------------------------------------------------
1 | """compatibility layer for functools (Python standard library)"""
2 | from __future__ import absolute_import
3 | from functools import *
4 | from sys import version_info as _version_info
5 |
6 |
7 | if _version_info[:2] <= (2, 7): # For version 2.7 and earlier.
8 |
9 | def update_wrapper(wrapper,
10 | wrapped,
11 | assigned=WRAPPER_ASSIGNMENTS,
12 | updated=WRAPPER_UPDATES):
13 | for attr in assigned:
14 | try: # <- This try/except
15 | value = getattr(wrapped, attr) # fixes issue #3445
16 | except AttributeError: # in Python 2.7 and
17 | pass # earlier.
18 | else:
19 | setattr(wrapper, attr, value)
20 | for attr in updated:
21 | getattr(wrapper, attr).update(getattr(wrapped, attr, {}))
22 | wrapper.__wrapped__ = wrapped
23 | return wrapper
24 |
25 |
26 | def wraps(wrapped,
27 | assigned=WRAPPER_ASSIGNMENTS,
28 | updated=WRAPPER_UPDATES):
29 | return partial(update_wrapper, # <- Patched update_wrapper().
30 | wrapped=wrapped,
31 | assigned=assigned,
32 | updated=updated)
33 |
34 |
35 | try:
36 | partialmethod # New in version 3.4.
37 | except NameError:
38 | # Adapted from the Python 3.6 Standard Library.
39 | class partialmethod(object):
40 | def __init__(self, func, *args, **keywords):
41 | if not callable(func) and not hasattr(func, "__get__"):
42 | raise TypeError("{!r} is not callable or a descriptor"
43 | .format(func))
44 |
45 | if isinstance(func, partialmethod):
46 | self.func = func.func
47 | self.args = func.args + args
48 | self.keywords = func.keywords.copy()
49 | self.keywords.update(keywords)
50 | else:
51 | self.func = func
52 | self.args = args
53 | self.keywords = keywords
54 |
55 | def __repr__(self):
56 | args = ", ".join(map(repr, self.args))
57 | keywords = ", ".join("{}={!r}".format(k, v)
58 | for k, v in self.keywords.items())
59 | format_string = "{module}.{cls}({func}, {args}, {keywords})"
60 | return format_string.format(module=self.__class__.__module__,
61 | cls=self.__class__.__qualname__,
62 | func=self.func,
63 | args=args,
64 | keywords=keywords)
65 |
66 | def _make_unbound_method(self):
67 | def _method(*args, **keywords):
68 | call_keywords = self.keywords.copy()
69 | call_keywords.update(keywords)
70 | #cls_or_self, *rest = args
71 | cls_or_self, rest = args[0], args[1:]
72 | call_args = (cls_or_self,) + self.args + tuple(rest)
73 | return self.func(*call_args, **call_keywords)
74 | _method.__isabstractmethod__ = self.__isabstractmethod__
75 | _method._partialmethod = self
76 | return _method
77 |
78 | def __get__(self, obj, cls):
79 | get = getattr(self.func, "__get__", None)
80 | result = None
81 | if get is not None:
82 | new_func = get(obj, cls)
83 | if new_func is not self.func:
84 | result = partial(new_func, *self.args, **self.keywords)
85 | try:
86 | result.__self__ = new_func.__self__
87 | except AttributeError:
88 | pass
89 | if result is None:
90 | result = self._make_unbound_method().__get__(obj, cls)
91 | return result
92 |
93 | @property
94 | def __isabstractmethod__(self):
95 | return getattr(self.func, "__isabstractmethod__", False)
96 |
--------------------------------------------------------------------------------
/release-checklist.rst:
--------------------------------------------------------------------------------
1 |
2 | Release Checklist
3 | =================
4 |
5 | #. Make sure correct version number is set in the following files
6 | (remove the ".devN" suffix):
7 |
8 | * ``datatest/__init__.py``
9 | * ``docs/conf.py``
10 |
11 | #. Make sure the *description* argument in ``setup.py`` matches the project
12 | description on GitHub (in the "About" section).
13 |
14 | #. In the call to ``setup()``, check the versions defined by the
15 | *python_requires* argument (see the "Version specifiers" section of
16 | PEP-440 for details).
17 |
18 | #. In the call to ``setup()``, check the trove classifiers in the
19 | *classifiers* argument (see https://pypi.org/classifiers/ for values).
20 |
21 | #. Check that *packages* argument of ``setup()`` is correct. Check that the
22 | value matches what ``setuptools.find_packages()`` returns:
23 |
24 | .. code-block:: python
25 |
26 | >>> import setuptools
27 | >>> sorted(setuptools.find_packages('.', exclude=['tests']))
28 |
29 | Defining this list explicitly (rather than using ``find_packages()``
30 | directly in ``setup.py`` file) is needed when installing on systems
31 | where ``setuptools`` is not available.
32 |
33 | #. Make sure ``__past__`` sub-package includes a stub module for the
34 | current API version.
35 |
36 | #. Update ``README.rst`` (including "Backward Compatibility" section).
37 |
38 | #. Make final edits to ``CHANGELOG`` (doublecheck release date and version).
39 |
40 | #. Commit and push final changes to upstream repository:
41 |
42 | Prepare version info, README, and CHANGELOG for version N.N.N release.
43 |
44 | #. Perform final checks to make sure there are no CI test failures.
45 |
46 | #. Make sure the packaging tools are up-to-date:
47 |
48 | .. code-block:: console
49 |
50 | pip install -U twine wheel setuptools check-manifest
51 |
52 | #. Check the manifest against the project's root folder:
53 |
54 | .. code-block:: console
55 |
56 | check-manifest .
57 |
58 | #. Remove all existing files in the ``dist/`` folder.
59 |
60 | #. Build new distributions:
61 |
62 | .. code-block:: console
63 |
64 | python setup.py sdist bdist_wheel
65 |
66 | #. Upload distributions to TestPyPI:
67 |
68 | .. code-block:: console
69 |
70 | twine upload --repository testpypi dist/*
71 |
72 | #. View the package's web page on TestPyPI and verify that the information
73 | is correct for the "Project links" and "Meta" sections:
74 |
75 | * https://test.pypi.org/project/datatest
76 |
77 | If you are testing a pre-release version, make sure to use the URL returned
78 | by twine in the previous step (the default URL shows the latest *stable*
79 | version).
80 |
81 | #. Test the installation process from TestPyPI:
82 |
83 | .. code-block:: console
84 |
85 | python -m pip install --index-url https://test.pypi.org/simple/ datatest
86 |
87 | If you're testing a pre-release version, make sure to use the "pip install"
88 | command listed at the top of the project's TestPyPI page.
89 |
90 | #. Upload source and wheel distributions to PyPI:
91 |
92 | .. code-block:: console
93 |
94 | twine upload dist/*
95 |
96 | #. Double check PyPI project page and test installation from PyPI:
97 |
98 | .. code-block:: console
99 |
100 | python -m pip install datatest
101 |
102 | #. Add version tag to upstream repository (also used by readthedocs.org).
103 |
104 | #. Iterate the version number in the development repository to the next
105 | anticipated release and add a "dev" suffix (e.g., N.N.N.dev1). This
106 | version number should conform to the "Version scheme" section of PEP-440.
107 | Make sure these changes are reflected in the following files:
108 |
109 | * ``datatest/__init__.py``
110 | * ``docs/conf.py``
111 |
112 | Commit these changes with a comment like the one below:
113 |
114 | Iterate version number to the next anticipated release.
115 |
116 | This is done so that installations made directly from the development
117 | repository and the "latest" docs are not confused with the just-published
118 | "stable" versions.
119 |
120 | #. Make sure the documentation reflects the new versions:
121 |
122 | * https://datatest.readthedocs.io/ (stable)
123 | * https://datatest.readthedocs.io/en/latest/ (latest)
124 |
125 | If the documentation was not automatically updated, you may need to
126 | login to https://readthedocs.org/ and start the build process manually.
127 |
128 | #. Publish update announcement to relevant mailing lists:
129 |
130 | * python-announce-list@python.org
131 | * testing-in-python@lists.idyll.org
132 |
--------------------------------------------------------------------------------
/tests/test_utils_misc.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import
3 | from datetime import timedelta
4 | from . import _unittest as unittest
5 | from datatest import _utils
6 | from datatest._utils import IterItems
7 | from datatest._utils import pretty_timedelta_repr
8 |
9 |
10 | class TestIterItems(unittest.TestCase):
11 | def test_type_error(self):
12 | regex = "expected iterable or mapping, got 'int'"
13 | with self.assertRaisesRegex(TypeError, regex):
14 | IterItems(123)
15 |
16 | def test_non_exhaustible(self):
17 | items_list = [('a', 1), ('b', 2)] # <- Non-exhaustible input.
18 |
19 | items = IterItems(items_list)
20 | self.assertIs(iter(items), iter(items), msg='exhaustible output')
21 | self.assertEqual(list(items), items_list)
22 | self.assertEqual(list(items), [], msg='already exhausted')
23 |
24 | def test_exhaustible(self):
25 | items_iter = iter([('a', 1), ('b', 2)]) # <- Exhaustible iterator.
26 |
27 | items = IterItems(items_iter)
28 | self.assertIs(iter(items), iter(items))
29 | self.assertEqual(list(items), [('a', 1), ('b', 2)])
30 | self.assertEqual(list(items), [], msg='already exhausted')
31 |
32 | def test_dict(self):
33 | mapping = {'a': 1, 'b': 2}
34 |
35 | items = IterItems(mapping)
36 | self.assertEqual(set(items), set([('a', 1), ('b', 2)]))
37 | self.assertEqual(set(items), set(), msg='already exhausted')
38 |
39 | def test_dictitems(self):
40 | dic = {'a': 1}
41 |
42 | if hasattr(dic, 'iteritems'): # <- Python 2
43 | dic_items = dic.iteritems()
44 |
45 | items = IterItems(dic_items)
46 | self.assertEqual(list(items), [('a', 1)])
47 | self.assertEqual(list(items), [], msg='already exhausted')
48 |
49 | dic_items = dic.items()
50 |
51 | items = IterItems(dic_items)
52 | self.assertEqual(list(items), [('a', 1)])
53 | self.assertEqual(list(items), [], msg='already exhausted')
54 |
55 | def test_empty_iterable(self):
56 | empty = iter([])
57 |
58 | items = IterItems(empty)
59 | self.assertEqual(list(items), [])
60 |
61 | def test_repr(self):
62 | items = IterItems([1, 2])
63 |
64 | repr_part = repr(iter([])).partition(' ')[0]
65 | repr_start = 'IterItems({0}'.format(repr_part)
66 | self.assertTrue(repr(items).startswith(repr_start))
67 |
68 | generator = (x for x in [1, 2])
69 | items = IterItems(generator)
70 | self.assertEqual(repr(items), 'IterItems({0!r})'.format(generator))
71 |
72 | def test_subclasshook(self):
73 | items = IterItems(iter([]))
74 | self.assertIsInstance(items, IterItems)
75 |
76 | try:
77 | items = dict([]).iteritems() # <- For Python 2
78 | except AttributeError:
79 | items = dict([]).items() # <- For Python 3
80 | self.assertIsInstance(items, IterItems)
81 |
82 | items = enumerate([])
83 | self.assertIsInstance(items, IterItems)
84 |
85 | def test_virtual_subclass(self):
86 | class OtherClass(object):
87 | pass
88 |
89 | oth_cls = OtherClass()
90 |
91 | IterItems.register(OtherClass) # <- Register virtual subclass.
92 | self.assertIsInstance(oth_cls, IterItems)
93 |
94 |
95 | class TestMakeSentinel(unittest.TestCase):
96 | def test_basic(self):
97 | sentinel = _utils._make_token(
98 | 'TheName', '', 'The docstring.'
99 | )
100 | self.assertEqual(sentinel.__class__.__name__, 'TheName')
101 | self.assertEqual(repr(sentinel), '')
102 | self.assertEqual(sentinel.__doc__, 'The docstring.')
103 | self.assertTrue(bool(sentinel))
104 |
105 | def test_falsy(self):
106 | token = _utils._make_token(
107 | 'TheName', '', 'The docstring.', truthy=False
108 | )
109 | self.assertFalse(bool(token))
110 |
111 |
112 | class TestPrettyTimedeltaRepr(unittest.TestCase):
113 | def test_already_normalized_units(self):
114 | delta = timedelta(days=6, seconds=27, microseconds=100)
115 |
116 | actual = pretty_timedelta_repr(delta)
117 | expected = 'timedelta(days=+6, seconds=+27, microseconds=+100)'
118 | self.assertEqual(actual, expected)
119 |
120 | def test_negative_delta(self):
121 | delta = timedelta(seconds=-2) # The built-in repr for this timedelta
122 | # is: timedelta(days=-1, seconds=86398)
123 |
124 | actual = pretty_timedelta_repr(delta)
125 | expected = 'timedelta(seconds=-2)'
126 | self.assertEqual(actual, expected)
127 |
--------------------------------------------------------------------------------
/datatest/__past__/api06.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """Backwards compatibility for version 0.6.0.dev1 API."""
3 | from __future__ import absolute_import
4 | import inspect
5 | import datatest
6 | from datatest.__past__ import api08
7 | from datatest.__past__ import api07
8 | from datatest._compatibility import itertools
9 | from datatest import DataTestCase
10 |
11 |
12 | DataTestCase.subjectData = property(fget=DataTestCase.subject.fget,
13 | fset=DataTestCase.subject.fset)
14 | DataTestCase.referenceData = property(fget=DataTestCase.reference.fget,
15 | fset=DataTestCase.reference.fset)
16 | DataTestCase.assertDataColumns = DataTestCase.assertSubjectColumns
17 | DataTestCase.assertDataSet = DataTestCase.assertSubjectSet
18 | DataTestCase.assertDataSum = DataTestCase.assertSubjectSum
19 | DataTestCase.assertDataRegex = DataTestCase.assertSubjectRegex
20 | DataTestCase.assertDataNotRegex = DataTestCase.assertSubjectNotRegex
21 | datatest.DataAssertionError = datatest.__past__.api07_error.DataError
22 |
23 |
24 | _wrapped_find_data_source = DataTestCase._find_data_source
25 | @staticmethod
26 | def _find_data_source(name):
27 | if name in ('subject', 'subjectData'):
28 | stack = inspect.stack()
29 | stack.pop() # Skip record of current frame.
30 | for record in stack:
31 | frame = record[0]
32 | if 'subject' in frame.f_globals:
33 | return frame.f_globals['subject'] # <- EXIT!
34 | if 'subjectData' in frame.f_globals:
35 | return frame.f_globals['subjectData'] # <- EXIT!
36 | raise NameError('cannot find {0!r}'.format(name))
37 | elif name in ('reference', 'referenceData'):
38 | stack = inspect.stack()
39 | stack.pop() # Skip record of current frame.
40 | for record in stack:
41 | frame = record[0]
42 | if 'reference' in frame.f_globals:
43 | return frame.f_globals['reference'] # <- EXIT!
44 | if 'referenceData' in frame.f_globals:
45 | return frame.f_globals['referenceData'] # <- EXIT!
46 | raise NameError('cannot find {0!r}'.format(name))
47 | return _wrapped_find_data_source(name)
48 | DataTestCase._find_data_source = _find_data_source
49 |
50 |
51 | def _normalize_required(self, required, method, *args, **kwds):
52 | if required == None:
53 | required = self.referenceData # <- OLD NAME!
54 | if isinstance(required, datatest.BaseSource):
55 | fn = getattr(required, method)
56 | required = fn(*args, **kwds)
57 | return required
58 | DataTestCase._normalize_required = _normalize_required
59 |
60 |
61 | # This method was removed entirely.
62 | def _assertDataCount(self, column, keys, required=None, msg=None, **kwds_filter):
63 | subject_dict = self.subject.count(column, keys, **kwds_filter)
64 | required = self._normalize_required(required, 'sum', column, keys, **kwds_filter)
65 | msg = msg or 'row counts different than {0!r} sums'.format(column)
66 | self.assertEqual(subject_dict, required, msg)
67 | DataTestCase.assertDataCount = _assertDataCount
68 |
69 |
70 | # Function signature and behavior was changed.
71 | def _allowAny(self, number=None, msg=None, **kwds_filter):
72 | if number:
73 | return datatest.allow_limit(number, msg, **kwds_filter)
74 | return datatest.allow_any(msg, **kwds_filter)
75 | DataTestCase.allowAny = _allowAny
76 |
77 |
78 | # Function signature and behavior was changed.
79 | def _allowMissing(self, number=None, msg=None):
80 | def function(iterable):
81 | t1, t2 = itertools.tee(iterable)
82 | not_allowed = []
83 | count = 0
84 | for x in t1:
85 | if not isinstance(x, datatest.Missing):
86 | not_allowed.append(x)
87 | else:
88 | count += 1
89 | if number and count > number:
90 | return t2 # <- EXIT! Exceeds limit, return all.
91 | return not_allowed
92 | return datatest.allow_iter(function, msg)
93 | DataTestCase.allowMissing = _allowMissing
94 |
95 |
96 | # Function signature and behavior was changed.
97 | def _allowExtra(self, number=None, msg=None):
98 | def function(iterable):
99 | t1, t2 = itertools.tee(iterable)
100 | not_allowed = []
101 | count = 0
102 | for x in t1:
103 | if not isinstance(x, datatest.Extra):
104 | not_allowed.append(x)
105 | else:
106 | count += 1
107 | if number and count > number:
108 | return t2 # <- EXIT! Exceeds limit, return all.
109 | return not_allowed
110 | return datatest.allow_iter(function, msg)
111 | DataTestCase.allowExtra = _allowExtra
112 |
--------------------------------------------------------------------------------
/docs/how-to/sequences.rst:
--------------------------------------------------------------------------------
1 |
2 | .. py:currentmodule:: datatest
3 |
4 | .. meta::
5 | :description: How to validate sequences.
6 | :keywords: datatest, sequences, order
7 |
8 |
9 | #########################
10 | How to Validate Sequences
11 | #########################
12 |
13 |
14 | Index Position
15 | ==============
16 |
17 | To check for a specific sequence, you can pass a list [1]_ as the
18 | *requirement* argument:
19 |
20 | .. code-block:: python
21 | :emphasize-lines: 4
22 | :linenos:
23 |
24 | from datatest import validate
25 |
26 | data = ['A', 'B', 'X', 'C', 'D']
27 | requirement = ['A', 'B', 'C', 'D'] # <- a list
28 | validate(data, requirement)
29 |
30 |
31 | Elements in the *data* and *requirement* lists are compared by
32 | sequence position. The items at index position 0 are compared to
33 | each other, then items at index position 1 are compared to each
34 | other, and so on:
35 |
36 | .. math::
37 |
38 | \begin{array}{cccc}
39 | \hline
40 | \textbf{index} & \textbf{data} & \textbf{requirement} & \textbf{result} \\
41 | \hline
42 | 0 & \textbf{A} & \textbf{A} & \textrm{matches} \\
43 | 1 & \textbf{B} & \textbf{B} & \textrm{matches} \\
44 | 2 & \textbf{X} & \textbf{C} & \textrm{doesn't match} \\
45 | 3 & \textbf{C} & \textbf{D} & \textrm{doesn't match} \\
46 | 4 & \textbf{D} & no\;value & \textrm{doesn't match} \\
47 | \hline
48 | \end{array}
49 |
50 |
51 | In this example, there are three differences:
52 |
53 | .. code-block:: none
54 |
55 | ValidationError: does not match required sequence (3 differences): [
56 | Invalid('X', expected='C'),
57 | Invalid('C', expected='D'),
58 | Extra('D'),
59 | ]
60 |
61 |
62 | Using enumerate()
63 | -----------------
64 |
65 | While the previous example works well for short lists, the error
66 | does not describe **where** in your sequence the differences occur.
67 | To get the index positions associated with any differences, you
68 | can :py:func:`enumerate` your *data* and *requirement* objects:
69 |
70 | .. code-block:: python
71 | :emphasize-lines: 5
72 | :linenos:
73 |
74 | from datatest import validate
75 |
76 | data = ['A', 'B', 'X', 'C', 'D']
77 | requirement = ['A', 'B', 'C', 'D']
78 | validate(enumerate(data), enumerate(requirement))
79 |
80 |
81 | A required **enumerate object** is treated as a mapping. The keys
82 | for any differences will correspond to their index positions:
83 |
84 | .. code-block:: none
85 |
86 | ValidationError: does not satisfy mapping requirements (3 differences): {
87 | 2: Invalid('X', expected='C'),
88 | 3: Invalid('C', expected='D'),
89 | 4: Extra('D'),
90 | }
91 |
92 |
93 | Relative Order
94 | ==============
95 |
96 | When comparing elements by sequence position, one mis-alignment can
97 | create differences for all following elements. If this behavior is
98 | not desireable, you may want to check for *relative order* instead.
99 |
100 | If you want to check the relative order of elements rather than
101 | their index positions, you can use :meth:`validate.order`:
102 |
103 | .. code-block:: python
104 | :emphasize-lines: 5
105 | :linenos:
106 |
107 | from datatest import validate
108 |
109 | data = ['A', 'B', 'X', 'C', 'D']
110 | requirement = ['A', 'B', 'C', 'D']
111 | validate.order(data, requirement)
112 |
113 |
114 | When checking for relative order, this method tries to align
115 | elements into contiguous matching subsequences. This reduces
116 | the number of non-matches:
117 |
118 | .. math::
119 |
120 | \begin{array}{cccc}
121 | \hline
122 | \textbf{index} & \textbf{data} & \textbf{requirement} & \textbf{result} \\
123 | \hline
124 | 0 & \textbf{A} & \textbf{A} & \textrm{matches} \\
125 | 1 & \textbf{B} & \textbf{B} & \textrm{matches} \\
126 | 2 & \textbf{X} & no\;value & \textrm{doesn't match} \\
127 | 3 & \textbf{C} & \textbf{C} & \textrm{matches} \\
128 | 4 & \textbf{D} & \textbf{D} & \textrm{matches} \\
129 | \hline
130 | \end{array}
131 |
132 | Differences are reported as two-tuples containing the index (in *data*)
133 | where the difference occurs and the non-matching value. In the earlier
134 | examples, we saw that validating by index position produced three
135 | differences. But in this example, validating the same sequences by
136 | relative order produces only one difference:
137 |
138 | .. code-block:: none
139 |
140 | ValidationError: does not match required order (1 difference): [
141 | Extra((2, 'X')),
142 | ]
143 |
144 |
145 | .. rubric:: Footnotes
146 |
147 | .. [1] The validate() function will check *data* by index position when the
148 | *requirement* is any iterable object other than a set, mapping, tuple
149 | or string. See the :ref:`Sequence Validation `
150 | section of the :func:`validate` documentation for full details.
151 |
--------------------------------------------------------------------------------
/tests/past_api00.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """Test backwards compatibility with pre-release API.
3 |
4 | .. note:: Because this sub-module works by monkey-patching the global
5 | ``datatest`` package, these tests should be run in a separate
6 | process.
7 | """
8 | from . import _unittest as unittest
9 |
10 | import datatest
11 | from datatest.__past__ import api00 # <- MONKEY PATCH!!!
12 |
13 | DataTestCase = datatest.DataTestCase
14 | from datatest.__past__.api07_error import DataError
15 | from datatest.__past__.api07_sources import MinimalSource
16 |
17 |
18 | class TestAttributes(unittest.TestCase):
19 | def test_api_dev0(self):
20 | # Error class.
21 | self.assertTrue(hasattr(datatest, 'DataAssertionError'))
22 |
23 | # Data source properties.
24 | self.assertTrue(hasattr(datatest.DataTestCase, 'subjectData'))
25 | self.assertTrue(hasattr(datatest.DataTestCase, 'referenceData'))
26 |
27 | # Acceptance context managers.
28 | self.assertTrue(hasattr(datatest.DataTestCase, 'allowSpecified'))
29 | self.assertTrue(hasattr(datatest.DataTestCase, 'allowUnspecified'))
30 | self.assertTrue(hasattr(datatest.DataTestCase, 'allowDeviationPercent'))
31 |
32 | # Assert methods.
33 | self.assertTrue(hasattr(datatest.DataTestCase, 'assertColumnSet'))
34 | self.assertTrue(hasattr(datatest.DataTestCase, 'assertColumnSubset'))
35 | self.assertTrue(hasattr(datatest.DataTestCase, 'assertColumnSuperset'))
36 | self.assertTrue(hasattr(datatest.DataTestCase, 'assertValueSet'))
37 | self.assertTrue(hasattr(datatest.DataTestCase, 'assertValueSubset'))
38 | self.assertTrue(hasattr(datatest.DataTestCase, 'assertValueSuperset'))
39 | self.assertTrue(hasattr(datatest.DataTestCase, 'assertValueSum'))
40 | self.assertTrue(hasattr(datatest.DataTestCase, 'assertValueCount'))
41 | self.assertTrue(hasattr(datatest.DataTestCase, 'assertValueRegex'))
42 | self.assertTrue(hasattr(datatest.DataTestCase, 'assertValueNotRegex'))
43 |
44 |
45 | class TestColumnSubset(datatest.DataTestCase):
46 | def setUp(self):
47 | self.subjectData = MinimalSource(data=[['a', '65'], ['b', '70']],
48 | fieldnames=['label1', 'value'])
49 |
50 | def test_is_same(self):
51 | self.assertColumnSubset(ref=['label1', 'value']) # Should pass without error.
52 |
53 | def test_is_subset(self):
54 | self.assertColumnSubset(ref=['label1', 'label2', 'value']) # Should pass without error.
55 |
56 | def test_is_superset(self):
57 | regex = "different column names:\n xExtra\(u?'value'\)"
58 | with self.assertRaisesRegex(DataError, regex):
59 | self.assertColumnSubset(ref=['label1'])
60 |
61 |
62 | class TestColumnSuperset(datatest.DataTestCase):
63 | def setUp(self):
64 | self.subjectData = MinimalSource(data=[['a', '65'], ['b', '70']],
65 | fieldnames=['label1', 'value'])
66 |
67 | def test_is_same(self):
68 | self.assertColumnSuperset(ref=['label1', 'value']) # Should pass without error.
69 |
70 | def test_is_superset(self):
71 | self.assertColumnSuperset(ref=['label1']) # Should pass without error.
72 |
73 | def test_is_subset(self):
74 | regex = "different column names:\n xMissing\(u?'label2'\)"
75 | with self.assertRaisesRegex(DataError, regex):
76 | self.assertColumnSuperset(ref=['label1', 'label2', 'value'])
77 |
78 |
79 | class TestValueSubset(DataTestCase):
80 | def setUp(self):
81 | self.subjectData = MinimalSource(data=[['a'], ['b'], ['c']],
82 | fieldnames=['label'])
83 |
84 | def test_is_same(self):
85 | self.assertValueSubset('label', ref=['a', 'b', 'c']) # Should pass without error.
86 |
87 | def test_is_subset(self):
88 | self.assertValueSubset('label', ref=['a', 'b', 'c', 'd']) # Should pass without error.
89 |
90 | def test_is_superset(self):
91 | regex = "different 'label' values:\n xExtra\(u?'c'\)"
92 | with self.assertRaisesRegex(DataError, regex):
93 | self.assertValueSubset('label', ref=['a', 'b'])
94 |
95 |
96 | class TestValueSuperset(DataTestCase):
97 | def setUp(self):
98 | self.subjectData = MinimalSource(data=[['a'], ['b'], ['c']],
99 | fieldnames=['label'])
100 |
101 | def test_is_same(self):
102 | self.assertValueSuperset('label', ref=['a', 'b', 'c']) # Should pass without error.
103 |
104 | def test_is_superset(self):
105 | self.assertValueSuperset('label', ref=['a', 'b']) # Should pass without error.
106 |
107 | def test_is_subset(self):
108 | regex = "different 'label' values:\n xMissing\(u?'d'\)"
109 | with self.assertRaisesRegex(DataError, regex):
110 | self.assertValueSuperset('label', ref=['a', 'b', 'c', 'd'])
111 |
112 |
113 | if __name__ == '__main__':
114 | unittest.main()
115 | else:
116 | raise Exception('This test must be run directly or as a subprocess.')
117 |
--------------------------------------------------------------------------------
/docs/reference/unittest-support.rst:
--------------------------------------------------------------------------------
1 |
2 | .. currentmodule:: datatest
3 |
4 | .. meta::
5 | :description: datatest API for unittest-style testing
6 | :keywords: datatest, unittest, data-wrangling
7 |
8 |
9 | ################
10 | Unittest Support
11 | ################
12 |
13 | Datatest can be used together with the :mod:`unittest` package
14 | from the Python Standard Library. For a quick introduction, see:
15 |
16 | * :ref:`Automated Data Testing: Unittest `
17 | * :ref:`Unittest Samples `
18 |
19 |
20 | .. _datatestcase-docs:
21 |
22 | ************
23 | DataTestCase
24 | ************
25 |
26 | .. autoclass:: DataTestCase
27 |
28 | **VALIDATION METHODS**
29 |
30 | The assertion methods wrap :func:`validate` and its methods:
31 |
32 | .. code-block:: python
33 | :emphasize-lines: 7
34 |
35 | from datatest import DataTestCase
36 |
37 | class MyTest(DataTestCase):
38 | def test_mydata(self):
39 | data = ...
40 | requirement = ...
41 | self.assertValid(data, requirement)
42 |
43 | .. automethod:: assertValid
44 |
45 | .. automethod:: assertValidPredicate
46 |
47 | .. automethod:: assertValidRegex
48 |
49 | .. automethod:: assertValidApprox
50 |
51 | .. automethod:: assertValidFuzzy
52 |
53 | .. automethod:: assertValidInterval
54 |
55 | .. automethod:: assertValidSet
56 |
57 | .. automethod:: assertValidSubset
58 |
59 | .. automethod:: assertValidSuperset
60 |
61 | .. automethod:: assertValidUnique
62 |
63 | .. automethod:: assertValidOrder
64 |
65 | **ACCEPTANCE METHODS**
66 |
67 | The acceptance methods wrap :func:`accepted` and its methods:
68 |
69 | .. code-block:: python
70 | :emphasize-lines: 7
71 |
72 | from datatest import DataTestCase
73 |
74 | class MyTest(DataTestCase):
75 | def test_mydata(self):
76 | data = ...
77 | requirement = ...
78 | with self.accepted(Missing):
79 | self.assertValid(data, requirement)
80 |
81 | .. automethod:: accepted
82 |
83 | .. automethod:: acceptedKeys
84 |
85 | .. automethod:: acceptedArgs
86 |
87 | .. method:: acceptedTolerance(tolerance, /, msg=None)
88 | acceptedTolerance(lower, upper, msg=None)
89 |
90 | Wrapper for :meth:`accepted.tolerance`.
91 |
92 | .. method:: acceptedPercent(tolerance, /, msg=None)
93 | acceptedPercent(lower, upper, msg=None)
94 |
95 | Wrapper for :meth:`accepted.percent`.
96 |
97 | .. automethod:: acceptedFuzzy
98 |
99 | .. automethod:: acceptedCount
100 |
101 |
102 | .. _unittest-style-invocation:
103 |
104 | **********************
105 | Command-Line Interface
106 | **********************
107 |
108 | The datatest module can be used from the command line just like
109 | unittest. To run the program with `test discovery
110 | `_
111 | use the following command::
112 |
113 | python -m datatest
114 |
115 | Run tests from specific modules, classes, or individual methods with::
116 |
117 | python -m datatest test_module1 test_module2
118 | python -m datatest test_module.TestClass
119 | python -m datatest test_module.TestClass.test_method
120 |
121 | The syntax and command-line options (``-f``, ``-v``, etc.) are the
122 | same as unittest---see unittest's `command-line documentation
123 | `_
124 | for full details.
125 |
126 | .. note::
127 |
128 | Tests are ordered by **file name** and then by **line number**
129 | (within each file) when running datatest from the command-line.
130 |
131 | ..
132 | Unlike strict unit testing, data preparation tests are often
133 | dependant on one another---this strict order-by-line-number
134 | behavior lets users design test suites appropriately.
135 | For example, asserting the population of a city will always
136 | fail when the 'city' column is missing. So it's appropriate
137 | to validate column names *before* validating the contents of
138 | each column.
139 |
140 |
141 | *******************
142 | Test Runner Program
143 | *******************
144 |
145 | .. py:decorator:: mandatory
146 |
147 | A decorator to mark whole test cases or individual methods as
148 | mandatory. If a mandatory test fails, DataTestRunner will stop
149 | immediately (this is similar to the ``--failfast`` command line
150 | argument behavior)::
151 |
152 | @datatest.mandatory
153 | class TestFileFormat(datatest.DataTestCase):
154 | def test_columns(self):
155 | ...
156 |
157 | .. autoclass:: DataTestRunner
158 | :members:
159 | :inherited-members:
160 |
161 | .. autoclass:: DataTestProgram(module='__main__', defaultTest=None, argv=None, testRunner=datatest.DataTestRunner, testLoader=unittest.TestLoader, exit=True, verbosity=1, failfast=None, catchbreak=None, buffer=None, warnings=None)
162 | :members:
163 | :inherited-members:
164 |
165 | |
166 |
167 | .. autoclass:: main
168 | :members:
169 | :inherited-members:
170 |
--------------------------------------------------------------------------------
/docs/how-to/phone-numbers.rst:
--------------------------------------------------------------------------------
1 |
2 | .. py:currentmodule:: datatest
3 |
4 | .. meta::
5 | :description: How to assert telephone number formats.
6 | :keywords: datatest, phone format, validate phone number
7 |
8 |
9 | #############################
10 | How to Validate Phone Numbers
11 | #############################
12 |
13 | To check that phone numbers are well-formed, you can use a regular
14 | expression.
15 |
16 |
17 | USA and Canada
18 | ==============
19 |
20 |
21 | .. code-block:: python
22 |
23 | from datatest import validate
24 |
25 | pattern = r'^\(\d{3}\)[ ]\d{3}-\d{4}$'
26 |
27 | data = [
28 | '(914) 232-9901',
29 | '(914) 737-9938',
30 | '(213) 888-7636',
31 | '(202) 965-2900',
32 | '(858) 651-5050',
33 | ]
34 |
35 | validate.regex(data, pattern, msg='must use phone number format')
36 |
37 |
38 | For other common US and Canadian formats, you can use the regex
39 | patterns:
40 |
41 | .. table::
42 | :widths: auto
43 |
44 | +-------------------------------+-------------------+
45 | | pattern | examples |
46 | +===============================+===================+
47 | | ``^\(\d{3}\)[ ]\d{3}-\d{4}$`` | \(914) 232-9901 |
48 | +-------------------------------+-------------------+
49 | | ``^\d{3}-\d{3}-\d{4}$`` | 914-232-9901 |
50 | +-------------------------------+-------------------+
51 | | ``^\+?1-\d{3}-\d{3}-\d{4}$`` | 1-914-232-9901 |
52 | | +-------------------+
53 | | | +1-914-232-9901 |
54 | +-------------------------------+-------------------+
55 |
56 |
57 | ..
58 | THESE PHONE NUMBER PATTERNS ARE INCOMPLETE
59 |
60 | China
61 | =====
62 |
63 | .. code-block:: python
64 |
65 | from datatest import validate
66 |
67 | pattern = r'^\d{3}[ ]\d{3,4}[ ]\d{4}$'
68 |
69 | data = [
70 | '074 7284 5586',
71 | '400 669 5539',
72 | ]
73 |
74 | validate.regex(data, pattern, msg='must use phone number format')
75 |
76 |
77 | For common variants, you can use the following patterns:
78 |
79 | .. table::
80 | :widths: auto
81 |
82 | +--------------------------------------+-------------------+
83 | | ``^\d{3}[ ]\d{3,4}[ ]\d{4}$`` | 074 7284 5586 |
84 | | +-------------------+
85 | | | 400 669 5539 |
86 | +--------------------------------------+-------------------+
87 | | ``^\+86[ ]\d{3}[ ]\d{3,4}[ ]\d{4}$`` | +86 074 7284 5586 |
88 | | +-------------------+
89 | | | +86 400 669 5539 |
90 | +--------------------------------------+-------------------+
91 |
92 |
93 | India
94 | =====
95 |
96 | .. code-block:: python
97 |
98 | import re
99 | from datatest import validate
100 |
101 |
102 | indian_phone_format = re.compile(r'''^
103 | (\+91[ ])? # Optional international code.
104 | (\(0\))? # Optional trunk prefix.
105 | # 10 digit codes with area & number splits.
106 | (
107 | \d{10} # xxxxxxxxxx
108 | | \d{5}[ ]\d{5} # xxxxx xxxxx
109 | | \d{4}[ ]\d{6} # xxxx xxxxxx
110 | | \d{3}[ ]\d{7} # xxx xxxxxxx
111 | | \d{2}[ ]\d{8} # xx xxxxxxxx
112 | )
113 | $''', re.VERBOSE)
114 |
115 | data = [
116 | '+91 (0)99999 99999',
117 | '+91 99999 99999',
118 | '9999999999',
119 | '99999 99999',
120 | '9999 999999',
121 | '999 9999999',
122 | '99 99999999',
123 | ]
124 |
125 | validate(data, indian_phone_format, msg='must use phone number format')
126 |
127 |
128 | United Kingdom
129 | ==============
130 |
131 | .. code-block:: python
132 |
133 | import re
134 | from datatest import validate
135 |
136 |
137 | uk_phone_format = re.compile(r'''^(
138 | # 10 digit NSNs (leading zero doesn't count)
139 | \(01\d{2}[ ]\d{2}\d\)[ ]\d{2}[ ]\d{3} # (01xx xx) xx xxx
140 | | \(01\d{3}\)[ ]\d{3}[ ]\d{3} # (01xxx) xxx xxx
141 | | \(01\d{2}\)[ ]\d{3}[ ]\d{4} # (01xx) xxx xxxx
142 | | \(02\d\)[ ]\d{4}[ ]\d{4} # (02x) xxxx xxxx
143 | | 0\d{3}[ ]\d{3}[ ]\d{4} # 0xxx xxx xxxx
144 | | 0\d{2}[ ]\d{4}[ ]\d{4} # 0xx xxxx xxxx
145 | | 07\d{3}[ ]\d{3}[ ]\d{3} # 07xxx xxx xxx
146 |
147 | # 9 digit NSNs
148 | | \(0169[ ]77\)[ ]\d{4} # (0169 77) xxxx
149 | | \(01\d{3}\)[ ]\d{2}[ ]\d{3} # (01xxx) xx xxx
150 | | 0500[ ]\d{3}[ ]\d{3} # 0500 xxx xxx
151 | | 0800[ ]\d{3}[ ]\d{3} # 0800 xxx xxx
152 | )$''', re.VERBOSE)
153 |
154 | data = [
155 | '(01257) 421 282',
156 | '(01736) 759 307',
157 | '(0169 77) 3452',
158 | '0116 319 5885',
159 | '0191 384 6777',
160 | '020 8399 0617',
161 | ]
162 |
163 | validate(data, uk_phone_format, msg='must use phone number format')
164 |
165 |
166 | ..
167 | TO ADD:
168 | Germany
169 | Japan
170 | France
171 |
172 |
--------------------------------------------------------------------------------
/docs/how-to/excel-auto-formatting.rst:
--------------------------------------------------------------------------------
1 |
2 | .. currentmodule:: datatest
3 |
4 | .. meta::
5 | :description: How to prevent Excel from converting values.
6 | :keywords: datatest, excel, date conversion, scientific notation, leading zeros
7 |
8 |
9 | #######################################
10 | How to Avoid Excel Automatic Formatting
11 | #######################################
12 |
13 | When MS Excel opens CSV files (and many other tabular formats),
14 | its default behavior will reformat certain values as dates,
15 | strip leading zeros, convert long numbers into scientific
16 | notation, and more. There are many cases where these kinds
17 | of changes actually corrupt your data.
18 |
19 | It is possible to control Excel's formatting behavior using its
20 | *Text Import Wizard*. But as long as other users can open and
21 | re-save your CSV files, there may be no good way to guarantee that
22 | someone else won't inadvertently corrupt your data with Excel's
23 | default auto-format behavior. In a situation like this, you can
24 | mitigate problems by avoiding values that Excel likes to auto-format.
25 |
26 | Using the :class:`Predicate` object below, you can check that values
27 | are "Excel safe" and receive a list of differences when values are
28 | vulnerable to inadvertent auto-formatting:
29 |
30 | .. code-block:: python
31 | :emphasize-lines: 44
32 | :linenos:
33 |
34 | import re
35 | from datatest import validate, Predicate
36 |
37 |
38 | # Predicate to check that elements are not subject
39 | # to Excel auto-formatting.
40 | excel_safe = ~Predicate(re.compile(r'''^(
41 | # Date format character combinations.
42 | \d{1,2}-(?:\d{1,2}|\d{4})
43 | | (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[ \-]\d{1,2}
44 | | [01]?[0-9]-(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)
45 |
46 | # Time conversions.
47 | | [01]?[0-9][ ]?(AM?|PM?) # Twelve-hour clock.
48 | | \d?\d[ ]*: # HH (hours).
49 | | \d?\d[ ]*(:[ ]*\d\d?){1,2} # HH:MM and HH:MM:SS
50 |
51 | # Numeric conversions.
52 | | 0\d+\.?\d* # Number with leading zeros.
53 | | \d*\.\d*0 # Decimal point with trailing zeros.
54 | | \d*\. # Trailing decimal point.
55 | | \d.?\d*E[+-]?\d+ # Scientific notation.
56 | | \d{16,} # Numbers of 16+ digits get approximated.
57 |
58 | # Whitespace normalization.
59 | | \s.* # Leading whitespace.
60 | | .*\s # Trailing whitespace.
61 | | .*\s\s.* # Irregular whitespace (new in Office 365).
62 |
63 | # Other conversions
64 | | =.+ # Spreadsheet formula.
65 |
66 | )$''', re.VERBOSE | re.IGNORECASE), name='excel_safe')
67 |
68 |
69 | data = [
70 | 'AOX-18',
71 | 'APR-23',
72 | 'DBB-01',
73 | 'DEC-20',
74 | 'DNZ-33',
75 | 'DVH-50',
76 | ]
77 | validate(data, excel_safe)
78 |
79 | In the example above, we use ``excel_safe`` as our *requirement*.
80 | The validation fails because our *data* contains two codes that
81 | Excel would auto-convert into date types:
82 |
83 | .. code-block:: none
84 |
85 | ValidationError: does not satisfy excel_safe() (2 differences): [
86 | Invalid('APR-23'),
87 | Invalid('DEC-20'),
88 | ]
89 |
90 |
91 | Fixing the Data
92 | ---------------
93 |
94 | To address the failure, we need to change the values in *data* so
95 | they are no longer subject to Excel's auto-formatting behavior.
96 | There are a few ways to do this.
97 |
98 | We can prefix the failing values with apostrophes (``'APR-23``
99 | and ``'DEC-20``). This causes Excel to treat them as text instead
100 | of dates or numbers:
101 |
102 | .. code-block:: python
103 | :emphasize-lines: 5,7
104 | :linenos:
105 | :lineno-start: 34
106 |
107 | ...
108 |
109 | data = [
110 | "AOX-18",
111 | "'APR-23",
112 | "DBB-01",
113 | "'DEC-20",
114 | "DNZ-33",
115 | "DVH-50",
116 | ]
117 | validate(data, excel_safe)
118 |
119 |
120 | Another approach would be to change the formatting for the all of
121 | the values. Below, the hyphens in *data* have been replaced with
122 | underscores (``_``):
123 |
124 | .. code-block:: python
125 | :emphasize-lines: 4-9
126 | :linenos:
127 | :lineno-start: 34
128 |
129 | ...
130 |
131 | data = [
132 | 'AOX_18',
133 | 'APR_23',
134 | 'DBB_01',
135 | 'DEC_20',
136 | 'DNZ_33',
137 | 'DVH_50',
138 | ]
139 | validate(data, excel_safe)
140 |
141 |
142 | After making the needed changes, the validation will now pass without
143 | error.
144 |
145 |
146 | .. caution::
147 |
148 | The ``excel_safe`` predicate implements a blacklist approach
149 | to detect values that Excel will automatically convert. It is
150 | not guaranteed to catch everything and future versions of Excel
151 | could introduce new behaviors. If you discover auto-formatted
152 | values that are not handled by this helper function (or if you
153 | have an idea regarding a workable whitelist approach), please
154 | `file an issue`_ and we will try to improve it.
155 |
156 |
157 | .. _`file an issue`: https://github.com/shawnbrown/datatest/issues
158 |
--------------------------------------------------------------------------------
/tests/past_api07_sources_sqlite.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import sqlite3
3 | from . import _unittest as unittest
4 |
5 | from .mixins import CountTests
6 | from .mixins import OtherTests
7 |
8 | from datatest.__past__.api07_sources import SqliteSource
9 |
10 |
11 | class TestSqliteSourceCount(CountTests, unittest.TestCase):
12 | def setUp(self):
13 | tablename = 'testtable'
14 | connection = sqlite3.connect(':memory:')
15 | cursor = connection.cursor()
16 | cursor.execute("CREATE TABLE testtable (label1, label2, value)")
17 | for values in self.testdata:
18 | cursor.execute("INSERT INTO testtable VALUES (?, ?, ?)", values)
19 | connection.commit()
20 |
21 | self.datasource = SqliteSource(connection, tablename)
22 |
23 |
24 | class TestSqliteSource(OtherTests, unittest.TestCase):
25 | def setUp(self):
26 | tablename = 'testtable'
27 | connection = sqlite3.connect(':memory:')
28 | cursor = connection.cursor()
29 | cursor.execute("CREATE TABLE testtable (label1, label2, value)")
30 | for values in self.testdata:
31 | cursor.execute("INSERT INTO testtable VALUES (?, ?, ?)", values)
32 | connection.commit()
33 |
34 | self.datasource = SqliteSource(connection, tablename)
35 |
36 | def test_where_clause(self):
37 | # No key-word args.
38 | clause, params = SqliteSource._build_where_clause()
39 | self.assertEqual(clause, '')
40 | self.assertEqual(params, [])
41 |
42 | # Single condition (where label1 equals 'a').
43 | clause, params = SqliteSource._build_where_clause(label1='a')
44 | self.assertEqual(clause, 'label1=?')
45 | self.assertEqual(params, ['a'])
46 |
47 | # Multiple conditions (where label1 equals 'a' AND label2 equals 'x').
48 | clause, params = SqliteSource._build_where_clause(label1='a', label2='x')
49 | self.assertEqual(clause, 'label1=? AND label2=?')
50 | self.assertEqual(params, ['a', 'x'])
51 |
52 | # Compound condition (where label1 equals 'a' OR 'b').
53 | clause, params = SqliteSource._build_where_clause(label1=('a', 'b'))
54 | self.assertEqual(clause, 'label1 IN (?, ?)')
55 | self.assertEqual(params, ['a', 'b'])
56 |
57 | # Mixed conditions (where label1 equals 'a' OR 'b' AND label2 equals 'x').
58 | clause, params = SqliteSource._build_where_clause(label1=('a', 'b'), label2='x')
59 | self.assertEqual(clause, 'label1 IN (?, ?) AND label2=?')
60 | self.assertEqual(params, ['a', 'b', 'x'])
61 |
62 | def test_normalize_column(self):
63 | result = SqliteSource._normalize_column('foo')
64 | self.assertEqual('"foo"', result)
65 |
66 | result = SqliteSource._normalize_column('foo bar')
67 | self.assertEqual('"foo bar"', result)
68 |
69 | result = SqliteSource._normalize_column('foo "bar" baz')
70 | self.assertEqual('"foo ""bar"" baz"', result)
71 |
72 | def test_from_records(self):
73 | """Test from_records method (wrapper for TemporarySqliteTable class)."""
74 | # Test tuples.
75 | columns = ['foo', 'bar', 'baz']
76 | data = [
77 | ('a', 'x', '1'),
78 | ('b', 'y', '2'),
79 | ('c', 'z', '3'),
80 | ]
81 | source = SqliteSource.from_records(data, columns)
82 |
83 | expected = [
84 | {'foo': 'a', 'bar': 'x', 'baz': '1'},
85 | {'foo': 'b', 'bar': 'y', 'baz': '2'},
86 | {'foo': 'c', 'bar': 'z', 'baz': '3'},
87 | ]
88 | self.assertEqual(expected, list(source))
89 |
90 | # Test dict.
91 | columns = ['foo', 'bar', 'baz']
92 | data = [
93 | {'foo': 'a', 'bar': 'x', 'baz': '1'},
94 | {'foo': 'b', 'bar': 'y', 'baz': '2'},
95 | {'foo': 'c', 'bar': 'z', 'baz': '3'},
96 | ]
97 | source = SqliteSource.from_records(data, columns)
98 | self.assertEqual(data, list(source))
99 |
100 | # Test omitted *columns* argument.
101 | data_dict = [
102 | {'foo': 'a', 'bar': 'x', 'baz': '1'},
103 | {'foo': 'b', 'bar': 'y', 'baz': '2'},
104 | {'foo': 'c', 'bar': 'z', 'baz': '3'},
105 | ]
106 | source = SqliteSource.from_records(data_dict)
107 | self.assertEqual(data_dict, list(source))
108 |
109 | def test_create_index(self):
110 | cursor = self.datasource._connection.cursor()
111 |
112 | # There should be no indexes initially.
113 | cursor.execute("PRAGMA INDEX_LIST('testtable')")
114 | self.assertEqual(cursor.fetchall(), [])
115 |
116 | # Add single-column index.
117 | self.datasource.create_index('label1') # <- CREATE INDEX!
118 | cursor.execute("PRAGMA INDEX_LIST('testtable')")
119 | results = [tup[1] for tup in cursor.fetchall()]
120 | self.assertEqual(results, ['idx_testtable_label1'])
121 |
122 | # Add multi-column index.
123 | self.datasource.create_index('label2', 'value') # <- CREATE INDEX!
124 | cursor.execute("PRAGMA INDEX_LIST('testtable')")
125 | results = sorted(tup[1] for tup in cursor.fetchall())
126 | self.assertEqual(results, ['idx_testtable_label1', 'idx_testtable_label2_value'])
127 |
128 | # Duplicate of first, single-column index should have no effect.
129 | self.datasource.create_index('label1') # <- CREATE INDEX!
130 | cursor.execute("PRAGMA INDEX_LIST('testtable')")
131 | results = sorted(tup[1] for tup in cursor.fetchall())
132 | self.assertEqual(results, ['idx_testtable_label1', 'idx_testtable_label2_value'])
133 |
--------------------------------------------------------------------------------
/datatest/_compatibility/decimal.py:
--------------------------------------------------------------------------------
1 | """compatibility layer for decimal (Python standard library)"""
2 | from __future__ import absolute_import
3 | from decimal import *
4 |
5 |
6 | try:
7 | Decimal.from_float # New in 2.7
8 | except AttributeError:
9 | import math as _math
10 |
11 | def _bit_length(integer):
12 | s = bin(integer) # binary representation: bin(-37) --> '-0b100101'
13 | s = s.lstrip('-0b') # remove leading zeros and minus sign
14 | return len(s) # len('100101') --> 6
15 |
16 | @classmethod
17 | def _from_float(cls, f):
18 | if isinstance(f, int): # handle integer inputs
19 | return cls(f)
20 | if not isinstance(f, float):
21 | raise TypeError("argument must be int or float.")
22 | if _math.isinf(f) or _math.isnan(f):
23 | return cls(repr(f))
24 | if _math.copysign(1.0, f) == 1.0:
25 | sign = 0
26 | else:
27 | sign = 1
28 | n, d = abs(f).as_integer_ratio()
29 | #k = d.bit_length() - 1
30 | k = _bit_length(d) - 1
31 | result = _dec_from_triple(sign, str(n*5**k), -k)
32 | if cls is Decimal:
33 | return result
34 | else:
35 | return cls(result)
36 |
37 | Decimal.from_float = _from_float
38 |
39 |
40 | if Decimal('1.0') != 1.0: # Changed in Python 3.2
41 |
42 | import numbers as _numbers
43 | from decimal import _dec_from_triple
44 |
45 |
46 | class FloatOperation(DecimalException, TypeError):
47 | """Enable stricter semantics for mixing floats and Decimals."""
48 | pass
49 |
50 |
51 | # Adapted from Python 3.1 standard library.
52 | _context_init_orig = Context.__init__
53 | def _context_init_new(self, prec=None, rounding=None,
54 | traps=None, flags=None,
55 | Emin=None, Emax=None,
56 | capitals=None, _clamp=0,
57 | _ignored_flags=None):
58 |
59 | # Call original __init__.
60 | _context_init_orig(self, prec=prec, rounding=rounding, traps=traps,
61 | flags=flags, Emin=Emin, Emax=Emax, capitals=capitals,
62 | _clamp=_clamp, _ignored_flags=_ignored_flags)
63 |
64 | # Add FloatOperation to `traps` dict.
65 | self.traps[FloatOperation] = 0
66 |
67 | Context.__init__ = _context_init_new
68 |
69 |
70 | # Adapted from Python 3.4 standard library.
71 | def _convert_for_comparison(self, other, equality_op=False):
72 | if isinstance(other, Decimal):
73 | return self, other
74 | if isinstance(other, _numbers.Rational):
75 | if not self._is_special:
76 | self = _dec_from_triple(self._sign,
77 | str(int(self._int) * other.denominator),
78 | self._exp)
79 | return self, Decimal(other.numerator)
80 | if equality_op and isinstance(other, _numbers.Complex) and other.imag == 0:
81 | other = other.real
82 | if isinstance(other, float):
83 | context = getcontext()
84 | if equality_op:
85 | context.flags[FloatOperation] = 1
86 | else:
87 | context._raise_error(FloatOperation,
88 | "strict semantics for mixing floats and Decimals are enabled")
89 | return self, Decimal.from_float(other)
90 | return NotImplemented, NotImplemented
91 |
92 | def _eq(self, other, context=None):
93 | self, other = _convert_for_comparison(self, other, equality_op=True)
94 | if other is NotImplemented:
95 | return other
96 | if self._check_nans(other, context):
97 | return False
98 | return self._cmp(other) == 0
99 | Decimal.__eq__ = _eq
100 |
101 | def _ne(self, other, context=None):
102 | self, other = _convert_for_comparison(self, other, equality_op=True)
103 | if other is NotImplemented:
104 | return other
105 | if self._check_nans(other, context):
106 | return True
107 | return self._cmp(other) != 0
108 | Decimal.__ne__ = _ne
109 |
110 | def _lt(self, other, context=None):
111 | self, other = _convert_for_comparison(self, other)
112 | if other is NotImplemented:
113 | return other
114 | ans = self._compare_check_nans(other, context)
115 | if ans:
116 | return False
117 | return self._cmp(other) < 0
118 | Decimal.__lt__ = _lt
119 |
120 | def _le(self, other, context=None):
121 | self, other = _convert_for_comparison(self, other)
122 | if other is NotImplemented:
123 | return other
124 | ans = self._compare_check_nans(other, context)
125 | if ans:
126 | return False
127 | return self._cmp(other) <= 0
128 | Decimal.__le__ = _le
129 |
130 | def _gt(self, other, context=None):
131 | self, other = _convert_for_comparison(self, other)
132 | if other is NotImplemented:
133 | return other
134 | ans = self._compare_check_nans(other, context)
135 | if ans:
136 | return False
137 | return self._cmp(other) > 0
138 | Decimal.__gt__ = _gt
139 |
140 | def _ge(self, other, context=None):
141 | self, other = _convert_for_comparison(self, other)
142 | if other is NotImplemented:
143 | return other
144 | ans = self._compare_check_nans(other, context)
145 | if ans:
146 | return False
147 | return self._cmp(other) >= 0
148 | Decimal.__ge__ = _ge
149 |
--------------------------------------------------------------------------------
/docs/how-to/customize-differences.rst:
--------------------------------------------------------------------------------
1 |
2 | .. currentmodule:: datatest
3 |
4 | .. meta::
5 | :description: How to customize error differences.
6 | :keywords: datatest, difference, customize
7 |
8 |
9 | ############################
10 | How to Customize Differences
11 | ############################
12 |
13 | When using a helper function for validation, datatest's default
14 | behavior is to produce :class:`Invalid` differences when the
15 | function returns False. But you can customize this behavior
16 | by returning a difference object instead of False. The returned
17 | difference is used in place of an automatically generated one.
18 |
19 |
20 | Default Behavior
21 | ================
22 |
23 | In the following example, the helper function checks that text
24 | values are upper case and have no extra whitespace. If the values
25 | are good, the function returns ``True``, if the values are bad it
26 | returns ``False``:
27 |
28 | .. code-block:: python
29 | :linenos:
30 | :emphasize-lines: 6
31 |
32 | from datatest import validate
33 |
34 |
35 | def wellformed(x): # <- Helper function.
36 | """Must be upercase and no extra whitespace."""
37 | return x == ' '.join(x.split()) and x.isupper()
38 |
39 | data = [
40 | 'CAPE GIRARDEAU',
41 | 'GREENE ',
42 | 'JACKSON',
43 | 'St. Louis',
44 | ]
45 |
46 | validate(data, wellformed)
47 |
48 |
49 | Each time the helper function returns ``False``, an :class:`Invalid`
50 | difference is created:
51 |
52 | .. code-block:: none
53 | :emphasize-lines: 5-6
54 |
55 | Traceback (most recent call last):
56 | File "example.py", line 15, in
57 | validate(data, wellformed)
58 | ValidationError: Must be upercase and no extra whitespace. (2 differences): [
59 | Invalid('GREENE '),
60 | Invalid('St. Louis'),
61 | ]
62 |
63 |
64 | Custom Differences
65 | ==================
66 |
67 | In this example, the helper function returns a custom ``BadWhitespace``
68 | or ``NotUpperCase`` difference for each bad value:
69 |
70 | .. code-block:: python
71 | :linenos:
72 | :emphasize-lines: 15,17
73 |
74 | from datatest import validate, Invalid
75 |
76 |
77 | class BadWhitespace(Invalid):
78 | """For strings with leading, trailing, or irregular whitespace."""
79 |
80 |
81 | class NotUpperCase(Invalid):
82 | """For strings that aren't upper case."""
83 |
84 |
85 | def wellformed(x): # <- Helper function.
86 | """Must be upercase and no extra whitespace."""
87 | if x != ' '.join(x.split()):
88 | return BadWhitespace(x)
89 | if not x.isupper():
90 | return NotUpperCase(x)
91 | return True
92 |
93 |
94 | data = [
95 | 'CAPE GIRARDEAU',
96 | 'GREENE ',
97 | 'JACKSON',
98 | 'St. Louis',
99 | ]
100 |
101 | validate(data, wellformed)
102 |
103 |
104 | These differences are use in the ValidationError:
105 |
106 | .. code-block:: none
107 | :emphasize-lines: 5-6
108 |
109 | Traceback (most recent call last):
110 | File "example.py", line 15, in
111 | validate(data, wellformed)
112 | ValidationError: Must be upercase and no extra whitespace. (2 differences): [
113 | BadWhitespace('GREENE '),
114 | NotUpperCase('St. Louis'),
115 | ]
116 |
117 |
118 | .. caution::
119 |
120 | Typically, you should try to **stick with existing differences**
121 | in your data tests. Only create a custom subclass when its meaning
122 | is evident and doing so helps your data preparation workflow.
123 |
124 | Don't add a custom class when it doesn't benefit your testing
125 | process. At best, you're doing extra work for no added benefit.
126 | And at worst, an ambiguous or needlessly complex subclass can
127 | cause more problems than it solves.
128 |
129 | If you need to resolve ambiguity in a validation, you can split
130 | the check into multiple calls. Below, we perform the same check
131 | demonstrated earlier using two :func:`validate` calls:
132 |
133 | .. code-block:: python
134 | :linenos:
135 | :emphasize-lines: 14,21
136 |
137 | from datatest import validate
138 |
139 | data = [
140 | 'CAPE GIRARDEAU',
141 | 'GREENE ',
142 | 'JACKSON',
143 | 'St. Louis',
144 | ]
145 |
146 | def no_irregular_whitespace(x): # <- Helper function.
147 | """Must have no irregular whitespace."""
148 | return x == ' '.join(x.split())
149 |
150 | validate(data, no_irregular_whitespace)
151 |
152 |
153 | def is_upper_case(x): # <- Helper function.
154 | """Must be upper case."""
155 | return x.isupper()
156 |
157 | validate(data, is_upper_case)
158 |
159 |
160 | ..
161 | # In the future, after adding a comparator interface to validate(),
162 | # possibly change the example to something like the following.
163 |
164 | from enum import Enum
165 | from datatest import validate, Invalid
166 |
167 |
168 | # Likert Scale
169 | class response(Enum):
170 | STRONGLY_OPPOSE = 1
171 | OPPOSE = 2
172 | NEUTRAL = 3
173 | SUPPORT = 4
174 | STRONGLY_SUPPORT = 5
175 |
176 |
177 | # 7-Point Likert Scale
178 | #class response(Enum):
179 | # STRONGLY_OPPOSE = 1
180 | # OPPOSE = 2
181 | # SOMEWHAT_OPPOSE = 3
182 | # NEUTRAL = 4
183 | # SOMEWHAT_SUPPORT = 5
184 | # SUPPORT = 6
185 | # STRONGLY_SUPPORT = 7
186 |
187 |
188 | class Change(Invalid):
189 | """For differences of 1 point."""
190 |
191 |
192 | class LargeChange(Invalid):
193 | """For differences of 2 or more points."""
194 |
195 |
196 | latest_survey = {
197 | 'a': response.SUPPORT,
198 | 'b': response.STRONGLY_OPPOSE,
199 | 'c': response.STRONGLY_SUPPORT,
200 | 'd': response.OPPOSE,
201 | }
202 |
203 | previous_survey = {
204 | 'a': response.SUPPORT,
205 | 'b': response.OPPOSE,
206 | 'c': response.STRONGLY_SUPPORT,
207 | 'd': response.SUPPORT,
208 | }
209 |
210 | validate(latest_survey, previous_survey)
211 |
212 |
--------------------------------------------------------------------------------
/tests/past_api09_load_csv.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import os
3 | import sqlite3
4 | import sys
5 | import warnings
6 | from . import _io as io
7 | from . import _unittest as unittest
8 | from datatest._compatibility.builtins import *
9 |
10 | from datatest._vendor.load_csv import load_csv
11 |
12 | try:
13 | from StringIO import StringIO
14 | except ImportError:
15 | StringIO = None
16 |
17 |
18 | class TestLoadCsv(unittest.TestCase):
19 | def setUp(self):
20 | connection = sqlite3.connect(':memory:')
21 | connection.execute('PRAGMA synchronous=OFF')
22 | connection.isolation_level = None
23 | self.cursor = connection.cursor()
24 |
25 | self.original_cwd = os.path.abspath(os.getcwd())
26 | os.chdir(os.path.join(os.path.dirname(__file__), 'sample_files'))
27 |
28 | def tearDown(self): # It would be best to use addCleanup()
29 | os.chdir(self.original_cwd) # but it is not available in Python 2.6.
30 |
31 | @staticmethod
32 | def get_stream(string, encoding=None):
33 | """Accepts a string and returns a file-like stream object.
34 |
35 | In Python 2, Unicode files should be opened in binary-mode
36 | but in Python 3, they should be opened in text-mode. This
37 | function emulates the appropriate opening behavior.
38 | """
39 | fh = io.BytesIO(string)
40 | if sys.version_info[0] == 2:
41 | return fh
42 | return io.TextIOWrapper(fh, encoding=encoding)
43 |
44 | def test_encoding_with_stream(self):
45 | csvfile = self.get_stream((
46 | b'col1,col2\n'
47 | b'1,\xe6\n' # '\xe6' -> æ (ash)
48 | b'2,\xf0\n' # '\xf0' -> ð (eth)
49 | b'3,\xfe\n' # '\xfe' -> þ (thorn)
50 | ), encoding='latin-1')
51 | load_csv(self.cursor, 'testtable1', csvfile, encoding='latin-1')
52 |
53 | expected = [
54 | ('1', chr(0xe6)), # chr(0xe6) -> æ
55 | ('2', chr(0xf0)), # chr(0xf0) -> ð
56 | ('3', chr(0xfe)), # chr(0xfe) -> þ
57 | ]
58 | self.cursor.execute('SELECT col1, col2 FROM testtable1')
59 | self.assertEqual(list(self.cursor), expected)
60 |
61 | def test_encoding_with_file(self):
62 | path = 'sample_text_iso88591.csv'
63 | load_csv(self.cursor, 'testtable', path, encoding='latin-1')
64 |
65 | expected = [
66 | ('iso88591', chr(0xe6)), # chr(0xe6) -> æ
67 | ]
68 | self.cursor.execute('SELECT col1, col2 FROM testtable')
69 | self.assertEqual(list(self.cursor), expected)
70 |
71 | def test_encoding_mismatch(self):
72 | path = 'sample_text_iso88591.csv'
73 | wrong_encoding = 'utf-8' # <- Doesn't match file.
74 |
75 | with self.assertRaises(UnicodeDecodeError):
76 | load_csv(self.cursor, 'testtable', path, wrong_encoding)
77 |
78 | def test_fallback_with_stream(self):
79 | with warnings.catch_warnings(record=True): # Catch warnings issued
80 | csvfile = self.get_stream(( # when running Python 2.
81 | b'col1,col2\n'
82 | b'1,\xe6\n' # '\xe6' -> æ (ash)
83 | b'2,\xf0\n' # '\xf0' -> ð (eth)
84 | b'3,\xfe\n' # '\xfe' -> þ (thorn)
85 | ), encoding='latin-1')
86 | load_csv(self.cursor, 'testtable1', csvfile) # <- No encoding arg.
87 |
88 | expected = [
89 | ('1', chr(0xe6)), # chr(0xe6) -> æ
90 | ('2', chr(0xf0)), # chr(0xf0) -> ð
91 | ('3', chr(0xfe)), # chr(0xfe) -> þ
92 | ]
93 | self.cursor.execute('SELECT col1, col2 FROM testtable1')
94 | self.assertEqual(list(self.cursor), expected)
95 |
96 | def test_fallback_with_StringIO(self):
97 | if not StringIO: # <- Python 2.x only.
98 | return
99 |
100 | csvfile = StringIO(
101 | b'col1,col2\n'
102 | b'1,\xe6\n' # '\xe6' -> æ (ash)
103 | b'2,\xf0\n' # '\xf0' -> ð (eth)
104 | b'3,\xfe\n' # '\xfe' -> þ (thorn)
105 | )
106 |
107 | with warnings.catch_warnings(record=True):
108 | load_csv(self.cursor, 'testtable1', csvfile)
109 |
110 | expected = [
111 | ('1', chr(0xe6)), # chr(0xe6) -> æ
112 | ('2', chr(0xf0)), # chr(0xf0) -> ð
113 | ('3', chr(0xfe)), # chr(0xfe) -> þ
114 | ]
115 | self.cursor.execute('SELECT col1, col2 FROM testtable1')
116 | self.assertEqual(list(self.cursor), expected)
117 |
118 | def test_fallback_with_file(self):
119 | with warnings.catch_warnings(record=True) as warning_list:
120 | warnings.simplefilter('always')
121 | path = 'sample_text_iso88591.csv'
122 | load_csv(self.cursor, 'testtable', path) # <- No encoding arg.
123 |
124 | self.assertEqual(len(warning_list), 1)
125 | expected = "using fallback 'latin-1'"
126 | self.assertIn(expected, str(warning_list[0].message))
127 |
128 | expected = [
129 | ('iso88591', chr(0xe6)), # chr(0xe6) -> æ
130 | ]
131 | self.cursor.execute('SELECT col1, col2 FROM testtable')
132 | self.assertEqual(list(self.cursor), expected)
133 |
134 | def test_fallback_with_exhaustible_object(self):
135 | """Exhaustible iterators and unseekable file-like objects
136 | can only be iterated over once. This means that the usual
137 | fallback behavior can not be applied and the function must
138 | raise an exception.
139 | """
140 | if not sys.version_info[0] == 2:
141 | return
142 |
143 | csvfile = self.get_stream((
144 | b'col1,col2\n'
145 | b'1,\xe6\n' # '\xe6' -> æ (ash)
146 | b'2,\xf0\n' # '\xf0' -> ð (eth)
147 | b'3,\xfe\n' # '\xfe' -> þ (thorn)
148 | ), encoding='latin-1')
149 | generator = (x for x in csvfile) # <- Make stream unseekable.
150 |
151 | with self.assertRaises(UnicodeDecodeError) as cm:
152 | load_csv(self.cursor, 'testtable', generator)
153 |
154 | error_message = str(cm.exception)
155 | self.assertIn('cannot attempt fallback', error_message.lower())
156 |
--------------------------------------------------------------------------------
/docs/tutorial/testing-pandas.rst:
--------------------------------------------------------------------------------
1 | :orphan:
2 |
3 | .. meta::
4 | :description: Datatest examples demonstrating use of pandas DataFrame objects.
5 | :keywords: datatest, pandas, DataFrame
6 |
7 |
8 | ###################
9 | Testing With Pandas
10 | ###################
11 |
12 | Datatest can validate :mod:`pandas` objects (:class:`DataFrame
13 | `, :class:`Series `, and
14 | :class:`Index `) the same way it does with
15 | built-in types.
16 |
17 |
18 | =============
19 | Some Examples
20 | =============
21 |
22 | This example uses a :class:`DataFrame ` to
23 | load and inspect data from a CSV file (:download:`movies.csv
24 | `). The CSV file uses the
25 | following format:
26 |
27 | .. csv-table::
28 | :header: title, rating, year, runtime
29 |
30 | Almost Famous, R, 2000, 122
31 | American Pie, R, 1999, 95
32 | Back to the Future, PG, 1985, 116
33 | Blade Runner, R, 1982, 117
34 | ..., ..., ..., ...
35 |
36 |
37 | .. tabs::
38 |
39 | .. group-tab:: Pytest
40 |
41 | The :download:`test_movies_df.py `
42 | script demonstrates pytest-style tests:
43 |
44 | .. literalinclude:: /_static/tutorial/test_movies_df.py
45 | :language: python
46 | :lineno-match:
47 |
48 | .. group-tab:: Unittest
49 |
50 | The :download:`test_movies_df_unit.py `
51 | script demonstrates unittest-style tests:
52 |
53 | .. literalinclude:: /_static/tutorial/test_movies_df_unit.py
54 | :language: python
55 | :lineno-match:
56 |
57 |
58 | You can run these tests, use the following command:
59 |
60 | .. tabs::
61 |
62 | .. group-tab:: Pytest
63 |
64 | .. code-block:: none
65 |
66 | pytest test_movies_df.py
67 |
68 | .. group-tab:: Unittest
69 |
70 | .. code-block:: none
71 |
72 | python -m datatest test_movies_df_unit.py
73 |
74 |
75 | ========================
76 | Step by Step Explanation
77 | ========================
78 |
79 |
80 | 1. Define a test fixture
81 | ------------------------
82 |
83 | Define a test fixture that loads the CSV file into a
84 | :class:`DataFrame `:
85 |
86 | .. tabs::
87 |
88 | .. group-tab:: Pytest
89 |
90 | .. literalinclude:: /_static/tutorial/test_movies_df.py
91 | :pyobject: df
92 | :lineno-match:
93 |
94 | .. group-tab:: Unittest
95 |
96 | .. literalinclude:: /_static/tutorial/test_movies_df_unit.py
97 | :pyobject: setUpModule
98 | :lineno-match:
99 |
100 |
101 | 2. Check column names
102 | ---------------------
103 |
104 | Check that the data includes the expected column names:
105 |
106 | .. tabs::
107 |
108 | .. group-tab:: Pytest
109 |
110 | .. literalinclude:: /_static/tutorial/test_movies_df.py
111 | :pyobject: test_columns
112 | :lineno-match:
113 |
114 | .. group-tab:: Unittest
115 |
116 | .. literalinclude:: /_static/tutorial/test_movies_df_unit.py
117 | :pyobject: TestMovies.test_columns
118 | :lineno-match:
119 |
120 | This validation requires that the set of values in ``df.columns``
121 | matches the required :py:class:`set`. The ``df.columns`` attribute is
122 | an :class:`Index ` object---datatest treats this the same
123 | as any other sequence of values.
124 |
125 | This test is marked ``mandatory`` because it's a prerequisite that must
126 | be satisfied before any of the other tests can pass. When a mandatory
127 | test fails, the test suite stops immediately and no more tests are run.
128 |
129 |
130 | 3. Check 'title' values
131 | -----------------------
132 |
133 | Check that values in the **title** column begin with an upper-case letter:
134 |
135 | .. tabs::
136 |
137 | .. group-tab:: Pytest
138 |
139 | .. literalinclude:: /_static/tutorial/test_movies_df.py
140 | :pyobject: test_title
141 | :lineno-match:
142 |
143 | .. group-tab:: Unittest
144 |
145 | .. literalinclude:: /_static/tutorial/test_movies_df_unit.py
146 | :pyobject: TestMovies.test_title
147 | :lineno-match:
148 |
149 | This validation checks that each value in the ``df['title']`` matches
150 | the regular expression ``^[A-Z]``.
151 |
152 |
153 | 4. Check 'rating' values
154 | ------------------------
155 |
156 | Check that values in the **rating** column match one of the allowed codes:
157 |
158 | .. tabs::
159 |
160 | .. group-tab:: Pytest
161 |
162 | .. literalinclude:: /_static/tutorial/test_movies_df.py
163 | :pyobject: test_rating
164 | :lineno-match:
165 |
166 | .. group-tab:: Unittest
167 |
168 | .. literalinclude:: /_static/tutorial/test_movies_df_unit.py
169 | :pyobject: TestMovies.test_rating
170 | :lineno-match:
171 |
172 | This validation checks that the values in ``df['rating']`` are also
173 | contained in the given set.
174 |
175 |
176 | 5. Check 'year' and 'runtime' types
177 | -----------------------------------
178 |
179 | Check that values in the **year** and **runtime** columns are integers:
180 |
181 | .. tabs::
182 |
183 | .. group-tab:: Pytest
184 |
185 | .. literalinclude:: /_static/tutorial/test_movies_df.py
186 | :pyobject: test_year
187 | :lineno-match:
188 |
189 | .. literalinclude:: /_static/tutorial/test_movies_df.py
190 | :pyobject: test_runtime
191 | :lineno-match:
192 |
193 | .. group-tab:: Unittest
194 |
195 | .. literalinclude:: /_static/tutorial/test_movies_df_unit.py
196 | :pyobject: TestMovies.test_year
197 | :lineno-match:
198 |
199 | .. literalinclude:: /_static/tutorial/test_movies_df_unit.py
200 | :pyobject: TestMovies.test_runtime
201 | :lineno-match:
202 |
203 |
204 | ================
205 | More Information
206 | ================
207 |
208 | .. seealso::
209 |
210 | See the :doc:`../intro/validating-pandas` introduction docs
211 | for more information and examples.
212 |
213 | See :ref:`pandas-accessor-docs` to learn about the alternate
214 | validation syntax provided by pandas **accessor extensions**.
215 |
216 |
--------------------------------------------------------------------------------
/datatest/_normalize.py:
--------------------------------------------------------------------------------
1 | """Normalize objects for validation."""
2 |
3 | import sys
4 | from ._compatibility.collections.abc import Collection
5 | from ._compatibility.collections.abc import Iterable
6 | from ._compatibility.collections.abc import Iterator
7 | from ._compatibility.collections.abc import Mapping
8 |
9 | from ._utils import exhaustible
10 | from ._utils import iterpeek
11 | from ._utils import IterItems
12 |
13 |
14 | class TypedIterator(Iterator):
15 | def __init__(self, iterable, evaltype):
16 | self._iterator = iter(iterable)
17 | self.evaltype = evaltype
18 |
19 | def __iter__(self):
20 | return self
21 |
22 | def __next__(self):
23 | return next(self._iterator)
24 |
25 | def next(self): # Python 2.x support.
26 | return self.__next__()
27 |
28 | def fetch(self):
29 | return self.evaltype(self._iterator)
30 |
31 |
32 | NoneType = type(None)
33 |
34 |
35 | def _normalize_lazy(obj):
36 | """Return an iterator for lazy evaluation."""
37 | if isinstance(obj, TypedIterator):
38 | if issubclass(obj.evaltype, Mapping):
39 | obj = IterItems(obj)
40 | return obj # <- EXIT!
41 |
42 | # Separate Squint module.
43 | squint = sys.modules.get('squint', None)
44 | if squint:
45 | if isinstance(obj, squint.Query):
46 | obj = obj.execute()
47 | if issubclass(getattr(obj, 'evaltype', NoneType), Mapping):
48 | obj = IterItems(obj)
49 | return obj # <- EXIT!
50 |
51 | if isinstance(obj, squint.Result):
52 | if issubclass(obj.evaltype, Mapping):
53 | obj = IterItems(obj)
54 | return obj # <- EXIT!
55 |
56 | pandas = sys.modules.get('pandas', None)
57 | if pandas:
58 | if isinstance(obj, pandas.DataFrame):
59 | if not obj.index.is_unique:
60 | msg = '{0} index contains duplicates, must be unique'
61 | raise ValueError(msg.format(obj.__class__.__name__))
62 |
63 | if isinstance(obj.index, pandas.RangeIndex):
64 | # DataFrame with RangeIndex is treated as an iterator.
65 | if len(obj.columns) == 1:
66 | obj = (x[0] for x in obj.values)
67 | else:
68 | obj = (tuple(x) for x in obj.values)
69 | return TypedIterator(obj, evaltype=list) # <- EXIT!
70 | else:
71 | # DataFrame with another index type is treated as a mapping.
72 | if len(obj.columns) == 1:
73 | gen = ((x[0], x[1]) for x in obj.itertuples())
74 | else:
75 | gen = ((x[0], tuple(x[1:])) for x in obj.itertuples())
76 | return IterItems(gen) # <- EXIT!
77 | elif isinstance(obj, pandas.Series):
78 | if not obj.index.is_unique:
79 | msg = '{0} index contains duplicates, must be unique'
80 | raise ValueError(msg.format(obj.__class__.__name__))
81 |
82 | if isinstance(obj.index, pandas.RangeIndex):
83 | # Series with RangeIndex is treated as an iterator.
84 | return TypedIterator(obj.values, evaltype=list) # <- EXIT!
85 | else:
86 | # Series with another index type is treated as a mapping.
87 | return IterItems(obj.iteritems()) # <- EXIT!
88 |
89 | numpy = sys.modules.get('numpy', None)
90 | if numpy and isinstance(obj, numpy.ndarray):
91 | # Two-dimentional array, recarray, or structured array.
92 | if obj.ndim == 2 or (obj.ndim == 1 and len(obj.dtype) > 1):
93 | obj = (tuple(x) for x in obj)
94 | return TypedIterator(obj, evaltype=list) # <- EXIT!
95 |
96 | # One-dimentional array, recarray, or structured array.
97 | if obj.ndim == 1:
98 | if len(obj.dtype) == 1: # Unpack single-valued recarray
99 | obj = (x[0] for x in obj) # or structured array.
100 | else:
101 | obj = iter(obj)
102 | return TypedIterator(obj, evaltype=list) # <- EXIT!
103 |
104 | # Check for cursor-like object (if obj has DBAPI2 cursor attributes).
105 | if all(hasattr(obj, n) for n in ('fetchone', 'execute',
106 | 'rowcount', 'description')):
107 | if not isinstance(obj, Iterable):
108 | def cursor_to_gen(cursor): # While most cursor objects are
109 | while True: # iterable, it is not required
110 | row = cursor.fetchone() # by the DBAPI2 specification.
111 | if row is None:
112 | break
113 | yield row
114 | obj = cursor_to_gen(obj)
115 |
116 | first, obj = iterpeek(obj)
117 | if first and len(first) == 1:
118 | obj = iter(x[0] for x in obj) # Unwrap single-value records.
119 | return obj # <- EXIT!
120 |
121 | return obj
122 |
123 |
124 | def _normalize_eager(obj, default_type=None):
125 | """Eagerly evaluate *obj* when possible. When *obj* is exhaustible,
126 | a *default_type* must be specified. When provided, *default_type*
127 | must be a collection type (a sized iterable container).
128 | """
129 | if isinstance(obj, TypedIterator):
130 | return obj.fetch()
131 |
132 | # Separate Squint module.
133 | squint = sys.modules.get('squint', None)
134 | if squint and isinstance(obj, squint.Result):
135 | return obj.fetch()
136 |
137 | if isinstance(obj, IterItems):
138 | return dict(obj)
139 |
140 | if isinstance(obj, Iterable) and exhaustible(obj):
141 | if isinstance(default_type, type) and issubclass(default_type, Collection):
142 | return default_type(obj)
143 | else:
144 | cls_name = obj.__class__.__name__
145 | msg = ("exhaustible type '{0}' cannot be eagerly evaluated "
146 | "without specifying a 'default_type' collection")
147 | raise TypeError(msg.format(cls_name))
148 |
149 | return obj
150 |
151 |
152 | def normalize(obj, lazy_evaluation=False, default_type=None):
153 | obj = _normalize_lazy(obj)
154 | if lazy_evaluation:
155 | return obj
156 | return _normalize_eager(obj, default_type)
157 |
--------------------------------------------------------------------------------