├── MANIFEST.in ├── .gitignore ├── example-data-good.csv ├── example-data-bad.csv ├── .project ├── .pydevproject ├── setup.py ├── LICENSE.txt ├── CHANGES.txt ├── README.txt ├── README.rst ├── example.py ├── tests.py └── csvvalidator.py /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt 2 | 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *~ 3 | .coverage 4 | dist 5 | build 6 | MANIFEST 7 | 8 | -------------------------------------------------------------------------------- /example-data-good.csv: -------------------------------------------------------------------------------- 1 | study_id patient_id gender age_years age_months date_inclusion 2 | 1 1 M 2 27 2011-01-11 3 | 1 2 M 5 61 2011-02-21 4 | 1 3 F 9 119 2011-03-01 5 | 2 1 M 2 32 2011-04-04 6 | 2 2 M 3 36 2011-05-07 7 | 2 3 F 7 90 2011-06-19 8 | 2 4 F 1 14 2011-07-22 -------------------------------------------------------------------------------- /example-data-bad.csv: -------------------------------------------------------------------------------- 1 | study_id patient_id gender age_years age_months date_inclusion x 2 | x 4 F 2 27 2011-01-01 3 | 1 x F 2 25 2011-01-01 4 | 1 1 x 2 27 2011-01-01 5 | 1 2 M x 61 2011-01-01 6 | 1 3 F 9 x 2011-01-01 7 | 1 3 M 1 17 2011-01-01 8 | 1 4 M 2 25 2011-01-01 x 9 | 2 1 M 200 32 2011-01-01 10 | 2 2 M 3 24 2011-01-01 11 | 2 3 F 7 90 2011-01-01 12 | 2 4 F 1 14 2011-01-01 13 | 2 5 F 2 25 1999-13-01 14 | 2 6 M 2 25 1999-12-32 -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | csvvalidator 4 | 5 | 6 | 7 | 8 | 9 | org.python.pydev.PyDevBuilder 10 | 11 | 12 | 13 | 14 | 15 | org.python.pydev.pythonNature 16 | 17 | 18 | -------------------------------------------------------------------------------- /.pydevproject: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Default 6 | python 2.7 7 | 8 | /csvvalidator 9 | 10 | 11 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from distutils.core import setup 4 | 5 | setup(name='csvvalidator', 6 | version='1.3-SNAPSHOT', 7 | author='Alistair Miles', 8 | author_email='alimanfoo@googlemail.com', 9 | url='https://github.com/alimanfoo/csvvalidator', 10 | license='MIT License', 11 | py_modules=['csvvalidator'], 12 | description='A simple library for validating data contained in CSV files or similar row-oriented data sources.', 13 | long_description=open('README.txt').read(), 14 | classifiers=['Intended Audience :: Developers', 15 | 'License :: OSI Approved :: MIT License', 16 | 'Programming Language :: Python', 17 | 'Topic :: Software Development :: Libraries :: Python Modules' 18 | ] 19 | ) 20 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011 Alistair Miles 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so, 8 | subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 15 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 16 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 17 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 18 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | -------------------------------------------------------------------------------- /CHANGES.txt: -------------------------------------------------------------------------------- 1 | v1.1, 2011-07-27 2 | ================ 3 | 4 | * The convention for record check functions has changed in this 5 | release. Record check functions should now raise a RecordError if 6 | there is a validation problem in the record. RecordError is a new 7 | class in the csvvalidator module in this release. This change makes 8 | it easier to distinguish between exceptions reporting validation 9 | problems (i.e., RecordError), and unexpected exceptions raised 10 | during record checks (i.e., all other exception classes, including 11 | ValueError). See also https://github.com/alimanfoo/csvvalidator/issues/3 12 | 13 | * This release supports a new convention for defining record check 14 | functions. In addition to the existing style, where record check 15 | functions are defined separately and added via the 16 | CSVValidator.add_record_check method, you can now also sub-class 17 | CSVValidator, and any method whose name starts with 'check' will be 18 | invoked as a record check function. This is similar to the style for 19 | 'assert' methods. See also https://github.com/alimanfoo/csvvalidator/issues/4 20 | 21 | v1.0, 2011-07-21 22 | ================ 23 | 24 | Initial release. 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | ============ 2 | csvvalidator 3 | ============ 4 | 5 | This module provides some simple utilities for validating data contained in CSV 6 | files, or other similar data sources. 7 | 8 | The source code for this module lives at: 9 | 10 | https://github.com/alimanfoo/csvvalidator 11 | 12 | Please report any bugs or feature requests via the issue tracker there. 13 | 14 | Installation 15 | ============ 16 | 17 | This module is registered with the Python package index, so you can do:: 18 | 19 | $ easy_install csvvalidator 20 | 21 | ... or download from http://pypi.python.org/pypi/csvvalidator and 22 | install in the usual way:: 23 | 24 | $ python setup.py install 25 | 26 | If you want the bleeding edge, clone the source code repository:: 27 | 28 | $ git clone git://github.com/alimanfoo/csvvalidator.git 29 | $ cd csvvalidator 30 | $ python setup.py install 31 | 32 | Usage 33 | ===== 34 | 35 | The `CSVValidator` class is the foundation for all validator objects that are 36 | capable of validating CSV data. 37 | 38 | You can use the CSVValidator class to dynamically construct a validator, e.g.:: 39 | 40 | import sys 41 | import csv 42 | from csvvalidator import * 43 | 44 | field_names = ( 45 | 'study_id', 46 | 'patient_id', 47 | 'gender', 48 | 'age_years', 49 | 'age_months', 50 | 'date_inclusion' 51 | ) 52 | 53 | validator = CSVValidator(field_names) 54 | 55 | # basic header and record length checks 56 | validator.add_header_check('EX1', 'bad header') 57 | validator.add_record_length_check('EX2', 'unexpected record length') 58 | 59 | # some simple value checks 60 | validator.add_value_check('study_id', int, 61 | 'EX3', 'study id must be an integer') 62 | validator.add_value_check('patient_id', int, 63 | 'EX4', 'patient id must be an integer') 64 | validator.add_value_check('gender', enumeration('M', 'F'), 65 | 'EX5', 'invalid gender') 66 | validator.add_value_check('age_years', number_range_inclusive(0, 120, int), 67 | 'EX6', 'invalid age in years') 68 | validator.add_value_check('date_inclusion', datetime_string('%Y-%m-%d'), 69 | 'EX7', 'invalid date') 70 | 71 | # a more complicated record check 72 | def check_age_variables(r): 73 | age_years = int(r['age_years']) 74 | age_months = int(r['age_months']) 75 | valid = (age_months >= age_years * 12 and 76 | age_months % age_years < 12) 77 | if not valid: 78 | raise RecordError('EX8', 'invalid age variables') 79 | validator.add_record_check(check_age_variables) 80 | 81 | # validate the data and write problems to stdout 82 | data = csv.reader('/path/to/data.csv', delimiter='\t') 83 | problems = validator.validate(data) 84 | write_problems(problems, sys.stdout) 85 | 86 | For more complex use cases you can also sub-class `CSVValidator` to define 87 | re-usable validator classes for specific data sources. 88 | 89 | For a complete account of all of the functionality available from this module, 90 | see the example.py and tests.py modules in the source code repository. 91 | 92 | Notes 93 | ===== 94 | 95 | Note that the `csvvalidator` module is intended to be used in combination with 96 | the standard Python `csv` module. The `csvvalidator` module **will not** 97 | validate the *syntax* of a CSV file. Rather, the `csvvalidator` module can be 98 | used to validate any source of row-oriented data, such as is provided by a 99 | `csv.reader` object. 100 | 101 | I.e., if you want to validate data from a CSV file, you have to first construct 102 | a CSV reader using the standard Python `csv` module, specifying the appropriate 103 | dialect, and then pass the CSV reader as the source of data to either the 104 | `CSVValidator.validate` or the `CSVValidator.ivalidate` method. 105 | 106 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | csvvalidator 3 | ============ 4 | 5 | **This package is no longer maintained. Functionality for validating tables has been migrated to [petl](https://petl.readthedocs.io/en/stable/transform.html#petl.transform.validation.validate).** 6 | 7 | --- 8 | 9 | This module provides some simple utilities for validating data contained in CSV 10 | files, or other similar data sources. 11 | 12 | The source code for this module lives at: 13 | 14 | https://github.com/alimanfoo/csvvalidator 15 | 16 | Please report any bugs or feature requests via the issue tracker there. 17 | 18 | Installation 19 | ============ 20 | 21 | This module is registered with the Python package index, so you can do:: 22 | 23 | $ easy_install csvvalidator 24 | 25 | ... or download from http://pypi.python.org/pypi/csvvalidator and 26 | install in the usual way:: 27 | 28 | $ python setup.py install 29 | 30 | If you want the bleeding edge, clone the source code repository:: 31 | 32 | $ git clone git://github.com/alimanfoo/csvvalidator.git 33 | $ cd csvvalidator 34 | $ python setup.py install 35 | 36 | Usage 37 | ===== 38 | 39 | The `CSVValidator` class is the foundation for all validator objects that are 40 | capable of validating CSV data. 41 | 42 | You can use the CSVValidator class to dynamically construct a validator, e.g.:: 43 | 44 | import sys 45 | import csv 46 | from csvvalidator import * 47 | 48 | field_names = ( 49 | 'study_id', 50 | 'patient_id', 51 | 'gender', 52 | 'age_years', 53 | 'age_months', 54 | 'date_inclusion' 55 | ) 56 | 57 | validator = CSVValidator(field_names) 58 | 59 | # basic header and record length checks 60 | validator.add_header_check('EX1', 'bad header') 61 | validator.add_record_length_check('EX2', 'unexpected record length') 62 | 63 | # some simple value checks 64 | validator.add_value_check('study_id', int, 65 | 'EX3', 'study id must be an integer') 66 | validator.add_value_check('patient_id', int, 67 | 'EX4', 'patient id must be an integer') 68 | validator.add_value_check('gender', enumeration('M', 'F'), 69 | 'EX5', 'invalid gender') 70 | validator.add_value_check('age_years', number_range_inclusive(0, 120, int), 71 | 'EX6', 'invalid age in years') 72 | validator.add_value_check('date_inclusion', datetime_string('%Y-%m-%d'), 73 | 'EX7', 'invalid date') 74 | 75 | # a more complicated record check 76 | def check_age_variables(r): 77 | age_years = int(r['age_years']) 78 | age_months = int(r['age_months']) 79 | valid = (age_months >= age_years * 12 and 80 | age_months % age_years < 12) 81 | if not valid: 82 | raise RecordError('EX8', 'invalid age variables') 83 | validator.add_record_check(check_age_variables) 84 | 85 | # validate the data and write problems to stdout 86 | data = csv.reader('/path/to/data.csv', delimiter='\t') 87 | problems = validator.validate(data) 88 | write_problems(problems, sys.stdout) 89 | 90 | For more complex use cases you can also sub-class `CSVValidator` to define 91 | re-usable validator classes for specific data sources. 92 | 93 | For a complete account of all of the functionality available from this module, 94 | see the example.py and tests.py modules in the source code repository. 95 | 96 | Notes 97 | ===== 98 | 99 | Note that the `csvvalidator` module is intended to be used in combination with 100 | the standard Python `csv` module. The `csvvalidator` module **will not** 101 | validate the *syntax* of a CSV file. Rather, the `csvvalidator` module can be 102 | used to validate any source of row-oriented data, such as is provided by a 103 | `csv.reader` object. 104 | 105 | I.e., if you want to validate data from a CSV file, you have to first construct 106 | a CSV reader using the standard Python `csv` module, specifying the appropriate 107 | dialect, and then pass the CSV reader as the source of data to either the 108 | `CSVValidator.validate` or the `CSVValidator.ivalidate` method. 109 | 110 | -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | An executable Python script illustrating the use of the CSVValidator module. 5 | 6 | This script illustrates some, but not all, of the features available. For a 7 | complete account of all features available, see the tests.py module. 8 | """ 9 | 10 | import argparse 11 | import os 12 | import sys 13 | import csv 14 | from csvvalidator import CSVValidator, enumeration, number_range_inclusive,\ 15 | write_problems, datetime_string, RecordError 16 | 17 | 18 | def create_validator(): 19 | """Create an example CSV validator for patient demographic data.""" 20 | 21 | field_names = ( 22 | 'study_id', 23 | 'patient_id', 24 | 'gender', 25 | 'age_years', 26 | 'age_months', 27 | 'date_inclusion' 28 | ) 29 | validator = CSVValidator(field_names) 30 | 31 | # basic header and record length checks 32 | validator.add_header_check('EX1', 'bad header') 33 | validator.add_record_length_check('EX2', 'unexpected record length') 34 | 35 | # some simple value checks 36 | validator.add_value_check('study_id', int, 37 | 'EX3', 'study id must be an integer') 38 | validator.add_value_check('patient_id', int, 39 | 'EX4', 'patient id must be an integer') 40 | validator.add_value_check('gender', enumeration('M', 'F'), 41 | 'EX5', 'invalid gender') 42 | validator.add_value_check('age_years', number_range_inclusive(0, 120, int), 43 | 'EX6', 'invalid age in years') 44 | validator.add_value_check('date_inclusion', datetime_string('%Y-%m-%d'), 45 | 'EX7', 'invalid date') 46 | 47 | # a more complicated record check 48 | def check_age_variables(r): 49 | age_years = int(r['age_years']) 50 | age_months = int(r['age_months']) 51 | valid = (age_months >= age_years * 12 and 52 | age_months % age_years < 12) 53 | if not valid: 54 | raise RecordError('EX8', 'invalid age variables') 55 | validator.add_record_check(check_age_variables) 56 | 57 | return validator 58 | 59 | 60 | def main(): 61 | """Main function.""" 62 | 63 | # define a command-line argument parser 64 | description = 'Validate a CSV data file.' 65 | parser = argparse.ArgumentParser(description=description) 66 | parser.add_argument('file', 67 | metavar='FILE', 68 | help='a file to be validated') 69 | parser.add_argument('-l', '--limit', 70 | dest='limit', 71 | type=int, 72 | action='store', 73 | default=0, 74 | help='limit the number of problems reported' 75 | ) 76 | parser.add_argument('-s', '--summarize', 77 | dest='summarize', 78 | action='store_true', 79 | default=False, 80 | help='output only a summary of the different types of problem found' 81 | ) 82 | parser.add_argument('-e', '--report-unexpected-exceptions', 83 | dest='report_unexpected_exceptions', 84 | action='store_true', 85 | default=False, 86 | help='report any unexpected exceptions as problems' 87 | ) 88 | 89 | # parse arguments 90 | args = parser.parse_args() 91 | 92 | # sanity check arguments 93 | if not os.path.isfile(args.file): 94 | print '%s is not a file' % args.file 95 | sys.exit(1) 96 | 97 | with open(args.file, 'r') as f: 98 | 99 | # set up a csv reader for the data 100 | data = csv.reader(f, delimiter='\t') 101 | 102 | # create a validator 103 | validator = create_validator() 104 | 105 | # validate the data from the csv reader 106 | # N.B., validate() returns a list of problems; 107 | # if you expect a large number of problems, use ivalidate() instead 108 | # of validate(), but bear in mind that ivalidate() returns an iterator 109 | # so there is no len() 110 | problems = validator.validate(data, 111 | summarize=args.summarize, 112 | report_unexpected_exceptions=args.report_unexpected_exceptions, 113 | context={'file': args.file}) 114 | 115 | # write problems to stdout as restructured text 116 | write_problems(problems, sys.stdout, 117 | summarize=args.summarize, 118 | limit=args.limit) 119 | 120 | # decide how to exit 121 | if problems: # will not work with ivalidate() because it returns an iterator 122 | sys.exit(1) 123 | else: 124 | sys.exit(0) 125 | 126 | 127 | if __name__ == "__main__": 128 | main() 129 | 130 | -------------------------------------------------------------------------------- /tests.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for the `csvvalidator` module. 3 | 4 | """ 5 | 6 | 7 | import logging 8 | import math 9 | 10 | from csvvalidator import CSVValidator, VALUE_CHECK_FAILED, MESSAGES,\ 11 | HEADER_CHECK_FAILED, RECORD_LENGTH_CHECK_FAILED, enumeration, match_pattern,\ 12 | search_pattern, number_range_inclusive, number_range_exclusive,\ 13 | VALUE_PREDICATE_FALSE, RECORD_PREDICATE_FALSE, UNIQUE_CHECK_FAILED,\ 14 | ASSERT_CHECK_FAILED, UNEXPECTED_EXCEPTION, write_problems, datetime_string,\ 15 | RECORD_CHECK_FAILED, datetime_range_inclusive, datetime_range_exclusive,\ 16 | RecordError 17 | 18 | 19 | # logging setup 20 | logger = logging.getLogger(__name__) 21 | handler = logging.StreamHandler() 22 | handler.setLevel(logging.DEBUG) 23 | formatter = logging.Formatter('%(levelname)s - %(funcName)s - %(message)s') 24 | handler.setFormatter(formatter) 25 | logger.addHandler(handler) 26 | debug, info, warning, error = logger.debug, logger.info, logger.warning, logger.error 27 | 28 | 29 | def test_value_checks(): 30 | """Some very simple tests of value checks.""" 31 | 32 | # a simple validator to be tested 33 | field_names=('foo', 'bar') 34 | validator = CSVValidator(field_names) 35 | validator.add_value_check('foo', int) 36 | validator.add_value_check('bar', float) 37 | 38 | # some test data 39 | data = ( 40 | ('foo', 'bar'), # row 1 - header row 41 | ('12', '3.4'), # row 2 - valid 42 | ('1.2', '3.4'), # row 3 - foo invalid 43 | ('abc', '3.4'), # row 4 - foo invalid 44 | ('12', 'abc'), # row 5 - bar invalid 45 | ('', '3.4'), # row 6 - foo invalid (empty) 46 | ('12', ''), # row 7 - bar invalid (empty) 47 | ('abc', 'def') # row 8 - both invalid 48 | ) 49 | 50 | # run the validator on the test data 51 | problems = validator.validate(data) 52 | 53 | assert len(problems) == 7 54 | 55 | # N.B., expect row and column indices start from 1 56 | 57 | problems_row2 = [p for p in problems if p['row'] == 2] 58 | assert len(problems_row2) == 0 # should be valid 59 | 60 | problems_row3 = [p for p in problems if p['row'] == 3] 61 | assert len(problems_row3) == 1 62 | p = problems_row3[0] # convenience variable 63 | assert p['column'] == 1 # report column index 64 | assert p['field'] == 'foo' # report field name 65 | assert p['code'] == VALUE_CHECK_FAILED # default problem code for value checks 66 | assert p['message'] == MESSAGES[VALUE_CHECK_FAILED] # default message 67 | assert p['value'] == '1.2' # report bad value 68 | assert p['record'] == ('1.2', '3.4') # report record 69 | 70 | problems_row4 = [p for p in problems if p['row'] == 4] 71 | assert len(problems_row4) == 1 72 | p = problems_row4[0] # convenience variable 73 | assert p['column'] == 1 74 | assert p['field'] == 'foo' 75 | assert p['code'] == VALUE_CHECK_FAILED 76 | assert p['message'] == MESSAGES[VALUE_CHECK_FAILED] 77 | assert p['value'] == 'abc' 78 | assert p['record'] == ('abc', '3.4') 79 | 80 | problems_row5 = [p for p in problems if p['row'] == 5] 81 | assert len(problems_row5) == 1 82 | p = problems_row5[0] # convenience variable 83 | assert p['column'] == 2 84 | assert p['field'] == 'bar' 85 | assert p['code'] == VALUE_CHECK_FAILED 86 | assert p['message'] == MESSAGES[VALUE_CHECK_FAILED] 87 | assert p['value'] == 'abc' 88 | assert p['record'] == ('12', 'abc') 89 | 90 | problems_row6 = [p for p in problems if p['row'] == 6] 91 | assert len(problems_row6) == 1 92 | p = problems_row6[0] # convenience variable 93 | assert p['column'] == 1 94 | assert p['field'] == 'foo' 95 | assert p['code'] == VALUE_CHECK_FAILED 96 | assert p['message'] == MESSAGES[VALUE_CHECK_FAILED] 97 | assert p['value'] == '' 98 | assert p['record'] == ('', '3.4') 99 | 100 | problems_row7 = [p for p in problems if p['row'] == 7] 101 | assert len(problems_row7) == 1 102 | p = problems_row7[0] # convenience variable 103 | assert p['column'] == 2 104 | assert p['field'] == 'bar' 105 | assert p['code'] == VALUE_CHECK_FAILED 106 | assert p['message'] == MESSAGES[VALUE_CHECK_FAILED] 107 | assert p['value'] == '' 108 | assert p['record'] == ('12', '') 109 | 110 | problems_row8 = [p for p in problems if p['row'] == 8] 111 | assert len(problems_row8) == 2 # expect both problems are found 112 | p0 = problems_row8[0] # convenience variable 113 | assert p0['column'] == 1 114 | assert p0['field'] == 'foo' 115 | assert p0['code'] == VALUE_CHECK_FAILED 116 | assert p0['message'] == MESSAGES[VALUE_CHECK_FAILED] 117 | assert p0['value'] == 'abc' 118 | assert p0['record'] == ('abc', 'def') 119 | p1 = problems_row8[1] # convenience variable 120 | assert p1['column'] == 2 121 | assert p1['field'] == 'bar' 122 | assert p1['code'] == VALUE_CHECK_FAILED 123 | assert p1['message'] == MESSAGES[VALUE_CHECK_FAILED] 124 | assert p1['value'] == 'def' 125 | assert p1['record'] == ('abc', 'def') 126 | 127 | 128 | def test_header_check(): 129 | """Test the header checks work.""" 130 | 131 | field_names = ('foo', 'bar') 132 | validator = CSVValidator(field_names) 133 | validator.add_header_check() # use default code and message 134 | validator.add_header_check(code='X1', message='custom message') # provide custom code and message 135 | 136 | data = ( 137 | ('foo', 'baz'), 138 | ('123', '456') 139 | ) 140 | 141 | problems = validator.validate(data) 142 | assert len(problems) == 2 143 | 144 | p0 = problems[0] 145 | assert p0['code'] == HEADER_CHECK_FAILED 146 | assert p0['message'] == MESSAGES[HEADER_CHECK_FAILED] 147 | assert p0['record'] == ('foo', 'baz') 148 | assert p0['missing'] == set(['bar']) 149 | assert p0['unexpected'] == set(['baz']) 150 | assert p0['row'] == 1 151 | 152 | p1 = problems[1] 153 | assert p1['code'] == 'X1' 154 | assert p1['message'] == 'custom message' 155 | assert p1['missing'] == set(['bar']) 156 | assert p1['unexpected'] == set(['baz']) 157 | assert p1['record'] == ('foo', 'baz') 158 | assert p1['row'] == 1 159 | 160 | 161 | def test_ignore_lines(): 162 | """Test instructions to ignore lines works.""" 163 | 164 | field_names = ('foo', 'bar') 165 | validator = CSVValidator(field_names) 166 | validator.add_header_check() 167 | validator.add_value_check('foo', int) 168 | validator.add_value_check('bar', float) 169 | 170 | data = ( 171 | ('ignore', 'me', 'please'), 172 | ('ignore', 'me', 'too', 'please'), 173 | ('foo', 'baz'), 174 | ('1.2', 'abc') 175 | ) 176 | 177 | problems = validator.validate(data, ignore_lines=2) 178 | assert len(problems) == 3 179 | 180 | header_problems = [p for p in problems if p['code'] == HEADER_CHECK_FAILED] 181 | assert len(header_problems) == 1 182 | assert header_problems[0]['row'] == 3 183 | 184 | value_problems = [p for p in problems if p['code'] == VALUE_CHECK_FAILED] 185 | assert len(value_problems) == 2 186 | for p in value_problems: 187 | assert p['row'] == 4 188 | 189 | 190 | def test_record_length_checks(): 191 | """Test the record length checks.""" 192 | 193 | field_names = ('foo', 'bar') 194 | validator = CSVValidator(field_names) 195 | validator.add_record_length_check() # test default code and message 196 | validator.add_record_length_check('X2', 'custom message') 197 | 198 | data = ( 199 | ('foo', 'bar'), 200 | ('12', '3.4'), 201 | ('12',), # be careful with syntax for singleton tuples 202 | ('12', '3.4', 'spong') 203 | ) 204 | 205 | problems = validator.validate(data) 206 | assert len(problems) == 4, len(problems) 207 | 208 | # find problems reported under default code 209 | default_problems = [p for p in problems if p['code'] == RECORD_LENGTH_CHECK_FAILED] 210 | assert len(default_problems) == 2 211 | d0 = default_problems[0] 212 | assert d0['message'] == MESSAGES[RECORD_LENGTH_CHECK_FAILED] 213 | assert d0['row'] == 3 214 | assert d0['record'] == ('12',) 215 | assert d0['length'] == 1 216 | d1 = default_problems[1] 217 | assert d1['message'] == MESSAGES[RECORD_LENGTH_CHECK_FAILED] 218 | assert d1['row'] == 4 219 | assert d1['record'] == ('12', '3.4', 'spong') 220 | assert d1['length'] == 3 221 | 222 | # find problems reported under custom code 223 | custom_problems = [p for p in problems if p['code'] == 'X2'] 224 | assert len(custom_problems) == 2 225 | c0 = custom_problems[0] 226 | assert c0['message'] == 'custom message' 227 | assert c0['row'] == 3 228 | assert c0['record'] == ('12',) 229 | assert c0['length'] == 1 230 | c1 = custom_problems[1] 231 | assert c1['message'] == 'custom message' 232 | assert c1['row'] == 4 233 | assert c1['record'] == ('12', '3.4', 'spong') 234 | assert c1['length'] == 3 235 | 236 | 237 | def test_value_checks_with_missing_values(): 238 | """ 239 | Establish expected behaviour for value checks where there are missing values 240 | in the records. 241 | 242 | """ 243 | 244 | field_names = ('foo', 'bar') 245 | validator = CSVValidator(field_names) 246 | validator.add_value_check('bar', float) 247 | 248 | data = ( 249 | ('foo', 'bar'), 250 | ('12',) # this is missing value for bar, what happens to value check? 251 | ) 252 | 253 | problems = validator.validate(data) 254 | 255 | # missing values are ignored - use record length checks to find these 256 | assert len(problems) == 0 257 | 258 | 259 | def test_value_check_enumeration(): 260 | """Test value checks with the enumeration() function.""" 261 | 262 | field_names = ('foo', 'bar', 'baz') 263 | validator = CSVValidator(field_names) 264 | # define an enumeration directly with arguments 265 | validator.add_value_check('bar', enumeration('M', 'F')) 266 | # define an enumeration by passing in a list or tuple 267 | flavours = ('chocolate', 'vanilla', 'strawberry') 268 | validator.add_value_check('baz', enumeration(flavours)) 269 | 270 | data = ( 271 | ('foo', 'bar', 'baz'), 272 | ('1', 'M', 'chocolate'), 273 | ('2', 'F', 'maple pecan'), 274 | ('3', 'X', 'strawberry') 275 | ) 276 | 277 | problems = validator.validate(data) 278 | assert len(problems) == 2 279 | 280 | p0 = problems[0] 281 | assert p0['code'] == VALUE_CHECK_FAILED 282 | assert p0['row'] == 3 283 | assert p0['column'] == 3 284 | assert p0['field'] == 'baz' 285 | assert p0['value'] == 'maple pecan' 286 | assert p0['record'] == ('2', 'F', 'maple pecan') 287 | 288 | p1 = problems[1] 289 | assert p1['code'] == VALUE_CHECK_FAILED 290 | assert p1['row'] == 4 291 | assert p1['column'] == 2 292 | assert p1['field'] == 'bar' 293 | assert p1['value'] == 'X' 294 | assert p1['record'] == ('3', 'X', 'strawberry') 295 | 296 | 297 | def test_value_check_match_pattern(): 298 | """Test value checks with the match_pattern() function.""" 299 | 300 | field_names = ('foo', 'bar') 301 | validator = CSVValidator(field_names) 302 | validator.add_value_check('bar', match_pattern('\d{4}-\d{2}-\d{2}')) 303 | 304 | data = ( 305 | ('foo', 'bar'), 306 | ('1', '1999-01-01'), 307 | ('2', 'abcd-ef-gh'), 308 | ('3', 'a1999-01-01'), 309 | ('4', '1999-01-01a') # this is valid - pattern attempts to match at beginning of line 310 | ) 311 | 312 | problems = validator.validate(data) 313 | assert len(problems) == 2, len(problems) 314 | for p in problems: 315 | assert p['code'] == VALUE_CHECK_FAILED 316 | 317 | assert problems[0]['row'] == 3 318 | assert problems[1]['row'] == 4 319 | 320 | 321 | def test_value_check_search_pattern(): 322 | """Test value checks with the search_pattern() function.""" 323 | 324 | field_names = ('foo', 'bar') 325 | validator = CSVValidator(field_names) 326 | validator.add_value_check('bar', search_pattern('\d{4}-\d{2}-\d{2}')) 327 | 328 | data = ( 329 | ('foo', 'bar'), 330 | ('1', '1999-01-01'), 331 | ('2', 'abcd-ef-gh'), 332 | ('3', 'a1999-01-01'), # this is valid - pattern attempts to match anywhere in line 333 | ('4', '1999-01-01a') # this is valid - pattern attempts to match anywhere in line 334 | ) 335 | 336 | problems = validator.validate(data) 337 | assert len(problems) == 1, len(problems) 338 | assert problems[0]['code'] == VALUE_CHECK_FAILED 339 | assert problems[0]['row'] == 3 340 | 341 | 342 | def test_value_check_numeric_ranges(): 343 | """Test value checks with numerical range functions.""" 344 | 345 | field_names = ('foo', 'bar', 'baz', 'quux') 346 | validator = CSVValidator(field_names) 347 | validator.add_value_check('foo', number_range_inclusive(2, 6, int)) 348 | validator.add_value_check('bar', number_range_exclusive(2, 6, int)) 349 | validator.add_value_check('baz', number_range_inclusive(2.0, 6.3, float)) 350 | validator.add_value_check('quux', number_range_exclusive(2.0, 6.3, float)) 351 | 352 | data = ( 353 | ('foo', 'bar', 'baz', 'quux'), 354 | ('2', '3', '2.0', '2.1'), # valid 355 | ('1', '3', '2.0', '2.1'), # foo invalid 356 | ('2', '2', '2.0', '2.1'), # bar invalid 357 | ('2', '3', '1.9', '2.1'), # baz invalid 358 | ('2', '3', '2.0', '2.0') # quux invalid 359 | ) 360 | 361 | problems = validator.validate(data) 362 | assert len(problems) == 4, len(problems) 363 | for p in problems: 364 | assert p['code'] == VALUE_CHECK_FAILED 365 | 366 | assert problems[0]['row'] == 3 and problems[0]['field'] == 'foo' 367 | assert problems[1]['row'] == 4 and problems[1]['field'] == 'bar' 368 | assert problems[2]['row'] == 5 and problems[2]['field'] == 'baz' 369 | assert problems[3]['row'] == 6 and problems[3]['field'] == 'quux' 370 | 371 | 372 | def test_value_checks_datetime(): 373 | """Test value checks with datetimes.""" 374 | 375 | field_names = ('foo', 'bar') 376 | validator = CSVValidator(field_names) 377 | validator.add_value_check('bar', datetime_string('%Y-%m-%d')) 378 | 379 | data = ( 380 | ('foo', 'bar'), 381 | ('A', '1999-09-09'), # valid 382 | ('B', '1999-13-09'), # invalid month 383 | ('C', '1999-09-32'), # invalid day 384 | ('D', '1999-09-09ss') # invalid string 385 | ) 386 | 387 | problems = validator.validate(data) 388 | assert len(problems) == 3, problems 389 | for p in problems: 390 | assert p['code'] == VALUE_CHECK_FAILED 391 | 392 | assert problems[0]['row'] == 3 and problems[0]['field'] == 'bar' 393 | assert problems[1]['row'] == 4 and problems[1]['field'] == 'bar' 394 | assert problems[2]['row'] == 5 and problems[2]['field'] == 'bar' 395 | 396 | 397 | def test_value_checks_datetime_range(): 398 | """Test value checks with datetime ranges.""" 399 | 400 | field_names = ('foo', 'bar') 401 | validator = CSVValidator(field_names) 402 | validator.add_value_check('bar', datetime_range_inclusive('1999-09-09', 403 | '2009-09-09', 404 | '%Y-%m-%d')) 405 | validator.add_value_check('bar', datetime_range_exclusive('1999-09-09', 406 | '2009-09-09', 407 | '%Y-%m-%d')) 408 | 409 | data = ( 410 | ('foo', 'bar'), 411 | ('A', '1999-09-10'), # valid 412 | ('B', '1999-09-09'), # invalid (exclusive) 413 | ('C', '2009-09-09'), # invalid (exclusive) 414 | ('D', '1999-09-08'), # invalid (both) 415 | ('E', '2009-09-10') # invalid (both) 416 | ) 417 | 418 | problems = validator.validate(data) 419 | 420 | assert len(problems) == 6, len(problems) 421 | assert len([p for p in problems if p['row'] == 3]) == 1 422 | assert len([p for p in problems if p['row'] == 4]) == 1 423 | assert len([p for p in problems if p['row'] == 5]) == 2 424 | assert len([p for p in problems if p['row'] == 6]) == 2 425 | 426 | 427 | def test_value_predicates(): 428 | """Test the use of value predicates.""" 429 | 430 | field_names = ('foo', 'bar') 431 | validator = CSVValidator(field_names) 432 | foo_predicate = lambda v: math.pow(float(v), 2) < 64 433 | validator.add_value_predicate('foo', foo_predicate) 434 | bar_predicate = lambda v: math.sqrt(float(v)) > 8 435 | validator.add_value_predicate('bar', bar_predicate, 'X3', 'custom message') 436 | 437 | data = ( 438 | ('foo', 'bar'), 439 | ('4', '81'), # valid 440 | ('9', '81'), # foo invalid 441 | ('4', '49') # bar invalid 442 | ) 443 | 444 | problems = validator.validate(data) 445 | assert len(problems) == 2, len(problems) 446 | 447 | p0 = problems[0] 448 | assert p0['code'] == VALUE_PREDICATE_FALSE 449 | assert p0['message'] == MESSAGES[VALUE_PREDICATE_FALSE] 450 | assert p0['row'] == 3 451 | assert p0['column'] == 1 452 | assert p0['field'] == 'foo' 453 | assert p0['value'] == '9' 454 | assert p0['record'] == ('9', '81') 455 | 456 | p1 = problems[1] 457 | assert p1['code'] == 'X3' 458 | assert p1['message'] == 'custom message' 459 | assert p1['row'] == 4 460 | assert p1['column'] == 2 461 | assert p1['field'] == 'bar' 462 | assert p1['value'] == '49' 463 | assert p1['record'] == ('4', '49') 464 | 465 | 466 | def test_record_checks(): 467 | """Test the use of record checks.""" 468 | 469 | field_names = ('foo', 'bar') 470 | validator = CSVValidator(field_names) 471 | 472 | def foo_gt_bar(r): 473 | foo = int(r['foo']) 474 | bar = int(r['bar']) 475 | if foo < bar: 476 | raise RecordError 477 | validator.add_record_check(foo_gt_bar) # use default code and message 478 | 479 | def foo_gt_2bar(r): 480 | foo = int(r['foo']) 481 | bar = int(r['bar']) 482 | if foo < 2 * bar: 483 | raise RecordError('X4', 'custom message') 484 | validator.add_record_check(foo_gt_2bar) 485 | 486 | data = ( 487 | ('foo', 'bar'), 488 | ('7', '3'), # valid 489 | ('5', '3'), # invalid - not foo_gt_2bar 490 | ('1', '3') # invalid - both predicates false 491 | ) 492 | 493 | problems = validator.validate(data) 494 | n = len(problems) 495 | assert n == 3, n 496 | 497 | row3_problems = [p for p in problems if p['row'] == 3] 498 | assert len(row3_problems) == 1 499 | p = row3_problems[0] 500 | assert p['code'] == 'X4' 501 | assert p['message'] == 'custom message' 502 | assert p['record'] == ('5', '3') 503 | 504 | row4_problems = [p for p in problems if p['row'] == 4] 505 | assert len(row4_problems) == 2 506 | 507 | row4_problems_default = [p for p in row4_problems if p['code'] == RECORD_CHECK_FAILED] 508 | assert len(row4_problems_default) == 1 509 | p = row4_problems_default[0] 510 | assert p['message'] == MESSAGES[RECORD_CHECK_FAILED] 511 | assert p['record'] == ('1', '3') 512 | 513 | row4_problems_custom = [p for p in row4_problems if p['code'] == 'X4'] 514 | assert len(row4_problems_custom) == 1 515 | p = row4_problems_custom[0] 516 | assert p['message'] == 'custom message' 517 | assert p['record'] == ('1', '3') 518 | 519 | 520 | def test_record_predicates(): 521 | """Test the use of record predicates.""" 522 | 523 | field_names = ('foo', 'bar') 524 | validator = CSVValidator(field_names) 525 | 526 | def foo_gt_bar(r): 527 | return int(r['foo']) > int(r['bar']) # expect record will be a dictionary 528 | validator.add_record_predicate(foo_gt_bar) # use default code and message 529 | 530 | def foo_gt_2bar(r): 531 | return int(r['foo']) > 2 * int(r['bar']) 532 | validator.add_record_predicate(foo_gt_2bar, 'X4', 'custom message') 533 | 534 | data = ( 535 | ('foo', 'bar'), 536 | ('7', '3'), # valid 537 | ('5', '3'), # invalid - not foo_gt_2bar 538 | ('1', '3') # invalid - both predicates false 539 | ) 540 | 541 | problems = validator.validate(data) 542 | n = len(problems) 543 | assert n == 3, n 544 | 545 | row3_problems = [p for p in problems if p['row'] == 3] 546 | assert len(row3_problems) == 1 547 | p = row3_problems[0] 548 | assert p['code'] == 'X4' 549 | assert p['message'] == 'custom message' 550 | assert p['record'] == ('5', '3') 551 | 552 | row4_problems = [p for p in problems if p['row'] == 4] 553 | assert len(row4_problems) == 2 554 | 555 | row4_problems_default = [p for p in row4_problems if p['code'] == RECORD_PREDICATE_FALSE] 556 | assert len(row4_problems_default) == 1 557 | p = row4_problems_default[0] 558 | assert p['message'] == MESSAGES[RECORD_PREDICATE_FALSE] 559 | assert p['record'] == ('1', '3') 560 | 561 | row4_problems_custom = [p for p in row4_problems if p['code'] == 'X4'] 562 | assert len(row4_problems_custom) == 1 563 | p = row4_problems_custom[0] 564 | assert p['message'] == 'custom message' 565 | assert p['record'] == ('1', '3') 566 | 567 | 568 | def test_unique_checks(): 569 | """Test the uniqueness checks.""" 570 | 571 | field_names = ('foo', 'bar') 572 | validator = CSVValidator(field_names) 573 | validator.add_unique_check('foo') 574 | 575 | data = ( 576 | ('foo', 'bar'), 577 | ('1', 'A'), 578 | ('2', 'B'), 579 | ('1', 'C') 580 | ) 581 | 582 | problems = validator.validate(data) 583 | n = len(problems) 584 | assert n == 1, n 585 | 586 | p = problems[0] 587 | assert p['code'] == UNIQUE_CHECK_FAILED 588 | assert p['message'] == MESSAGES[UNIQUE_CHECK_FAILED] 589 | assert p['row'] == 4 590 | assert p['key'] == 'foo' 591 | assert p['value'] == '1' 592 | assert p['record'] == ('1', 'C') 593 | 594 | def test_unique_checks_with_variable_record_lengths(): 595 | """Test the uniqueness checks still work when record lengths vary.""" 596 | 597 | field_names = ('foo', 'bar') 598 | validator = CSVValidator(field_names) 599 | validator.add_unique_check('bar') 600 | 601 | data = ( 602 | ('foo', 'bar'), 603 | ('1', 'A'), 604 | ('2'), 605 | ('3', 'A') 606 | ) 607 | 608 | problems = validator.validate(data) 609 | n = len(problems) 610 | assert n == 1, n 611 | 612 | p = problems[0] 613 | assert p['code'] == UNIQUE_CHECK_FAILED 614 | assert p['message'] == MESSAGES[UNIQUE_CHECK_FAILED] 615 | assert p['row'] == 4 616 | assert p['key'] == 'bar' 617 | assert p['value'] == 'A' 618 | assert p['record'] == ('3', 'A') 619 | 620 | def test_compound_unique_checks(): 621 | """Test the uniqueness checks on compound keys.""" 622 | 623 | field_names = ('foo', 'bar') 624 | validator = CSVValidator(field_names) 625 | validator.add_unique_check(('foo', 'bar'), 'X5', 'custom message') 626 | 627 | data = ( 628 | ('foo', 'bar'), 629 | ('1', 'A'), 630 | ('2', 'B'), 631 | ('1', 'B'), 632 | ('2', 'A'), 633 | ('1', 'A') 634 | ) 635 | 636 | problems = validator.validate(data) 637 | n = len(problems) 638 | assert n == 1, n 639 | 640 | p = problems[0] 641 | assert p['code'] == 'X5' 642 | assert p['message'] == 'custom message' 643 | assert p['row'] == 6 644 | assert p['key'] == ('foo', 'bar') 645 | assert p['value'] == ('1', 'A') 646 | assert p['record'] == ('1', 'A') 647 | 648 | 649 | def test_compound_unique_checks_with_variable_record_lengths(): 650 | """Test the uniqueness checks on compound keys when record lengths vary.""" 651 | 652 | field_names = ('something', 'foo', 'bar') 653 | validator = CSVValidator(field_names) 654 | validator.add_unique_check(('foo', 'bar'), 'X5', 'custom message') 655 | 656 | data = ( 657 | ('something', 'foo', 'bar'), 658 | ('Z', '1', 'A'), 659 | ('Z', '2', 'B'), 660 | ('Z'), 661 | ('Z', '2', 'A'), 662 | ('Z', '1', 'A') 663 | ) 664 | 665 | problems = validator.validate(data) 666 | print problems 667 | n = len(problems) 668 | assert n == 1, n 669 | 670 | p = problems[0] 671 | assert p['code'] == 'X5' 672 | assert p['message'] == 'custom message' 673 | assert p['row'] == 6 674 | assert p['key'] == ('foo', 'bar') 675 | assert p['value'] == ('1', 'A') 676 | assert p['record'] == ('Z', '1', 'A') 677 | 678 | 679 | def test_assert_methods(): 680 | """Test use of 'assert' methods.""" 681 | 682 | # define a custom validator class 683 | class MyValidator(CSVValidator): 684 | 685 | def __init__(self, threshold): 686 | field_names = ('foo', 'bar') 687 | super(MyValidator, self).__init__(field_names) 688 | self._threshold = threshold 689 | 690 | def assert_foo_plus_bar_gt_threshold(self, r): 691 | assert int(r['foo']) + int(r['bar']) > self._threshold # use default error code and message 692 | 693 | def assert_foo_times_bar_gt_threshold(self, r): 694 | assert int(r['foo']) * int(r['bar']) > self._threshold, ('X6', 'custom message') 695 | 696 | validator = MyValidator(42) 697 | 698 | data = ( 699 | ('foo', 'bar'), 700 | ('33', '10'), # valid 701 | ('7', '8'), # invalid (foo + bar less than threshold) 702 | ('3', '4'), # invalid (both) 703 | ) 704 | 705 | problems = validator.validate(data) 706 | n = len(problems) 707 | assert n == 3, n 708 | 709 | row3_problems = [p for p in problems if p['row'] == 3] 710 | assert len(row3_problems) == 1 711 | p = row3_problems[0] 712 | assert p['code'] == ASSERT_CHECK_FAILED 713 | assert p['message'] == MESSAGES[ASSERT_CHECK_FAILED] 714 | assert p['record'] == ('7', '8') 715 | 716 | row4_problems = [p for p in problems if p['row'] == 4] 717 | assert len(row4_problems) == 2 718 | 719 | row4_problems_custom = [p for p in row4_problems if p['code'] == 'X6'] 720 | assert len(row4_problems_custom) == 1, row4_problems 721 | p = row4_problems_custom[0] 722 | assert p['message'] == 'custom message' 723 | assert p['record'] == ('3', '4') 724 | 725 | row4_problems_default = [p for p in row4_problems if p['code'] == ASSERT_CHECK_FAILED] 726 | assert len(row4_problems_default) == 1 727 | p = row4_problems_default[0] 728 | assert p['message'] == MESSAGES[ASSERT_CHECK_FAILED] 729 | assert p['record'] == ('3', '4') 730 | 731 | 732 | def test_check_methods(): 733 | """Test use of 'check' methods.""" 734 | 735 | # define a custom validator class 736 | class MyValidator(CSVValidator): 737 | 738 | def __init__(self, threshold): 739 | field_names = ('foo', 'bar') 740 | super(MyValidator, self).__init__(field_names) 741 | self._threshold = threshold 742 | 743 | def check_foo_plus_bar_gt_threshold(self, r): 744 | if int(r['foo']) + int(r['bar']) <= self._threshold: 745 | raise RecordError # use default error code and message 746 | 747 | def check_foo_times_bar_gt_threshold(self, r): 748 | if int(r['foo']) * int(r['bar']) <= self._threshold: 749 | raise RecordError('X6', 'custom message') 750 | 751 | validator = MyValidator(42) 752 | 753 | data = ( 754 | ('foo', 'bar'), 755 | ('33', '10'), # valid 756 | ('7', '8'), # invalid (foo + bar less than threshold) 757 | ('3', '4'), # invalid (both) 758 | ) 759 | 760 | problems = validator.validate(data) 761 | n = len(problems) 762 | assert n == 3, n 763 | 764 | row3_problems = [p for p in problems if p['row'] == 3] 765 | assert len(row3_problems) == 1 766 | p = row3_problems[0] 767 | assert p['code'] == RECORD_CHECK_FAILED 768 | assert p['message'] == MESSAGES[RECORD_CHECK_FAILED] 769 | assert p['record'] == ('7', '8') 770 | 771 | row4_problems = [p for p in problems if p['row'] == 4] 772 | assert len(row4_problems) == 2 773 | 774 | row4_problems_custom = [p for p in row4_problems if p['code'] == 'X6'] 775 | assert len(row4_problems_custom) == 1 776 | p = row4_problems_custom[0] 777 | assert p['message'] == 'custom message' 778 | assert p['record'] == ('3', '4') 779 | 780 | row4_problems_default = [p for p in row4_problems if p['code'] == RECORD_CHECK_FAILED] 781 | assert len(row4_problems_default) == 1 782 | p = row4_problems_default[0] 783 | assert p['message'] == MESSAGES[RECORD_CHECK_FAILED] 784 | assert p['record'] == ('3', '4') 785 | 786 | 787 | def test_each_and_finally_assert_methods(): 788 | """Test 'each' and 'finally_assert' methods.""" 789 | 790 | # define a custom validator class 791 | class MyValidator(CSVValidator): 792 | 793 | def __init__(self, threshold): 794 | field_names = ('foo', 'bar') 795 | super(MyValidator, self).__init__(field_names) 796 | self._threshold = threshold 797 | self._bars = [] 798 | self._count = 0 799 | 800 | def each_store_bar(self, r): 801 | n = float(r['bar']) 802 | self._bars.append(n) 803 | self._count += 1 804 | 805 | def finally_assert_mean_bar_gt_threshold(self): 806 | mean = sum(self._bars) / self._count 807 | assert mean > self._threshold, ('X7', 'custom message') 808 | 809 | data = [ 810 | ['foo', 'bar'], 811 | ['A', '2'], 812 | ['B', '3'], 813 | ['C', '7'] 814 | ] 815 | 816 | validator = MyValidator(5.0) 817 | problems = validator.validate(data) 818 | assert len(problems) == 1 819 | p = problems[0] 820 | assert p['code'] == 'X7' 821 | assert p['message'] == 'custom message' 822 | 823 | data.append(['D', '10']) 824 | validator = MyValidator(5.0) 825 | problems = validator.validate(data) 826 | assert len(problems) == 0 827 | 828 | 829 | def test_exception_handling(): 830 | """Establish expectations for exception handling.""" 831 | 832 | field_names = ('foo', 'bar') 833 | validator = CSVValidator(field_names) 834 | 835 | validator.add_value_check('foo', int) 836 | 837 | def buggy_value_check(v): 838 | """I am a buggy value check.""" 839 | raise Exception('something went wrong') 840 | validator.add_value_check('bar', buggy_value_check) 841 | 842 | def buggy_value_predicate(v): 843 | """I am a buggy value predicate.""" 844 | raise Exception('something went wrong') 845 | validator.add_value_predicate('bar', buggy_value_predicate) 846 | 847 | def buggy_record_check(r): 848 | """I am a buggy record check.""" 849 | raise Exception('something went wrong') 850 | validator.add_record_check(buggy_record_check) 851 | 852 | def buggy_record_predicate(r): 853 | """I am a buggy record predicate.""" 854 | raise Exception('something went wrong') 855 | validator.add_record_predicate(buggy_record_predicate) 856 | 857 | def buggy_assert(r): 858 | """I am a buggy assert.""" 859 | raise Exception('something went wrong') 860 | validator.assert_something_buggy = buggy_assert 861 | 862 | def buggy_check(r): 863 | """I am a buggy check.""" 864 | raise Exception('something went wrong') 865 | validator.check_something_buggy = buggy_check 866 | 867 | def buggy_each(r): 868 | """I am a buggy each.""" 869 | raise Exception('something went wrong') 870 | validator.each_something_buggy = buggy_each 871 | 872 | def buggy_finally_assert(): 873 | """I am a buggy finally assert.""" 874 | raise Exception('something went wrong') 875 | validator.finally_assert_something_buggy = buggy_finally_assert 876 | 877 | def buggy_skip(record): 878 | """I am a buggy skip.""" 879 | raise Exception('something went wrong') 880 | validator.add_skip(buggy_skip) 881 | 882 | data = ( 883 | ('foo', 'bar'), 884 | ('ab', '56') 885 | ) 886 | 887 | problems = validator.validate(data, report_unexpected_exceptions=False) 888 | n = len(problems) 889 | assert n == 1, n 890 | p = problems[0] 891 | assert p['row'] == 2 892 | 893 | problems = validator.validate(data) # by default, exceptions are reported as problems 894 | n = len(problems) 895 | assert n == 10, n 896 | 897 | unexpected_problems = [p for p in problems if p['code'] == UNEXPECTED_EXCEPTION] 898 | assert len(unexpected_problems) == 9 899 | for p in unexpected_problems: 900 | e = p['exception'] 901 | assert e.args[0] == 'something went wrong', e.args 902 | 903 | 904 | def test_summarize(): 905 | """Test use of summarize option.""" 906 | 907 | field_names = ('foo', 'bar') 908 | validator = CSVValidator(field_names) 909 | 910 | def foo_gt_bar(r): 911 | return int(r['foo']) > int(r['bar']) 912 | validator.add_record_predicate(foo_gt_bar) 913 | 914 | data = ( 915 | ('foo', 'bar'), 916 | ('7', '3'), # valid 917 | ('1', '3') # invalid 918 | ) 919 | 920 | problems = validator.validate(data, summarize=True) 921 | n = len(problems) 922 | assert n == 1, n 923 | 924 | p = problems[0] 925 | assert p['code'] == RECORD_PREDICATE_FALSE 926 | for k in ('message', 'row', 'record'): 927 | assert k not in p 928 | 929 | 930 | def test_limit(): 931 | """Test the use of the limit option.""" 932 | 933 | field_names = ('foo', 'bar') 934 | validator = CSVValidator(field_names) 935 | 936 | def foo_gt_bar(r): 937 | return int(r['foo']) > int(r['bar']) 938 | validator.add_record_predicate(foo_gt_bar) 939 | 940 | data = ( 941 | ('foo', 'bar'), 942 | ('7', '3'), # valid 943 | ('1', '3'), # invalid 944 | ('2', '3') # invalid 945 | ) 946 | 947 | problems = validator.validate(data, limit=1) 948 | n = len(problems) 949 | assert n == 1, n 950 | 951 | problems = validator.validate(data) 952 | n = len(problems) 953 | assert n == 2, n 954 | 955 | 956 | def test_context(): 957 | """Test passing in of context information.""" 958 | 959 | field_names = ('foo', 'bar') 960 | validator = CSVValidator(field_names) 961 | 962 | def foo_gt_bar(r): 963 | return int(r['foo']) > int(r['bar']) 964 | validator.add_record_predicate(foo_gt_bar) 965 | 966 | data = ( 967 | ('foo', 'bar'), 968 | ('7', '3'), # valid 969 | ('1', '3') # invalid 970 | ) 971 | 972 | context = {'info': 'file X'} 973 | problems = validator.validate(data, context=context) 974 | n = len(problems) 975 | assert n == 1, n 976 | 977 | p = problems[0] 978 | assert p['context'] == context 979 | 980 | 981 | def test_write_problems(): 982 | """Test writing problems as restructured text.""" 983 | 984 | class MockFile(object): 985 | 986 | def __init__(self): 987 | self.content = '' 988 | 989 | def write(self, s): 990 | self.content += s 991 | 992 | file = MockFile() 993 | 994 | problems = [ 995 | { 996 | 'code': 'X1', 997 | 'message': 'invalid foo', 998 | 'row': 2, 999 | 'field': 'foo', 1000 | 'context': { 1001 | 'info': 'interesting' 1002 | } 1003 | }, 1004 | { 1005 | 'code': 'X2', 1006 | 'message': 'invalid bar', 1007 | 'row': 3, 1008 | 'field': 'bar', 1009 | 'context': { 1010 | 'info': 'very interesting' 1011 | } 1012 | } 1013 | ] 1014 | 1015 | expectation = """ 1016 | ================= 1017 | Validation Report 1018 | ================= 1019 | 1020 | Problems 1021 | ======== 1022 | 1023 | X1 - invalid foo 1024 | ---------------- 1025 | :field: foo 1026 | :row: 2 1027 | :info: interesting 1028 | 1029 | X2 - invalid bar 1030 | ---------------- 1031 | :field: bar 1032 | :row: 3 1033 | :info: very interesting 1034 | 1035 | Summary 1036 | ======= 1037 | 1038 | Found 2 problems in total. 1039 | 1040 | :X1: 1 1041 | :X2: 1 1042 | """ 1043 | 1044 | write_problems(problems, file) 1045 | assert file.content == expectation, file.content 1046 | 1047 | 1048 | def test_write_problems_summarize(): 1049 | """Test writing a problem summary as restructured text.""" 1050 | 1051 | class MockFile(object): 1052 | 1053 | def __init__(self): 1054 | self.content = '' 1055 | 1056 | def write(self, s): 1057 | self.content += s 1058 | 1059 | file = MockFile() 1060 | 1061 | problems = [ 1062 | { 1063 | 'code': 'X1', 1064 | 'message': 'invalid foo', 1065 | 'row': 2, 1066 | 'field': 'foo', 1067 | 'context': { 1068 | 'info': 'interesting' 1069 | } 1070 | }, 1071 | { 1072 | 'code': 'X2', 1073 | 'message': 'invalid bar', 1074 | 'row': 3, 1075 | 'field': 'bar', 1076 | 'context': { 1077 | 'info': 'very interesting' 1078 | } 1079 | }, 1080 | { 1081 | 'code': 'X2', 1082 | 'message': 'invalid bar', 1083 | 'row': 4, 1084 | 'field': 'bar', 1085 | 'context': { 1086 | 'info': 'very very interesting' 1087 | } 1088 | } 1089 | ] 1090 | 1091 | expectation = """ 1092 | ================= 1093 | Validation Report 1094 | ================= 1095 | 1096 | Summary 1097 | ======= 1098 | 1099 | Found 3 problems in total. 1100 | 1101 | :X1: 1 1102 | :X2: 2 1103 | """ 1104 | 1105 | write_problems(problems, file, summarize=True) 1106 | assert file.content == expectation, file.content 1107 | 1108 | 1109 | def test_write_problems_with_limit(): 1110 | """Test writing problems with a limit as restructured text.""" 1111 | 1112 | class MockFile(object): 1113 | 1114 | def __init__(self): 1115 | self.content = '' 1116 | 1117 | def write(self, s): 1118 | self.content += s 1119 | 1120 | file = MockFile() 1121 | 1122 | problems = [ 1123 | { 1124 | 'code': 'X1', 1125 | 'message': 'invalid foo', 1126 | 'row': 2, 1127 | 'field': 'foo', 1128 | 'context': { 1129 | 'info': 'interesting' 1130 | } 1131 | }, 1132 | { 1133 | 'code': 'X2', 1134 | 'message': 'invalid bar', 1135 | 'row': 3, 1136 | 'field': 'bar', 1137 | 'context': { 1138 | 'info': 'very interesting' 1139 | } 1140 | } 1141 | ] 1142 | 1143 | expectation = """ 1144 | ================= 1145 | Validation Report 1146 | ================= 1147 | 1148 | Problems 1149 | ======== 1150 | 1151 | X1 - invalid foo 1152 | ---------------- 1153 | :field: foo 1154 | :row: 2 1155 | :info: interesting 1156 | 1157 | Summary 1158 | ======= 1159 | 1160 | Found at least 1 problem in total. 1161 | 1162 | :X1: 1 1163 | """ 1164 | 1165 | write_problems(problems, file, limit=1) 1166 | assert file.content == expectation, file.content 1167 | 1168 | 1169 | def test_skips(): 1170 | """Test skip functions.""" 1171 | 1172 | field_names = ('foo', 'bar') 1173 | validator = CSVValidator(field_names) 1174 | 1175 | validator.add_record_length_check() 1176 | validator.add_value_check('foo', int) 1177 | 1178 | def skip_pragma(record): 1179 | return record[0].startswith('##') 1180 | validator.add_skip(skip_pragma) 1181 | 1182 | data = ( 1183 | ('foo', 'bar'), 1184 | ('1', 'X'), 1185 | ('## this row', 'should be', 'skipped'), 1186 | ('3', 'Y') 1187 | ) 1188 | 1189 | problems = validator.validate(data) 1190 | assert len(problems) == 0, problems 1191 | 1192 | 1193 | def test_guard_conditions(): 1194 | """Test some guard conditions.""" 1195 | 1196 | field_names = ('foo', 'bar') 1197 | validator = CSVValidator(field_names) 1198 | try: 1199 | validator.add_value_check('foo', 'i am not callable') 1200 | except AssertionError: 1201 | pass # expected 1202 | else: 1203 | assert False, 'expected exception' 1204 | 1205 | 1206 | -------------------------------------------------------------------------------- /csvvalidator.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | This module provides some simple utilities for validating data contained in CSV 4 | files, or other similar data sources. 5 | 6 | Note that the `csvvalidator` module is intended to be used in combination with 7 | the standard Python `csv` module. The `csvvalidator` module **will not** 8 | validate the *syntax* of a CSV file. Rather, the `csvvalidator` module can be 9 | used to validate any source of row-oriented data, such as is provided by a 10 | `csv.reader` object. 11 | 12 | I.e., if you want to validate data from a CSV file, you have to first construct 13 | a CSV reader using the standard Python `csv` module, specifying the appropriate 14 | dialect, and then pass the CSV reader as the source of data to either the 15 | `CSVValidator.validate` or the `CSVValidator.ivalidate` method. 16 | 17 | The `CSVValidator` class is the foundation for all validator objects that are 18 | capable of validating CSV data. 19 | 20 | You can use the CSVValidator class to dynamically construct a validator, e.g.:: 21 | 22 | import sys 23 | import csv 24 | from csvvalidator import * 25 | 26 | field_names = ( 27 | 'study_id', 28 | 'patient_id', 29 | 'gender', 30 | 'age_years', 31 | 'age_months', 32 | 'date_inclusion' 33 | ) 34 | 35 | validator = CSVValidator(field_names) 36 | 37 | # basic header and record length checks 38 | validator.add_header_check('EX1', 'bad header') 39 | validator.add_record_length_check('EX2', 'unexpected record length') 40 | 41 | # some simple value checks 42 | validator.add_value_check('study_id', int, 43 | 'EX3', 'study id must be an integer') 44 | validator.add_value_check('patient_id', int, 45 | 'EX4', 'patient id must be an integer') 46 | validator.add_value_check('gender', enumeration('M', 'F'), 47 | 'EX5', 'invalid gender') 48 | validator.add_value_check('age_years', number_range_inclusive(0, 120, int), 49 | 'EX6', 'invalid age in years') 50 | validator.add_value_check('date_inclusion', datetime_string('%Y-%m-%d'), 51 | 'EX7', 'invalid date') 52 | 53 | # a more complicated record check 54 | def check_age_variables(r): 55 | age_years = int(r['age_years']) 56 | age_months = int(r['age_months']) 57 | valid = (age_months >= age_years * 12 and 58 | age_months % age_years < 12) 59 | if not valid: 60 | raise RecordError('EX8', 'invalid age variables') 61 | validator.add_record_check(check_age_variables) 62 | 63 | # validate the data and write problems to stdout 64 | data = csv.reader('/path/to/data.csv', delimiter='\t') 65 | problems = validator.validate(data) 66 | write_problems(problems, sys.stdout) 67 | 68 | For more complex use cases you can also sub-class `CSVValidator` to define 69 | re-usable validator classes for specific data sources. 70 | 71 | The source code for this module lives at: 72 | 73 | https://github.com/alimanfoo/csvvalidator 74 | 75 | For a complete account of all of the functionality available from this module, 76 | see the example.py and tests.py modules in the source code repository. 77 | 78 | """ 79 | 80 | 81 | import re 82 | from datetime import datetime 83 | 84 | 85 | UNEXPECTED_EXCEPTION = 0 86 | VALUE_CHECK_FAILED = 1 87 | HEADER_CHECK_FAILED = 2 88 | RECORD_LENGTH_CHECK_FAILED = 3 89 | VALUE_PREDICATE_FALSE = 4 90 | RECORD_CHECK_FAILED = 5 91 | RECORD_PREDICATE_FALSE = 6 92 | UNIQUE_CHECK_FAILED = 7 93 | ASSERT_CHECK_FAILED = 8 94 | FINALLY_ASSERT_CHECK_FAILED = 9 95 | 96 | MESSAGES = { 97 | UNEXPECTED_EXCEPTION: 'Unexpected exception [%s]: %s', 98 | VALUE_CHECK_FAILED: 'Value check failed.', 99 | HEADER_CHECK_FAILED: 'Header check failed.', 100 | RECORD_LENGTH_CHECK_FAILED: 'Record length check failed.', 101 | RECORD_CHECK_FAILED: 'Record check failed.', 102 | VALUE_PREDICATE_FALSE: 'Value predicate returned false.', 103 | RECORD_PREDICATE_FALSE: 'Record predicate returned false.', 104 | UNIQUE_CHECK_FAILED: 'Unique check failed.', 105 | ASSERT_CHECK_FAILED: 'Assertion check failed.', 106 | FINALLY_ASSERT_CHECK_FAILED: 'Final assertion check failed.' 107 | } 108 | 109 | 110 | class RecordError(Exception): 111 | """Exception representing a validation problem in a record.""" 112 | 113 | 114 | def __init__(self, code=None, message=None, details=None): 115 | self.code = code 116 | self.message = message 117 | self.details = details 118 | 119 | 120 | def __str__(self): 121 | return repr((self.code, self.message, self.details)) 122 | 123 | 124 | def __repr__(self): 125 | return repr((self.code, self.message, self.details)) 126 | 127 | 128 | class CSVValidator(object): 129 | """ 130 | Instances of this class can be configured to run a variety of different 131 | types of validation check on a CSV-like data source. 132 | 133 | """ 134 | 135 | 136 | def __init__(self, field_names): 137 | """ 138 | Instantiate a `CSVValidator`, supplying expected `field_names` as a 139 | sequence of strings. 140 | 141 | """ 142 | 143 | self._field_names = tuple(field_names) 144 | self._value_checks = [] 145 | self._header_checks = [] 146 | self._record_length_checks = [] 147 | self._value_predicates = [] 148 | self._record_checks = [] 149 | self._record_predicates = [] 150 | self._unique_checks = [] 151 | self._skips = [] 152 | 153 | 154 | def add_header_check(self, 155 | code=HEADER_CHECK_FAILED, 156 | message=MESSAGES[HEADER_CHECK_FAILED]): 157 | """ 158 | Add a header check, i.e., check whether the header record is consistent 159 | with the expected field names. 160 | 161 | Arguments 162 | --------- 163 | 164 | `code` - problem code to report if the header record is not valid, 165 | defaults to `HEADER_CHECK_FAILED` 166 | 167 | `message` - problem message to report if a value is not valid 168 | 169 | """ 170 | 171 | t = code, message 172 | self._header_checks.append(t) 173 | 174 | 175 | def add_record_length_check(self, 176 | code=RECORD_LENGTH_CHECK_FAILED, 177 | message=MESSAGES[RECORD_LENGTH_CHECK_FAILED], 178 | modulus=1): 179 | """ 180 | Add a record length check, i.e., check whether the length of a record is 181 | consistent with the number of expected fields. 182 | 183 | Arguments 184 | --------- 185 | 186 | `code` - problem code to report if a record is not valid, defaults to 187 | `RECORD_LENGTH_CHECK_FAILED` 188 | 189 | `message` - problem message to report if a record is not valid 190 | 191 | `modulus` - apply the check to every nth record, defaults to 1 (check 192 | every record) 193 | 194 | """ 195 | 196 | t = code, message, modulus 197 | self._record_length_checks.append(t) 198 | 199 | 200 | def add_value_check(self, field_name, value_check, 201 | code=VALUE_CHECK_FAILED, 202 | message=MESSAGES[VALUE_CHECK_FAILED], 203 | modulus=1): 204 | """ 205 | Add a value check function for the specified field. 206 | 207 | Arguments 208 | --------- 209 | 210 | `field_name` - the name of the field to attach the value check function 211 | to 212 | 213 | `value_check` - a function that accepts a single argument (a value) and 214 | raises a `ValueError` if the value is not valid 215 | 216 | `code` - problem code to report if a value is not valid, defaults to 217 | `VALUE_CHECK_FAILED` 218 | 219 | `message` - problem message to report if a value is not valid 220 | 221 | `modulus` - apply the check to every nth record, defaults to 1 (check 222 | every record) 223 | 224 | """ 225 | 226 | # guard conditions 227 | assert field_name in self._field_names, 'unexpected field name: %s' % field_name 228 | assert callable(value_check), 'value check must be a callable function' 229 | 230 | t = field_name, value_check, code, message, modulus 231 | self._value_checks.append(t) 232 | 233 | 234 | def add_value_predicate(self, field_name, value_predicate, 235 | code=VALUE_PREDICATE_FALSE, 236 | message=MESSAGES[VALUE_PREDICATE_FALSE], 237 | modulus=1): 238 | """ 239 | Add a value predicate function for the specified field. 240 | 241 | N.B., everything you can do with value predicates can also be done with 242 | value check functions, whether you use one or the other is a matter of 243 | style. 244 | 245 | Arguments 246 | --------- 247 | 248 | `field_name` - the name of the field to attach the value predicate 249 | function to 250 | 251 | `value_predicate` - a function that accepts a single argument (a value) 252 | and returns False if the value is not valid 253 | 254 | `code` - problem code to report if a value is not valid, defaults to 255 | `VALUE_PREDICATE_FALSE` 256 | 257 | `message` - problem message to report if a value is not valid 258 | 259 | `modulus` - apply the check to every nth record, defaults to 1 (check 260 | every record) 261 | 262 | """ 263 | 264 | assert field_name in self._field_names, 'unexpected field name: %s' % field_name 265 | assert callable(value_predicate), 'value predicate must be a callable function' 266 | 267 | t = field_name, value_predicate, code, message, modulus 268 | self._value_predicates.append(t) 269 | 270 | 271 | def add_record_check(self, record_check, modulus=1): 272 | """ 273 | Add a record check function. 274 | 275 | Arguments 276 | --------- 277 | 278 | `record_check` - a function that accepts a single argument (a record as 279 | a dictionary of values indexed by field name) and raises a 280 | `RecordError` if the record is not valid 281 | 282 | `modulus` - apply the check to every nth record, defaults to 1 (check 283 | every record) 284 | 285 | """ 286 | 287 | assert callable(record_check), 'record check must be a callable function' 288 | 289 | t = record_check, modulus 290 | self._record_checks.append(t) 291 | 292 | 293 | def add_record_predicate(self, record_predicate, 294 | code=RECORD_PREDICATE_FALSE, 295 | message=MESSAGES[RECORD_PREDICATE_FALSE], 296 | modulus=1): 297 | """ 298 | Add a record predicate function. 299 | 300 | N.B., everything you can do with record predicates can also be done with 301 | record check functions, whether you use one or the other is a matter of 302 | style. 303 | 304 | Arguments 305 | --------- 306 | 307 | `record_predicate` - a function that accepts a single argument (a record 308 | as a dictionary of values indexed by field name) and returns False if 309 | the value is not valid 310 | 311 | `code` - problem code to report if a record is not valid, defaults to 312 | `RECORD_PREDICATE_FALSE` 313 | 314 | `message` - problem message to report if a record is not valid 315 | 316 | `modulus` - apply the check to every nth record, defaults to 1 (check 317 | every record) 318 | 319 | """ 320 | 321 | assert callable(record_predicate), 'record predicate must be a callable function' 322 | 323 | t = record_predicate, code, message, modulus 324 | self._record_predicates.append(t) 325 | 326 | 327 | def add_unique_check(self, key, 328 | code=UNIQUE_CHECK_FAILED, 329 | message=MESSAGES[UNIQUE_CHECK_FAILED]): 330 | """ 331 | Add a unique check on a single column or combination of columns. 332 | 333 | Arguments 334 | --------- 335 | 336 | `key` - a single field name (string) specifying a field in which all 337 | values are expected to be unique, or a sequence of field names (tuple 338 | or list of strings) specifying a compound key 339 | 340 | `code` - problem code to report if a record is not valid, defaults to 341 | `UNIQUE_CHECK_FAILED` 342 | 343 | `message` - problem message to report if a record is not valid 344 | 345 | """ 346 | 347 | if isinstance(key, basestring): 348 | assert key in self._field_names, 'unexpected field name: %s' % key 349 | else: 350 | for f in key: 351 | assert f in self._field_names, 'unexpected field name: %s' % key 352 | t = key, code, message 353 | self._unique_checks.append(t) 354 | 355 | 356 | def add_skip(self, skip): 357 | """ 358 | Add a `skip` function which accepts a single argument (a record as a 359 | sequence of values) and returns True if all checks on the record should 360 | be skipped. 361 | 362 | """ 363 | 364 | assert callable(skip), 'skip must be a callable function' 365 | self._skips.append(skip) 366 | 367 | 368 | def validate(self, data, 369 | expect_header_row=True, 370 | ignore_lines=0, 371 | summarize=False, 372 | limit=0, 373 | context=None, 374 | report_unexpected_exceptions=True): 375 | """ 376 | Validate `data` and return a list of validation problems found. 377 | 378 | Arguments 379 | --------- 380 | 381 | `data` - any source of row-oriented data, e.g., as provided by a 382 | `csv.reader`, or a list of lists of strings, or ... 383 | 384 | `expect_header_row` - does the data contain a header row (i.e., the 385 | first record is a list of field names)? Defaults to True. 386 | 387 | `ignore_lines` - ignore n lines (rows) at the beginning of the data 388 | 389 | `summarize` - only report problem codes, no other details 390 | 391 | `limit` - report at most n problems 392 | 393 | `context` - a dictionary of any additional information to be added to 394 | any problems found - useful if problems are being aggregated from 395 | multiple validators 396 | 397 | `report_unexpected_exceptions` - value check function, value predicates, 398 | record check functions, record predicates, and other user-supplied 399 | validation functions may raise unexpected exceptions. If this argument 400 | is true, any unexpected exceptions will be reported as validation 401 | problems; if False, unexpected exceptions will be handled silently. 402 | 403 | """ 404 | 405 | problems = list() 406 | problem_generator = self.ivalidate(data, expect_header_row, 407 | ignore_lines, summarize, context, 408 | report_unexpected_exceptions) 409 | for i, p in enumerate(problem_generator): 410 | if not limit or i < limit: 411 | problems.append(p) 412 | return problems 413 | 414 | 415 | def ivalidate(self, data, 416 | expect_header_row=True, 417 | ignore_lines=0, 418 | summarize=False, 419 | context=None, 420 | report_unexpected_exceptions=True): 421 | """ 422 | Validate `data` and return a iterator over problems found. 423 | 424 | Use this function rather than validate() if you expect a large number 425 | of problems. 426 | 427 | Arguments 428 | --------- 429 | 430 | `data` - any source of row-oriented data, e.g., as provided by a 431 | `csv.reader`, or a list of lists of strings, or ... 432 | 433 | `expect_header_row` - does the data contain a header row (i.e., the 434 | first record is a list of field names)? Defaults to True. 435 | 436 | `ignore_lines` - ignore n lines (rows) at the beginning of the data 437 | 438 | `summarize` - only report problem codes, no other details 439 | 440 | `context` - a dictionary of any additional information to be added to 441 | any problems found - useful if problems are being aggregated from 442 | multiple validators 443 | 444 | `report_unexpected_exceptions` - value check function, value predicates, 445 | record check functions, record predicates, and other user-supplied 446 | validation functions may raise unexpected exceptions. If this argument 447 | is true, any unexpected exceptions will be reported as validation 448 | problems; if False, unexpected exceptions will be handled silently. 449 | 450 | """ 451 | 452 | unique_sets = self._init_unique_sets() # used for unique checks 453 | for i, r in enumerate(data): 454 | if expect_header_row and i == ignore_lines: 455 | # r is the header row 456 | for p in self._apply_header_checks(i, r, summarize, context): 457 | yield p 458 | elif i >= ignore_lines: 459 | # r is a data row 460 | skip = False 461 | for p in self._apply_skips(i, r, summarize, 462 | report_unexpected_exceptions, 463 | context): 464 | if p is True: 465 | skip = True 466 | else: 467 | yield p 468 | if not skip: 469 | for p in self._apply_each_methods(i, r, summarize, 470 | report_unexpected_exceptions, 471 | context): 472 | yield p # may yield a problem if an exception is raised 473 | for p in self._apply_value_checks(i, r, summarize, 474 | report_unexpected_exceptions, 475 | context): 476 | yield p 477 | for p in self._apply_record_length_checks(i, r, summarize, 478 | context): 479 | yield p 480 | for p in self._apply_value_predicates(i, r, summarize, 481 | report_unexpected_exceptions, 482 | context): 483 | yield p 484 | for p in self._apply_record_checks(i, r, summarize, 485 | report_unexpected_exceptions, 486 | context): 487 | yield p 488 | for p in self._apply_record_predicates(i, r, summarize, 489 | report_unexpected_exceptions, 490 | context): 491 | yield p 492 | for p in self._apply_unique_checks(i, r, unique_sets, summarize): 493 | yield p 494 | for p in self._apply_check_methods(i, r, summarize, 495 | report_unexpected_exceptions, 496 | context): 497 | yield p 498 | for p in self._apply_assert_methods(i, r, summarize, 499 | report_unexpected_exceptions, 500 | context): 501 | yield p 502 | for p in self._apply_finally_assert_methods(summarize, 503 | report_unexpected_exceptions, 504 | context): 505 | yield p 506 | 507 | 508 | def _init_unique_sets(self): 509 | """Initialise sets used for uniqueness checking.""" 510 | 511 | ks = dict() 512 | for t in self._unique_checks: 513 | key = t[0] 514 | ks[key] = set() # empty set 515 | return ks 516 | 517 | 518 | def _apply_value_checks(self, i, r, 519 | summarize=False, 520 | report_unexpected_exceptions=True, 521 | context=None): 522 | """Apply value check functions on the given record `r`.""" 523 | 524 | for field_name, check, code, message, modulus in self._value_checks: 525 | if i % modulus == 0: # support sampling 526 | fi = self._field_names.index(field_name) 527 | if fi < len(r): # only apply checks if there is a value 528 | value = r[fi] 529 | try: 530 | check(value) 531 | except ValueError: 532 | p = {'code': code} 533 | if not summarize: 534 | p['message'] = message 535 | p['row'] = i + 1 536 | p['column'] = fi + 1 537 | p['field'] = field_name 538 | p['value'] = value 539 | p['record'] = r 540 | if context is not None: p['context'] = context 541 | yield p 542 | except Exception as e: 543 | if report_unexpected_exceptions: 544 | p = {'code': UNEXPECTED_EXCEPTION} 545 | if not summarize: 546 | p['message'] = MESSAGES[UNEXPECTED_EXCEPTION] % (e.__class__.__name__, e) 547 | p['row'] = i + 1 548 | p['column'] = fi + 1 549 | p['field'] = field_name 550 | p['value'] = value 551 | p['record'] = r 552 | p['exception'] = e 553 | p['function'] = '%s: %s' % (check.__name__, 554 | check.__doc__) 555 | if context is not None: p['context'] = context 556 | yield p 557 | 558 | 559 | def _apply_header_checks(self, i, r, summarize=False, context=None): 560 | """Apply header checks on the given record `r`.""" 561 | 562 | for code, message in self._header_checks: 563 | if tuple(r) != self._field_names: 564 | p = {'code': code} 565 | if not summarize: 566 | p['message'] = message 567 | p['row'] = i + 1 568 | p['record'] = tuple(r) 569 | p['missing'] = set(self._field_names) - set(r) 570 | p['unexpected'] = set(r) - set(self._field_names) 571 | if context is not None: p['context'] = context 572 | yield p 573 | 574 | 575 | def _apply_record_length_checks(self, i, r, summarize=False, context=None): 576 | """Apply record length checks on the given record `r`.""" 577 | 578 | for code, message, modulus in self._record_length_checks: 579 | if i % modulus == 0: # support sampling 580 | if len(r) != len(self._field_names): 581 | p = {'code': code} 582 | if not summarize: 583 | p['message'] = message 584 | p['row'] = i + 1 585 | p['record'] = r 586 | p['length'] = len(r) 587 | if context is not None: p['context'] = context 588 | yield p 589 | 590 | 591 | def _apply_value_predicates(self, i, r, 592 | summarize=False, 593 | report_unexpected_exceptions=True, 594 | context=None): 595 | """Apply value predicates on the given record `r`.""" 596 | 597 | for field_name, predicate, code, message, modulus in self._value_predicates: 598 | if i % modulus == 0: # support sampling 599 | fi = self._field_names.index(field_name) 600 | if fi < len(r): # only apply predicate if there is a value 601 | value = r[fi] 602 | try: 603 | valid = predicate(value) 604 | if not valid: 605 | p = {'code': code} 606 | if not summarize: 607 | p['message'] = message 608 | p['row'] = i + 1 609 | p['column'] = fi + 1 610 | p['field'] = field_name 611 | p['value'] = value 612 | p['record'] = r 613 | if context is not None: p['context'] = context 614 | yield p 615 | except Exception as e: 616 | if report_unexpected_exceptions: 617 | p = {'code': UNEXPECTED_EXCEPTION} 618 | if not summarize: 619 | p['message'] = MESSAGES[UNEXPECTED_EXCEPTION] % (e.__class__.__name__, e) 620 | p['row'] = i + 1 621 | p['column'] = fi + 1 622 | p['field'] = field_name 623 | p['value'] = value 624 | p['record'] = r 625 | p['exception'] = e 626 | p['function'] = '%s: %s' % (predicate.__name__, 627 | predicate.__doc__) 628 | if context is not None: p['context'] = context 629 | yield p 630 | 631 | 632 | def _apply_record_checks(self, i, r, 633 | summarize=False, 634 | report_unexpected_exceptions=True, 635 | context=None): 636 | """Apply record checks on `r`.""" 637 | 638 | for check, modulus in self._record_checks: 639 | if i % modulus == 0: # support sampling 640 | rdict = self._as_dict(r) 641 | try: 642 | check(rdict) 643 | except RecordError as e: 644 | code = e.code if e.code is not None else RECORD_CHECK_FAILED 645 | p = {'code': code} 646 | if not summarize: 647 | message = e.message if e.message is not None else MESSAGES[RECORD_CHECK_FAILED] 648 | p['message'] = message 649 | p['row'] = i + 1 650 | p['record'] = r 651 | if context is not None: p['context'] = context 652 | if e.details is not None: p['details'] = e.details 653 | yield p 654 | except Exception as e: 655 | if report_unexpected_exceptions: 656 | p = {'code': UNEXPECTED_EXCEPTION} 657 | if not summarize: 658 | p['message'] = MESSAGES[UNEXPECTED_EXCEPTION] % (e.__class__.__name__, e) 659 | p['row'] = i + 1 660 | p['record'] = r 661 | p['exception'] = e 662 | p['function'] = '%s: %s' % (check.__name__, 663 | check.__doc__) 664 | if context is not None: p['context'] = context 665 | yield p 666 | 667 | 668 | def _apply_record_predicates(self, i, r, 669 | summarize=False, 670 | report_unexpected_exceptions=True, 671 | context=None): 672 | """Apply record predicates on `r`.""" 673 | 674 | for predicate, code, message, modulus in self._record_predicates: 675 | if i % modulus == 0: # support sampling 676 | rdict = self._as_dict(r) 677 | try: 678 | valid = predicate(rdict) 679 | if not valid: 680 | p = {'code': code} 681 | if not summarize: 682 | p['message'] = message 683 | p['row'] = i + 1 684 | p['record'] = r 685 | if context is not None: p['context'] = context 686 | yield p 687 | except Exception as e: 688 | if report_unexpected_exceptions: 689 | p = {'code': UNEXPECTED_EXCEPTION} 690 | if not summarize: 691 | p['message'] = MESSAGES[UNEXPECTED_EXCEPTION] % (e.__class__.__name__, e) 692 | p['row'] = i + 1 693 | p['record'] = r 694 | p['exception'] = e 695 | p['function'] = '%s: %s' % (predicate.__name__, 696 | predicate.__doc__) 697 | if context is not None: p['context'] = context 698 | yield p 699 | 700 | 701 | def _apply_unique_checks(self, i, r, unique_sets, 702 | summarize=False, 703 | context=None): 704 | """Apply unique checks on `r`.""" 705 | 706 | for key, code, message in self._unique_checks: 707 | value = None 708 | values = unique_sets[key] 709 | if isinstance(key, basestring): # assume key is a field name 710 | fi = self._field_names.index(key) 711 | if fi >= len(r): 712 | continue 713 | value = r[fi] 714 | else: # assume key is a list or tuple, i.e., compound key 715 | value = [] 716 | for f in key: 717 | fi = self._field_names.index(f) 718 | if fi >= len(r): 719 | break 720 | value.append(r[fi]) 721 | value = tuple(value) # enable hashing 722 | if value in values: 723 | p = {'code': code} 724 | if not summarize: 725 | p['message'] = message 726 | p['row'] = i + 1 727 | p['record'] = r 728 | p['key'] = key 729 | p['value'] = value 730 | if context is not None: p['context'] = context 731 | yield p 732 | values.add(value) 733 | 734 | 735 | def _apply_each_methods(self, i, r, 736 | summarize=False, 737 | report_unexpected_exceptions=True, 738 | context=None): 739 | """Invoke 'each' methods on `r`.""" 740 | 741 | for a in dir(self): 742 | if a.startswith('each'): 743 | rdict = self._as_dict(r) 744 | f = getattr(self, a) 745 | try: 746 | f(rdict) 747 | except Exception as e: 748 | if report_unexpected_exceptions: 749 | p = {'code': UNEXPECTED_EXCEPTION} 750 | if not summarize: 751 | p['message'] = MESSAGES[UNEXPECTED_EXCEPTION] % (e.__class__.__name__, e) 752 | p['row'] = i + 1 753 | p['record'] = r 754 | p['exception'] = e 755 | p['function'] = '%s: %s' % (f.__name__, 756 | f.__doc__) 757 | if context is not None: p['context'] = context 758 | yield p 759 | 760 | 761 | def _apply_assert_methods(self, i, r, 762 | summarize=False, 763 | report_unexpected_exceptions=True, 764 | context=None): 765 | """Apply 'assert' methods on `r`.""" 766 | 767 | for a in dir(self): 768 | if a.startswith('assert'): 769 | rdict = self._as_dict(r) 770 | f = getattr(self, a) 771 | try: 772 | f(rdict) 773 | except AssertionError as e: 774 | code = ASSERT_CHECK_FAILED 775 | message = MESSAGES[ASSERT_CHECK_FAILED] 776 | if len(e.args) > 0: 777 | custom = e.args[0] 778 | if isinstance(custom, (list, tuple)): 779 | if len(custom) > 0: 780 | code = custom[0] 781 | if len(custom) > 1: 782 | message = custom[1] 783 | else: 784 | code = custom 785 | p = {'code': code} 786 | if not summarize: 787 | p['message'] = message 788 | p['row'] = i + 1 789 | p['record'] = r 790 | if context is not None: p['context'] = context 791 | yield p 792 | except Exception as e: 793 | if report_unexpected_exceptions: 794 | p = {'code': UNEXPECTED_EXCEPTION} 795 | if not summarize: 796 | p['message'] = MESSAGES[UNEXPECTED_EXCEPTION] % (e.__class__.__name__, e) 797 | p['row'] = i + 1 798 | p['record'] = r 799 | p['exception'] = e 800 | p['function'] = '%s: %s' % (f.__name__, 801 | f.__doc__) 802 | if context is not None: p['context'] = context 803 | yield p 804 | 805 | 806 | def _apply_check_methods(self, i, r, 807 | summarize=False, 808 | report_unexpected_exceptions=True, 809 | context=None): 810 | """Apply 'check' methods on `r`.""" 811 | 812 | for a in dir(self): 813 | if a.startswith('check'): 814 | rdict = self._as_dict(r) 815 | f = getattr(self, a) 816 | try: 817 | f(rdict) 818 | except RecordError as e: 819 | code = e.code if e.code is not None else RECORD_CHECK_FAILED 820 | p = {'code': code} 821 | if not summarize: 822 | message = e.message if e.message is not None else MESSAGES[RECORD_CHECK_FAILED] 823 | p['message'] = message 824 | p['row'] = i + 1 825 | p['record'] = r 826 | if context is not None: p['context'] = context 827 | if e.details is not None: p['details'] = e.details 828 | yield p 829 | except Exception as e: 830 | if report_unexpected_exceptions: 831 | p = {'code': UNEXPECTED_EXCEPTION} 832 | if not summarize: 833 | p['message'] = MESSAGES[UNEXPECTED_EXCEPTION] % (e.__class__.__name__, e) 834 | p['row'] = i + 1 835 | p['record'] = r 836 | p['exception'] = e 837 | p['function'] = '%s: %s' % (f.__name__, 838 | f.__doc__) 839 | if context is not None: p['context'] = context 840 | yield p 841 | 842 | 843 | def _apply_finally_assert_methods(self, 844 | summarize=False, 845 | report_unexpected_exceptions=True, 846 | context=None): 847 | """Apply 'finally_assert' methods.""" 848 | 849 | for a in dir(self): 850 | if a.startswith('finally_assert'): 851 | f = getattr(self, a) 852 | try: 853 | f() 854 | except AssertionError as e: 855 | code = ASSERT_CHECK_FAILED 856 | message = MESSAGES[ASSERT_CHECK_FAILED] 857 | if len(e.args) > 0: 858 | custom = e.args[0] 859 | if isinstance(custom, (list, tuple)): 860 | if len(custom) > 0: 861 | code = custom[0] 862 | if len(custom) > 1: 863 | message = custom[1] 864 | else: 865 | code = custom 866 | p = {'code': code} 867 | if not summarize: 868 | p['message'] = message 869 | if context is not None: p['context'] = context 870 | yield p 871 | except Exception as e: 872 | if report_unexpected_exceptions: 873 | p = {'code': UNEXPECTED_EXCEPTION} 874 | if not summarize: 875 | p['message'] = MESSAGES[UNEXPECTED_EXCEPTION] % (e.__class__.__name__, e) 876 | p['exception'] = e 877 | p['function'] = '%s: %s' % (f.__name__, 878 | f.__doc__) 879 | if context is not None: p['context'] = context 880 | yield p 881 | 882 | 883 | def _apply_skips(self, i, r, 884 | summarize=False, 885 | report_unexpected_exceptions=True, 886 | context=None): 887 | """Apply skip functions on `r`.""" 888 | 889 | for skip in self._skips: 890 | try: 891 | result = skip(r) 892 | if result is True: 893 | yield True 894 | except Exception as e: 895 | if report_unexpected_exceptions: 896 | p = {'code': UNEXPECTED_EXCEPTION} 897 | if not summarize: 898 | p['message'] = MESSAGES[UNEXPECTED_EXCEPTION] % (e.__class__.__name__, e) 899 | p['row'] = i + 1 900 | p['record'] = r 901 | p['exception'] = e 902 | p['function'] = '%s: %s' % (skip.__name__, 903 | skip.__doc__) 904 | if context is not None: p['context'] = context 905 | yield p 906 | 907 | 908 | def _as_dict(self, r): 909 | """Convert the record to a dictionary using field names as keys.""" 910 | 911 | d = dict() 912 | for i, f in enumerate(self._field_names): 913 | d[f] = r[i] if i < len(r) else None 914 | return d 915 | 916 | 917 | def enumeration(*args): 918 | """ 919 | Return a value check function which raises a value error if the value is not 920 | in a pre-defined enumeration of values. 921 | 922 | If you pass in a list, tuple or set as the single argument, it is assumed 923 | that the list/tuple/set defines the membership of the enumeration. 924 | 925 | If you pass in more than on argument, it is assumed the arguments themselves 926 | define the enumeration. 927 | 928 | """ 929 | 930 | assert len(args) > 0, 'at least one argument is required' 931 | if len(args) == 1: 932 | # assume the first argument defines the membership 933 | members = args[0] 934 | else: 935 | # assume the arguments are the members 936 | members = args 937 | def checker(value): 938 | if value not in members: 939 | raise ValueError(value) 940 | return checker 941 | 942 | 943 | def match_pattern(regex): 944 | """ 945 | Return a value check function which raises a ValueError if the value does 946 | not match the supplied regular expression, see also `re.match`. 947 | 948 | """ 949 | 950 | prog = re.compile(regex) 951 | def checker(v): 952 | result = prog.match(v) 953 | if result is None: 954 | raise ValueError(v) 955 | return checker 956 | 957 | 958 | def search_pattern(regex): 959 | """ 960 | Return a value check function which raises a ValueError if the supplied 961 | regular expression does not match anywhere in the value, see also 962 | `re.search`. 963 | 964 | """ 965 | 966 | prog = re.compile(regex) 967 | def checker(v): 968 | result = prog.search(v) 969 | if result is None: 970 | raise ValueError(v) 971 | return checker 972 | 973 | 974 | def number_range_inclusive(min, max, type=float): 975 | """ 976 | Return a value check function which raises a ValueError if the supplied 977 | value when cast as `type` is less than `min` or greater than `max`. 978 | 979 | """ 980 | 981 | def checker(v): 982 | if type(v) < min or type(v) > max: 983 | raise ValueError(v) 984 | return checker 985 | 986 | 987 | def number_range_exclusive(min, max, type=float): 988 | """ 989 | Return a value check function which raises a ValueError if the supplied 990 | value when cast as `type` is less than or equal to `min` or greater than 991 | or equal to `max`. 992 | 993 | """ 994 | 995 | def checker(v): 996 | if type(v) <= min or type(v) >= max: 997 | raise ValueError(v) 998 | return checker 999 | 1000 | 1001 | def datetime_string(format): 1002 | """ 1003 | Return a value check function which raises a ValueError if the supplied 1004 | value cannot be converted to a datetime using the supplied format string. 1005 | 1006 | See also `datetime.strptime`. 1007 | 1008 | """ 1009 | 1010 | def checker(v): 1011 | datetime.strptime(v, format) 1012 | return checker 1013 | 1014 | 1015 | def datetime_range_inclusive(min, max, format): 1016 | """ 1017 | Return a value check function which raises a ValueError if the supplied 1018 | value when converted to a datetime using the supplied `format` string is 1019 | less than `min` or greater than `max`. 1020 | 1021 | """ 1022 | 1023 | dmin = datetime.strptime(min, format) 1024 | dmax = datetime.strptime(max, format) 1025 | def checker(v): 1026 | dv = datetime.strptime(v, format) 1027 | if dv < dmin or dv > dmax: 1028 | raise ValueError(v) 1029 | return checker 1030 | 1031 | 1032 | def datetime_range_exclusive(min, max, format): 1033 | """ 1034 | Return a value check function which raises a ValueError if the supplied 1035 | value when converted to a datetime using the supplied `format` string is 1036 | less than or equal to `min` or greater than or equal to `max`. 1037 | 1038 | """ 1039 | 1040 | dmin = datetime.strptime(min, format) 1041 | dmax = datetime.strptime(max, format) 1042 | def checker(v): 1043 | dv = datetime.strptime(v, format) 1044 | if dv <= dmin or dv >= dmax: 1045 | raise ValueError(v) 1046 | return checker 1047 | 1048 | 1049 | def write_problems(problems, file, summarize=False, limit=0): 1050 | """ 1051 | Write problems as restructured text to a file (or stdout/stderr). 1052 | 1053 | """ 1054 | w = file.write # convenience variable 1055 | w(""" 1056 | ================= 1057 | Validation Report 1058 | ================= 1059 | """) 1060 | counts = dict() # store problem counts per problem code 1061 | total = 0 1062 | for i, p in enumerate(problems): 1063 | if limit and i >= limit: 1064 | break # bail out 1065 | if total == 0 and not summarize: 1066 | w(""" 1067 | Problems 1068 | ======== 1069 | """) 1070 | total += 1 1071 | code = p['code'] 1072 | if code in counts: 1073 | counts[code] += 1 1074 | else: 1075 | counts[code] = 1 1076 | if not summarize: 1077 | ptitle = '\n%s - %s\n' % (p['code'], p['message']) 1078 | w(ptitle) 1079 | underline = '' 1080 | for i in range(len(ptitle.strip())): 1081 | underline += '-' 1082 | underline += '\n' 1083 | w(underline) 1084 | for k in sorted(p.viewkeys() - set(['code', 'message', 'context'])): 1085 | w(':%s: %s\n' % (k, p[k])) 1086 | if 'context' in p: 1087 | c = p['context'] 1088 | for k in sorted(c.viewkeys()): 1089 | w(':%s: %s\n' % (k, c[k])) 1090 | 1091 | w(""" 1092 | Summary 1093 | ======= 1094 | 1095 | Found %s%s problem%s in total. 1096 | 1097 | """ % ('at least ' if limit else '', total, 's' if total != 1 else '')) 1098 | for code in sorted(counts.viewkeys()): 1099 | w(':%s: %s\n' % (code, counts[code])) 1100 | return total 1101 | 1102 | --------------------------------------------------------------------------------