├── LEAD.md ├── data ├── empty.csv ├── schema_invalid_empty.json ├── schema_invalid_wrong_type.json ├── data_missing_cols.csv ├── data_headers_field_names_mismatch.csv ├── data_invalid_extra_cols.csv ├── schema_invalid_pk_is_wrong_type.json ├── data_unique_primary_key_violation.csv ├── schema_invalid_pk_no_fields.json ├── data.csv ├── data_infer.csv ├── data_invalid_col_value.csv ├── data.csv.zip ├── storage.png ├── data_infer_boolean.xlsx ├── data_infer_iso-8859-7.csv ├── data_infer_utf8.csv ├── data_infer_missing_values.csv ├── schema_invalid_fk_no_reference.json ├── data_infer_row_limit.csv ├── schema_valid_simple.json ├── schema_invalid_pk_string.json ├── schema_valid_pk_array.json ├── schema_invalid_pk_array.json ├── schema_invalid_fk_string.json ├── schema_invalid_fk_array.json ├── schema_invalid_multiple_errors.json ├── schema_invalid_fk_array_string.json ├── schema_invalid_fk_array_string_ref.json ├── schema_valid_fk_array.json ├── schema_invalid_fk_string_array_ref.json ├── schema_invalid_fk_array_wrong_number.json ├── data_infer_increase_limit.csv └── schema_valid_full.json ├── tests ├── __init__.py ├── types │ ├── __init__.py │ ├── test_any.py │ ├── test_year.py │ ├── test_object.py │ ├── test_yearmonth.py │ ├── test_array.py │ ├── test_integer.py │ ├── test_string.py │ ├── test_duration.py │ ├── test_boolean.py │ ├── test_geopoint.py │ ├── test_time.py │ ├── test_date.py │ ├── test_datetime.py │ ├── test_geojson.py │ └── test_number.py ├── constraints │ ├── __init__.py │ ├── test_unique.py │ ├── test_enum.py │ ├── test_maximum.py │ ├── test_minimum.py │ ├── test_maxLength.py │ ├── test_minLength.py │ ├── test_required.py │ └── test_pattern.py ├── test_exceptions.py ├── test_profile.py ├── conftest.py ├── test_helpers.py ├── test_cli.py ├── test_infer.py ├── test_validate.py ├── test_schema_constraint_field_type.py ├── test_field.py └── test_schema.py ├── examples ├── __init__.py ├── table_validate.py ├── table_pandas.py ├── table_infer.py └── table_sql.py ├── tableschema ├── VERSION ├── __main__.py ├── types │ ├── any.py │ ├── duration.py │ ├── object.py │ ├── year.py │ ├── array.py │ ├── __init__.py │ ├── boolean.py │ ├── geojson.py │ ├── yearmonth.py │ ├── integer.py │ ├── datetime.py │ ├── time.py │ ├── string.py │ ├── date.py │ ├── geopoint.py │ └── number.py ├── constraints │ ├── unique.py │ ├── required.py │ ├── enum.py │ ├── maxLength.py │ ├── minLength.py │ ├── __init__.py │ ├── maximum.py │ ├── minimum.py │ └── pattern.py ├── plugins │ └── __init__.py ├── config.py ├── validate.py ├── __init__.py ├── infer.py ├── cli.py ├── exceptions.py ├── helpers.py ├── storage.py ├── field.py ├── profile.py ├── profiles │ └── geojson.json ├── schema.py └── table.py ├── setup.cfg ├── pytest.ini ├── MANIFEST.in ├── .github ├── pull_request_template.md ├── issue_template.md ├── stale.yml └── workflows │ └── general.yml ├── pylama.ini ├── LICENSE.md ├── .gitignore ├── Makefile └── setup.py /LEAD.md: -------------------------------------------------------------------------------- 1 | roll 2 | -------------------------------------------------------------------------------- /data/empty.csv: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/types/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tableschema/VERSION: -------------------------------------------------------------------------------- 1 | 1.21.0 -------------------------------------------------------------------------------- /tests/constraints/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/schema_invalid_empty.json: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /data/schema_invalid_wrong_type.json: -------------------------------------------------------------------------------- 1 | [] 2 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal=1 3 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | testpaths = tests 3 | -------------------------------------------------------------------------------- /data/data_missing_cols.csv: -------------------------------------------------------------------------------- 1 | key,value 2 | one 3 | two,2 4 | -------------------------------------------------------------------------------- /data/data_headers_field_names_mismatch.csv: -------------------------------------------------------------------------------- 1 | id,bad,name 2 | 1,39,Paul 3 | -------------------------------------------------------------------------------- /data/data_invalid_extra_cols.csv: -------------------------------------------------------------------------------- 1 | key,value 2 | one,1,unexpected 3 | two,2 4 | -------------------------------------------------------------------------------- /data/schema_invalid_pk_is_wrong_type.json: -------------------------------------------------------------------------------- 1 | { 2 | "primaryKey": 1 3 | } 4 | -------------------------------------------------------------------------------- /data/data_unique_primary_key_violation.csv: -------------------------------------------------------------------------------- 1 | id,age,name 2 | 1 39,Paul 3 | 1,36,Jane 4 | -------------------------------------------------------------------------------- /data/schema_invalid_pk_no_fields.json: -------------------------------------------------------------------------------- 1 | { 2 | "primaryKey": ["id", "title"] 3 | } 4 | -------------------------------------------------------------------------------- /data/data.csv: -------------------------------------------------------------------------------- 1 | city,location 2 | london,"51.50,-0.11" 3 | paris,"48.85,2.30" 4 | rome,N/A 5 | -------------------------------------------------------------------------------- /data/data_infer.csv: -------------------------------------------------------------------------------- 1 | id,age,name 2 | 1,39,Paul 3 | 2,23,Jimmy 4 | 3,36,Jane 5 | 4,28,Judy 6 | -------------------------------------------------------------------------------- /data/data_invalid_col_value.csv: -------------------------------------------------------------------------------- 1 | key,value 2 | zero,0 3 | one,not_an_integer 4 | two,2 5 | -------------------------------------------------------------------------------- /data/data.csv.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tableschema-py/HEAD/data/data.csv.zip -------------------------------------------------------------------------------- /data/storage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tableschema-py/HEAD/data/storage.png -------------------------------------------------------------------------------- /data/data_infer_boolean.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tableschema-py/HEAD/data/data_infer_boolean.xlsx -------------------------------------------------------------------------------- /data/data_infer_iso-8859-7.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tableschema-py/HEAD/data/data_infer_iso-8859-7.csv -------------------------------------------------------------------------------- /data/data_infer_utf8.csv: -------------------------------------------------------------------------------- 1 | id,age,name 2 | 1,39,Paul 3 | 2,23,Jimmy 4 | 3,36,Jane 5 | 4,28,Judy 6 | 5,37,Iñtërnâtiônàlizætiøn 7 | -------------------------------------------------------------------------------- /tableschema/__main__.py: -------------------------------------------------------------------------------- 1 | from .cli import cli 2 | 3 | 4 | # Module API 5 | 6 | if __name__ == "__main__": 7 | cli() 8 | -------------------------------------------------------------------------------- /data/data_infer_missing_values.csv: -------------------------------------------------------------------------------- 1 | id,age,name 2 | 1,39,Paul 3 | -,25,Test 4 | 2,23,Jimmy 5 | -,25,Test 6 | 3,36,Jane 7 | -,25,Test 8 | 4,28,Judy 9 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | global-include VERSION 2 | include LICENSE.md 3 | include Makefile 4 | include pylama.ini 5 | include pytest.ini 6 | include README.md 7 | include tox.ini 8 | 9 | global-include *.json 10 | -------------------------------------------------------------------------------- /data/schema_invalid_fk_no_reference.json: -------------------------------------------------------------------------------- 1 | { 2 | "fields": [ 3 | { 4 | "name": "state" 5 | } 6 | ], 7 | "foreignKeys": [ 8 | { 9 | "fields": "state" 10 | } 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | Please replace this line with full information about your pull request. Make sure that tests pass before publishing it 4 | 5 | --- 6 | 7 | Please preserve this line to notify @roll (lead of this repository) 8 | -------------------------------------------------------------------------------- /.github/issue_template.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | Please replace this line with full information about your idea or problem. If it's a bug share as much as possible to reproduce it 4 | 5 | --- 6 | 7 | Please preserve this line to notify @roll (lead of this repository) 8 | -------------------------------------------------------------------------------- /data/data_infer_row_limit.csv: -------------------------------------------------------------------------------- 1 | id,age,name 2 | 1,39,Paul 3 | 2,23,Jimmy 4 | 3,36,Jane 5 | 4,28,Judy 6 | qwerty,nineteen,Rose 7 | werty,nineteen,Red 8 | erty,nineteen,Rotem 9 | rty,nineteen,Ruth 10 | ty,nineteen,Amber 11 | y,nineteen,Angel 12 | _,nineteen,Angie 13 | -------------------------------------------------------------------------------- /pylama.ini: -------------------------------------------------------------------------------- 1 | [pylama] 2 | linters = pyflakes,mccabe,pep8 3 | ignore = E128,E301,E305,E731,C901 4 | 5 | [pylama:pep8] 6 | max_line_length = 120 7 | 8 | [pylama:mccabe] 9 | complexity = 36 10 | 11 | [pylama:*/__init__.py] 12 | ignore = W0611 13 | 14 | [pylama:*/compat.py] 15 | ignore = W0611,E0602 16 | -------------------------------------------------------------------------------- /tableschema/types/any.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | 8 | # Module API 9 | 10 | def cast_any(format, value, **options): 11 | return value 12 | -------------------------------------------------------------------------------- /tableschema/constraints/unique.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | 8 | # Module API 9 | 10 | def check_unique(constraint, unique): 11 | return True 12 | -------------------------------------------------------------------------------- /data/schema_valid_simple.json: -------------------------------------------------------------------------------- 1 | { 2 | "fields": [ 3 | { 4 | "name": "id", 5 | "title": "Identifier", 6 | "type": "integer" 7 | }, 8 | { 9 | "name": "title", 10 | "title": "Title", 11 | "type": "string" 12 | } 13 | ] 14 | } 15 | -------------------------------------------------------------------------------- /data/schema_invalid_pk_string.json: -------------------------------------------------------------------------------- 1 | { 2 | "fields": [ 3 | { 4 | "name": "id", 5 | "title": "Identifier", 6 | "type": "integer" 7 | }, 8 | { 9 | "name": "title", 10 | "title": "Title", 11 | "type": "string" 12 | } 13 | ], 14 | "primaryKey": "identifier" 15 | } 16 | -------------------------------------------------------------------------------- /data/schema_valid_pk_array.json: -------------------------------------------------------------------------------- 1 | { 2 | "fields": [ 3 | { 4 | "name": "id", 5 | "title": "Identifier", 6 | "type": "integer" 7 | }, 8 | { 9 | "name": "title", 10 | "title": "Title", 11 | "type": "string" 12 | } 13 | ], 14 | "primaryKey": ["id", "title"] 15 | } 16 | -------------------------------------------------------------------------------- /data/schema_invalid_pk_array.json: -------------------------------------------------------------------------------- 1 | { 2 | "fields": [ 3 | { 4 | "name": "id", 5 | "title": "Identifier", 6 | "type": "integer" 7 | }, 8 | { 9 | "name": "title", 10 | "title": "Title", 11 | "type": "string" 12 | } 13 | ], 14 | "primaryKey": ["id", "titel"] 15 | } 16 | -------------------------------------------------------------------------------- /tableschema/constraints/required.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | 8 | # Module API 9 | 10 | def check_required(constraint, value): 11 | if not (constraint and value is None): 12 | return True 13 | return False 14 | -------------------------------------------------------------------------------- /data/schema_invalid_fk_string.json: -------------------------------------------------------------------------------- 1 | { 2 | "fields": [ 3 | { 4 | "name": "state" 5 | } 6 | ], 7 | "foreignKeys": [ 8 | { 9 | "fields": "doesnotexist", 10 | "reference": { 11 | "datapackage": "http://data.okfn.org/data/mydatapackage/", 12 | "resource": "the-resource", 13 | "fields": "state_id" 14 | } 15 | } 16 | ] 17 | } 18 | -------------------------------------------------------------------------------- /tableschema/constraints/enum.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | 8 | # Module API 9 | 10 | def check_enum(constraint, value): 11 | if value is None: 12 | return True 13 | if value in constraint: 14 | return True 15 | return False 16 | -------------------------------------------------------------------------------- /tableschema/constraints/maxLength.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | 8 | # Module API 9 | 10 | def check_maxLength(constraint, value): 11 | if value is None: 12 | return True 13 | if len(value) <= constraint: 14 | return True 15 | return False 16 | -------------------------------------------------------------------------------- /tableschema/constraints/minLength.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | 8 | # Module API 9 | 10 | def check_minLength(constraint, value): 11 | if value is None: 12 | return True 13 | if len(value) >= constraint: 14 | return True 15 | return False 16 | -------------------------------------------------------------------------------- /tableschema/plugins/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from ..helpers import PluginImporter 8 | 9 | 10 | # Register importer 11 | importer = PluginImporter( 12 | virtual='tableschema.plugins.', actual='tableschema_') 13 | importer.register() 14 | 15 | # Delete variables 16 | del PluginImporter 17 | del importer 18 | -------------------------------------------------------------------------------- /tests/test_exceptions.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import pytest 8 | from tableschema.exceptions import CastError 9 | 10 | 11 | # Tests 12 | 13 | def test_no_errors_reuse(): 14 | ce1 = CastError('message1') 15 | ce1.errors.append('error') 16 | ce2 = CastError('message2') 17 | assert len(ce2.errors) == 0 18 | -------------------------------------------------------------------------------- /data/schema_invalid_fk_array.json: -------------------------------------------------------------------------------- 1 | { 2 | "fields": [ 3 | { 4 | "name": "id", 5 | "title": "Identifier", 6 | "type": "integer" 7 | } 8 | ], 9 | "foreignKeys": [ 10 | { 11 | "fields": ["id", "title"], 12 | "reference": { 13 | "datapackage": "http://data.okfn.org/data/mydatapackage/", 14 | "resource": "the-resource", 15 | "fields": "no" 16 | } 17 | } 18 | ] 19 | } 20 | -------------------------------------------------------------------------------- /tests/constraints/test_unique.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import pytest 8 | from tableschema import constraints 9 | 10 | 11 | # Tests 12 | 13 | @pytest.mark.parametrize('constraint, value, result', [ 14 | (False, 'any', True), 15 | (True, 'any', True), 16 | ]) 17 | def test_check_unique(constraint, value, result): 18 | assert constraints.check_unique(constraint, value) == result 19 | -------------------------------------------------------------------------------- /tableschema/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import io 8 | import os 9 | 10 | 11 | # Module API 12 | 13 | VERSION = io.open(os.path.join(os.path.dirname(__file__), 'VERSION')).read().strip() 14 | ERROR = 'tableschema.error' 15 | DEFAULT_FIELD_TYPE = 'string' 16 | DEFAULT_FIELD_FORMAT = 'default' 17 | DEFAULT_MISSING_VALUES = [''] 18 | REMOTE_SCHEMES = ['http', 'https', 'ftp', 'ftps', 's3'] 19 | -------------------------------------------------------------------------------- /tableschema/constraints/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | 8 | # Module API 9 | 10 | from .enum import check_enum 11 | from .maximum import check_maximum 12 | from .maxLength import check_maxLength 13 | from .minimum import check_minimum 14 | from .minLength import check_minLength 15 | from .pattern import check_pattern 16 | from .required import check_required 17 | from .unique import check_unique 18 | -------------------------------------------------------------------------------- /tests/constraints/test_enum.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import pytest 8 | from tableschema import constraints 9 | 10 | 11 | # Tests 12 | 13 | @pytest.mark.parametrize('constraint, value, result', [ 14 | ([1, 2], 1, True), 15 | ([0, 2], 1, False), 16 | ([], 1, False), 17 | ]) 18 | def test_check_enum(constraint, value, result): 19 | assert constraints.check_enum(constraint, value) == result 20 | -------------------------------------------------------------------------------- /tests/constraints/test_maximum.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import pytest 8 | from tableschema import constraints 9 | 10 | 11 | # Tests 12 | 13 | @pytest.mark.parametrize('constraint, value, result', [ 14 | (0, 1, False), 15 | (1, 1, True), 16 | (2, 1, True), 17 | ]) 18 | def test_check_maximum(constraint, value, result): 19 | assert constraints.check_maximum(constraint, value) == result 20 | -------------------------------------------------------------------------------- /tests/constraints/test_minimum.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import pytest 8 | from tableschema import constraints 9 | 10 | 11 | # Tests 12 | 13 | @pytest.mark.parametrize('constraint, value, result', [ 14 | (0, 1, True), 15 | (1, 1, True), 16 | (2, 1, False), 17 | ]) 18 | def test_check_minimum(constraint, value, result): 19 | assert constraints.check_minimum(constraint, value) == result 20 | -------------------------------------------------------------------------------- /tests/constraints/test_maxLength.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import pytest 8 | from tableschema import constraints 9 | 10 | 11 | # Tests 12 | 13 | @pytest.mark.parametrize('constraint, value, result', [ 14 | (0, [1], False), 15 | (1, [1], True), 16 | (2, [1], True), 17 | ]) 18 | def test_check_maxLength(constraint, value, result): 19 | assert constraints.check_maxLength(constraint, value) == result 20 | -------------------------------------------------------------------------------- /tests/constraints/test_minLength.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import pytest 8 | from tableschema import constraints 9 | 10 | 11 | # Tests 12 | 13 | @pytest.mark.parametrize('constraint, value, result', [ 14 | (0, [1], True), 15 | (1, [1], True), 16 | (2, [1], False), 17 | ]) 18 | def test_check_minLength(constraint, value, result): 19 | assert constraints.check_minLength(constraint, value) == result 20 | -------------------------------------------------------------------------------- /tests/constraints/test_required.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import pytest 8 | from tableschema import constraints 9 | 10 | 11 | # Tests 12 | 13 | @pytest.mark.parametrize('constraint, value, result', [ 14 | (False, 1, True), 15 | (True, 0, True), 16 | (True, None, False), 17 | ]) 18 | def test_check_required(constraint, value, result): 19 | assert constraints.check_required(constraint, value) == result 20 | -------------------------------------------------------------------------------- /tableschema/validate.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .schema import Schema 8 | 9 | 10 | # Module API 11 | 12 | 13 | def validate(descriptor): 14 | """Validate descriptor 15 | 16 | # Arguments 17 | dict: descriptor 18 | 19 | # Raises 20 | ValidationError: on validation errors 21 | 22 | # Returns 23 | bool: True 24 | 25 | """ 26 | Schema(descriptor, strict=True) 27 | return True 28 | -------------------------------------------------------------------------------- /examples/table_validate.py: -------------------------------------------------------------------------------- 1 | from tableschema import Table 2 | 3 | # Data from WEB, schema from MEMORY 4 | SOURCE = 'https://raw.githubusercontent.com/frictionlessdata/tableschema-py/master/data/data_infer.csv' 5 | SCHEMA = {'fields': [{'name': 'id', 'type': 'integer'}, {'name': 'age', 'type': 'integer'}, {'name': 'name', 'type': 'string'}] } 6 | 7 | # If schema is not passed it will be inferred 8 | table = Table(SOURCE, schema=SCHEMA) 9 | rows = table.iter() 10 | while True: 11 | try: 12 | print(next(rows)) 13 | except StopIteration: 14 | break 15 | except Exception as exception: 16 | print(exception) 17 | -------------------------------------------------------------------------------- /tableschema/constraints/maximum.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import decimal 8 | 9 | 10 | # Module API 11 | 12 | def check_maximum(constraint, value): 13 | if value is None: 14 | return True 15 | try: 16 | if value <= constraint: 17 | return True 18 | except decimal.InvalidOperation: 19 | # For non-finite numbers NaN, INF and -INF 20 | # the constraint always is not satisfied 21 | return False 22 | return False 23 | -------------------------------------------------------------------------------- /tableschema/constraints/minimum.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import decimal 8 | 9 | 10 | # Module API 11 | 12 | def check_minimum(constraint, value): 13 | if value is None: 14 | return True 15 | try: 16 | if value >= constraint: 17 | return True 18 | except decimal.InvalidOperation: 19 | # For non-finite numbers NaN, INF and -INF 20 | # the constraint always is not satisfied 21 | return False 22 | return False 23 | -------------------------------------------------------------------------------- /data/schema_invalid_multiple_errors.json: -------------------------------------------------------------------------------- 1 | { 2 | "fields": [ 3 | { 4 | "name": "id", 5 | "title": "Identifier", 6 | "type": "magical_unicorn" 7 | }, 8 | { 9 | "name": "title", 10 | "title": "Title", 11 | "type": "string" 12 | } 13 | ], 14 | "primaryKey": "identifier", 15 | "foreignKeys": [ 16 | { 17 | "fields": ["id", "notafield"], 18 | "reference": { 19 | "datapackage": "http://data.okfn.org/data/mydatapackage/", 20 | "fields": "no" 21 | } 22 | } 23 | ] 24 | } 25 | -------------------------------------------------------------------------------- /tableschema/constraints/pattern.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import re 8 | COMPILED_RE = type(re.compile("")) 9 | 10 | 11 | # Module API 12 | 13 | def check_pattern(constraint, value): 14 | if value is None: 15 | return True 16 | if not isinstance(constraint, COMPILED_RE): 17 | regex = re.compile('^{0}$'.format(constraint)) 18 | else: 19 | regex = constraint 20 | match = regex.match(value) 21 | if match: 22 | return True 23 | return False 24 | -------------------------------------------------------------------------------- /tests/types/test_any.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import pytest 8 | from tableschema import types 9 | from tableschema.config import ERROR 10 | 11 | 12 | # Tests 13 | 14 | @pytest.mark.parametrize('format, value, result', [ 15 | ('default', 1, 1), 16 | ('default', '1', '1'), 17 | ('default', '3.14', '3.14'), 18 | ('default', True, True), 19 | ('default', '', ''), 20 | ]) 21 | def test_cast_any(format, value, result): 22 | assert types.cast_any(format, value) == result 23 | -------------------------------------------------------------------------------- /data/schema_invalid_fk_array_string.json: -------------------------------------------------------------------------------- 1 | { 2 | "fields": [ 3 | { 4 | "name": "id", 5 | "title": "Identifier", 6 | "type": "integer" 7 | }, 8 | { 9 | "name": "title", 10 | "title": "Title", 11 | "type": "string" 12 | } 13 | ], 14 | "foreignKeys": [ 15 | { 16 | "fields": "id", 17 | "reference": { 18 | "datapackage": "http://data.okfn.org/data/mydatapackage/", 19 | "resource": "the-resource", 20 | "fields": ["id_1", "title_id"] 21 | } 22 | } 23 | ] 24 | } 25 | -------------------------------------------------------------------------------- /data/schema_invalid_fk_array_string_ref.json: -------------------------------------------------------------------------------- 1 | { 2 | "fields": [ 3 | { 4 | "name": "id", 5 | "title": "Identifier", 6 | "type": "integer" 7 | }, 8 | { 9 | "name": "title", 10 | "title": "Title", 11 | "type": "string" 12 | } 13 | ], 14 | "foreignKeys": [ 15 | { 16 | "fields": ["id", "title"], 17 | "reference": { 18 | "datapackage": "http://data.okfn.org/data/mydatapackage/", 19 | "resource": "the-resource", 20 | "fields": "no" 21 | } 22 | } 23 | ] 24 | } 25 | -------------------------------------------------------------------------------- /data/schema_valid_fk_array.json: -------------------------------------------------------------------------------- 1 | { 2 | "fields": [ 3 | { 4 | "name": "id", 5 | "title": "Identifier", 6 | "type": "integer" 7 | }, 8 | { 9 | "name": "title", 10 | "title": "Title", 11 | "type": "string" 12 | } 13 | ], 14 | "foreignKeys": [ 15 | { 16 | "fields": ["id", "title"], 17 | "reference": { 18 | "datapackage": "http://data.okfn.org/data/mydatapackage/", 19 | "resource": "the-resource", 20 | "fields": ["fk_id", "title_id"] 21 | } 22 | } 23 | ] 24 | } 25 | -------------------------------------------------------------------------------- /data/schema_invalid_fk_string_array_ref.json: -------------------------------------------------------------------------------- 1 | { 2 | "fields": [ 3 | { 4 | "name": "id", 5 | "title": "Identifier", 6 | "type": "integer" 7 | }, 8 | { 9 | "name": "title", 10 | "title": "Title", 11 | "type": "string" 12 | } 13 | ], 14 | "foreignKeys": [ 15 | { 16 | "fields": "id", 17 | "reference": { 18 | "datapackage": "http://data.okfn.org/data/mydatapackage/", 19 | "resource": "the-resource", 20 | "fields": ["id_1", "title_id"] 21 | } 22 | } 23 | ] 24 | } 25 | -------------------------------------------------------------------------------- /tests/constraints/test_pattern.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import pytest 8 | from tableschema import constraints 9 | import re 10 | 11 | # Tests 12 | 13 | @pytest.mark.parametrize('constraint, value, result', [ 14 | ('^test$', 'test', True), 15 | ('^test$', 'TEST', False), 16 | (re.compile('^test$'), 'test', True), 17 | (re.compile('^test$'), 'TEST', False), 18 | ]) 19 | def test_check_pattern(constraint, value, result): 20 | assert constraints.check_pattern(constraint, value) == result 21 | -------------------------------------------------------------------------------- /data/schema_invalid_fk_array_wrong_number.json: -------------------------------------------------------------------------------- 1 | { 2 | "fields": [ 3 | { 4 | "name": "id", 5 | "title": "Identifier", 6 | "type": "integer" 7 | }, 8 | { 9 | "name": "title", 10 | "title": "Title", 11 | "type": "string" 12 | } 13 | ], 14 | "foreignKeys": [ 15 | { 16 | "fields": ["id", "title"], 17 | "reference": { 18 | "datapackage": "http://data.okfn.org/data/mydatapackage/", 19 | "resource": "the-resource", 20 | "fields": ["id", "title", "somethingelse"] 21 | } 22 | } 23 | ] 24 | } 25 | -------------------------------------------------------------------------------- /tableschema/types/duration.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import six 8 | import datetime 9 | import isodate 10 | from ..config import ERROR 11 | 12 | 13 | # Module API 14 | 15 | def cast_duration(format, value, **options): 16 | if not isinstance(value, (isodate.Duration, datetime.timedelta)): 17 | if not isinstance(value, six.string_types): 18 | return ERROR 19 | try: 20 | value = isodate.parse_duration(value) 21 | except Exception: 22 | return ERROR 23 | return value 24 | -------------------------------------------------------------------------------- /tableschema/types/object.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import six 8 | import json 9 | from ..config import ERROR 10 | 11 | 12 | # Module API 13 | 14 | def cast_object(format, value, **options): 15 | if not isinstance(value, dict): 16 | if not isinstance(value, six.string_types): 17 | return ERROR 18 | try: 19 | value = json.loads(value) 20 | except Exception: 21 | return ERROR 22 | if not isinstance(value, dict): 23 | return ERROR 24 | return value 25 | -------------------------------------------------------------------------------- /tests/types/test_year.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import pytest 8 | from tableschema import types 9 | from tableschema.config import ERROR 10 | 11 | 12 | # Tests 13 | 14 | @pytest.mark.parametrize('format, value, result', [ 15 | ('default', 2000, 2000), 16 | ('default', '2000', 2000), 17 | ('default', -2000, ERROR), 18 | ('default', 20000, ERROR), 19 | ('default', '3.14', ERROR), 20 | ('default', '', ERROR), 21 | ]) 22 | def test_cast_year(format, value, result): 23 | assert types.cast_year(format, value) == result 24 | -------------------------------------------------------------------------------- /tableschema/types/year.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import six 8 | from ..config import ERROR 9 | 10 | 11 | # Module API 12 | 13 | def cast_year(format, value, **options): 14 | if not isinstance(value, int): 15 | if not isinstance(value, six.string_types): 16 | return ERROR 17 | if len(value) != 4: 18 | return ERROR 19 | try: 20 | value = int(value) 21 | except Exception: 22 | return ERROR 23 | if value < 0 or value > 9999: 24 | return ERROR 25 | return value 26 | -------------------------------------------------------------------------------- /examples/table_pandas.py: -------------------------------------------------------------------------------- 1 | # pip install tableschema-pandas 2 | from pprint import pprint 3 | from tableschema import Table 4 | 5 | # Data source 6 | SOURCE = 'https://raw.githubusercontent.com/frictionlessdata/tableschema-py/master/data/data_infer.csv' 7 | 8 | # Data processor 9 | def skip_under_30(erows): 10 | for number, headers, row in erows: 11 | krow = dict(zip(headers, row)) 12 | if krow['age'] >= 30: 13 | yield (number, headers, row) 14 | 15 | # Export to pandas 16 | table = Table(SOURCE, post_convert=[skip_under_30]) 17 | storage = table.save('persons', storage='pandas') 18 | pprint(storage['persons']) 19 | # Will print (if use skip_under_30 filter) 20 | # id age name 21 | # 1 39 Paul 22 | # 3 36 Jane 23 | -------------------------------------------------------------------------------- /tests/test_profile.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import io 8 | import os 9 | import json 10 | import pytest 11 | import requests 12 | from tableschema.profile import Profile 13 | 14 | 15 | # Tests 16 | 17 | @pytest.mark.skip 18 | @pytest.mark.skipif(os.environ.get('TRAVIS_BRANCH') != 'master', reason='CI') 19 | def test_specs_table_schema_is_up_to_date(): 20 | profile = Profile('table-schema') 21 | jsonschema = requests.get('https://specs.frictionlessdata.io/schemas/table-schema.json').json() 22 | assert profile.jsonschema == jsonschema, 'run `make profiles` to update profiles' 23 | -------------------------------------------------------------------------------- /tableschema/types/array.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import six 8 | import json 9 | from ..config import ERROR 10 | 11 | 12 | # Module API 13 | 14 | def cast_array(format, value, **options): 15 | if not isinstance(value, list): 16 | if isinstance(value, tuple): 17 | return list(value) 18 | if not isinstance(value, six.string_types): 19 | return ERROR 20 | try: 21 | value = json.loads(value) 22 | except Exception: 23 | return ERROR 24 | if not isinstance(value, list): 25 | return ERROR 26 | return value 27 | -------------------------------------------------------------------------------- /tableschema/types/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | 8 | # Module API 9 | 10 | from .any import cast_any 11 | from .array import cast_array 12 | from .boolean import cast_boolean 13 | from .date import cast_date 14 | from .datetime import cast_datetime 15 | from .duration import cast_duration 16 | from .geojson import cast_geojson 17 | from .geopoint import cast_geopoint 18 | from .integer import cast_integer 19 | from .number import cast_number 20 | from .object import cast_object 21 | from .string import cast_string 22 | from .time import cast_time 23 | from .year import cast_year 24 | from .yearmonth import cast_yearmonth 25 | -------------------------------------------------------------------------------- /.github/stale.yml: -------------------------------------------------------------------------------- 1 | # Number of days of inactivity before an issue becomes stale 2 | daysUntilStale: 90 3 | 4 | # Number of days of inactivity before a stale issue is closed 5 | daysUntilClose: 30 6 | 7 | # Issues with these labels will never be considered stale 8 | exemptLabels: 9 | - feature 10 | - enhancement 11 | - bug 12 | 13 | # Label to use when marking an issue as stale 14 | staleLabel: wontfix 15 | 16 | # Comment to post when marking an issue as stale. Set to `false` to disable 17 | markComment: > 18 | This issue has been automatically marked as stale because it has not had 19 | recent activity. It will be closed if no further activity occurs. Thank you 20 | for your contributions. 21 | 22 | # Comment to post when closing a stale issue. Set to `false` to disable 23 | closeComment: false 24 | -------------------------------------------------------------------------------- /tests/types/test_object.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import pytest 8 | from tableschema import types 9 | from tableschema.config import ERROR 10 | 11 | 12 | # Tests 13 | 14 | @pytest.mark.parametrize('format, value, result', [ 15 | ('default', {}, {}), 16 | ('default', '{}', {}), 17 | ('default', {'key': 'value'}, {'key': 'value'}), 18 | ('default', '{"key": "value"}', {'key': 'value'}), 19 | ('default', '["key", "value"]', ERROR), 20 | ('default', 'string', ERROR), 21 | ('default', 1, ERROR), 22 | ('default', '3.14', ERROR), 23 | ('default', '', ERROR), 24 | ]) 25 | def test_cast_object(format, value, result): 26 | assert types.cast_object(format, value) == result 27 | -------------------------------------------------------------------------------- /tableschema/types/boolean.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import six 8 | from ..config import ERROR 9 | 10 | 11 | # Module API 12 | 13 | def cast_boolean(format, value, **options): 14 | if not isinstance(value, bool): 15 | if isinstance(value, six.string_types): 16 | value = value.strip() 17 | if value in options.get('trueValues', _TRUE_VALUES): 18 | value = True 19 | elif value in options.get('falseValues', _FALSE_VALUES): 20 | value = False 21 | else: 22 | return ERROR 23 | return value 24 | 25 | 26 | # Internal 27 | 28 | _TRUE_VALUES = ['true', 'True', 'TRUE', '1'] 29 | _FALSE_VALUES = ['false', 'False', 'FALSE', '0'] 30 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import pytest 8 | from copy import deepcopy 9 | 10 | 11 | # Fixtures 12 | 13 | @pytest.fixture(scope='session') 14 | def apply_defaults(): 15 | def function(descriptor): 16 | descriptor = deepcopy(descriptor) 17 | # Schema descriptor 18 | if descriptor.get('fields'): 19 | descriptor.setdefault('missingValues', ['']) 20 | for field in descriptor['fields']: 21 | field.setdefault('type', 'string') 22 | field.setdefault('format', 'default') 23 | # Field descriptor 24 | else: 25 | descriptor.setdefault('type', 'string') 26 | descriptor.setdefault('format', 'default') 27 | return descriptor 28 | return function 29 | -------------------------------------------------------------------------------- /tableschema/types/geojson.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import six 8 | import json 9 | from ..config import ERROR 10 | from ..profile import Profile 11 | 12 | 13 | # Module API 14 | 15 | def cast_geojson(format, value, **options): 16 | if isinstance(value, six.string_types): 17 | try: 18 | value = json.loads(value) 19 | except Exception: 20 | return ERROR 21 | if not isinstance(value, dict): 22 | return ERROR 23 | if format == 'default': 24 | try: 25 | _profile.validate(value) 26 | except Exception: 27 | return ERROR 28 | elif format == 'topojson': 29 | pass # Accept any dict as possibly topojson for now 30 | return value 31 | 32 | 33 | # Internal 34 | 35 | _profile = Profile('geojson') 36 | -------------------------------------------------------------------------------- /tests/types/test_yearmonth.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import pytest 8 | from tableschema import types 9 | from tableschema.config import ERROR 10 | 11 | 12 | # Tests 13 | 14 | @pytest.mark.parametrize('format, value, result', [ 15 | ('default', [2000, 10], (2000, 10)), 16 | ('default', (2000, 10), (2000, 10)), 17 | ('default', '2000-10', (2000, 10)), 18 | ('default', (2000, 10, 20), ERROR), 19 | ('default', '2000-13-20', ERROR), 20 | ('default', '2000-13', ERROR), 21 | ('default', '2000-0', ERROR), 22 | ('default', '13', ERROR), 23 | ('default', -10, ERROR), 24 | ('default', 20, ERROR), 25 | ('default', '3.14', ERROR), 26 | ('default', '', ERROR), 27 | ]) 28 | def test_cast_yearmonth(format, value, result): 29 | assert types.cast_yearmonth(format, value) == result 30 | -------------------------------------------------------------------------------- /tests/types/test_array.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import pytest 8 | from tableschema import types 9 | from tableschema.config import ERROR 10 | 11 | 12 | # Tests 13 | 14 | @pytest.mark.parametrize('format, value, result', [ 15 | ('default', [], []), 16 | ('default', (), []), 17 | ('default', '[]', []), 18 | ('default', ['key', 'value'], ['key', 'value']), 19 | ('default', ('key', 'value'), ['key', 'value']), 20 | ('default', '["key", "value"]', ['key', 'value']), 21 | ('default', {'key': 'value'}, ERROR), 22 | ('default', '{"key": "value"}', ERROR), 23 | ('default', 'string', ERROR), 24 | ('default', 1, ERROR), 25 | ('default', '3.14', ERROR), 26 | ('default', '', ERROR), 27 | ]) 28 | def test_cast_array(format, value, result): 29 | assert types.cast_array(format, value) == result 30 | -------------------------------------------------------------------------------- /tableschema/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | from . import config 7 | __version__ = config.VERSION 8 | 9 | 10 | # Module API 11 | 12 | from .cli import cli 13 | from .table import Table 14 | from .schema import Schema 15 | from .field import Field 16 | from .storage import Storage 17 | from .validate import validate 18 | from .infer import infer 19 | from .schema import FailedCast 20 | from .exceptions import DataPackageException 21 | from .exceptions import TableSchemaException 22 | from .exceptions import LoadError 23 | from .exceptions import ValidationError 24 | from .exceptions import CastError 25 | from .exceptions import IntegrityError 26 | from .exceptions import UniqueKeyError 27 | from .exceptions import RelationError 28 | from .exceptions import UnresolvedFKError 29 | from .exceptions import StorageError 30 | 31 | # Deprecated 32 | 33 | from . import exceptions 34 | -------------------------------------------------------------------------------- /tableschema/types/yearmonth.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import six 8 | from typing import NamedTuple 9 | from ..config import ERROR 10 | 11 | 12 | # Module API 13 | 14 | def cast_yearmonth(format, value, **options): 15 | if isinstance(value, (tuple, list)): 16 | if len(value) != 2: 17 | return ERROR 18 | value = _yearmonth(value[0], value[1]) 19 | elif isinstance(value, six.string_types): 20 | try: 21 | year, month = value.split('-') 22 | year = int(year) 23 | month = int(month) 24 | if month < 1 or month > 12: 25 | return ERROR 26 | value = _yearmonth(year, month) 27 | except Exception: 28 | return ERROR 29 | else: 30 | return ERROR 31 | return value 32 | 33 | 34 | # Internal 35 | class _yearmonth(NamedTuple): 36 | year: int 37 | month: int 38 | -------------------------------------------------------------------------------- /tests/types/test_integer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | from decimal import Decimal 8 | 9 | import pytest 10 | from tableschema import types 11 | from tableschema.config import ERROR 12 | 13 | 14 | # Tests 15 | 16 | @pytest.mark.parametrize('format, value, result, options', [ 17 | ('default', 1, 1, {}), 18 | ('default', 1 << 63, 1 << 63, {}), 19 | ('default', '1', 1, {}), 20 | ('default', 1.0, 1, {}), 21 | ('default', Decimal('1.0'), 1, {}), 22 | ('default', '1$', 1, {'bareNumber': False}), 23 | ('default', 'ab1$', 1, {'bareNumber': False}), 24 | ('default', True, ERROR, {}), 25 | ('default', False, ERROR, {}), 26 | ('default', 3.14, ERROR, {}), 27 | ('default', '3.14', ERROR, {}), 28 | ('default', Decimal('3.14'), ERROR, {}), 29 | ('default', '', ERROR, {}), 30 | ]) 31 | def test_cast_integer(format, value, result, options): 32 | assert types.cast_integer(format, value, **options) == result 33 | -------------------------------------------------------------------------------- /examples/table_infer.py: -------------------------------------------------------------------------------- 1 | # pip install sqlalchemy tableschema-sql 2 | import sqlalchemy as sa 3 | from pprint import pprint 4 | from tableschema import Table 5 | 6 | # Data source 7 | SOURCE = 'https://raw.githubusercontent.com/frictionlessdata/tableschema-py/master/data/data_infer.csv' 8 | 9 | # Create SQL database 10 | db = sa.create_engine('sqlite://') 11 | 12 | # Data processor 13 | def skip_under_30(erows): 14 | for number, headers, row in erows: 15 | krow = dict(zip(headers, row)) 16 | if krow['age'] >= 30: 17 | yield (number, headers, row) 18 | 19 | # Work with table 20 | table = Table(SOURCE, post_cast=[skip_under_30]) 21 | table.schema.save('tmp/persons.json') # Save INFERRED schema 22 | table.save('persons', backend='sql', engine=db) # Save data to SQL 23 | table.save('tmp/persons.csv') # Save data to DRIVE 24 | 25 | # Check the result 26 | pprint(Table('persons', backend='sql', engine=db).read(keyed=True)) 27 | pprint(Table('tmp/persons.csv').read(keyed=True)) 28 | # Will print (twice) 29 | # [{'age': 39, 'id': 1, 'name': 'Paul'}, 30 | # {'age': 36, 'id': 3, 'name': 'Jane'}] 31 | -------------------------------------------------------------------------------- /examples/table_sql.py: -------------------------------------------------------------------------------- 1 | # pip install sqlalchemy tableschema-sql 2 | import sqlalchemy as sa 3 | from tableschema import Table 4 | 5 | # Create SQL database 6 | db = sa.create_engine('sqlite://') 7 | 8 | # Data from WEB, schema from MEMORY 9 | SOURCE = 'https://raw.githubusercontent.com/frictionlessdata/tableschema-py/master/data/data_infer.csv' 10 | SCHEMA = {'fields': [{'name': 'id', 'type': 'integer'}, {'name': 'age', 'type': 'integer'}, {'name': 'name', 'type': 'string'}] } 11 | 12 | # Open from WEB save to SQL database 13 | table = Table(SOURCE, schema=SCHEMA) 14 | table.save('articles', storage='sql', engine=db) 15 | 16 | # Open from SQL save to DRIVE 17 | table = Table('articles', storage='sql', engine=db) 18 | table.infer() 19 | table.schema.save('tmp/articles.json') 20 | table.save('tmp/articles.csv') 21 | 22 | # Open from DRIVE print to CONSOLE 23 | table = Table('tmp/articles.csv', schema='tmp/articles.json') 24 | print(table.read(keyed=True)) 25 | # Will print 26 | # [{'id': 1, 'age': 39, 'name': 'Paul'}, {'id': 2, 'age': 23, 'name': 'Jimmy'}, {'id': 3, 'age': 36, 'name': 'Jane'}, {'id': 4, 'age': 28, 'name': 'Judy'}] 27 | -------------------------------------------------------------------------------- /tests/test_helpers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import os 8 | import io 9 | import pytest 10 | from tableschema import exceptions, helpers 11 | 12 | 13 | # Tests 14 | 15 | def test_retrieve_descriptor_dict(): 16 | source = {'this': 'that', 'other': ['thing']} 17 | assert helpers.retrieve_descriptor(source) 18 | 19 | 20 | def test_retrieve_descriptor_list(): 21 | source = [{'this': 'that', 'other': ['thing']}] 22 | assert helpers.retrieve_descriptor(source) 23 | 24 | 25 | def test_retrieve_descriptor_url(): 26 | source = 'data/schema_valid_full.json' 27 | assert helpers.retrieve_descriptor(source) 28 | 29 | 30 | def test_retrieve_descriptor_path(): 31 | source = 'data/schema_valid_full.json' 32 | assert helpers.retrieve_descriptor(source) 33 | 34 | 35 | def test_retrieve_descriptor_invalid(): 36 | source = 'data/data_infer.csv' 37 | with pytest.raises(exceptions.LoadError): 38 | helpers.retrieve_descriptor(source) 39 | -------------------------------------------------------------------------------- /tests/types/test_string.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import pytest 8 | from tableschema import types 9 | from tableschema.config import ERROR 10 | 11 | 12 | # Tests 13 | 14 | @pytest.mark.parametrize('format, value, result', [ 15 | ('default', 'string', 'string'), 16 | ('default', '', ''), 17 | ('default', 0, ERROR), 18 | ('uri', 'http://google.com', 'http://google.com'), 19 | ('uri', '://no-scheme.test', ERROR), 20 | ('uri', 'string', ERROR), 21 | ('uri', '', ERROR), 22 | ('uri', 0, ERROR), 23 | ('email', 'name@gmail.com', 'name@gmail.com'), 24 | ('email', 'http://google.com', ERROR), 25 | ('email', 'string', ERROR), 26 | ('email', '', ERROR), 27 | ('email', 0, ERROR), 28 | ('binary', 'dGVzdA==', 'dGVzdA=='), 29 | ('binary', '', ''), 30 | ('binary', 'string', ERROR), 31 | ('binary', 0, ERROR), 32 | ]) 33 | def test_cast_string(format, value, result): 34 | assert types.cast_string(format, value) == result 35 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Open Knowledge 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /tableschema/types/integer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import re 8 | import six 9 | from decimal import Decimal 10 | from ..config import ERROR 11 | 12 | 13 | # Module API 14 | 15 | 16 | def cast_integer(format, value, **options): 17 | if isinstance(value, six.integer_types): 18 | if value is True or value is False: 19 | return ERROR 20 | pass 21 | 22 | elif isinstance(value, six.string_types): 23 | if not options.get('bareNumber', _DEFAULT_BARE_NUMBER): 24 | value = _RE_BARE_NUMBER.sub('', value) 25 | 26 | try: 27 | value = int(value) 28 | except Exception: 29 | return ERROR 30 | 31 | elif isinstance(value, float) and value.is_integer(): 32 | value = int(value) 33 | 34 | elif isinstance(value, Decimal) and value % 1 == 0: 35 | value = int(value) 36 | 37 | else: 38 | return ERROR 39 | 40 | return value 41 | 42 | 43 | # Internal 44 | _RE_BARE_NUMBER = re.compile(r'((^\D*)|(\D*$))') 45 | _DEFAULT_BARE_NUMBER = True 46 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | # Extras 60 | tmp 61 | .DS_Store 62 | .idea 63 | .projectile 64 | *.sublime-project 65 | *.sublime-workspace 66 | shippable/* 67 | /docs/site 68 | .python-version 69 | tabulator 70 | jsontableschema_sql 71 | jsontableschema_bigquery 72 | jsontableschema_pandas 73 | README.rst 74 | .pytest_cache/ 75 | -------------------------------------------------------------------------------- /tests/types/test_duration.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import pytest 8 | import datetime 9 | import isodate 10 | from tableschema import types 11 | from tableschema.config import ERROR 12 | 13 | 14 | # Tests 15 | 16 | @pytest.mark.parametrize('format, value, result', [ 17 | ('default', isodate.Duration(years=1), isodate.Duration(years=1)), 18 | ('default', 'P1Y10M3DT5H11M7S', 19 | isodate.Duration(years=1, months=10, days=3, hours=5, minutes=11, seconds=7)), 20 | ('default', 'P1Y', isodate.Duration(years=1)), 21 | ('default', 'P1M', isodate.Duration(months=1)), 22 | ('default', 'PT1S', datetime.timedelta(seconds=1)), 23 | ('default', datetime.timedelta(seconds=1), datetime.timedelta(seconds=1)), 24 | ('default', 'P1M1Y', ERROR), 25 | ('default', 'P-1Y', ERROR), 26 | ('default', 'year', ERROR), 27 | ('default', True, ERROR), 28 | ('default', False, ERROR), 29 | ('default', 1, ERROR), 30 | ('default', '', ERROR), 31 | ('default', [], ERROR), 32 | ('default', {}, ERROR), 33 | ]) 34 | def test_cast_duration(format, value, result): 35 | assert types.cast_duration(format, value) == result 36 | -------------------------------------------------------------------------------- /tableschema/types/datetime.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import six 8 | import warnings 9 | from datetime import datetime 10 | from dateutil.parser import parse 11 | from ..config import ERROR 12 | 13 | 14 | # Module API 15 | 16 | def cast_datetime(format, value, **options): 17 | if not isinstance(value, datetime): 18 | if not isinstance(value, six.string_types): 19 | return ERROR 20 | try: 21 | if format == 'default': 22 | value = datetime.strptime(value, _DEFAULT_PATTERN) 23 | elif format == 'any': 24 | value = parse(value) 25 | else: 26 | if format.startswith('fmt:'): 27 | warnings.warn( 28 | 'Format "fmt:" is deprecated. ' 29 | 'Please use "" without "fmt:" prefix.', 30 | UserWarning) 31 | format = format.replace('fmt:', '') 32 | value = datetime.strptime(value, format) 33 | except Exception: 34 | return ERROR 35 | return value 36 | 37 | 38 | # Internal 39 | 40 | _DEFAULT_PATTERN = '%Y-%m-%dT%H:%M:%SZ' 41 | -------------------------------------------------------------------------------- /tableschema/types/time.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import six 8 | import warnings 9 | from datetime import datetime, time 10 | from dateutil.parser import parse 11 | from ..config import ERROR 12 | 13 | 14 | # Module API 15 | 16 | def cast_time(format, value, **options): 17 | if not isinstance(value, time): 18 | if not isinstance(value, six.string_types): 19 | return ERROR 20 | try: 21 | if format == 'default': 22 | value = datetime.strptime(value, _DEFAULT_PATTERN).time() 23 | elif format == 'any': 24 | value = parse(value).time() 25 | else: 26 | if format.startswith('fmt:'): 27 | warnings.warn( 28 | 'Format "fmt:" is deprecated. ' 29 | 'Please use "" without "fmt:" prefix.', 30 | UserWarning) 31 | format = format.replace('fmt:', '') 32 | value = datetime.strptime(value, format).time() 33 | except Exception: 34 | return ERROR 35 | return value 36 | 37 | 38 | # Internal 39 | 40 | _DEFAULT_PATTERN = '%H:%M:%S' 41 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all install list readme release templates test version 2 | 3 | 4 | PACKAGE := $(shell grep '^PACKAGE =' setup.py | cut -d "'" -f2) 5 | VERSION := $(shell head -n 1 $(PACKAGE)/VERSION) 6 | LEAD := $(shell head -n 1 LEAD.md) 7 | 8 | 9 | all: list 10 | 11 | install: 12 | pip install --upgrade -e .[develop] 13 | 14 | list: 15 | @grep '^\.PHONY' Makefile | cut -d' ' -f2- | tr ' ' '\n' 16 | 17 | profiles: 18 | wget -O tableschema/profiles/table-schema.json https://specs.frictionlessdata.io/schemas/table-schema.json 19 | 20 | readme: 21 | pip install md-toc 22 | pip install referencer 23 | referencer $(PACKAGE) README.md --in-place 24 | md_toc -p github --header-levels 3 README.md 25 | sed -i '/(#$(PACKAGE)-py)/,+2d' README.md 26 | 27 | release: 28 | git checkout master && git pull origin && git fetch -p 29 | @git log --pretty=format:"%C(yellow)%h%Creset %s%Cgreen%d" --reverse -20 30 | @echo "\nReleasing v$(VERSION) in 10 seconds. Press to abort\n" && sleep 10 31 | git commit -a -m 'v$(VERSION)' && git tag -a v$(VERSION) -m 'v$(VERSION)' 32 | git push --follow-tags 33 | 34 | templates: 35 | sed -i -E "s/@(\w*)/@$(LEAD)/" .github/issue_template.md 36 | sed -i -E "s/@(\w*)/@$(LEAD)/" .github/pull_request_template.md 37 | 38 | test: 39 | pylama $(PACKAGE) 40 | pytest --cov ${PACKAGE} --cov-report term-missing --cov-fail-under 90 41 | 42 | version: 43 | @echo $(VERSION) 44 | -------------------------------------------------------------------------------- /tableschema/types/string.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import re 8 | import six 9 | import uuid 10 | import base64 11 | import rfc3986.exceptions 12 | import rfc3986.validators 13 | import rfc3986.uri 14 | from ..config import ERROR 15 | 16 | 17 | # Module API 18 | 19 | def cast_string(format, value, **options): 20 | if not isinstance(value, six.string_types): 21 | return ERROR 22 | if format in _SIMPLE_FORMATS: 23 | return value 24 | if format == 'uri': 25 | uri = _uri_from_string(value) 26 | try: 27 | _uri_validator.validate(uri) 28 | except rfc3986.exceptions.ValidationError: 29 | return ERROR 30 | elif format == 'email': 31 | if not re.match(_EMAIL_PATTERN, value): 32 | return ERROR 33 | elif format == 'uuid': 34 | try: 35 | uuid.UUID(value, version=4) 36 | except Exception: 37 | return ERROR 38 | elif format == 'binary': 39 | try: 40 | base64.b64decode(value) 41 | except Exception: 42 | return ERROR 43 | return value 44 | 45 | 46 | # Internal 47 | 48 | _SIMPLE_FORMATS = {'default', None} 49 | _EMAIL_PATTERN = re.compile(r'[^@]+@[^@]+\.[^@]+') 50 | _uri_from_string = rfc3986.uri.URIReference.from_string 51 | _uri_validator = rfc3986.validators.Validator().require_presence_of('scheme') 52 | -------------------------------------------------------------------------------- /tableschema/types/date.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import six 8 | import warnings 9 | from datetime import datetime, date 10 | from dateutil.parser import parse 11 | from ..config import ERROR 12 | 13 | 14 | # Module API 15 | 16 | def cast_date(format, value, **options): 17 | if isinstance(value, datetime): 18 | value_time = value.time() 19 | if value_time.hour == 0 and value_time.minute == 0 and value_time.second == 0: 20 | return datetime(value.year, value.month, value.day).date() 21 | else: 22 | return ERROR 23 | 24 | if isinstance(value, date): 25 | return value 26 | 27 | if not isinstance(value, six.string_types): 28 | return ERROR 29 | 30 | # Parse string date 31 | try: 32 | if format == 'default': 33 | value = datetime.strptime(value, _DEFAULT_PATTERN).date() 34 | elif format == 'any': 35 | value = parse(value).date() 36 | else: 37 | if format.startswith('fmt:'): 38 | warnings.warn( 39 | 'Format "fmt:" is deprecated. ' 40 | 'Please use "" without "fmt:" prefix.', 41 | UserWarning) 42 | format = format.replace('fmt:', '') 43 | value = datetime.strptime(value, format).date() 44 | except Exception: 45 | return ERROR 46 | 47 | return value 48 | 49 | 50 | # Internal 51 | 52 | _DEFAULT_PATTERN = '%Y-%m-%d' 53 | -------------------------------------------------------------------------------- /tests/types/test_boolean.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import pytest 8 | from tableschema import types 9 | from tableschema.config import ERROR 10 | 11 | 12 | # Tests 13 | 14 | @pytest.mark.parametrize('format, value, result, options', [ 15 | ('default', True, True, {}), 16 | ('default', 'true', True, {}), 17 | ('default', 'True', True, {}), 18 | ('default', 'TRUE', True, {}), 19 | ('default', '1', True, {}), 20 | ('default', 'yes', True, {'trueValues': ['yes']}), 21 | ('default', False, False, {}), 22 | ('default', 'false', False, {}), 23 | ('default', 'False', False, {}), 24 | ('default', 'FALSE', False, {}), 25 | ('default', '0', False, {}), 26 | ('default', 'no', False, {'falseValues': ['no']}), 27 | ('default', 't', ERROR, {}), 28 | ('default', 'YES', ERROR, {}), 29 | ('default', 'Yes', ERROR, {}), 30 | ('default', 'f', ERROR, {}), 31 | ('default', 'NO', ERROR, {}), 32 | ('default', 'No', ERROR, {}), 33 | ('default', 0, ERROR, {}), 34 | ('default', 1, ERROR, {}), 35 | ('default', 0, False, {'falseValues': [0], 'trueValues': [1]}), 36 | ('default', 1, True, {'falseValues': [0], 'trueValues': [1]}), 37 | ('default', '3.14', ERROR, {}), 38 | ('default', '', ERROR, {}), 39 | ('default', 'Yes', ERROR, {'trueValues': ['yes']}), 40 | ('default', 'No', ERROR, {'falseValues': ['no']}), 41 | ]) 42 | def test_cast_boolean(format, value, result, options): 43 | assert types.cast_boolean(format, value, **options) == result 44 | -------------------------------------------------------------------------------- /tableschema/types/geopoint.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import six 8 | import json 9 | from typing import NamedTuple 10 | from decimal import Decimal 11 | from ..config import ERROR 12 | 13 | 14 | # Module API 15 | 16 | def cast_geopoint(format, value, **options): 17 | 18 | # Parse 19 | if isinstance(value, six.string_types): 20 | try: 21 | if format == 'default': 22 | lon, lat = value.split(',') 23 | lon = lon.strip() 24 | lat = lat.strip() 25 | elif format == 'array': 26 | lon, lat = json.loads(value) 27 | elif format == 'object': 28 | if isinstance(value, six.string_types): 29 | value = json.loads(value) 30 | if len(value) != 2: 31 | return ERROR 32 | lon = value['lon'] 33 | lat = value['lat'] 34 | value = _geopoint(Decimal(lon), Decimal(lat)) 35 | except Exception: 36 | return ERROR 37 | 38 | # Validate 39 | try: 40 | value = _geopoint(*value) 41 | if value.lon > 180 or value.lon < -180: 42 | return ERROR 43 | if value.lat > 90 or value.lat < -90: 44 | return ERROR 45 | except Exception: 46 | return ERROR 47 | 48 | return value 49 | 50 | 51 | # Internal 52 | 53 | class _geopoint(NamedTuple): 54 | lon: Decimal 55 | lat: Decimal 56 | 57 | def __repr__(self): 58 | return '[%s, %s]' % (self.lon, self.lat) 59 | -------------------------------------------------------------------------------- /tests/types/test_geopoint.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import pytest 8 | from tableschema import types 9 | from tableschema.config import ERROR 10 | 11 | 12 | # Tests 13 | 14 | @pytest.mark.parametrize('format, value, result', [ 15 | ('default', (180, 90), (180, 90)), 16 | ('default', [180, 90], (180, 90)), 17 | ('default', '180,90', (180, 90)), 18 | ('default', '180, -90', (180, -90)), 19 | ('default', {'lon': 180, 'lat': 90}, ERROR), 20 | ('default', '181,90', ERROR), 21 | ('default', '0,91', ERROR), 22 | ('default', 'string', ERROR), 23 | ('default', 1, ERROR), 24 | ('default', '3.14', ERROR), 25 | ('default', '', ERROR), 26 | ('array', (180, 90), (180, 90)), 27 | ('array', [180, 90], (180, 90)), 28 | ('array', '[180, -90]', (180, -90)), 29 | # ('array', {'lon': 180, 'lat': 90}, ERROR), 30 | ('array', [181, 90], ERROR), 31 | ('array', [0, 91], ERROR), 32 | ('array', '180,90', ERROR), 33 | ('array', 'string', ERROR), 34 | ('array', 1, ERROR), 35 | ('array', '3.14', ERROR), 36 | ('array', '', ERROR), 37 | # ('object', {'lon': 180, 'lat': 90}, (180, 90)), 38 | ('object', '{"lon": 180, "lat": 90}', (180, 90)), 39 | ('object', '[180, -90]', ERROR), 40 | ('object', {'lon': 181, 'lat': 90}, ERROR), 41 | ('object', {'lon': 180, 'lat': -91}, ERROR), 42 | # ('object', [180, -90], ERROR), 43 | ('object', '180,90', ERROR), 44 | ('object', 'string', ERROR), 45 | ('object', 1, ERROR), 46 | ('object', '3.14', ERROR), 47 | ('object', '', ERROR), 48 | ]) 49 | def test_cast_geopoint(format, value, result): 50 | assert types.cast_geopoint(format, value) == result 51 | -------------------------------------------------------------------------------- /tests/types/test_time.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import warnings 8 | import pytest 9 | from datetime import time 10 | from tableschema import types 11 | from tableschema.config import ERROR 12 | 13 | 14 | # Tests 15 | 16 | @pytest.mark.parametrize('format, value, result', [ 17 | ('default', time(6), time(6)), 18 | ('default', '06:00:00', time(6)), 19 | ('default', '09:00', ERROR), 20 | ('default', '3 am', ERROR), 21 | ('default', '3.00', ERROR), 22 | ('default', 'invalid', ERROR), 23 | ('default', True, ERROR), 24 | ('default', '', ERROR), 25 | ('any', time(6), time(6)), 26 | ('any', '06:00:00', time(6)), 27 | ('any', '3:00 am', time(3)), 28 | ('any', 'some night', ERROR), 29 | ('any', 'invalid', ERROR), 30 | ('any', True, ERROR), 31 | ('any', '', ERROR), 32 | ('%H:%M', time(6), time(6)), 33 | ('%H:%M', '06:00', time(6)), 34 | ('%M:%H', '06:50', ERROR), 35 | ('%H:%M', '3:00 am', ERROR), 36 | ('%H:%M', 'some night', ERROR), 37 | ('%H:%M', 'invalid', ERROR), 38 | ('%H:%M', True, ERROR), 39 | ('%H:%M', '', ERROR), 40 | ('invalid', '', ERROR), 41 | # Deprecated 42 | ('fmt:%H:%M', time(6), time(6)), 43 | ('fmt:%H:%M', '06:00', time(6)), 44 | ('fmt:%M:%H', '06:50', ERROR), 45 | ('fmt:%H:%M', '3:00 am', ERROR), 46 | ('fmt:%H:%M', 'some night', ERROR), 47 | ('fmt:%H:%M', 'invalid', ERROR), 48 | ('fmt:%H:%M', True, ERROR), 49 | ('fmt:%H:%M', '', ERROR), 50 | ]) 51 | def test_cast_time(format, value, result): 52 | with warnings.catch_warnings(): 53 | warnings.simplefilter("error" if not format.startswith('fmt:') else "ignore") 54 | assert types.cast_time(format, value) == result 55 | -------------------------------------------------------------------------------- /.github/workflows/general.yml: -------------------------------------------------------------------------------- 1 | name: general 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | tags: 8 | - v*.*.* 9 | pull_request: 10 | branches: 11 | - main 12 | 13 | jobs: 14 | 15 | # Test 16 | 17 | test: 18 | runs-on: ubuntu-latest 19 | strategy: 20 | matrix: 21 | python-version: ['3.10', '3.11', '3.12'] 22 | steps: 23 | - name: Checkout repository 24 | uses: actions/checkout@v2 25 | - name: Install Python 26 | uses: actions/setup-python@v2 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install setuptools wheel 33 | make install 34 | - name: Test software 35 | run: make test 36 | - name: Report coverage 37 | uses: codecov/codecov-action@v1 38 | 39 | # Release 40 | 41 | release: 42 | if: github.event_name == 'push' && contains(github.ref, 'refs/tags/') 43 | runs-on: ubuntu-latest 44 | needs: [test] 45 | steps: 46 | - name: Checkout repository 47 | uses: actions/checkout@v2 48 | - name: Install Python 49 | uses: actions/setup-python@v2 50 | with: 51 | python-version: 3.11 52 | - name: Install dependencies 53 | run: | 54 | python -m pip install --upgrade pip 55 | pip install setuptools wheel 56 | - name: Build distribution 57 | run: | 58 | python setup.py sdist bdist_wheel 59 | - name: Publish to PYPI 60 | uses: pypa/gh-action-pypi-publish@master 61 | with: 62 | password: ${{ secrets.PYPI_API_KEY }} 63 | - name: Release to GitHub 64 | uses: softprops/action-gh-release@v1 65 | env: 66 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 67 | -------------------------------------------------------------------------------- /tableschema/types/number.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import re 8 | import six 9 | from decimal import Decimal 10 | from ..config import ERROR 11 | 12 | 13 | # Module API 14 | 15 | def cast_number(format, value, **options): 16 | if isinstance(value, six.string_types): 17 | group_char = options.get('groupChar', _DEFAULT_GROUP_CHAR) 18 | decimal_char = options.get('decimalChar', _DEFAULT_DECIMAL_CHAR) 19 | value = _RE_WHITESPACE.sub('', value) 20 | if decimal_char != '.': 21 | if group_char: 22 | value = value.replace(decimal_char, '__decimal_char__') 23 | value = value.replace(group_char, '') 24 | value = value.replace('__decimal_char__', '.') 25 | else: 26 | value = value.replace(decimal_char, '__decimal_char__') 27 | value = value.replace('__decimal_char__', '.') 28 | elif group_char: 29 | value = value.replace(group_char, '') 30 | 31 | if not options.get('bareNumber', _DEFAULT_BARE_NUMBER): 32 | value = _RE_BARE_NUMBER.sub('', value) 33 | elif isinstance(value, Decimal): 34 | return value 35 | elif not isinstance(value, six.integer_types + (float,)): 36 | return ERROR 37 | elif value is True or value is False: 38 | return ERROR 39 | else: 40 | value = str(value) 41 | try: 42 | value = Decimal(value) 43 | except Exception: 44 | return ERROR 45 | return value 46 | 47 | 48 | # Internal 49 | 50 | _RE_WHITESPACE = re.compile(r'\s') 51 | _RE_BARE_NUMBER = re.compile(r'((^\D*)|(\D*$))') 52 | _DEFAULT_GROUP_CHAR = '' 53 | _DEFAULT_DECIMAL_CHAR = '.' 54 | _DEFAULT_BARE_NUMBER = True 55 | -------------------------------------------------------------------------------- /tableschema/infer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import six 8 | import warnings 9 | from . import config 10 | from .table import Table 11 | 12 | 13 | # Module API 14 | 15 | def infer(source, headers=1, limit=100, confidence=0.75, 16 | missing_values=config.DEFAULT_MISSING_VALUES, 17 | guesser_cls=None, resolver_cls=None, 18 | **options): 19 | """Infer source schema. 20 | 21 | # Arguments 22 | source (any): source as path, url or inline data 23 | headers (int/str[]): headers rows number or headers list 24 | confidence (float): how many casting errors are allowed (as a ratio, between 0 and 1) 25 | missing_values (str[]): list of missing values (by default `['']`) 26 | guesser_cls (class): you can implement inferring strategies by 27 | providing type-guessing and type-resolving classes [experimental] 28 | resolver_cls (class): you can implement inferring strategies by 29 | providing type-guessing and type-resolving classes [experimental] 30 | 31 | # Raises 32 | TableSchemaException: raises any error that occurs during the process 33 | 34 | # Returns 35 | dict: returns schema descriptor 36 | 37 | """ 38 | 39 | # Deprecated arguments order 40 | is_string = lambda value: isinstance(value, six.string_types) 41 | if isinstance(source, list) and all(map(is_string, source)): 42 | warnings.warn('Correct arguments order infer(source, headers)', UserWarning) 43 | source, headers = headers, source 44 | 45 | table = Table(source, headers=headers, sample_size=limit, **options) 46 | descriptor = table.infer(limit=limit, confidence=confidence, 47 | missing_values=missing_values, guesser_cls=guesser_cls, 48 | resolver_cls=resolver_cls) 49 | return descriptor 50 | -------------------------------------------------------------------------------- /tests/types/test_date.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import warnings 8 | import pytest 9 | from datetime import date, datetime 10 | from tableschema import types 11 | from tableschema.config import ERROR 12 | 13 | 14 | # Tests 15 | 16 | @pytest.mark.parametrize('format, value, result', [ 17 | ('default', date(2019, 1, 1), date(2019, 1, 1)), 18 | ('default', '2019-01-01', date(2019, 1, 1)), 19 | ('default', '10th Jan 1969', ERROR), 20 | ('default', 'invalid', ERROR), 21 | ('default', True, ERROR), 22 | ('default', '', ERROR), 23 | ('default', datetime(2018, 1, 1), date(2018, 1, 1)), 24 | ('default', datetime(2018, 3, 1, 8, 30, 23), ERROR), 25 | ('any', date(2019, 1, 1), date(2019, 1, 1)), 26 | ('any', '2019-01-01', date(2019, 1, 1)), 27 | ('any', '10th Jan 1969', date(1969, 1, 10)), 28 | ('any', '10th Jan nineteen sixty nine', ERROR), 29 | ('any', 'invalid', ERROR), 30 | ('any', True, ERROR), 31 | ('any', '', ERROR), 32 | ('%d/%m/%y', date(2019, 1, 1), date(2019, 1, 1)), 33 | ('%d/%m/%y', '21/11/06', date(2006, 11, 21)), 34 | ('%y/%m/%d', '21/11/06 16:30', ERROR), 35 | ('%d/%m/%y', 'invalid', ERROR), 36 | ('%d/%m/%y', True, ERROR), 37 | ('%d/%m/%y', '', ERROR), 38 | ('invalid', '21/11/06 16:30', ERROR), 39 | # Deprecated 40 | ('fmt:%d/%m/%y', date(2019, 1, 1), date(2019, 1, 1)), 41 | ('fmt:%d/%m/%y', '21/11/06', date(2006, 11, 21)), 42 | ('fmt:%y/%m/%d', '21/11/06 16:30', ERROR), 43 | ('fmt:%d/%m/%y', 'invalid', ERROR), 44 | ('fmt:%d/%m/%y', True, ERROR), 45 | ('fmt:%d/%m/%y', '', ERROR), 46 | ]) 47 | def test_cast_date(format, value, result): 48 | with warnings.catch_warnings(): 49 | warnings.simplefilter("error" if not format.startswith('fmt:') else "ignore") 50 | assert types.cast_date(format, value) == result 51 | -------------------------------------------------------------------------------- /tests/types/test_datetime.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import warnings 8 | import pytest 9 | from datetime import datetime 10 | from tableschema import types 11 | from tableschema.config import ERROR 12 | 13 | 14 | # Tests 15 | 16 | @pytest.mark.parametrize('format, value, result', [ 17 | ('default', datetime(2014, 1, 1, 6), datetime(2014, 1, 1, 6)), 18 | ('default', '2014-01-01T06:00:00Z', datetime(2014, 1, 1, 6)), 19 | ('default', 'Mon 1st Jan 2014 9 am', ERROR), 20 | ('default', 'invalid', ERROR), 21 | ('default', True, ERROR), 22 | ('default', '', ERROR), 23 | ('any', datetime(2014, 1, 1, 6), datetime(2014, 1, 1, 6)), 24 | ('any', '10th Jan 1969 9 am', datetime(1969, 1, 10, 9)), 25 | ('any', 'invalid', ERROR), 26 | ('any', True, ERROR), 27 | ('any', '', ERROR), 28 | ('%d/%m/%y %H:%M', datetime(2006, 11, 21, 16, 30), datetime(2006, 11, 21, 16, 30)), 29 | ('%d/%m/%y %H:%M', '21/11/06 16:30', datetime(2006, 11, 21, 16, 30)), 30 | ('%H:%M %d/%m/%y', '21/11/06 16:30', ERROR), 31 | ('%d/%m/%y %H:%M', 'invalid', ERROR), 32 | ('%d/%m/%y %H:%M', True, ERROR), 33 | ('%d/%m/%y %H:%M', '', ERROR), 34 | ('invalid', '21/11/06 16:30', ERROR), 35 | # Deprecated 36 | ('fmt:%d/%m/%y %H:%M', datetime(2006, 11, 21, 16, 30), datetime(2006, 11, 21, 16, 30)), 37 | ('fmt:%d/%m/%y %H:%M', '21/11/06 16:30', datetime(2006, 11, 21, 16, 30)), 38 | ('fmt:%H:%M %d/%m/%y', '21/11/06 16:30', ERROR), 39 | ('fmt:%d/%m/%y %H:%M', 'invalid', ERROR), 40 | ('fmt:%d/%m/%y %H:%M', True, ERROR), 41 | ('fmt:%d/%m/%y %H:%M', '', ERROR), 42 | ]) 43 | def test_cast_datetime(format, value, result): 44 | with warnings.catch_warnings(): 45 | warnings.simplefilter("error" if not format.startswith('fmt:') else "ignore") 46 | assert types.cast_datetime(format, value) == result 47 | -------------------------------------------------------------------------------- /data/data_infer_increase_limit.csv: -------------------------------------------------------------------------------- 1 | a,b 2 | 0,1 3 | 0,1 4 | 0,1 5 | 0,1 6 | 0,1 7 | 0,1 8 | 0,1 9 | 0,1 10 | 0,1 11 | 0,1 12 | 0,1 13 | 0,1 14 | 0,1 15 | 0,1 16 | 0,1 17 | 0,1 18 | 0,1 19 | 0,1 20 | 0,1 21 | 0,1 22 | 0,1 23 | 0,1 24 | 0,1 25 | 0,1 26 | 0,1 27 | 0,1 28 | 0,1 29 | 0,1 30 | 0,1 31 | 0,1 32 | 0,1 33 | 0,1 34 | 0,1 35 | 0,1 36 | 0,1 37 | 0,1 38 | 0,1 39 | 0,1 40 | 0,1 41 | 0,1 42 | 0,1 43 | 0,1 44 | 0,1 45 | 0,1 46 | 0,1 47 | 0,1 48 | 0,1 49 | 0,1 50 | 0,1 51 | 0,1 52 | 0,1 53 | 0,1 54 | 0,1 55 | 0,1 56 | 0,1 57 | 0,1 58 | 0,1 59 | 0,1 60 | 0,1 61 | 0,1 62 | 0,1 63 | 0,1 64 | 0,1 65 | 0,1 66 | 0,1 67 | 0,1 68 | 0,1 69 | 0,1 70 | 0,1 71 | 0,1 72 | 0,1 73 | 0,1 74 | 0,1 75 | 0,1 76 | 0,1 77 | 0,1 78 | 0,1 79 | 0,1 80 | 0,1 81 | 0,1 82 | 0,1 83 | 0,1 84 | 0,1 85 | 0,1 86 | 0,1 87 | 0,1 88 | 0,1 89 | 0,1 90 | 0,1 91 | 0,1 92 | 0,1 93 | 0,1 94 | 0,1 95 | 0,1 96 | 0,1 97 | 0,1 98 | 0,1 99 | 0,1 100 | 0,1 101 | 0,1.1 102 | 0,1.1 103 | 0,1.1 104 | 0,1.1 105 | 0,1.1 106 | 0,1.1 107 | 0,1.1 108 | 0,1.1 109 | 0,1.1 110 | 0,1.1 111 | 0,1.1 112 | 0,1.1 113 | 0,1.1 114 | 0,1.1 115 | 0,1.1 116 | 0,1.1 117 | 0,1.1 118 | 0,1.1 119 | 0,1.1 120 | 0,1.1 121 | 0,1.1 122 | 0,1.1 123 | 0,1.1 124 | 0,1.1 125 | 0,1.1 126 | 0,1.1 127 | 0,1.1 128 | 0,1.1 129 | 0,1.1 130 | 0,1.1 131 | 0,1.1 132 | 0,1.1 133 | 0,1.1 134 | 0,1.1 135 | 0,1.1 136 | 0,1.1 137 | 0,1.1 138 | 0,1.1 139 | 0,1.1 140 | 0,1.1 141 | 0,1.1 142 | 0,1.1 143 | 0,1.1 144 | 0,1.1 145 | 0,1.1 146 | 0,1.1 147 | 0,1.1 148 | 0,1.1 149 | 0,1.1 150 | 0,1.1 151 | 0,1.1 152 | 0,1.1 153 | 0,1.1 154 | 0,1.1 155 | 0,1.1 156 | 0,1.1 157 | 0,1.1 158 | 0,1.1 159 | 0,1.1 160 | 0,1.1 161 | 0,1.1 162 | 0,1.1 163 | 0,1.1 164 | 0,1.1 165 | 0,1.1 166 | 0,1.1 167 | 0,1.1 168 | 0,1.1 169 | 0,1.1 170 | 0,1.1 171 | 0,1.1 172 | 0,1.1 173 | 0,1.1 174 | 0,1.1 175 | 0,1.1 176 | 0,1.1 177 | 0,1.1 178 | 0,1.1 179 | 0,1.1 180 | 0,1.1 181 | 0,1.1 182 | 0,1.1 183 | 0,1.1 184 | 0,1.1 185 | 0,1.1 186 | 0,1.1 187 | 0,1.1 188 | 0,1.1 189 | 0,1.1 190 | 0,1.1 191 | 0,1.1 192 | 0,1.1 193 | 0,1.1 194 | 0,1.1 195 | 0,1.1 196 | 0,1.1 197 | 0,1.1 198 | 0,1.1 199 | 0,1.1 200 | 0,1.1 201 | -------------------------------------------------------------------------------- /tests/types/test_geojson.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | from mock import patch 8 | import pytest 9 | from tableschema import types 10 | from tableschema.config import ERROR 11 | from tableschema.profile import Profile 12 | 13 | 14 | # Tests 15 | 16 | @pytest.mark.parametrize('format, value, result', [ 17 | ('default', 18 | {'properties': {'Ã': 'Ã'}, 'type': 'Feature', 'geometry': None}, 19 | {'properties': {'Ã': 'Ã'}, 'type': 'Feature', 'geometry': None}), 20 | ('default', 21 | '{"geometry": null, "type": "Feature", "properties": {"\\u00c3": "\\u00c3"}}', 22 | {'properties': {'Ã': 'Ã'}, 'type': 'Feature', 'geometry': None}), 23 | ('default', {'coordinates': [0, 0, 0], 'type': 'Point'}, ERROR), 24 | ('default', 'string', ERROR), 25 | ('default', 1, ERROR), 26 | ('default', '3.14', ERROR), 27 | ('default', '', ERROR), 28 | ('default', {}, ERROR), 29 | ('default', '{}', ERROR), 30 | ('topojson', 31 | {'type': 'LineString', 'arcs': [42]}, 32 | {'type': 'LineString', 'arcs': [42]}), 33 | ('topojson', 34 | '{"type": "LineString", "arcs": [42]}', 35 | {'type': 'LineString', 'arcs': [42]}), 36 | ('topojson', 'string', ERROR), 37 | ('topojson', 1, ERROR), 38 | ('topojson', '3.14', ERROR), 39 | ('topojson', '', ERROR), 40 | ]) 41 | def test_cast_geojson(format, value, result): 42 | assert types.cast_geojson(format, value) == result 43 | 44 | 45 | @pytest.mark.parametrize('format, value, validates', [ 46 | ('default', '', False), 47 | ('default', '""', False), 48 | ('default', '3.14', False), 49 | ('default', '{}', True), 50 | ('default', {}, True), 51 | ]) 52 | def test_validation(format, value, validates): 53 | """Only json object shaped inputs call Profile.validate().""" 54 | err = Exception('fake validation error') 55 | with patch.object(Profile, 'validate', side_effect=err) as mock_validate: 56 | assert types.cast_geojson(format, value) == ERROR 57 | assert mock_validate.call_count == int(validates) 58 | -------------------------------------------------------------------------------- /tableschema/cli.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | 6 | import io 7 | import sys 8 | import click 9 | import tableschema 10 | import json as json_module 11 | from . import config 12 | 13 | 14 | # Module API 15 | 16 | @click.group(help='') 17 | def cli(): 18 | """Command-line interface 19 | 20 | ``` 21 | Usage: tableschema [OPTIONS] COMMAND [ARGS]... 22 | 23 | Options: 24 | --help Show this message and exit. 25 | 26 | Commands: 27 | infer Infer a schema from data. 28 | info Return info on this version of Table Schema 29 | validate Validate that a supposed schema is in fact a Table Schema. 30 | ``` 31 | 32 | """ 33 | pass 34 | 35 | 36 | @cli.command() 37 | def info(): 38 | """Return info on this version of Table Schema""" 39 | click.echo(json_module.dumps({'version': config.VERSION}, ensure_ascii=False, indent=4)) 40 | 41 | 42 | @cli.command() 43 | @click.argument('data') 44 | @click.option('--row_limit', default=100, type=int) 45 | @click.option('--confidence', default=0.75, type=float) 46 | @click.option('--encoding', default='utf-8') 47 | @click.option('--to_file') 48 | @click.option('--json', is_flag=True) 49 | def infer(data, row_limit, confidence, encoding, to_file, json): 50 | """Infer a schema from data. 51 | 52 | - data must be a local filepath 53 | - data must be CSV 54 | - the file encoding is assumed to be UTF-8 unless an encoding is passed 55 | with --encoding 56 | - the first line of data must be headers 57 | - these constraints are just for the CLI 58 | 59 | """ 60 | try: 61 | descriptor = tableschema.infer( 62 | data, encoding=encoding, limit=row_limit, confidence=confidence) 63 | except Exception as exception: 64 | click.echo(exception) 65 | sys.exit(1) 66 | 67 | if json: 68 | return click.secho(json_module.dumps(descriptor, ensure_ascii=False, indent=4)) 69 | if to_file: 70 | with io.open(to_file, mode='w+t', encoding='utf-8') as dest: 71 | dest.write(json_module.dumps(descriptor, ensure_ascii=False, indent=4)) 72 | click.echo(descriptor) 73 | 74 | 75 | @cli.command() 76 | @click.argument('schema') 77 | def validate(schema): 78 | """Validate that a supposed schema is in fact a Table Schema.""" 79 | try: 80 | tableschema.validate(schema) 81 | click.echo("Schema is valid") 82 | sys.exit(0) 83 | except tableschema.exceptions.ValidationError as exception: 84 | click.echo("Schema is not valid") 85 | click.echo(exception.errors) 86 | sys.exit(1) 87 | except Exception as exception: 88 | click.echo(exception) 89 | sys.exit(1) 90 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | 6 | import os 7 | import io 8 | from setuptools import setup, find_packages 9 | 10 | 11 | # Helpers 12 | def read(*paths): 13 | """Read a text file.""" 14 | basedir = os.path.dirname(__file__) 15 | fullpath = os.path.join(basedir, *paths) 16 | contents = io.open(fullpath, encoding='utf-8').read().strip() 17 | return contents 18 | 19 | 20 | # Prepare 21 | PACKAGE = 'tableschema' 22 | NAME = PACKAGE.replace('_', '-') 23 | INSTALL_REQUIRES = [ 24 | 'six>=1.9', 25 | 'click>=3.3', 26 | 'requests>=2.5', 27 | 'cached-property>=1.5', 28 | 'python-dateutil>=2.4', 29 | 'jsonschema>=2.5', 30 | 'unicodecsv>=0.14', 31 | 'isodate>=0.5.4', 32 | 'rfc3986>=1.1.0', 33 | 'dataflows-tabulator>=1.54.1', 34 | ] 35 | TESTS_REQUIRE = [ 36 | 'mock', 37 | 'pylama', 38 | 'pytest', 39 | 'pytest-cov', 40 | ] 41 | README = read('README.md') 42 | VERSION = read(PACKAGE, 'VERSION') 43 | PACKAGES = find_packages(exclude=['examples', 'tests']) 44 | 45 | 46 | # Run 47 | setup( 48 | name=NAME, 49 | version=VERSION, 50 | packages=PACKAGES, 51 | include_package_data=True, 52 | install_requires=INSTALL_REQUIRES, 53 | tests_require=TESTS_REQUIRE, 54 | extras_require={'develop': TESTS_REQUIRE}, 55 | entry_points={ 56 | 'console_scripts': [ 57 | 'tableschema = tableschema.__main__:cli', 58 | ] 59 | }, 60 | zip_safe=False, 61 | long_description=README, 62 | long_description_content_type="text/markdown", 63 | description='A utility library for working with Table Schema in Python', 64 | author='Open Knowledge Foundation', 65 | author_email='info@okfn.org', 66 | url='https://github.com/frictionlessdata/tableschema-py', 67 | license='MIT', 68 | keywords=[ 69 | 'frictionless data', 70 | 'open data', 71 | 'json schema', 72 | 'table schema', 73 | 'data package', 74 | 'tabular data package', 75 | ], 76 | classifiers=[ 77 | 'Development Status :: 4 - Beta', 78 | 'Environment :: Web Environment', 79 | 'Intended Audience :: Developers', 80 | 'License :: OSI Approved :: MIT License', 81 | 'Operating System :: OS Independent', 82 | 'Programming Language :: Python :: 2', 83 | 'Programming Language :: Python :: 2.7', 84 | 'Programming Language :: Python :: 3', 85 | 'Programming Language :: Python :: 3.4', 86 | 'Programming Language :: Python :: 3.5', 87 | 'Programming Language :: Python :: 3.6', 88 | 'Topic :: Internet :: WWW/HTTP :: Dynamic Content', 89 | 'Topic :: Software Development :: Libraries :: Python Modules' 90 | ], 91 | ) 92 | -------------------------------------------------------------------------------- /tableschema/exceptions.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | 8 | # Module API 9 | 10 | class DataPackageException(Exception): 11 | """Base class for all DataPackage/TableSchema exceptions. 12 | 13 | If there are multiple errors, they can be read from the exception object: 14 | 15 | ```python 16 | try: 17 | # lib action 18 | except DataPackageException as exception: 19 | if exception.multiple: 20 | for error in exception.errors: 21 | # handle error 22 | ``` 23 | 24 | """ 25 | 26 | # Public 27 | 28 | def __init__(self, message, errors=None): 29 | self.__errors = errors or [] 30 | super(Exception, self).__init__(message) 31 | 32 | @property 33 | def multiple(self): 34 | """Whether it's a nested exception 35 | 36 | # Returns 37 | bool: whether it's a nested exception 38 | 39 | """ 40 | return bool(self.__errors) 41 | 42 | @property 43 | def errors(self): 44 | """List of nested errors 45 | 46 | # Returns 47 | DataPackageException[]: list of nested errors 48 | 49 | """ 50 | return self.__errors 51 | 52 | 53 | class TableSchemaException(DataPackageException): 54 | """Base class for all TableSchema exceptions. 55 | """ 56 | pass 57 | 58 | 59 | class LoadError(TableSchemaException): 60 | """All loading errors. 61 | """ 62 | pass 63 | 64 | 65 | class ValidationError(TableSchemaException): 66 | """All validation errors. 67 | """ 68 | pass 69 | 70 | 71 | class CastError(TableSchemaException): 72 | """All value cast errors. 73 | """ 74 | pass 75 | 76 | 77 | class IntegrityError(TableSchemaException): 78 | """All integrity errors. 79 | """ 80 | pass 81 | 82 | 83 | class UniqueKeyError(CastError): 84 | """Unique key constraint violation (CastError subclass) 85 | """ 86 | pass 87 | 88 | 89 | class RelationError(TableSchemaException): 90 | """All relations errors. 91 | """ 92 | pass 93 | 94 | 95 | class UnresolvedFKError(RelationError): 96 | """Unresolved foreign key reference error (RelationError subclass). 97 | """ 98 | pass 99 | 100 | 101 | class StorageError(TableSchemaException): 102 | """All storage errors. 103 | """ 104 | pass 105 | 106 | 107 | # Deprecated 108 | 109 | MultipleInvalid = TableSchemaException 110 | InvalidJSONError = LoadError 111 | SchemaValidationError = ValidationError 112 | InvalidSchemaError = ValidationError 113 | InvalidCastError = CastError 114 | ConstraintError = CastError 115 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | 6 | import os 7 | import ast 8 | import pytest 9 | from click.testing import CliRunner 10 | from tableschema import Schema 11 | from tableschema.cli import infer, validate 12 | os.environ['LC_ALL'] = 'en_US.UTF-8' 13 | 14 | 15 | # Tests 16 | 17 | def test_infer_schema(): 18 | runner = CliRunner() 19 | result = runner.invoke(infer, ['data/data_infer.csv']) 20 | # output is a string, evaluate to a dict 21 | schema = ast.literal_eval(result.output) 22 | schema_model = Schema(schema) 23 | assert schema_model.get_field('id').type == 'integer' 24 | assert schema_model.get_field('age').type == 'integer' 25 | assert schema_model.get_field('name').type == 'string' 26 | 27 | 28 | def test_infer_schema_utf8(): 29 | """UTF8 encoded data containing non-ascii characters.""" 30 | runner = CliRunner() 31 | result = runner.invoke(infer, ['data/data_infer_utf8.csv']) 32 | # output is a string, evaluate to a dict 33 | schema = ast.literal_eval(result.output) 34 | schema_model = Schema(schema) 35 | assert schema_model.get_field('id').type == 'integer' 36 | assert schema_model.get_field('age').type == 'integer' 37 | assert schema_model.get_field('name').type == 'string' 38 | 39 | 40 | def test_infer_schema_greek(): 41 | """iso-8859-7 (greek) encoded data containing non-ascii characters.""" 42 | runner = CliRunner() 43 | result = runner.invoke(infer, 44 | ['data/data_infer_iso-8859-7.csv', 45 | '--encoding=iso-8859-7']) 46 | # output is a string, evaluate to a dict 47 | schema = ast.literal_eval(result.output) 48 | schema_model = Schema(schema) 49 | assert schema_model.get_field('id').type == 'integer' 50 | assert schema_model.get_field('age').type == 'integer' 51 | assert schema_model.get_field('name').type == 'string' 52 | 53 | def test_validate_schema(): 54 | runner = CliRunner() 55 | result = runner.invoke(validate, ['data/schema_valid_simple.json']) 56 | assert result.output.splitlines()[0] == 'Schema is valid' 57 | assert result.exit_code == 0 58 | result = runner.invoke(validate, ['data/schema_invalid_pk_no_fields.json']) 59 | assert result.output.splitlines()[0] == 'Schema is not valid' 60 | assert result.exit_code == 1 61 | 62 | 63 | @pytest.mark.skip 64 | def test_infer_schema_greek_no_encoding_defined(): 65 | """iso-8859-7 (greek) encoded data containing non-ascii characters, 66 | with no encoding arg passed returns an error message.""" 67 | runner = CliRunner() 68 | result = runner.invoke(cli.infer, ['data/data_infer_iso-8859-7.csv']) 69 | # There's an exception in the result 70 | assert 'Could not decode the data file as utf-8.' in result.output 71 | -------------------------------------------------------------------------------- /tests/types/test_number.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import pytest 8 | from decimal import Decimal 9 | from tableschema import types 10 | from tableschema.config import ERROR 11 | 12 | 13 | # Tests 14 | 15 | @pytest.mark.parametrize('format, value, result, options', [ 16 | ('default', Decimal(1), Decimal(1), {}), 17 | ('default', 1, Decimal(1), {}), 18 | ('default', 1.0, Decimal(1), {}), 19 | ('default', 1 << 63, Decimal(1 << 63), {}), 20 | ('default', '1', Decimal(1), {}), 21 | ('default', '10.00', Decimal(10), {}), 22 | ('default', '10.50', Decimal(10.5), {}), 23 | ('default', 24.122667, Decimal('24.122667'), {}), 24 | ('default', '100%', Decimal(100), {'bareNumber': False}), 25 | ('default', '1000‰', Decimal(1000), {'bareNumber': False}), 26 | ('default', '-1000', Decimal(-1000), {}), 27 | ('default', '1,000', Decimal(1000), {'groupChar': ','}), 28 | ('default', '10,000.00', Decimal(10000), {'groupChar': ','}), 29 | ('default', '10,000,000.50', Decimal(10000000.5), {'groupChar': ','}), 30 | ('default', '10#000.00', Decimal(10000), {'groupChar': '#'}), 31 | ('default', '10#000#000.50', Decimal(10000000.5), {'groupChar': '#'}), 32 | ('default', '10.50', Decimal(10.5), {'groupChar': '#'}), 33 | ('default', '1#000', Decimal(1000), {'groupChar': '#'}), 34 | ('default', '10#000@00', Decimal(10000), {'groupChar': '#', 'decimalChar': '@'}), 35 | ('default', '10#000#000@50', Decimal(10000000.5), {'groupChar': '#', 'decimalChar': '@'}), 36 | ('default', '10@50', Decimal(10.5), {'groupChar': '#', 'decimalChar': '@'}), 37 | ('default', '1#000', Decimal(1000), {'groupChar': '#', 'decimalChar': '@'}), 38 | ('default', '10,000.00', Decimal(10000), {'groupChar': ',', 'bareNumber': False}), 39 | ('default', '10,000,000.00', Decimal(10000000), {'groupChar': ',', 'bareNumber': False}), 40 | ('default', '10.000.000,00', Decimal(10000000), {'groupChar': '.', 'decimalChar': ','}), 41 | ('default', '$10000.00', Decimal(10000), {'bareNumber': False}), 42 | ('default', ' 10,000.00 €', Decimal(10000), {'groupChar': ',', 'bareNumber': False}), 43 | ('default', '10 000,00', Decimal(10000), {'groupChar': ' ', 'decimalChar': ','}), 44 | ('default', '10 000 000,00', Decimal(10000000), {'groupChar': ' ', 'decimalChar': ','}), 45 | ('default', '10000,00 ₪', Decimal(10000), {'groupChar': ' ', 'decimalChar': ',', 'bareNumber': False}), 46 | ('default', ' 10 000,00 £', Decimal(10000), {'groupChar': ' ', 'decimalChar': ',', 'bareNumber': False}), 47 | ('default', True, ERROR, {}), 48 | ('default', False, ERROR, {}), 49 | ('default', '10,000a.00', ERROR, {}), 50 | ('default', '10+000.00', ERROR, {}), 51 | ('default', '$10:000.00', ERROR, {}), 52 | ('default', 'string', ERROR, {}), 53 | ('default', '', ERROR, {}), 54 | ]) 55 | def test_cast_number(format, value, result, options): 56 | assert types.cast_number(format, value, **options) == result 57 | -------------------------------------------------------------------------------- /tests/test_infer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import os 8 | import io 9 | from tableschema import infer 10 | 11 | 12 | # Tests 13 | 14 | def test_infer_schema(): 15 | descriptor = infer('data/data_infer.csv') 16 | assert descriptor == { 17 | 'fields': [ 18 | {'name': 'id', 'type': 'integer', 'format': 'default'}, 19 | {'name': 'age', 'type': 'integer', 'format': 'default'}, 20 | {'name': 'name', 'type': 'string', 'format': 'default'}], 21 | 'missingValues': [''], 22 | } 23 | 24 | 25 | def test_infer_schema_utf8(): 26 | descriptor = infer('data/data_infer_utf8.csv') 27 | assert descriptor == { 28 | 'fields': [ 29 | {'name': 'id', 'type': 'integer', 'format': 'default'}, 30 | {'name': 'age', 'type': 'integer', 'format': 'default'}, 31 | {'name': 'name', 'type': 'string', 'format': 'default'}], 32 | 'missingValues': [''], 33 | } 34 | 35 | 36 | def test_infer_schema_with_row_limit(): 37 | descriptor = infer('data/data_infer_row_limit.csv', limit=4) 38 | assert descriptor == { 39 | 'fields': [ 40 | {'name': 'id', 'type': 'integer', 'format': 'default'}, 41 | {'name': 'age', 'type': 'integer', 'format': 'default'}, 42 | {'name': 'name', 'type': 'string', 'format': 'default'}], 43 | 'missingValues': [''], 44 | } 45 | 46 | 47 | def test_infer_schema_with_missing_values_default(): 48 | descriptor = infer('data/data_infer_missing_values.csv') 49 | assert descriptor == { 50 | 'fields': [ 51 | {'name': 'id', 'type': 'string', 'format': 'default'}, 52 | {'name': 'age', 'type': 'integer', 'format': 'default'}, 53 | {'name': 'name', 'type': 'string', 'format': 'default'}], 54 | 'missingValues': [''], 55 | } 56 | 57 | 58 | def test_infer_schema_with_missing_values_using_the_argument(): 59 | descriptor = infer('data/data_infer_missing_values.csv', missing_values=['-']) 60 | assert descriptor == { 61 | 'fields': [ 62 | {'name': 'id', 'type': 'integer', 'format': 'default'}, 63 | {'name': 'age', 'type': 'integer', 'format': 'default'}, 64 | {'name': 'name', 'type': 'string', 'format': 'default'}], 65 | 'missingValues': ['-'], 66 | } 67 | 68 | 69 | def test_infer_check_type_boolean_string_tie(): 70 | descriptor = infer([['f'], ['stringish']], headers=['field']) 71 | assert descriptor['fields'][0]['type'] == 'string' 72 | 73 | 74 | def test_infer_xlsx_file_with_boolean_column_issue_203(): 75 | descriptor = infer('data/data_infer_boolean.xlsx') 76 | assert descriptor == { 77 | 'fields': [ 78 | {'name': 'number', 'type': 'integer', 'format': 'default'}, 79 | {'name': 'string', 'type': 'string', 'format': 'default'}, 80 | {'name': 'boolean', 'type': 'boolean', 'format': 'default'}], 81 | 'missingValues': [''], 82 | } 83 | 84 | 85 | def test_infer_increase_limit_issue_212(): 86 | descriptor = infer('data/data_infer_increase_limit.csv', limit=200) 87 | assert descriptor == { 88 | 'fields': [ 89 | {'name': 'a', 'type': 'integer', 'format': 'default'}, 90 | {'name': 'b', 'type': 'number', 'format': 'default'}, 91 | ], 92 | 'missingValues': [''], 93 | } 94 | -------------------------------------------------------------------------------- /data/schema_valid_full.json: -------------------------------------------------------------------------------- 1 | { 2 | "fields": [ 3 | { 4 | "name": "first_name", 5 | "title": "First Name", 6 | "type": "string", 7 | "description": "The first name of the person" 8 | }, 9 | { 10 | "name": "last_name", 11 | "title": "Last Name", 12 | "type": "string", 13 | "description": "The last name of the person" 14 | }, 15 | { 16 | "name": "gender", 17 | "title": "Gender", 18 | "type": "string", 19 | "description": "The gender of the person." 20 | }, 21 | { 22 | "name": "age", 23 | "title": "Age", 24 | "type": "integer", 25 | "description": "The age of this person." 26 | }, 27 | { 28 | "name": "period_employed", 29 | "title": "Period Employed", 30 | "type": "number", 31 | "description": "The period of employment, in years (eg: 2.6 Y)." 32 | }, 33 | { 34 | "name": "employment_start", 35 | "title": "Employment Start", 36 | "type": "date", 37 | "description": "The date this person started employment." 38 | }, 39 | { 40 | "name": "daily_start", 41 | "title": "Daily Start", 42 | "type": "time", 43 | "description": "Usual start time for this person." 44 | }, 45 | { 46 | "name": "daily_end", 47 | "title": "Daily End", 48 | "type": "time", 49 | "description": "Usual end time for this person." 50 | }, 51 | { 52 | "name": "is_management", 53 | "title": "Is Management", 54 | "type": "boolean", 55 | "description": "Is this person part of upper management." 56 | }, 57 | { 58 | "name": "photo", 59 | "title": "Photo", 60 | "type": "string", 61 | "format": "binary", 62 | "description": "A photo of this person." 63 | }, 64 | { 65 | "name": "interests", 66 | "title": "Interests", 67 | "type": "array", 68 | "description": "Declared interests of this person (work-related)." 69 | }, 70 | { 71 | "name": "home_location", 72 | "title": "Home Location", 73 | "type": "geopoint", 74 | "description": "A geopoint for this person's home address." 75 | }, 76 | { 77 | "name": "position_title", 78 | "title": "Position Title", 79 | "type": "string", 80 | "description": "This person's position in the company." 81 | }, 82 | { 83 | "name": "extra", 84 | "title": "Extra", 85 | "type": "object", 86 | "description": "Extra information about this person." 87 | }, 88 | { 89 | "name": "notes", 90 | "title": "Notes", 91 | "type": "any", 92 | "description": "Add any relevant notes for HR." 93 | } 94 | ], 95 | "primaryKey": [ 96 | "first_name", 97 | "last_name", 98 | "period_employed", 99 | "home_location" 100 | ], 101 | "foreignKeys": [ 102 | { 103 | "fields": ["position_title"], 104 | "reference": { 105 | "resource": "positions", 106 | "fields": ["name"] 107 | } 108 | } 109 | ] 110 | } 111 | -------------------------------------------------------------------------------- /tests/test_validate.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import os 8 | import io 9 | import json 10 | import pytest 11 | from tableschema import validate, exceptions 12 | 13 | 14 | # Tests 15 | 16 | def test_schema_valid_simple(): 17 | valid = validate('data/schema_valid_simple.json') 18 | assert valid 19 | 20 | 21 | def test_schema_valid_full(): 22 | valid = validate('data/schema_valid_full.json') 23 | assert valid 24 | 25 | 26 | def test_schema_valid_pk_array(): 27 | valid = validate('data/schema_valid_pk_array.json') 28 | assert valid 29 | 30 | 31 | def test_schema_invalid_empty(): 32 | with pytest.raises(exceptions.ValidationError): 33 | valid = validate('data/schema_invalid_empty.json') 34 | 35 | 36 | def test_schema_invalid_wrong_type(): 37 | with pytest.raises(exceptions.ValidationError): 38 | valid = validate([]) 39 | 40 | 41 | def test_schema_invalid_pk_string(): 42 | with pytest.raises(exceptions.ValidationError): 43 | valid = validate('data/schema_invalid_pk_string.json') 44 | 45 | 46 | def test_schema_invalid_pk_array(): 47 | with pytest.raises(exceptions.ValidationError): 48 | valid = validate('data/schema_invalid_pk_array.json') 49 | 50 | 51 | def test_schema_valid_fk_array(): 52 | valid = validate('data/schema_valid_fk_array.json') 53 | assert valid 54 | 55 | 56 | def test_schema_invalid_fk_string(): 57 | with pytest.raises(exceptions.ValidationError): 58 | valid = validate('data/schema_invalid_fk_string.json') 59 | 60 | 61 | def test_schema_invalid_fk_no_reference(): 62 | with pytest.raises(exceptions.ValidationError): 63 | valid = validate('data/schema_invalid_fk_no_reference.json') 64 | 65 | 66 | def test_schema_invalid_fk_array(): 67 | with pytest.raises(exceptions.ValidationError): 68 | valid = validate('data/schema_invalid_fk_array.json') 69 | 70 | 71 | def test_schema_invalid_fk_ref_is_an_array_fields_is_a_string(): 72 | with pytest.raises(exceptions.ValidationError): 73 | valid = validate('data/schema_invalid_fk_string_array_ref.json') 74 | 75 | 76 | def test_schema_invalid_fk_reference_is_a_string_fields_is_an_array(): 77 | with pytest.raises(exceptions.ValidationError): 78 | valid = validate('data/schema_invalid_fk_array_string_ref.json') 79 | 80 | 81 | def test_schema_invalid_fk_reference_array_number_mismatch(): 82 | with pytest.raises(exceptions.ValidationError): 83 | valid = validate('data/schema_invalid_fk_array_wrong_number.json') 84 | 85 | 86 | def test_primary_key_is_not_a_valid_type(): 87 | with pytest.raises(exceptions.ValidationError) as excinfo: 88 | valid = validate('data/schema_invalid_pk_is_wrong_type.json') 89 | assert len(excinfo.value.errors) == 2 90 | 91 | 92 | def test_schema_multiple_errors_no_fail_fast_true(): 93 | with pytest.raises(exceptions.ValidationError) as excinfo: 94 | valid = validate('data/schema_invalid_multiple_errors.json') 95 | assert len(excinfo.value.errors) == 5 96 | 97 | 98 | def test_validate_error_message(): 99 | descriptor = { 100 | 'fields': [ 101 | {'name': 'name', 'type': 'other'}, 102 | ], 103 | } 104 | with pytest.raises(exceptions.ValidationError) as excinfo: 105 | validate(descriptor) 106 | message = str(excinfo.value.errors[0]) 107 | assert 'Descriptor validation error' in message 108 | assert 'at "fields/0" in descriptor' in message 109 | assert 'at "properties/fields/items/anyOf" in profile' in message 110 | 111 | -------------------------------------------------------------------------------- /tableschema/helpers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import os 8 | import io 9 | import sys 10 | import six 11 | import json 12 | import requests 13 | from copy import deepcopy 14 | from importlib.util import find_spec 15 | from . import exceptions 16 | from . import config 17 | 18 | 19 | # Retrieve descriptor 20 | 21 | def retrieve_descriptor(source): 22 | 23 | try: 24 | # Inline 25 | if isinstance(source, (dict, list)): 26 | return deepcopy(source) 27 | 28 | # String 29 | if isinstance(source, six.string_types): 30 | # Remote 31 | if six.moves.urllib.parse.urlparse(source).scheme in config.REMOTE_SCHEMES: 32 | return requests.get(source).json() 33 | 34 | # Local 35 | with io.open(source, encoding='utf-8') as file: 36 | return json.load(file) 37 | 38 | # Stream 39 | return json.load(source) 40 | 41 | except Exception: 42 | raise exceptions.LoadError('Can\'t load descriptor') 43 | 44 | 45 | # Expand descriptor 46 | 47 | def expand_schema_descriptor(descriptor): 48 | if isinstance(descriptor, dict): 49 | descriptor = deepcopy(descriptor) 50 | for field in descriptor.get('fields', []): 51 | field.setdefault('type', config.DEFAULT_FIELD_TYPE) 52 | field.setdefault('format', config.DEFAULT_FIELD_FORMAT) 53 | descriptor.setdefault('missingValues', config.DEFAULT_MISSING_VALUES) 54 | return descriptor 55 | 56 | 57 | def expand_field_descriptor(descriptor): 58 | descriptor = deepcopy(descriptor) 59 | descriptor.setdefault('type', config.DEFAULT_FIELD_TYPE) 60 | descriptor.setdefault('format', config.DEFAULT_FIELD_FORMAT) 61 | return descriptor 62 | 63 | 64 | # Miscellaneous 65 | 66 | def ensure_dir(path): 67 | """Ensure directory exists. 68 | 69 | Args: 70 | path(str): dir path 71 | 72 | """ 73 | dirpath = os.path.dirname(path) 74 | if dirpath and not os.path.exists(dirpath): 75 | os.makedirs(dirpath) 76 | 77 | 78 | def normalize_value(value): 79 | """Convert value to string and make it lower cased. 80 | """ 81 | cast = str 82 | if six.PY2: 83 | cast = unicode # noqa 84 | return cast(value).lower() 85 | 86 | 87 | def default_exc_handler(exc, *args, **kwargs): 88 | """Default exception handler function: raise exc, ignore other arguments. 89 | """ 90 | raise exc 91 | 92 | 93 | class PluginImporter(object): 94 | """Plugin importer. 95 | 96 | Example: 97 | Add to myapp.plugins something like this: 98 | ``` 99 | importer = PluginImporter(virtual='myapp.plugins.', actual='myapp_') 100 | importer.register() 101 | del PluginImporter 102 | del importer 103 | ``` 104 | 105 | """ 106 | 107 | # Public 108 | 109 | def __init__(self, virtual, actual): 110 | self.__virtual = virtual 111 | self.__actual = actual 112 | 113 | def __eq__(self, other): 114 | if not isinstance(other, type(self)): 115 | return False 116 | return (self.virtual == other.virtual and 117 | self.actual == other.actual) 118 | 119 | @property 120 | def virtual(self): 121 | return self.__virtual 122 | 123 | @property 124 | def actual(self): 125 | return self.__actual 126 | 127 | def register(self): 128 | if self not in sys.meta_path: 129 | sys.meta_path.append(self) 130 | 131 | def find_spec(self, fullname, path=None, target=None): 132 | if fullname.startswith(self.virtual): 133 | # Transform the module name 134 | transformed_name = fullname.replace(self.virtual, self.actual) 135 | return find_spec(transformed_name) 136 | return None 137 | 138 | -------------------------------------------------------------------------------- /tableschema/storage.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | from six import add_metaclass 8 | from importlib import import_module 9 | from abc import ABCMeta, abstractmethod 10 | from . import exceptions 11 | 12 | 13 | # Module API 14 | 15 | @add_metaclass(ABCMeta) 16 | class Storage(object): 17 | """Storage factory/interface 18 | 19 | # For users 20 | 21 | > Use `Storage.connect` to instantiate a storage 22 | 23 | For instantiation of concrete storage instances, 24 | `tableschema.Storage` provides a unified factory method `connect` 25 | (which uses the plugin system under the hood): 26 | 27 | ```python 28 | # pip install tableschema_sql 29 | from tableschema import Storage 30 | 31 | storage = Storage.connect('sql', **options) 32 | storage.create('bucket', descriptor) 33 | storage.write('bucket', rows) 34 | storage.read('bucket') 35 | ``` 36 | 37 | # For integrators 38 | 39 | The library includes interface declaration to implement tabular `Storage`. 40 | This interface allow to use different data storage systems like SQL 41 | with `tableschema.Table` class (load/save) as well as on the data package level: 42 | 43 | ![Storage](https://raw.githubusercontent.com/frictionlessdata/tableschema-py/master/data/storage.png) 44 | 45 | An implementor must follow `tableschema.Storage` interface 46 | to write his own storage backend. Concrete storage backends 47 | could include additional functionality specific to conrete storage system. 48 | See `plugins` below to know how to integrate custom storage plugin into your workflow. 49 | 50 | """ 51 | 52 | # Public 53 | 54 | @abstractmethod 55 | def __init__(self, **options): 56 | pass 57 | 58 | @classmethod 59 | def connect(cls, name, **options): 60 | """Create tabular `storage` based on storage name. 61 | 62 | > This method is statis: `Storage.connect()` 63 | 64 | # Arguments 65 | name (str): storage name like `sql` 66 | options (dict): concrete storage options 67 | 68 | # Raises 69 | StorageError: raises on any error 70 | 71 | # Returns 72 | Storage: returns `Storage` instance 73 | 74 | """ 75 | if cls is not Storage: 76 | message = 'Storage.connect is not available on concrete implemetations' 77 | raise exceptions.StorageError(message) 78 | module = 'tableschema.plugins.%s' % name 79 | storage = import_module(module).Storage(**options) 80 | return storage 81 | 82 | @property 83 | @abstractmethod 84 | def buckets(self): 85 | """Return list of storage bucket names. 86 | 87 | A `bucket` is a special term which has almost the same meaning as `table`. 88 | You should consider `bucket` as a `table` stored in the `storage`. 89 | 90 | # Raises 91 | exceptions.StorageError: raises on any error 92 | 93 | # Returns 94 | str[]: return list of bucket names 95 | 96 | """ 97 | pass 98 | 99 | @abstractmethod 100 | def create(self, bucket, descriptor, force=False): 101 | """Create one/multiple buckets. 102 | 103 | # Arguments 104 | bucket (str/list): bucket name or list of bucket names 105 | descriptor (dict/dict[]): schema descriptor or list of descriptors 106 | force (bool): whether to delete and re-create already existing buckets 107 | 108 | # Raises 109 | exceptions.StorageError: raises on any error 110 | 111 | """ 112 | pass 113 | 114 | @abstractmethod 115 | def delete(self, bucket=None, ignore=False): 116 | """ Delete one/multiple/all buckets. 117 | 118 | # Arguments 119 | bucket (str/list/None): bucket name or list of bucket names to delete. 120 | If `None`, all buckets will be deleted 121 | descriptor (dict/dict[]): schema descriptor or list of descriptors 122 | ignore (bool): don't raise an error on non-existent bucket deletion 123 | 124 | # Raises 125 | exceptions.StorageError: raises on any error 126 | 127 | """ 128 | pass 129 | 130 | @abstractmethod 131 | def describe(self, bucket, descriptor=None): 132 | """ Get/set bucket's Table Schema descriptor 133 | 134 | # Arguments 135 | bucket (str): bucket name 136 | descriptor (dict/None): schema descriptor to set 137 | 138 | # Raises 139 | exceptions.StorageError: raises on any error 140 | 141 | # Returns 142 | dict: returns Table Schema descriptor 143 | 144 | """ 145 | pass 146 | 147 | @abstractmethod 148 | def iter(self, bucket): 149 | """Return an iterator of typed values based on the schema of this bucket. 150 | 151 | # Arguments 152 | bucket (str): bucket name 153 | 154 | # Raises 155 | exceptions.StorageError: raises on any error 156 | 157 | # Returns 158 | list[]: yields data rows 159 | 160 | """ 161 | pass 162 | 163 | @abstractmethod 164 | def read(self, bucket): 165 | """Read typed values based on the schema of this bucket. 166 | 167 | # Arguments 168 | bucket (str): bucket name 169 | # Raises 170 | exceptions.StorageError: raises on any error 171 | # Returns 172 | list[]: returns data rows 173 | 174 | """ 175 | pass 176 | 177 | @abstractmethod 178 | def write(self, bucket, rows): 179 | """ This method writes data rows into `storage`. 180 | 181 | It should store values of unsupported types as strings internally (like csv does). 182 | 183 | # Arguments 184 | bucket (str): bucket name 185 | rows (list[]): data rows to write 186 | 187 | # Raises 188 | exceptions.StorageError: raises on any error 189 | 190 | """ 191 | pass 192 | -------------------------------------------------------------------------------- /tests/test_schema_constraint_field_type.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import (absolute_import, division, print_function, 3 | unicode_literals) 4 | 5 | import io 6 | import json 7 | import os 8 | 9 | import pytest 10 | 11 | from tableschema import Schema, exceptions, validate 12 | 13 | # Tests on built-in constraints - field type consistency 14 | 15 | CONSTRAINT_FIELDTYPE_TESTCASES = [ 16 | # minLength constraint (applies to collections (string, array, object)) 17 | ('minLength', {'minLength': 4}, None, True), 18 | ('minLength', {'minLength': 4}, 'any', False), 19 | ('minLength', {'minLength': 4}, 'array', True), 20 | ('minLength', {'minLength': 4}, 'boolean', False), 21 | ('minLength', {'minLength': 4}, 'date', False), 22 | ('minLength', {'minLength': 4}, 'datetime', False), 23 | ('minLength', {'minLength': 4}, 'duration', False), 24 | ('minLength', {'minLength': 4}, 'geojson', False), 25 | ('minLength', {'minLength': 4}, 'geopoint', False), 26 | ('minLength', {'minLength': 4}, 'integer', False), 27 | ('minLength', {'minLength': 4}, 'number', False), 28 | ('minLength', {'minLength': 4}, 'object', True), 29 | ('minLength', {'minLength': 4}, 'string', True), 30 | ('minLength', {'minLength': 4}, 'time', False), 31 | ('minLength', {'minLength': 4}, 'year', False), 32 | ('minLength', {'minLength': 4}, 'yearmonth', False), 33 | 34 | # maxLength constraint (applies to collections (string, array, object)) 35 | ('maxLength', {'maxLength': 3}, None, True), 36 | ('maxLength', {'maxLength': 3}, 'any', False), 37 | ('maxLength', {'maxLength': 3}, 'array', True), 38 | ('maxLength', {'maxLength': 3}, 'boolean', False), 39 | ('maxLength', {'maxLength': 3}, 'date', False), 40 | ('maxLength', {'maxLength': 3}, 'datetime', False), 41 | ('maxLength', {'maxLength': 3}, 'duration', False), 42 | ('maxLength', {'maxLength': 3}, 'geojson', False), 43 | ('maxLength', {'maxLength': 3}, 'geopoint', False), 44 | ('maxLength', {'maxLength': 3}, 'integer', False), 45 | ('maxLength', {'maxLength': 3}, 'number', False), 46 | ('maxLength', {'maxLength': 3}, 'object', True), 47 | ('maxLength', {'maxLength': 3}, 'string', True), 48 | ('maxLength', {'maxLength': 3}, 'time', False), 49 | ('maxLength', {'maxLength': 3}, 'year', False), 50 | ('maxLength', {'maxLength': 3}, 'yearmonth', False), 51 | 52 | # minimum constraint (applies to integer, number, date, time, datetime, year, yearmonth) 53 | ('minimum', {'minimum': 4}, None, False), 54 | ('minimum', {'minimum': 4}, 'any', False), 55 | ('minimum', {'minimum': 4}, 'array', False), 56 | ('minimum', {'minimum': 4}, 'boolean', False), 57 | ('minimum', {'minimum': "1789-07-14"}, 'date', True), 58 | ('minimum', {'minimum': "1789-07-14T08:00:00Z"}, 'datetime', True), 59 | ('minimum', {'minimum': 4}, 'duration', False), 60 | ('minimum', {'minimum': 4}, 'geojson', False), 61 | ('minimum', {'minimum': 4}, 'geopoint', False), 62 | ('minimum', {'minimum': 4}, 'integer', True), 63 | ('minimum', {'minimum': 4}, 'number', True), 64 | ('minimum', {'minimum': 4}, 'object', False), 65 | ('minimum', {'minimum': 4}, 'string', False), 66 | ('minimum', {'minimum': "07:07:07"}, 'time', True), 67 | ('minimum', {'minimum': 4}, 'year', True), 68 | ('minimum', {'minimum': "1789-07"}, 'yearmonth', True), 69 | 70 | # maximum constraint (applies to integer, number, date, time and datetime, year, yearmonth) 71 | ('maximum', {'maximum': 4}, None, False), 72 | ('maximum', {'maximum': 4}, 'any', False), 73 | ('maximum', {'maximum': 4}, 'array', False), 74 | ('maximum', {'maximum': 4}, 'boolean', False), 75 | ('maximum', {'maximum': "2001-01-01"}, 'date', True), 76 | ('maximum', {'maximum': "2001-01-01T12:00:00Z"}, 'datetime', True), 77 | ('maximum', {'maximum': 4}, 'duration', False), 78 | ('maximum', {'maximum': 4}, 'geojson', False), 79 | ('maximum', {'maximum': 4}, 'geopoint', False), 80 | ('maximum', {'maximum': 4}, 'integer', True), 81 | ('maximum', {'maximum': 4}, 'number', True), 82 | ('maximum', {'maximum': 4}, 'object', False), 83 | ('maximum', {'maximum': 4}, 'string', False), 84 | ('maximum', {'maximum': "08:09:10"}, 'time', True), 85 | ('maximum', {'maximum': 4}, 'year', True), 86 | ('maximum', {'maximum': "2001-01"}, 'yearmonth', True), 87 | 88 | # pattern constraint (apply to string) 89 | ('pattern', {'pattern': '[0-9]+'}, None, True), 90 | ('pattern', {'pattern': '[0-9]+'}, 'any', False), 91 | ('pattern', {'pattern': '[0-9]+'}, 'array', False), 92 | ('pattern', {'pattern': '[0-9]+'}, 'boolean', False), 93 | ('pattern', {'pattern': '[0-9]+'}, 'date', False), 94 | ('pattern', {'pattern': '[0-9]+'}, 'datetime', False), 95 | ('pattern', {'pattern': '[0-9]+'}, 'duration', False), 96 | ('pattern', {'pattern': '[0-9]+'}, 'geojson', False), 97 | ('pattern', {'pattern': '[0-9]+'}, 'geopoint', False), 98 | ('pattern', {'pattern': '[0-9]+'}, 'integer', False), 99 | ('pattern', {'pattern': '[0-9]+'}, 'number', False), 100 | ('pattern', {'pattern': '[0-9]+'}, 'object', False), 101 | ('pattern', {'pattern': '[0-9]+'}, 'string', True), 102 | ('pattern', {'pattern': '[0-9]+'}, 'time', False), 103 | ('pattern', {'pattern': '[0-9]+'}, 'year', False), 104 | ('pattern', {'pattern': '[0-9]+'}, 'yearmonth', False) 105 | ] 106 | 107 | 108 | @pytest.mark.parametrize("constraint_name, constraint, field_type, expected", CONSTRAINT_FIELDTYPE_TESTCASES) 109 | def test_schema_constraint_field_type(constraint_name, constraint, field_type, expected): 110 | field = { 111 | 'name': 'f', 112 | 'constraints': constraint, 113 | } 114 | if field_type is not None: 115 | field['type'] = field_type 116 | test_descriptor = {'fields': [field]} 117 | 118 | message = 'constraint "{}" can{} be applied to "{}" field' \ 119 | .format(constraint_name, "" if expected else "not", 120 | "default" if field_type is None else field_type) 121 | 122 | table_schema = Schema(descriptor=test_descriptor) 123 | assert table_schema.valid == expected, message 124 | -------------------------------------------------------------------------------- /tests/test_field.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import io 8 | import json 9 | import pytest 10 | import requests 11 | from functools import partial 12 | from tableschema import Field, exceptions 13 | 14 | 15 | # Constants 16 | 17 | DESCRIPTOR_MIN = {'name': 'id'} 18 | DESCRIPTOR_MAX = { 19 | 'name': 'id', 20 | 'type': 'integer', 21 | 'format': 'default', 22 | 'constraints': {'required': True}, 23 | } 24 | 25 | 26 | # Tests [general] 27 | 28 | def test_descriptor(apply_defaults): 29 | assert Field(DESCRIPTOR_MIN).descriptor == apply_defaults(DESCRIPTOR_MIN) 30 | 31 | 32 | def test_name(): 33 | assert Field(DESCRIPTOR_MIN).name == 'id' 34 | 35 | 36 | def test_type(): 37 | assert Field(DESCRIPTOR_MIN).type == 'string' 38 | assert Field(DESCRIPTOR_MAX).type == 'integer' 39 | 40 | 41 | def test_format(): 42 | assert Field(DESCRIPTOR_MIN).format == 'default' 43 | assert Field(DESCRIPTOR_MAX).format == 'default' 44 | 45 | 46 | def test_constraints(): 47 | assert Field(DESCRIPTOR_MIN).constraints == {} 48 | assert Field(DESCRIPTOR_MAX).constraints == {'required': True} 49 | 50 | 51 | def test_required(): 52 | assert Field(DESCRIPTOR_MIN).required == False 53 | assert Field(DESCRIPTOR_MAX).required == True 54 | 55 | 56 | def test_cast_value(): 57 | assert Field(DESCRIPTOR_MAX).cast_value('1') == 1 58 | 59 | 60 | def test_cast_value_constraint_error(): 61 | with pytest.raises(exceptions.CastError): 62 | Field(DESCRIPTOR_MAX).cast_value('') 63 | 64 | 65 | def test_cast_value_constraints_false(): 66 | assert Field(DESCRIPTOR_MIN).cast_value('', constraints=False) == None 67 | 68 | 69 | def test_cast_value_null_with_missing_values(): 70 | field = Field({'name': 'name', 'type': 'number'}, missing_values=['null']) 71 | assert field.cast_value('null') == None 72 | 73 | 74 | def test_test_value(): 75 | assert Field(DESCRIPTOR_MAX).test_value('1') == True 76 | assert Field(DESCRIPTOR_MAX).test_value('string') == False 77 | assert Field(DESCRIPTOR_MAX).test_value('') == False 78 | 79 | 80 | def test_test_value_constraints_false(): 81 | assert Field(DESCRIPTOR_MIN).test_value('', constraints=False) == True 82 | 83 | 84 | def test_missing_values(): 85 | assert Field(DESCRIPTOR_MIN).missing_values == [''] 86 | assert Field(DESCRIPTOR_MIN, missing_values=['-']).missing_values == ['-'] 87 | 88 | 89 | # Tests [missingValues] 90 | 91 | def test_string_missingValues(): 92 | field = Field({ 93 | 'name': 'name', 94 | 'type': 'string', 95 | }, missing_values=['', 'NA', 'N/A']) 96 | cast = field.cast_value 97 | assert cast('') == None 98 | assert cast('NA') == None 99 | assert cast('N/A') == None 100 | 101 | 102 | def test_number_missingValues(): 103 | field = Field({ 104 | 'name': 'name', 105 | 'type': 'number', 106 | }, missing_values=['', 'NA', 'N/A']) 107 | cast = field.cast_value 108 | assert cast('') == None 109 | assert cast('NA') == None 110 | assert cast('N/A') == None 111 | 112 | 113 | # Tests [constraints] 114 | 115 | def test_test_value_required(): 116 | field = Field({ 117 | 'name': 'name', 118 | 'type': 'string', 119 | 'constraints': {'required': True} 120 | }, missing_values=['', 'NA', 'N/A']) 121 | test = partial(field.test_value, constraints=['required']) 122 | assert test('test') == True 123 | assert test('null') == True 124 | assert test('none') == True 125 | assert test('nil') == True 126 | assert test('nan') == True 127 | assert test('NA') == False 128 | assert test('N/A') == False 129 | assert test('-') == True 130 | assert test('') == False 131 | assert test(None) == False 132 | 133 | 134 | def test_test_value_pattern(): 135 | field = Field({ 136 | 'name': 'name', 137 | 'type': 'string', 138 | 'constraints': {'pattern': '3.*'} 139 | }) 140 | test = partial(field.test_value, constraints=['pattern']) 141 | assert test('3') == True 142 | assert test('321') == True 143 | assert test('123') == False 144 | 145 | 146 | def test_test_value_unique(): 147 | field = Field({ 148 | 'name': 'name', 149 | 'type': 'integer', 150 | 'constraints': {'unique': True} 151 | }) 152 | test = partial(field.test_value, constraints=['unique']) 153 | assert test(30000) == True 154 | assert test('bad') == False 155 | 156 | 157 | def test_test_value_enum(): 158 | field = Field({ 159 | 'name': 'name', 160 | 'type': 'integer', 161 | 'constraints': {'enum': ['1', '2', '3']} 162 | }) 163 | test = partial(field.test_value, constraints=['enum']) 164 | assert test('1') == True 165 | assert test(1) == True 166 | assert test('4') == False 167 | assert test(4) == False 168 | 169 | 170 | def test_test_value_minimum(): 171 | field = Field({ 172 | 'name': 'name', 173 | 'type': 'integer', 174 | 'constraints': {'minimum': 1} 175 | }) 176 | test = partial(field.test_value, constraints=['minimum']) 177 | assert test('2') == True 178 | assert test(2) == True 179 | assert test('1') == True 180 | assert test(1) == True 181 | assert test('0') == False 182 | assert test(0) == False 183 | 184 | 185 | def test_test_value_maximum(): 186 | field = Field({ 187 | 'name': 'name', 188 | 'type': 'integer', 189 | 'constraints': {'maximum': 1} 190 | }) 191 | test = partial(field.test_value, constraints=['maximum']) 192 | assert test('0') == True 193 | assert test(0) == True 194 | assert test('1') == True 195 | assert test(1) == True 196 | assert test('2') == False 197 | assert test(2) == False 198 | 199 | 200 | def test_test_value_minLength(): 201 | field = Field({ 202 | 'name': 'name', 203 | 'type': 'string', 204 | 'constraints': {'minLength': 1} 205 | }) 206 | test = partial(field.test_value, constraints=['minLength']) 207 | assert test('ab') == True 208 | assert test('a') == True 209 | # Null value passes 210 | assert test('') == True 211 | 212 | 213 | def test_test_value_maxLength(): 214 | field = Field({ 215 | 'name': 'name', 216 | 'type': 'string', 217 | 'constraints': {'maxLength': 1} 218 | }) 219 | test = partial(field.test_value, constraints=['maxLength']) 220 | assert test('') == True 221 | assert test('a') == True 222 | assert test('ab') == False 223 | -------------------------------------------------------------------------------- /tableschema/field.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import os 8 | from functools import partial 9 | from cached_property import cached_property 10 | from .profile import Profile 11 | from . import constraints 12 | from . import exceptions 13 | from . import helpers 14 | from . import config 15 | from . import types 16 | 17 | 18 | # Module API 19 | 20 | class Field(object): 21 | """Field representaion 22 | 23 | # Arguments 24 | descriptor (dict): schema field descriptor 25 | missingValues (str[]): an array with string representing missing values 26 | 27 | # Raises 28 | TableSchemaException: raises any error that occurs during the process 29 | 30 | """ 31 | 32 | # Public 33 | 34 | ERROR = config.ERROR 35 | 36 | def __init__(self, descriptor, missing_values=config.DEFAULT_MISSING_VALUES, 37 | # Internal 38 | schema=None): 39 | 40 | # Process descriptor 41 | descriptor = helpers.expand_field_descriptor(descriptor) 42 | 43 | # Set attributes 44 | self.__descriptor = descriptor 45 | self.__missing_values = missing_values 46 | self.__schema = schema 47 | self.__cast_function = self.__get_cast_function() 48 | self.__check_functions = self.__get_check_functions() 49 | self.__preserve_missing_values = os.environ.get('TABLESCHEMA_PRESERVE_MISSING_VALUES') 50 | 51 | @cached_property 52 | def schema(self): 53 | """Returns a schema instance if the field belongs to some schema 54 | 55 | # Returns 56 | Schema: field's schema 57 | 58 | """ 59 | return self.__schema 60 | 61 | @cached_property 62 | def name(self): 63 | """Field name 64 | 65 | # Returns 66 | str: field name 67 | 68 | """ 69 | return self.__descriptor.get('name') 70 | 71 | @cached_property 72 | def type(self): 73 | """Field type 74 | 75 | # Returns 76 | str: field type 77 | 78 | """ 79 | return self.__descriptor.get('type') 80 | 81 | @cached_property 82 | def format(self): 83 | """Field format 84 | 85 | # Returns 86 | str: field format 87 | 88 | """ 89 | return self.__descriptor.get('format') 90 | 91 | @cached_property 92 | def missing_values(self): 93 | """Field's missing values 94 | 95 | # Returns 96 | str[]: missing values 97 | 98 | """ 99 | return self.__missing_values 100 | 101 | @cached_property 102 | def required(self): 103 | """Whether field is required 104 | 105 | # Returns 106 | bool: true if required 107 | 108 | """ 109 | return self.constraints.get('required', False) 110 | 111 | @cached_property 112 | def constraints(self): 113 | """Field constraints 114 | 115 | # Returns 116 | dict: dict of field constraints 117 | 118 | """ 119 | return self.__descriptor.get('constraints', {}) 120 | 121 | @cached_property 122 | def descriptor(self): 123 | """Fields's descriptor 124 | 125 | # Returns 126 | dict: descriptor 127 | 128 | """ 129 | return self.__descriptor 130 | 131 | @cached_property 132 | def cast_function(self): 133 | return self.__cast_function 134 | 135 | @cached_property 136 | def check_functions(self): 137 | return self.__check_functions 138 | 139 | def cast_value(self, value, constraints=True): 140 | """Cast given value according to the field type and format. 141 | 142 | # Arguments 143 | value (any): value to cast against field 144 | constraints (boll/str[]): gets constraints configuration 145 | - it could be set to true to disable constraint checks 146 | - it could be an Array of constraints to check e.g. ['minimum', 'maximum'] 147 | 148 | # Raises 149 | TableSchemaException: raises any error that occurs during the process 150 | 151 | # Returns 152 | any: returns cast value 153 | 154 | """ 155 | 156 | # Null value 157 | if value in self.__missing_values: 158 | # Whether missing_values should be preserved without being cast 159 | if self.__preserve_missing_values: 160 | return value 161 | value = None 162 | 163 | # Cast value 164 | cast_value = value 165 | if value is not None: 166 | cast_value = self.__cast_function(value) 167 | if cast_value == config.ERROR: 168 | raise exceptions.CastError(( 169 | 'Field "{field.name}" can\'t cast value "{value}" ' 170 | 'for type "{field.type}" with format "{field.format}"' 171 | ).format(field=self, value=value)) 172 | 173 | # Check value 174 | if constraints: 175 | for name, check in self.__check_functions.items(): 176 | if isinstance(constraints, list): 177 | if name not in constraints: 178 | continue 179 | passed = check(cast_value) 180 | if not passed: 181 | raise exceptions.CastError(( 182 | 'Field "{field.name}" has constraint "{name}" ' 183 | 'which is not satisfied for value "{value}"' 184 | ).format(field=self, name=name, value=value)) 185 | 186 | return cast_value 187 | 188 | def test_value(self, value, constraints=True): 189 | """Test whether value is compliant to the field. 190 | 191 | # Arguments 192 | value (any): value to cast against field 193 | constraints (bool/str[]): constraints configuration 194 | 195 | # Returns 196 | bool: returns if value is compliant to the field 197 | 198 | """ 199 | try: 200 | self.cast_value(value, constraints=constraints) 201 | except exceptions.CastError: 202 | return False 203 | return True 204 | 205 | # Private 206 | 207 | def __get_cast_function(self): 208 | options = {} 209 | # Get cast options 210 | for key in ['decimalChar', 'groupChar', 'bareNumber', 'trueValues', 'falseValues']: 211 | value = self.descriptor.get(key) 212 | if value is not None: 213 | options[key] = value 214 | try: 215 | cast = getattr(types, 'cast_%s' % self.type) 216 | except AttributeError: 217 | message = 'Not supported field type: %s' % self.type 218 | raise exceptions.TableSchemaException(message) 219 | cast = partial(cast, self.format, **options) 220 | return cast 221 | 222 | def __get_check_functions(self): 223 | checks = {} 224 | cast = partial(self.cast_value, constraints=False) 225 | whitelist = _get_field_constraints(self.type) 226 | for name, constraint in self.constraints.items(): 227 | if name in whitelist: 228 | # Cast enum constraint 229 | if name in ['enum']: 230 | constraint = list(map(cast, constraint)) 231 | # Cast maximum/minimum constraint 232 | if name in ['maximum', 'minimum']: 233 | constraint = cast(constraint) 234 | check = getattr(constraints, 'check_%s' % name) 235 | checks[name] = partial(check, constraint) 236 | return checks 237 | 238 | 239 | # Internal 240 | 241 | def _get_field_constraints(type): 242 | # Extract list of constraints for given type from jsonschema 243 | jsonschema = Profile('table-schema').jsonschema 244 | profile_types = jsonschema['properties']['fields']['items']['anyOf'] 245 | for profile_type in profile_types: 246 | if type in profile_type['properties']['type']['enum']: 247 | return profile_type['properties']['constraints']['properties'].keys() 248 | -------------------------------------------------------------------------------- /tableschema/profile.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import io 8 | import os 9 | import six 10 | import json 11 | import jsonschema 12 | from jsonschema.validators import validator_for 13 | from . import exceptions 14 | 15 | 16 | # Module API 17 | 18 | class Profile(object): 19 | 20 | # Public 21 | 22 | def __init__(self, profile): 23 | self.__profile = profile 24 | self.__jsonschema = _PROFILES.get(profile) 25 | if not self.__jsonschema: 26 | message = 'Can\'t load profile "%s"' % profile 27 | raise exceptions.LoadError(message) 28 | 29 | @property 30 | def name(self): 31 | return self.__jsonschema.get('title', '').replace(' ', '-').lower() or None 32 | 33 | @property 34 | def jsonschema(self): 35 | return self.__jsonschema 36 | 37 | def validate(self, descriptor): 38 | 39 | # Other profiles 40 | if self.name != 'table-schema': 41 | return jsonschema.validate(descriptor, self.jsonschema) 42 | 43 | # Collect errors 44 | errors = [] 45 | validator = _TableSchemaValidator( 46 | self.jsonschema, format_checker=jsonschema.FormatChecker()) 47 | for error in validator.iter_errors(descriptor): 48 | if isinstance(error, jsonschema.exceptions.ValidationError): 49 | message = str(error.message) 50 | if six.PY2: 51 | message = message.replace('u\'', '\'') 52 | descriptor_path = '/'.join(map(str, error.path)) 53 | profile_path = '/'.join(map(str, error.schema_path)) 54 | error = exceptions.ValidationError( 55 | 'Descriptor validation error: %s ' 56 | 'at "%s" in descriptor and ' 57 | 'at "%s" in profile' 58 | % (message, descriptor_path, profile_path)) 59 | errors.append(error) 60 | 61 | # Railse error 62 | if errors: 63 | message = 'There are %s validation errors (see exception.errors)' % len(errors) 64 | raise exceptions.ValidationError(message, errors=errors) 65 | 66 | return True 67 | 68 | 69 | # Internal 70 | 71 | def _load_profile(filename): 72 | path = os.path.join(os.path.dirname(__file__), 'profiles', filename) 73 | profile = json.load(io.open(path, encoding='utf-8')) 74 | return profile 75 | 76 | 77 | _PROFILES = { 78 | 'table-schema': _load_profile('table-schema.json'), 79 | 'geojson': _load_profile('geojson.json'), 80 | } 81 | 82 | _CONSTRAINT_ALLOWED_FIELD_TYPE = { 83 | 'minLength': {None, 'string', 'array', 'object'}, 84 | 'maxLength': {None, 'string', 'array', 'object'}, 85 | 'minimum': {'integer', 'number', 'date', 'time', 'datetime', 'year', 'yearmonth'}, 86 | 'maximum': {'integer', 'number', 'date', 'time', 'datetime', 'year', 'yearmonth'}, 87 | 'pattern': {None, 'string'}, 88 | } 89 | 90 | 91 | class _TableSchemaValidator(validator_for(_PROFILES['table-schema'])): 92 | @classmethod 93 | def check_schema(cls, schema): 94 | # When checking against the metaschema, we do not want to run the 95 | # additional checking added in iter_errors 96 | parent_cls = cls.__bases__[0] 97 | for error in parent_cls(cls.META_SCHEMA).iter_errors(schema): 98 | raise jsonschema.exceptions.SchemaError.create_from(error) 99 | 100 | def iter_errors(self, instance, _schema=None): 101 | 102 | # iter jsonschema validation errors 103 | for error in super(_TableSchemaValidator, self).iter_errors(instance, _schema): 104 | yield jsonschema.exceptions.ValidationError( 105 | error.message, error.validator, error.path, error.cause, 106 | error.context, error.validator_value, error.instance, 107 | error.schema, error.schema_path, error.parent) 108 | 109 | # get field names 110 | try: 111 | field_names = [f['name'] for f in instance['fields']] 112 | except (TypeError, KeyError): 113 | field_names = [] 114 | 115 | # ensure constraint and field type consistency 116 | if isinstance(instance, dict) and instance.get('fields'): 117 | for field in instance['fields']: 118 | if not isinstance(field, dict): 119 | continue 120 | field_type = field.get('type') 121 | field_type_str = 'default' if field_type is None else field_type 122 | field_name = field.get('name', '[noname]') 123 | constraints = field.get('constraints', {}) 124 | for constraint_name in constraints: 125 | if constraint_name in _CONSTRAINT_ALLOWED_FIELD_TYPE: 126 | if field_type not in _CONSTRAINT_ALLOWED_FIELD_TYPE[constraint_name]: 127 | yield exceptions.ValidationError( 128 | "field {}: built-in {} constraint can't be applied to {} type field" 129 | .format(field_name, constraint_name, field_type_str)) 130 | 131 | # the hash MAY contain a key `primaryKey` 132 | if isinstance(instance, dict) and instance.get('primaryKey'): 133 | 134 | # ensure that the primary key matches field names 135 | if isinstance(instance['primaryKey'], six.string_types): 136 | if not instance['primaryKey'] in field_names: 137 | yield exceptions.ValidationError( 138 | 'A JSON Table Schema primaryKey value must be found in' 139 | ' the schema field names') 140 | elif isinstance(instance['primaryKey'], list): 141 | for k in instance['primaryKey']: 142 | if k not in field_names: 143 | yield exceptions.ValidationError( 144 | 'A JSON Table Schema primaryKey value must be ' 145 | 'found in the schema field names') 146 | 147 | # the hash may contain a key `foreignKeys` 148 | if isinstance(instance, dict) and instance.get('foreignKeys'): 149 | for fk in instance['foreignKeys']: 150 | 151 | # ensure that `foreignKey.fields` match field names 152 | if isinstance(fk.get('fields'), six.string_types): 153 | if fk.get('fields') not in field_names: 154 | yield exceptions.ValidationError( 155 | 'A JSON Table Schema foreignKey.fields value must ' 156 | 'correspond with field names.') 157 | elif isinstance(fk.get('fields', []), list): 158 | for field in fk.get('fields'): 159 | if field not in field_names: 160 | yield exceptions.ValidationError( 161 | 'A JSON Table Schema foreignKey.fields value ' 162 | 'must correspond with field names.') 163 | 164 | # ensure that `foreignKey.reference.fields` 165 | # matches outer `fields` 166 | if isinstance(fk.get('fields'), six.string_types): 167 | fields = fk.get('reference', {}).get('fields', {}) 168 | if not isinstance(fields, six.string_types): 169 | yield exceptions.ValidationError( 170 | 'A JSON Table Schema foreignKey.reference.fields ' 171 | 'must match field names.') 172 | else: 173 | if isinstance(fk['reference']['fields'], six.string_types): 174 | yield exceptions.ValidationError( 175 | 'A JSON Table Schema foreignKey.fields cannot ' 176 | 'be a string when foreignKey.reference.fields.' 177 | 'is a string') 178 | if not (len(fk.get('fields')) == 179 | len(fk['reference']['fields'])): 180 | yield exceptions.ValidationError( 181 | 'A JSON Table Schema foreignKey.fields must ' 182 | 'contain the same number entries as ' 183 | 'foreignKey.reference.fields.') 184 | -------------------------------------------------------------------------------- /tableschema/profiles/geojson.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/draft-04/schema#", 3 | "id": "https://raw.githubusercontent.com/fge/sample-json-schemas/master/geojson/geojson.json#", 4 | "title": "Geo JSON object", 5 | "description": "Schema for a Geo JSON object", 6 | "type": "object", 7 | "required": [ "type" ], 8 | "properties": { 9 | "crs": { "$ref": "#/definitions/crs" }, 10 | "bbox": { "$ref": "#/definitions/bbox" } 11 | }, 12 | "oneOf": [ 13 | { "$ref": "#/definitions/geometry" }, 14 | { "$ref": "#/definitions/geometryCollection" }, 15 | { "$ref": "#/definitions/feature" }, 16 | { "$ref": "#/definitions/featureCollection" } 17 | ], 18 | "definitions": { 19 | "geometryCollection": { 20 | "title": "GeometryCollection", 21 | "description": "A collection of geometry objects", 22 | "required": [ "geometries" ], 23 | "properties": { 24 | "type": { "enum": [ "GeometryCollection" ] }, 25 | "geometries": { 26 | "type": "array", 27 | "items": { "$ref": "#/definitions/geometry" } 28 | } 29 | } 30 | }, 31 | "feature": { 32 | "title": "Feature", 33 | "description": "A Geo JSON feature object", 34 | "required": [ "geometry", "properties" ], 35 | "properties": { 36 | "type": { "enum": [ "Feature" ] }, 37 | "geometry": { 38 | "oneOf": [ 39 | { "type": "null" }, 40 | { "$ref": "#/definitions/geometry" } 41 | ] 42 | }, 43 | "properties": { "type": [ "object", "null" ] }, 44 | "id": { "FIXME": "may be there, type not known (string? number?)" } 45 | } 46 | }, 47 | "featureCollection": { 48 | "title": "FeatureCollection", 49 | "description": "A Geo JSON feature collection", 50 | "required": [ "features" ], 51 | "properties": { 52 | "type": { "enum": [ "FeatureCollection" ] }, 53 | "features": { 54 | "type": "array", 55 | "items": { "$ref": "#/definitions/feature" } 56 | } 57 | } 58 | }, 59 | "geometry": { 60 | "title": "geometry", 61 | "description": "One geometry as defined by GeoJSON", 62 | "type": "object", 63 | "required": [ "type", "coordinates" ], 64 | "oneOf": [ 65 | { 66 | "title": "Point", 67 | "properties": { 68 | "type": { "enum": [ "Point" ] }, 69 | "coordinates": { "$ref": "#/definitions/geometry/definitions/position" } 70 | } 71 | }, 72 | { 73 | "title": "MultiPoint", 74 | "properties": { 75 | "type": { "enum": [ "MultiPoint" ] }, 76 | "coordinates": { "$ref": "#/definitions/geometry/definitions/positionArray" } 77 | } 78 | }, 79 | { 80 | "title": "LineString", 81 | "properties": { 82 | "type": { "enum": [ "LineString" ] }, 83 | "coordinates": { "$ref": "#/definitions/geometry/definitions/lineString" } 84 | } 85 | }, 86 | { 87 | "title": "MultiLineString", 88 | "properties": { 89 | "type": { "enum": [ "MultiLineString" ] }, 90 | "coordinates": { 91 | "type": "array", 92 | "items": { "$ref": "#/definitions/geometry/definitions/lineString" } 93 | } 94 | } 95 | }, 96 | { 97 | "title": "Polygon", 98 | "properties": { 99 | "type": { "enum": [ "Polygon" ] }, 100 | "coordinates": { "$ref": "#/definitions/geometry/definitions/polygon" } 101 | } 102 | }, 103 | { 104 | "title": "MultiPolygon", 105 | "properties": { 106 | "type": { "enum": [ "MultiPolygon" ] }, 107 | "coordinates": { 108 | "type": "array", 109 | "items": { "$ref": "#/definitions/geometry/definitions/polygon" } 110 | } 111 | } 112 | } 113 | ], 114 | "definitions": { 115 | "position": { 116 | "description": "A single position", 117 | "type": "array", 118 | "minItems": 2, 119 | "items": [ { "type": "number" }, { "type": "number" } ], 120 | "additionalItems": false 121 | }, 122 | "positionArray": { 123 | "description": "An array of positions", 124 | "type": "array", 125 | "items": { "$ref": "#/definitions/geometry/definitions/position" } 126 | }, 127 | "lineString": { 128 | "description": "An array of two or more positions", 129 | "allOf": [ 130 | { "$ref": "#/definitions/geometry/definitions/positionArray" }, 131 | { "minItems": 2 } 132 | ] 133 | }, 134 | "linearRing": { 135 | "description": "An array of four positions where the first equals the last", 136 | "allOf": [ 137 | { "$ref": "#/definitions/geometry/definitions/positionArray" }, 138 | { "minItems": 4 } 139 | ] 140 | }, 141 | "polygon": { 142 | "description": "An array of linear rings", 143 | "type": "array", 144 | "items": { "$ref": "#/definitions/geometry/definitions/linearRing" } 145 | } 146 | } 147 | }, 148 | "crs": { 149 | "title": "crs", 150 | "description": "a Coordinate Reference System object", 151 | "type": [ "object", "null" ], 152 | "required": [ "type", "properties" ], 153 | "properties": { 154 | "type": { "type": "string" }, 155 | "properties": { "type": "object" } 156 | }, 157 | "additionalProperties": false, 158 | "oneOf": [ 159 | { "$ref": "#/definitions/crs/definitions/namedCrs" }, 160 | { "$ref": "#/definitions/crs/definitions/linkedCrs" } 161 | ], 162 | "definitions": { 163 | "namedCrs": { 164 | "properties": { 165 | "type": { "enum": [ "name" ] }, 166 | "properties": { 167 | "required": [ "name" ], 168 | "additionalProperties": false, 169 | "properties": { 170 | "name": { 171 | "type": "string", 172 | "FIXME": "semantic validation necessary" 173 | } 174 | } 175 | } 176 | } 177 | }, 178 | "linkedObject": { 179 | "type": "object", 180 | "required": [ "href" ], 181 | "properties": { 182 | "href": { 183 | "type": "string", 184 | "format": "uri", 185 | "FIXME": "spec says \"dereferenceable\", cannot enforce that" 186 | }, 187 | "type": { 188 | "type": "string", 189 | "description": "Suggested values: proj4, ogjwkt, esriwkt" 190 | } 191 | } 192 | }, 193 | "linkedCrs": { 194 | "properties": { 195 | "type": { "enum": [ "link" ] }, 196 | "properties": { "$ref": "#/definitions/crs/definitions/linkedObject" } 197 | } 198 | } 199 | } 200 | }, 201 | "bbox": { 202 | "description": "A bounding box as defined by GeoJSON", 203 | "FIXME": "unenforceable constraint: even number of elements in array", 204 | "type": "array", 205 | "items": { "type": "number" } 206 | } 207 | } 208 | } 209 | 210 | -------------------------------------------------------------------------------- /tests/test_schema.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import io 8 | import os 9 | import json 10 | import pytest 11 | import requests 12 | from collections import OrderedDict 13 | from decimal import Decimal 14 | from tableschema import Schema, FailedCast, exceptions 15 | 16 | 17 | # Constants 18 | 19 | BASE_URL = 'https://raw.githubusercontent.com/frictionlessdata/tableschema-py/master/%s' 20 | DESCRIPTOR_MIN = {'fields': [{'name': 'id'}, {'name': 'height', 'type': 'integer'}]} 21 | DESCRIPTOR_MAX = { 22 | 'fields': [ 23 | {'name': 'id', 'type': 'string', 'constraints': {'required': True}}, 24 | {'name': 'height', 'type': 'number'}, 25 | {'name': 'age', 'type': 'integer'}, 26 | {'name': 'name', 'type': 'string'}, 27 | {'name': 'occupation', 'type': 'string'}, 28 | ], 29 | 'primaryKey': ['id'], 30 | 'foreignKeys': [{'fields': ['name'], 'reference': {'resource': '', 'fields': ['id']}}], 31 | 'missingValues': ['', '-', 'null'], 32 | } 33 | 34 | 35 | # General 36 | 37 | 38 | def test_init(): 39 | assert Schema(DESCRIPTOR_MIN) 40 | assert Schema(DESCRIPTOR_MAX) 41 | assert Schema('data/schema_valid_full.json') 42 | assert Schema('data/schema_valid_simple.json') 43 | 44 | 45 | def test_init_invalid_in_strict_mode(): 46 | with pytest.raises(exceptions.TableSchemaException) as exception: 47 | Schema('data/schema_invalid_multiple_errors.json', strict=True) 48 | 49 | 50 | def test_descriptor(apply_defaults): 51 | assert Schema(DESCRIPTOR_MIN).descriptor == apply_defaults(DESCRIPTOR_MIN) 52 | assert Schema(DESCRIPTOR_MAX).descriptor == apply_defaults(DESCRIPTOR_MAX) 53 | 54 | 55 | def test_descriptor_path(apply_defaults): 56 | path = 'data/schema_valid_simple.json' 57 | actual = Schema(path).descriptor 58 | with io.open(path, encoding='utf-8') as file: 59 | expect = apply_defaults(json.load(file)) 60 | assert actual == expect 61 | 62 | 63 | def test_descriptor_url(apply_defaults): 64 | url = BASE_URL % 'data/schema_valid_simple.json' 65 | actual = Schema(url).descriptor 66 | expect = apply_defaults(requests.get(url).json()) 67 | assert actual == expect 68 | 69 | 70 | def test_descriptor_applied_defaults(): 71 | assert Schema(DESCRIPTOR_MIN).descriptor == { 72 | 'fields': [ 73 | {'name': 'id', 'type': 'string', 'format': 'default'}, 74 | {'name': 'height', 'type': 'integer', 'format': 'default'}, 75 | ], 76 | 'missingValues': [''], 77 | } 78 | 79 | 80 | def test_cast_row(): 81 | schema = Schema(DESCRIPTOR_MAX) 82 | source = ['string', '10.0', '1', 'string', 'string'] 83 | target = ['string', Decimal(10.0), 1, 'string', 'string'] 84 | assert schema.cast_row(source) == target 85 | 86 | 87 | def test_cast_row_null_values(): 88 | schema = Schema(DESCRIPTOR_MAX) 89 | source = ['string', '', '-', 'string', 'null'] 90 | target = ['string', None, None, 'string', None] 91 | assert schema.cast_row(source) == target 92 | 93 | 94 | def test_cast_row_too_short(): 95 | schema = Schema(DESCRIPTOR_MAX) 96 | source = ['string', '10.0', '1', 'string'] 97 | with pytest.raises(exceptions.CastError): 98 | schema.cast_row(source) 99 | 100 | 101 | def test_cast_row_too_long(): 102 | schema = Schema(DESCRIPTOR_MAX) 103 | source = ['string', '10.0', '1', 'string', 'string', 'string'] 104 | with pytest.raises(exceptions.CastError): 105 | schema.cast_row(source) 106 | 107 | 108 | def test_cast_row_wrong_type(): 109 | schema = Schema(DESCRIPTOR_MAX) 110 | source = ['string', 'notdecimal', '10.6', 'string', 'string'] 111 | with pytest.raises(exceptions.CastError): 112 | schema.cast_row(source) 113 | 114 | 115 | def test_cast_row_wrong_type_multiple_errors(): 116 | schema = Schema(DESCRIPTOR_MAX) 117 | source = ['string', 'notdecimal', '10.6', 'string', 'string'] 118 | with pytest.raises(exceptions.CastError) as excinfo: 119 | schema.cast_row(source) 120 | assert len(excinfo.value.errors) == 2 121 | 122 | 123 | def test_missing_values(): 124 | assert Schema(DESCRIPTOR_MIN).missing_values == [''] 125 | assert Schema(DESCRIPTOR_MAX).missing_values == ['', '-', 'null'] 126 | 127 | 128 | # Test row casting with exception handler i.e. don't fail immediately 129 | 130 | def _check_error( 131 | error, expect_exc_class, expect_exc_str, expect_row_number=None, 132 | expect_row_data=None, expect_error_data=None): 133 | # Helper function to check all given expectations on handled errors. 134 | # error must be a (exc, row_number, row_data, error_data)-tuple 135 | 136 | # Make this a namedtuple? 137 | exc, row_number, row_data, error_data = error 138 | assert isinstance(exc, expect_exc_class) 139 | assert expect_exc_str in str(exc) 140 | if expect_row_number is not None: 141 | # actual row number including header line 142 | assert row_number == expect_row_number 143 | if expect_row_data is not None: 144 | assert row_data == expect_row_data 145 | if error_data is not None: 146 | assert error_data == expect_error_data 147 | 148 | 149 | def test_cast_row_handled(): 150 | schema = Schema(DESCRIPTOR_MAX) 151 | source = ['string', '10.0', '1', 'string', 'string'] 152 | target = ['string', Decimal(10.0), 1, 'string', 'string'] 153 | errors = [] 154 | def handler(exc, row_number, row_data, error_data): 155 | errors.append((exc, row_number, row_data, error_data)) 156 | assert schema.cast_row(source, exc_handler=handler) == target 157 | assert len(errors) == 0 158 | 159 | 160 | def test_cast_row_null_values_handled(): 161 | schema = Schema(DESCRIPTOR_MAX) 162 | source = ['string', '', '-', 'string', 'null'] 163 | target = ['string', None, None, 'string', None] 164 | errors = [] 165 | def handler(exc, row_number, row_data, error_data): 166 | errors.append((exc, row_number, row_data, error_data)) 167 | assert schema.cast_row(source, exc_handler=handler) == target 168 | assert len(errors) == 0 169 | 170 | 171 | def test_cast_row_too_short_handled(): 172 | schema = Schema(DESCRIPTOR_MAX) 173 | source = ['string', '10.0', '1', 'string'] 174 | # Missing values get substituted by None 175 | target = ['string', Decimal(10.0), 1, 'string', None] 176 | errors = [] 177 | def handler(exc, row_number, row_data, error_data): 178 | errors.append((exc, row_number, row_data, error_data)) 179 | assert schema.cast_row(source, exc_handler=handler) == target 180 | assert len(errors) == 1 181 | expect_row_data = OrderedDict( 182 | [('id', 'string'), ('height', '10.0'), ('age', '1'), 183 | ('name', 'string'), ('occupation', None)]) 184 | _check_error( 185 | errors[0], expect_exc_class=exceptions.CastError, 186 | expect_exc_str='Row length', expect_row_number=None, 187 | expect_row_data=expect_row_data, expect_error_data=expect_row_data) 188 | 189 | def test_cast_row_too_long_handled(): 190 | schema = Schema(DESCRIPTOR_MAX) 191 | source = ['string', '10.0', '1', 'string', 'string', 'string'] 192 | # superfluous values are left out 193 | target = ['string', Decimal(10.0), 1, 'string', 'string'] 194 | errors = [] 195 | def handler(exc, row_number, row_data, error_data): 196 | errors.append((exc, row_number, row_data, error_data)) 197 | assert schema.cast_row(source, exc_handler=handler) == target 198 | assert len(errors) == 1 199 | # superfluous values are keyed with col num for error reporting 200 | expect_row_data = OrderedDict( 201 | [('id', 'string'), ('height', '10.0'), ('age', '1'), 202 | ('name', 'string'), ('occupation', 'string'), 203 | ('tableschema-cast-error-extra-col-6', 'string')]) 204 | _check_error( 205 | errors[0], expect_exc_class=exceptions.CastError, 206 | expect_exc_str='Row length', expect_row_number=None, 207 | expect_row_data=expect_row_data, expect_error_data=expect_row_data) 208 | 209 | 210 | def test_cast_row_wrong_type_handled(): 211 | schema = Schema(DESCRIPTOR_MAX) 212 | source = ['string', 'notdecimal', '1', 'string', 'string'] 213 | target = ['string', 'notdecimal', 1, 'string', 'string'] 214 | errors = [] 215 | def handler(exc, row_number, row_data, error_data): 216 | errors.append((exc, row_number, row_data, error_data)) 217 | actual = schema.cast_row(source, exc_handler=handler) 218 | assert actual == target 219 | assert isinstance(actual[1], FailedCast) 220 | assert len(errors) == 1 221 | expect_row_data = OrderedDict( 222 | [('id', 'string'), ('height', 'notdecimal'), ('age', '1'), 223 | ('name', 'string'), ('occupation', 'string')]) 224 | expect_error_data = OrderedDict([('height', 'notdecimal')]) 225 | _check_error( 226 | errors[0], expect_exc_class=exceptions.CastError, 227 | expect_exc_str='There are 1 cast errors', expect_row_number=None, 228 | expect_row_data=expect_row_data, expect_error_data=expect_error_data) 229 | exc = errors[0][0] 230 | assert len(exc.errors) == 1 231 | 232 | 233 | def test_cast_row_wrong_type_multiple_errors_handled(): 234 | schema = Schema(DESCRIPTOR_MAX) 235 | source = ['string', 'notdecimal', '10.6', 'string', 'string'] 236 | target = ['string', 'notdecimal', '10.6', 'string', 'string'] 237 | errors = [] 238 | def handler(exc, row_number, row_data, error_data): 239 | errors.append((exc, row_number, row_data, error_data)) 240 | actual = schema.cast_row(source, exc_handler=handler) 241 | assert actual == target 242 | assert isinstance(actual[1], FailedCast) 243 | assert isinstance(actual[2], FailedCast) 244 | assert len(errors) == 1 245 | expect_row_data = OrderedDict( 246 | [('id', 'string'), ('height', 'notdecimal'), ('age', '10.6'), 247 | ('name', 'string'), ('occupation', 'string')]) 248 | expect_error_data = OrderedDict( 249 | [('height', 'notdecimal'),('age', '10.6')]) 250 | _check_error( 251 | errors[0], expect_exc_class=exceptions.CastError, 252 | expect_exc_str='There are 2 cast errors', expect_row_number=None, 253 | expect_row_data=expect_row_data, expect_error_data=expect_error_data) 254 | exc = errors[0][0] 255 | assert len(exc.errors) == 2 256 | 257 | 258 | def test_fields(): 259 | expect = ['id', 'height'] 260 | actual = [field.name for field in Schema(DESCRIPTOR_MIN).fields] 261 | assert expect == actual 262 | 263 | 264 | def test_get_field(): 265 | schema = Schema(DESCRIPTOR_MIN) 266 | assert schema.get_field('id').name == 'id' 267 | assert schema.get_field('height').name == 'height' 268 | assert schema.get_field('undefined') is None 269 | 270 | 271 | def test_update_field(): 272 | schema = Schema(DESCRIPTOR_MIN) 273 | assert schema.update_field('id', {'type': 'number'}) is True 274 | assert schema.update_field('height', {'type': 'number'}) is True 275 | assert schema.update_field('unknown', {'type': 'number'}) is False 276 | schema.commit() 277 | assert schema.get_field('id').type == 'number' 278 | assert schema.get_field('height').type == 'number' 279 | 280 | 281 | def test_has_field(): 282 | schema = Schema(DESCRIPTOR_MIN) 283 | assert schema.has_field('id') 284 | assert schema.has_field('height') 285 | assert not schema.has_field('undefined') 286 | 287 | 288 | def test_headers(): 289 | assert Schema(DESCRIPTOR_MIN).headers == ['id', 'height'] 290 | 291 | 292 | def test_primary_key(): 293 | assert Schema(DESCRIPTOR_MIN).primary_key == [] 294 | assert Schema(DESCRIPTOR_MAX).primary_key == ['id'] 295 | 296 | 297 | def test_foreign_keys(): 298 | assert Schema(DESCRIPTOR_MIN).foreign_keys == [] 299 | assert Schema(DESCRIPTOR_MAX).foreign_keys == DESCRIPTOR_MAX['foreignKeys'] 300 | 301 | 302 | def test_save(tmpdir, apply_defaults): 303 | path = str(tmpdir.join('schema.json')) 304 | Schema(DESCRIPTOR_MIN).save(path) 305 | with io.open(path, encoding='utf-8') as file: 306 | descriptor = json.load(file) 307 | assert descriptor == apply_defaults(DESCRIPTOR_MIN) 308 | 309 | 310 | def test_infer(): 311 | data = [ 312 | ['id', 'age', 'name', 'dob'], 313 | ['1','39','Paul','28/1/79'], 314 | ['2','23','Jimmy','13/6/95'], 315 | ['3','36','Jane','17/9/80'], 316 | ['4','N/A','Judy','19/4/83'], 317 | ] 318 | schema = Schema() 319 | schema.infer(data) 320 | assert schema.descriptor == { 321 | 'fields': [ 322 | {'format': 'default', 'name': 'id', 'type': 'integer'}, 323 | {'format': 'default', 'name': 'age', 'type': 'integer'}, 324 | {'format': 'default', 'name': 'name', 'type': 'string'}, 325 | {'format': '%d/%m/%y', 'name': 'dob', 'type': 'date'}, 326 | ], 327 | 'missingValues': ['']} 328 | data = [ 329 | ['id', 'age', 'name'], 330 | ['1','39','Paul'], 331 | ['2','23','Jimmy'], 332 | ['3','36','Jane'], 333 | ['4','N/A','Judy'], 334 | ] 335 | schema = Schema() 336 | schema.infer(data, confidence=0.8) 337 | assert schema.descriptor == { 338 | 'fields': [ 339 | {'format': 'default', 'name': 'id', 'type': 'integer'}, 340 | {'format': 'default', 'name': 'age', 'type': 'string'}, 341 | {'format': 'default', 'name': 'name', 'type': 'string'}], 342 | 'missingValues': ['']} 343 | 344 | class AllStrings(): 345 | def cast(self, value): 346 | return [('string', 'default', 0)] 347 | data = [ 348 | ['id', 'age', 'name'], 349 | ['1','39','Paul'], 350 | ['2','23','Jimmy'], 351 | ['3','36','Jane'], 352 | ['4','100','Judy'], 353 | ] 354 | 355 | schema = Schema() 356 | schema.infer(data, confidence=0.8, guesser_cls=AllStrings) 357 | assert schema.descriptor['fields'] == [ 358 | {'format': 'default', 'name': 'id', 'type': 'string'}, 359 | {'format': 'default', 'name': 'age', 'type': 'string'}, 360 | {'format': 'default', 'name': 'name', 'type': 'string'}] 361 | assert schema.descriptor == { 362 | 'fields': [ 363 | {'format': 'default', 'name': 'id', 'type': 'string'}, 364 | {'format': 'default', 'name': 'age', 'type': 'string'}, 365 | {'format': 'default', 'name': 'name', 'type': 'string'}], 366 | 'missingValues': ['']} 367 | 368 | 369 | def test_add_remove_field(): 370 | schema = Schema() 371 | schema.add_field({'name': 'name'}) 372 | field = schema.remove_field('name') 373 | assert field.name == 'name' 374 | 375 | 376 | def test_primary_foreign_keys_as_array(): 377 | descriptor = { 378 | 'fields': [{'name': 'name'}], 379 | 'primaryKey': ['name'], 380 | 'foreignKeys': [{ 381 | 'fields': ['parent_id'], 382 | 'reference': {'resource': 'resource', 'fields': ['id']} 383 | }] 384 | } 385 | schema = Schema(descriptor) 386 | assert schema.primary_key == ['name'] 387 | assert schema.foreign_keys == [{ 388 | 'fields': ['parent_id'], 389 | 'reference': {'resource': 'resource', 'fields': ['id']} 390 | }] 391 | 392 | 393 | def test_primary_foreign_keys_as_string(): 394 | descriptor = { 395 | 'fields': [{'name': 'name'}], 396 | 'primaryKey': 'name', 397 | 'foreignKeys': [{ 398 | 'fields': 'parent_id', 399 | 'reference': {'resource': 'resource', 'fields': 'id'} 400 | }] 401 | } 402 | schema = Schema(descriptor) 403 | assert schema.primary_key == ['name'] 404 | assert schema.foreign_keys == [{ 405 | 'fields': ['parent_id'], 406 | 'reference': {'resource': 'resource', 'fields': ['id']} 407 | }] 408 | 409 | 410 | def test_fields_have_public_backreference_to_schema(): 411 | schema = Schema('data/schema_valid_full.json') 412 | assert schema.get_field('first_name').schema == schema 413 | assert schema.get_field('last_name').schema == schema 414 | 415 | 416 | # Issues 417 | 418 | 419 | def test_schema_field_date_format_issue_177(): 420 | descriptor = {'fields':[{'name':'myfield', 'type':'date', 'format':'%d/%m/%y'}]} 421 | schema = Schema(descriptor) 422 | assert schema 423 | 424 | 425 | def test_schema_field_time_format_issue_177(): 426 | descriptor = {'fields':[{'name':'myfield', 'type':'time', 'format':'%H:%M:%S'}]} 427 | schema = Schema(descriptor) 428 | assert schema 429 | 430 | 431 | def test_schema_add_remove_field_issue_218(): 432 | descriptor = { 433 | 'fields': [ 434 | {'name': 'test_1', 'type': 'string', 'format': 'default'}, 435 | {'name': 'test_2', 'type': 'string', 'format': 'default'}, 436 | {'name': 'test_3', 'type': 'string', 'format': 'default'}, 437 | ] 438 | } 439 | test_schema = Schema(descriptor) 440 | test_schema.remove_field('test_1') 441 | test_schema.add_field({'name': 'test_4', 'type': 'string', 'format': 'default'}) 442 | 443 | 444 | def test_schema_not_supported_type_issue_goodatbles_304(): 445 | schema = Schema({'fields': [ {'name': 'name'}, {'name': 'age', 'type': 'bad'} ]}) 446 | assert schema.valid is False 447 | assert schema.fields[1] is False 448 | 449 | 450 | def test_schema_infer_with_non_headers_issues_goodtables_258(): 451 | schema = Schema() 452 | schema.infer([[1],[2],[3]], headers=[None]) 453 | assert schema.field_names == ['field1'] 454 | -------------------------------------------------------------------------------- /tableschema/schema.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import io 8 | import six 9 | import json 10 | from collections import OrderedDict 11 | from copy import deepcopy 12 | from six.moves import zip_longest 13 | from .profile import Profile 14 | from .field import Field 15 | from . import exceptions 16 | from . import helpers 17 | from . import config 18 | from . import types 19 | 20 | 21 | # Module API 22 | 23 | class Schema(object): 24 | """Schema representation 25 | 26 | # Arguments 27 | descriptor (str/dict): schema descriptor one of: 28 | - local path 29 | - remote url 30 | - dictionary 31 | strict (bool): flag to specify validation behaviour: 32 | - if false, errors will not be raised but instead collected in `schema.errors` 33 | - if true, validation errors are raised immediately 34 | 35 | # Raises 36 | TableSchemaException: raise any error that occurs during the process 37 | 38 | """ 39 | 40 | # Public 41 | 42 | def __init__(self, descriptor={}, strict=False): 43 | 44 | # Process descriptor 45 | descriptor = helpers.retrieve_descriptor(descriptor) 46 | 47 | # Set attributes 48 | self.__strict = strict 49 | self.__current_descriptor = deepcopy(descriptor) 50 | self.__next_descriptor = deepcopy(descriptor) 51 | self.__profile = Profile('table-schema') 52 | self.__errors = [] 53 | self.__fields = [] 54 | 55 | # Build instance 56 | self.__build() 57 | 58 | @property 59 | def valid(self): 60 | """Validation status 61 | 62 | Always true in strict mode. 63 | 64 | # Returns 65 | bool: validation status 66 | 67 | """ 68 | return not bool(self.__errors) 69 | 70 | @property 71 | def errors(self): 72 | """Validation errors 73 | 74 | Always empty in strict mode. 75 | 76 | # Returns 77 | Exception[]: validation errors 78 | 79 | """ 80 | return self.__errors 81 | 82 | @property 83 | def descriptor(self): 84 | """Schema's descriptor 85 | 86 | # Returns 87 | dict: descriptor 88 | 89 | """ 90 | # Never use this.descriptor inside this class (!!!) 91 | return self.__next_descriptor 92 | 93 | @property 94 | def missing_values(self): 95 | """Schema's missing values 96 | 97 | # Returns 98 | str[]: missing values 99 | 100 | """ 101 | return self.__current_descriptor.get('missingValues', []) 102 | 103 | @property 104 | def primary_key(self): 105 | """Schema's primary keys 106 | 107 | # Returns 108 | str[]: primary keys 109 | 110 | """ 111 | primary_key = self.__current_descriptor.get('primaryKey', []) 112 | if not isinstance(primary_key, list): 113 | primary_key = [primary_key] 114 | return primary_key 115 | 116 | @property 117 | def foreign_keys(self): 118 | """Schema's foreign keys 119 | 120 | # Returns 121 | dict[]: foreign keys 122 | 123 | """ 124 | foreign_keys = self.__current_descriptor.get('foreignKeys', []) 125 | for key in foreign_keys: 126 | key.setdefault('fields', []) 127 | key.setdefault('reference', {}) 128 | key['reference'].setdefault('resource', '') 129 | key['reference'].setdefault('fields', []) 130 | if not isinstance(key['fields'], list): 131 | key['fields'] = [key['fields']] 132 | if not isinstance(key['reference']['fields'], list): 133 | key['reference']['fields'] = [key['reference']['fields']] 134 | return foreign_keys 135 | 136 | @property 137 | def fields(self): 138 | """Schema's fields 139 | 140 | # Returns 141 | Field[]: an array of field instances 142 | 143 | """ 144 | return self.__fields 145 | 146 | @property 147 | def field_names(self): 148 | """Schema's field names 149 | 150 | # Returns 151 | str[]: an array of field names 152 | 153 | """ 154 | return [field.name for field in self.fields] 155 | 156 | def get_field(self, name): 157 | """Get schema's field by name. 158 | 159 | > Use `table.update_field` if you want to modify the field descriptor 160 | 161 | # Arguments 162 | name (str): schema field name 163 | 164 | # Returns 165 | Field/None: `Field` instance or `None` if not found 166 | 167 | """ 168 | for field in self.fields: 169 | if field.name == name: 170 | return field 171 | return None 172 | 173 | def add_field(self, descriptor): 174 | """ Add new field to schema. 175 | 176 | The schema descriptor will be validated with newly added field descriptor. 177 | 178 | # Arguments 179 | descriptor (dict): field descriptor 180 | 181 | # Raises 182 | TableSchemaException: raises any error that occurs during the process 183 | 184 | # Returns 185 | Field/None: added `Field` instance or `None` if not added 186 | 187 | """ 188 | self.__current_descriptor.setdefault('fields', []) 189 | self.__current_descriptor['fields'].append(descriptor) 190 | self.__build() 191 | return self.__fields[-1] 192 | 193 | def update_field(self, name, update): 194 | """Update existing descriptor field by name 195 | 196 | # Arguments 197 | name (str): schema field name 198 | update (dict): update to apply to field's descriptor 199 | 200 | # Returns 201 | bool: true on success and false if no field is found to be modified 202 | 203 | """ 204 | for field in self.__next_descriptor['fields']: 205 | if field['name'] == name: 206 | field.update(update) 207 | return True 208 | return False 209 | 210 | def remove_field(self, name): 211 | """Remove field resource by name. 212 | 213 | The schema descriptor will be validated after field descriptor removal. 214 | 215 | # Arguments 216 | name (str): schema field name 217 | 218 | # Raises 219 | TableSchemaException: raises any error that occurs during the process 220 | 221 | # Returns 222 | Field/None: removed `Field` instances or `None` if not found 223 | 224 | """ 225 | field = self.get_field(name) 226 | if field: 227 | predicat = lambda field: field.get('name') != name 228 | self.__current_descriptor['fields'] = list(filter( 229 | predicat, self.__current_descriptor['fields'])) 230 | self.__build() 231 | return field 232 | 233 | def cast_row(self, row, fail_fast=False, row_number=None, exc_handler=None): 234 | """Cast row based on field types and formats. 235 | 236 | # Arguments 237 | row (any[]: data row as an array of values 238 | 239 | # Returns 240 | any[]: returns cast data row 241 | 242 | """ 243 | exc_handler = helpers.default_exc_handler if exc_handler is None else \ 244 | exc_handler 245 | 246 | # Prepare 247 | result = [] 248 | errors = [] 249 | if row_number is not None: 250 | row_number_info = ' for row "%s"' % row_number 251 | else: 252 | row_number_info = '' 253 | # Check row length 254 | if len(row) != len(self.fields): 255 | message = ( 256 | 'Row length %s doesn\'t match fields count %s' + 257 | row_number_info) % (len(row), len(self.fields)) 258 | exc = exceptions.CastError(message) 259 | # Some preparations for error reporting, relevant if custom error 260 | # handling is in place. 261 | if len(row) < len(self.fields): 262 | # Treat missing col values as None 263 | keyed_row = OrderedDict( 264 | zip_longest((field.name for field in self.fields), row)) 265 | # Use added None values for further processing 266 | row = list(keyed_row.values()) 267 | else: 268 | fields = self.fields 269 | keyed_row = OrderedDict( 270 | # Use extra column number if value index exceeds fields 271 | (fields[i].name if fields[i:] 272 | else 'tableschema-cast-error-extra-col-{}'.format(i+1), 273 | value) 274 | for (i, value) in enumerate(row)) 275 | exc_handler(exc, row_number=row_number, row_data=keyed_row, 276 | error_data=keyed_row) 277 | 278 | # Cast row 279 | for field, value in zip(self.fields, row): 280 | try: 281 | result.append(field.cast_value(value)) 282 | except exceptions.CastError as exception: 283 | if fail_fast: 284 | raise 285 | # Wrap original value in a FailedCast object to be able to 286 | # further process/yield values and to distinguish uncasted 287 | # values on the consuming side. 288 | result.append(FailedCast(value)) 289 | errors.append(exception) 290 | 291 | # Raise errors 292 | if errors: 293 | message = ( 294 | 'There are %s cast errors (see exception.errors)' + 295 | row_number_info) % len(errors) 296 | keyed_row = OrderedDict(zip(self.field_names, row)) 297 | # Add the cast failure-causing fields only to error data. 298 | # Indexing results with the row field index should be ok at this 299 | # point due to the previous processing. 300 | error_data = OrderedDict( 301 | (name, value) 302 | for (i, (name, value)) in enumerate(keyed_row.items()) 303 | if isinstance(result[i], FailedCast)) 304 | exc_handler( 305 | exceptions.CastError(message, errors=errors), 306 | row_number=row_number, row_data=keyed_row, 307 | error_data=error_data) 308 | 309 | return result 310 | 311 | def infer(self, rows, headers=1, confidence=0.75, 312 | guesser_cls=None, resolver_cls=None): 313 | """Infer and set `schema.descriptor` based on data sample. 314 | 315 | # Arguments 316 | rows (list[]): array of arrays representing rows. 317 | headers (int/str[]): data sample headers (one of): 318 | - row number containing headers (`rows` should contain headers rows) 319 | - array of headers (`rows` should NOT contain headers rows) 320 | confidence (float): how many casting errors are allowed (as a ratio, between 0 and 1) 321 | guesser_cls (class): you can implement inferring strategies by 322 | providing type-guessing and type-resolving classes [experimental] 323 | resolver_cls (class): you can implement inferring strategies by 324 | providing type-guessing and type-resolving classes [experimental] 325 | 326 | # Returns 327 | dict: Table Schema descriptor 328 | 329 | """ 330 | 331 | # Get headers 332 | if isinstance(headers, int): 333 | headers_row = headers 334 | while True: 335 | headers_row -= 1 336 | headers = rows.pop(0) 337 | if not headers_row: 338 | break 339 | elif isinstance(headers, list): 340 | seen_cells = [] 341 | headers = list(headers) 342 | for index, cell in enumerate(headers): 343 | count = seen_cells.count(cell) + 1 344 | headers[index] = '%s%s' % (cell, count) if count > 1 else cell 345 | seen_cells.append(cell) 346 | elif not isinstance(headers, list): 347 | headers = [] 348 | 349 | # Get descriptor 350 | missing_values = self.__current_descriptor.get('missingValues', config.DEFAULT_MISSING_VALUES) 351 | guesser = guesser_cls() if guesser_cls else _TypeGuesser(missing_values) 352 | resolver = (resolver_cls or _TypeResolver)() 353 | descriptor = {'fields': [], 'missingValues': missing_values} 354 | type_matches = {} 355 | for number, header in enumerate(headers, start=1): 356 | descriptor['fields'].append({'name': header or 'field%s' % number}) 357 | for index, row in enumerate(rows): 358 | # Normalize rows with invalid dimensions for sanity 359 | row_length = len(row) 360 | headers_length = len(headers) 361 | if row_length > headers_length: 362 | row = row[:len(headers)] 363 | if row_length < headers_length: 364 | diff = headers_length - row_length 365 | fill = [''] * diff 366 | row = row + fill 367 | # build a column-wise lookup of type matches 368 | for index, value in enumerate(row): 369 | rv = guesser.cast(value) 370 | if type_matches.get(index): 371 | type_matches[index].extend(rv) 372 | else: 373 | type_matches[index] = list(rv) 374 | # choose a type/format for each column based on the matches 375 | for index, results in type_matches.items(): 376 | rv = resolver.get(results, confidence) 377 | descriptor['fields'][index].update(**rv) 378 | 379 | # Save descriptor 380 | self.__current_descriptor = descriptor 381 | self.__build() 382 | 383 | return descriptor 384 | 385 | def commit(self, strict=None): 386 | """Update schema instance if there are in-place changes in the descriptor. 387 | 388 | # Example 389 | 390 | ```python 391 | from tableschema import Schema 392 | descriptor = {'fields': [{'name': 'my_field', 'title': 'My Field', 'type': 'string'}]} 393 | schema = Schema(descriptor) 394 | print(schema.get_field('my_field').descriptor['type']) # string 395 | 396 | # Update descriptor by field position 397 | schema.descriptor['fields'][0]['type'] = 'number' 398 | # Update descriptor by field name 399 | schema.update_field('my_field', {'title': 'My Pretty Field'}) # True 400 | 401 | # Change are not committed 402 | print(schema.get_field('my_field').descriptor['type']) # string 403 | print(schema.get_field('my_field').descriptor['title']) # My Field 404 | 405 | # Commit change 406 | schema.commit() 407 | print(schema.get_field('my_field').descriptor['type']) # number 408 | print(schema.get_field('my_field').descriptor['title']) # My Pretty Field 409 | 410 | ``` 411 | 412 | # Arguments 413 | strict (bool): alter `strict` mode for further work 414 | 415 | # Raises 416 | TableSchemaException: raises any error that occurs during the process 417 | 418 | # Returns 419 | bool: true on success and false if not modified 420 | 421 | """ 422 | if strict is not None: 423 | self.__strict = strict 424 | elif self.__current_descriptor == self.__next_descriptor: 425 | return False 426 | self.__current_descriptor = deepcopy(self.__next_descriptor) 427 | self.__build() 428 | return True 429 | 430 | def save(self, target, ensure_ascii=True): 431 | """Save schema descriptor to target destination. 432 | 433 | # Arguments 434 | target (str): path where to save a descriptor 435 | 436 | # Raises 437 | TableSchemaException: raises any error that occurs during the process 438 | 439 | # Returns 440 | bool: true on success 441 | 442 | """ 443 | mode = 'w' 444 | encoding = 'utf-8' 445 | if six.PY2: 446 | mode = 'wb' 447 | encoding = None 448 | helpers.ensure_dir(target) 449 | with io.open(target, mode=mode, encoding=encoding) as file: 450 | json.dump(self.__current_descriptor, file, indent=4, ensure_ascii=ensure_ascii) 451 | 452 | # Internal 453 | 454 | def __build(self): 455 | 456 | # Process descriptor 457 | expand = helpers.expand_schema_descriptor 458 | self.__current_descriptor = expand(self.__current_descriptor) 459 | self.__next_descriptor = deepcopy(self.__current_descriptor) 460 | 461 | # Validate descriptor 462 | try: 463 | self.__profile.validate(self.__current_descriptor) 464 | self.__errors = [] 465 | except exceptions.ValidationError as exception: 466 | self.__errors = exception.errors 467 | if self.__strict: 468 | raise exception 469 | 470 | # Populate fields 471 | self.__fields = [] 472 | for field in self.__current_descriptor.get('fields', []): 473 | missing_values = self.__current_descriptor['missingValues'] 474 | try: 475 | field = Field(field, missing_values=missing_values, schema=self) 476 | except exceptions.TableSchemaException as e: 477 | if self.__strict: 478 | raise e 479 | else: 480 | field = False 481 | self.__fields.append(field) 482 | 483 | # Deprecated 484 | 485 | headers = field_names 486 | has_field = get_field 487 | 488 | 489 | class FailedCast(object): 490 | """Wrap an original data field value that failed to be properly casted. 491 | 492 | FailedCast allows for further processing/yielding values but still be able 493 | to distinguish uncasted values on the consuming side. 494 | 495 | Delegates attribute access and the basic rich comparison methods to the 496 | underlying object. Supports default user-defined classes hashability i.e. 497 | is hashable based on object identity (not based on the wrapped value). 498 | 499 | # Arguments 500 | value (any): value 501 | 502 | """ 503 | 504 | # Make this "reasonably immutable": Don't support setting other attributes, 505 | # don't support modifying re-setting value 506 | __slots__ = ('_value',) 507 | 508 | def __init__(self, value): 509 | self._value = value 510 | 511 | @property 512 | def value(self): 513 | return self._value 514 | 515 | def __repr__(self): 516 | return 'FailedCast(%r)' % self._value 517 | 518 | def __getattr__(self, name): 519 | return getattr(self._value, name) 520 | 521 | def __lt__(self, other): 522 | return self._value < other 523 | 524 | def __le__(self, other): 525 | return self._value <= other 526 | 527 | def __eq__(self, other): 528 | return self._value == other 529 | 530 | def __ne__(self, other): 531 | return self._value != other 532 | 533 | def __gt__(self, other): 534 | return self._value > other 535 | 536 | def __ge__(self, other): 537 | return self._value >= other 538 | 539 | def __hash__(self): 540 | return object.__hash__(self) 541 | 542 | 543 | # Internal 544 | _INFER_DATE_FORMATS = [ 545 | '%Y-%m-%d', 546 | '%d/%m/%Y', 547 | '%m/%d/%Y', 548 | '%d/%m/%y', 549 | '%m/%d/%y', 550 | '%Y%m%d', 551 | '%d-%m-%y', 552 | '%Y/%m/%d', 553 | '%d.%m.%Y', 554 | '%d.%m.%y', 555 | ] 556 | 557 | 558 | _INFER_TYPE_ORDER = [ 559 | 'duration', 560 | 'geojson', 561 | 'geopoint', 562 | 'object', 563 | 'array', 564 | 'datetime', 565 | 'time', 566 | ('date', _INFER_DATE_FORMATS), 567 | 'integer', 568 | 'number', 569 | 'boolean', 570 | 'string', 571 | 'any', 572 | ] 573 | 574 | 575 | class _TypeGuesser(object): 576 | """Guess the type for a value returning a tuple of ('type', 'format') 577 | """ 578 | 579 | # Public 580 | 581 | def __init__(self, missing_values): 582 | self.missing_values = missing_values 583 | 584 | def cast(self, value): 585 | for priority, type_rec in enumerate(_INFER_TYPE_ORDER): 586 | if isinstance(type_rec, tuple): 587 | name, formats = type_rec 588 | else: 589 | name, formats = type_rec, ['default'] 590 | cast = getattr(types, 'cast_%s' % name) 591 | if value not in self.missing_values: 592 | for format in formats: 593 | result = cast(format, value) 594 | if result != config.ERROR: 595 | yield (name, format, priority) 596 | 597 | 598 | class _TypeResolver(object): 599 | """Get the best matching type/format from a list of possible ones. 600 | """ 601 | 602 | # Public 603 | 604 | def get(self, results, confidence): 605 | variants = set(results) 606 | # only one candidate... that's easy. 607 | if len(variants) == 1: 608 | rv = {'type': results[0][0], 'format': results[0][1]} 609 | else: 610 | counts = {} 611 | for result in results: 612 | if counts.get(result): 613 | counts[result] += 1 614 | else: 615 | counts[result] = 1 616 | # tuple representation of `counts` dict sorted by values 617 | sorted_counts = sorted(counts.items(), key=lambda item: item[1], reverse=True) 618 | if not sorted_counts: 619 | return {'type': 'string', 'format': 'default'} 620 | # Allow also counts that are not the max, based on the confidence 621 | max_count = sorted_counts[0][1] 622 | sorted_counts = filter(lambda item: item[1] >= max_count * confidence, 623 | sorted_counts) 624 | # Choose the most specific data type 625 | sorted_counts = sorted(sorted_counts, 626 | key=lambda item: item[0][2]) 627 | rv = {'type': sorted_counts[0][0][0], 'format': sorted_counts[0][0][1]} 628 | return rv 629 | -------------------------------------------------------------------------------- /tableschema/table.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | from copy import copy 8 | from tabulator import Stream 9 | from functools import partial 10 | from collections import OrderedDict 11 | from .storage import Storage 12 | from .schema import Schema 13 | from . import exceptions 14 | from . import helpers 15 | from . import config 16 | from collections import defaultdict 17 | 18 | 19 | # Module API 20 | 21 | class Table(object): 22 | """Table representation 23 | 24 | # Arguments 25 | source (str/list[]): data source one of: 26 | - local file (path) 27 | - remote file (url) 28 | - array of arrays representing the rows 29 | schema (any): data schema in all forms supported by `Schema` class 30 | strict (bool): strictness option to pass to `Schema` constructor 31 | post_cast (function[]): list of post cast processors 32 | storage (None): storage name like `sql` or `bigquery` 33 | options (dict): `tabulator` or storage's options 34 | 35 | # Raises 36 | TableSchemaException: raises on any error 37 | 38 | """ 39 | 40 | # Public 41 | 42 | def __init__(self, source, schema=None, strict=False, 43 | post_cast=[], storage=None, **options): 44 | 45 | # Set attributes 46 | self.__source = source 47 | self.__stream = None 48 | self.__schema = None 49 | self.__headers = None 50 | self.__storage = None 51 | self.__post_cast = copy(post_cast) 52 | 53 | # Schema 54 | if isinstance(schema, Schema): 55 | self.__schema = schema 56 | elif schema is not None: 57 | self.__schema = Schema(schema) 58 | 59 | # Stream (tabulator) 60 | if storage is None: 61 | options.setdefault('headers', 1) 62 | self.__stream = Stream(source, **options) 63 | 64 | # Stream (storage) 65 | else: 66 | if not isinstance(storage, Storage): 67 | storage = Storage.connect(storage, **options) 68 | if self.__schema: 69 | storage.describe(source, self.__schema.descriptor) 70 | headers = Schema(storage.describe(source)).field_names 71 | self.__stream = Stream(partial(storage.iter, source), headers=headers) 72 | self.__storage = storage 73 | 74 | @property 75 | def headers(self): 76 | """Table's headers is available 77 | 78 | # Returns 79 | str[]: headers 80 | 81 | """ 82 | return self.__headers 83 | 84 | @property 85 | def schema(self): 86 | """Returns schema class instance if available 87 | 88 | # Returns 89 | Schema: schema 90 | 91 | """ 92 | return self.__schema 93 | 94 | @property 95 | def size(self): 96 | """Table's size in BYTES if it's available 97 | 98 | If it's already read using e.g. `table.read`, otherwise returns `None`. 99 | In the middle of an iteration it returns size of already read contents 100 | 101 | # Returns 102 | int/None: size in BYTES 103 | 104 | """ 105 | if self.__stream: 106 | return self.__stream.size 107 | 108 | @property 109 | def hash(self): 110 | """Table's SHA256 hash if it's available. 111 | 112 | If it's already read using e.g. `table.read`, otherwise returns `None`. 113 | In the middle of an iteration it returns hash of already read contents 114 | 115 | # Returns 116 | str/None: SHA256 hash 117 | 118 | """ 119 | if self.__stream: 120 | return self.__stream.hash 121 | 122 | def iter(self, keyed=False, extended=False, cast=True, 123 | integrity=False, relations=False, 124 | foreign_keys_values=False, exc_handler=None): 125 | """Iterates through the table data and emits rows cast based on table schema. 126 | 127 | # Arguments 128 | 129 | keyed (bool): 130 | yield keyed rows in a form of `{header1\\: value1, header2\\: value2}` 131 | (default is false; the form of rows is `[value1, value2]`) 132 | 133 | extended (bool): 134 | yield extended rows in a for of `[rowNumber, [header1, header2], [value1, value2]]` 135 | (default is false; the form of rows is `[value1, value2]`) 136 | 137 | cast (bool): 138 | disable data casting if false 139 | (default is true) 140 | 141 | integrity (dict): 142 | dictionary in a form of `{'size'\\: , 'hash'\\: ''}` 143 | to check integrity of the table when it's read completely. 144 | Both keys are optional. 145 | 146 | relations (dict): 147 | dictionary of foreign key references in a form 148 | of `{resource1\\: [{field1\\: value1, field2\\: value2}, ...], ...}`. 149 | If provided, foreign key fields will checked and resolved 150 | to one of their references (/!\\ one-to-many fk are not completely resolved). 151 | 152 | foreign_keys_values (dict): 153 | three-level dictionary of foreign key references optimized 154 | to speed up validation process in a form of 155 | `{resource1\\: {(fk_field1, fk_field2)\\: {(value1, value2)\\: {one_keyedrow}, ... }}}`. 156 | If not provided but relations is true, it will be created 157 | before the validation process by *index_foreign_keys_values* method 158 | 159 | exc_handler (func): 160 | optional custom exception handler callable. 161 | Can be used to defer raising errors (i.e. "fail late"), e.g. 162 | for data validation purposes. Must support the signature below 163 | 164 | # Custom exception handler 165 | 166 | ```python 167 | def exc_handler(exc, row_number=None, row_data=None, error_data=None): 168 | '''Custom exception handler (example) 169 | 170 | # Arguments: 171 | exc(Exception): 172 | Deferred exception instance 173 | row_number(int): 174 | Data row number that triggers exception exc 175 | row_data(OrderedDict): 176 | Invalid data row source data 177 | error_data(OrderedDict): 178 | Data row source data field subset responsible for the error, if 179 | applicable (e.g. invalid primary or foreign key fields). May be 180 | identical to row_data. 181 | ''' 182 | # ... 183 | ``` 184 | 185 | # Raises 186 | TableSchemaException: base class of any error 187 | CastError: data cast error 188 | IntegrityError: integrity checking error 189 | UniqueKeyError: unique key constraint violation 190 | UnresolvedFKError: unresolved foreign key reference error 191 | 192 | # Returns 193 | Iterator[list]: yields rows 194 | 195 | """ 196 | exc_handler = helpers.default_exc_handler if exc_handler is None else \ 197 | exc_handler 198 | 199 | # Prepare unique checks 200 | if cast: 201 | unique_fields_cache = {} 202 | if self.schema: 203 | unique_fields_cache = _create_unique_fields_cache(self.schema) 204 | # Prepare relation checks 205 | if relations and not foreign_keys_values: 206 | # we have to test relations but the index has not been precomputed 207 | # prepare the index to boost validation process 208 | foreign_keys_values = self.index_foreign_keys_values(relations) 209 | 210 | # Open/iterate stream 211 | with self.__stream as stream: 212 | iterator = stream.iter(extended=True) 213 | iterator = self.__apply_processors( 214 | iterator, cast=cast, exc_handler=exc_handler) 215 | for row_number, headers, row in iterator: 216 | 217 | # Get headers 218 | if not self.__headers: 219 | self.__headers = headers 220 | 221 | # Check headers 222 | if cast: 223 | if self.schema and self.headers: 224 | if self.headers != self.schema.field_names: 225 | message = ( 226 | 'Table headers (%r) don\'t match ' 227 | 'schema field names (%r) in row %s' % ( 228 | self.headers, self.schema.field_names, 229 | row_number)) 230 | keyed_row = OrderedDict(zip(headers, row)) 231 | exc_handler( 232 | exceptions.CastError(message), 233 | row_number=row_number, row_data=keyed_row, 234 | error_data=keyed_row) 235 | continue 236 | 237 | # Check unique 238 | if cast: 239 | for indexes, cache in unique_fields_cache.items(): 240 | keyed_values = OrderedDict( 241 | (headers[i], value) 242 | for i, value in enumerate(row) if i in indexes) 243 | values = tuple(keyed_values.values()) 244 | if not all(map(lambda value: value is None, values)): 245 | if values in cache['data']: 246 | message = ( 247 | 'Field(s) "%s" duplicates in row "%s" ' 248 | 'for values %r' % ( 249 | cache['name'], row_number, values)) 250 | exc_handler( 251 | exceptions.UniqueKeyError(message), 252 | row_number=row_number, 253 | row_data=OrderedDict(zip(headers, row)), 254 | error_data=keyed_values) 255 | cache['data'].add(values) 256 | 257 | # Resolve relations 258 | if relations: 259 | if self.schema: 260 | row_with_relations = dict(zip(headers, copy(row))) 261 | for foreign_key in self.schema.foreign_keys: 262 | refValue = _resolve_relations(row, headers, foreign_keys_values, 263 | foreign_key) 264 | if refValue is None: 265 | keyed_row = OrderedDict(zip(headers, row)) 266 | # local values of the FK 267 | local_keyed_values = { 268 | key: keyed_row[key] 269 | for key in foreign_key['fields'] 270 | } 271 | local_values = tuple(local_keyed_values.values()) 272 | message = ( 273 | 'Foreign key "%s" violation in row "%s": ' 274 | '%s not found in %s' % ( 275 | foreign_key['fields'], 276 | row_number, 277 | local_values, 278 | foreign_key['reference']['resource'])) 279 | exc_handler( 280 | exceptions.UnresolvedFKError(message), 281 | row_number=row_number, row_data=keyed_row, 282 | error_data=local_keyed_values) 283 | # If we reach this point we don't fail-early 284 | # i.e. no exception has been raised. As the 285 | # reference can't be resolved, use empty dict 286 | # as the "unresolved result". 287 | for field in foreign_key['fields']: 288 | if not isinstance( 289 | row_with_relations[field], dict): 290 | row_with_relations[field] = {} 291 | elif type(refValue) is dict: 292 | # Substitute resolved referenced object for 293 | # original referencing field value. 294 | # For a composite foreign key, this substitutes 295 | # each part of the composite key with the 296 | # referenced object. 297 | for field in foreign_key['fields']: 298 | if type(row_with_relations[field]) is not dict: 299 | # no previous refValues injected on this field 300 | row_with_relations[field] = refValue 301 | else: 302 | # alreayd one ref, merging 303 | row_with_relations[field].update(refValue) 304 | else: 305 | # case when all original value of the FK are empty 306 | # refValue == row, there is nothing to do 307 | # an empty dict might be a better returned value for this case ? 308 | pass 309 | 310 | # mutate row now that we are done, in the right order 311 | row = [row_with_relations[f] for f in headers] 312 | 313 | # Form row 314 | if extended: 315 | yield (row_number, headers, row) 316 | elif keyed: 317 | yield dict(zip(headers, row)) 318 | else: 319 | yield row 320 | 321 | # Check integrity 322 | if integrity: 323 | violations = [] 324 | size = integrity.get('size') 325 | hash = integrity.get('hash') 326 | if size and size != self.__stream.size: 327 | violations.append('size "%s"' % self.__stream.size) 328 | if hash and hash != self.__stream.hash: 329 | violations.append('hash "%s"' % self.__stream.hash) 330 | if violations: 331 | message = 'Calculated %s differ(s) from declared value(s)' 332 | raise exceptions.IntegrityError(message % ' and '.join(violations)) 333 | 334 | def read(self, keyed=False, extended=False, cast=True, limit=None, 335 | integrity=False, relations=False, foreign_keys_values=False, 336 | exc_handler=None): 337 | """Read the whole table and return as array of rows 338 | 339 | > It has the same API as `table.iter` except for 340 | 341 | # Arguments 342 | limit (int): limit count of rows to read and return 343 | 344 | # Returns 345 | list[]: returns rows 346 | 347 | """ 348 | result = [] 349 | rows = self.iter( 350 | keyed=keyed, extended=extended, cast=cast, integrity=integrity, 351 | relations=relations, foreign_keys_values=foreign_keys_values, 352 | exc_handler=exc_handler) 353 | for count, row in enumerate(rows, start=1): 354 | result.append(row) 355 | if count == limit: 356 | break 357 | return result 358 | 359 | def infer(self, limit=100, confidence=0.75, 360 | missing_values=config.DEFAULT_MISSING_VALUES, 361 | guesser_cls=None, resolver_cls=None): 362 | """Infer a schema for the table. 363 | 364 | It will infer and set Table Schema to `table.schema` based on table data. 365 | 366 | # Arguments 367 | limit (int): limit rows sample size 368 | confidence (float): how many casting errors are allowed (as a ratio, between 0 and 1) 369 | missing_values (str[]): list of missing values (by default `['']`) 370 | guesser_cls (class): you can implement inferring strategies by 371 | providing type-guessing and type-resolving classes [experimental] 372 | resolver_cls (class): you can implement inferring strategies by 373 | providing type-guessing and type-resolving classes [experimental] 374 | 375 | # Returns 376 | dict: Table Schema descriptor 377 | 378 | """ 379 | if self.__schema is None or self.__headers is None: 380 | 381 | # Infer (tabulator) 382 | if not self.__storage: 383 | with self.__stream as stream: 384 | if self.__schema is None: 385 | self.__schema = Schema({'missingValues': missing_values}) 386 | self.__schema.infer(stream.sample[:limit], 387 | headers=stream.headers, 388 | confidence=confidence, 389 | guesser_cls=guesser_cls, 390 | resolver_cls=resolver_cls) 391 | if self.__headers is None: 392 | self.__headers = stream.headers 393 | 394 | # Infer (storage) 395 | else: 396 | descriptor = self.__storage.describe(self.__source) 397 | if self.__schema is None: 398 | self.__schema = Schema(descriptor) 399 | if self.__headers is None: 400 | self.__headers = self.__schema.field_names 401 | 402 | return self.__schema.descriptor 403 | 404 | def save(self, target, storage=None, **options): 405 | """Save data source to file locally in CSV format with `,` (comma) delimiter 406 | 407 | > To save schema use `table.schema.save()` 408 | 409 | # Arguments 410 | target (str): saving target (e.g. file path) 411 | storage (None/str): storage name like `sql` or `bigquery` 412 | options (dict): `tabulator` or storage options 413 | 414 | # Raises 415 | TableSchemaException: raises an error if there is saving problem 416 | 417 | # Returns 418 | True/Storage: returns true or storage instance 419 | 420 | """ 421 | 422 | # Save (tabulator) 423 | if storage is None: 424 | with Stream(self.iter, headers=self.__schema.headers) as stream: 425 | stream.save(target, **options) 426 | return True 427 | 428 | # Save (storage) 429 | else: 430 | if not isinstance(storage, Storage): 431 | storage = Storage.connect(storage, **options) 432 | storage.create(target, self.__schema.descriptor, force=True) 433 | storage.write(target, self.iter(cast=False)) 434 | return storage 435 | 436 | def index_foreign_keys_values(self, relations): 437 | """Creates a three-level dictionary of foreign key references 438 | 439 | We create them optimized to speed up validation process in a form of 440 | `{resource1: {(fk_field1, fk_field2): {(value1, value2): {one_keyedrow}, ... }}}`. 441 | 442 | For each foreign key of the schema it will iterate through the corresponding 443 | `relations['resource']` to create an index (i.e. a dict) of existing values 444 | for the foreign fields and store on keyed row for each value combination. 445 | 446 | The optimization relies on the indexation of possible values for one foreign key 447 | in a hashmap to later speed up resolution. 448 | 449 | This method is public to allow creating the index once to apply it 450 | on multiple tables charing the same schema 451 | (typically [grouped resources in datapackage](https://github.com/frictionlessdata/datapackage-py#group)) 452 | 453 | # Notes 454 | 455 | - the second key of the output is a tuple of the foreign fields, 456 | a proxy identifier of the foreign key 457 | - the same relation resource can be indexed multiple times 458 | as a schema can contain more than one Foreign Keys 459 | pointing to the same resource 460 | 461 | # Arguments 462 | relations (dict): 463 | dict of foreign key references in a form of 464 | `{resource1\\: [{field1\\: value1, field2\\: value2}, ...], ...}`. 465 | It must contain all resources pointed in the foreign keys schema definition. 466 | 467 | # Returns 468 | dict: 469 | returns a three-level dictionary of foreign key references 470 | optimized to speed up validation process in a form of 471 | `{resource1\\: {(fk_field1, fk_field2)\\: {(value1, value2)\\: {one_keyedrow}, ... }}})` 472 | 473 | """ 474 | 475 | # we dont need to load the complete reference table to test relations 476 | # we can lower payload AND optimize testing foreign keys 477 | # by preparing the right index based on the foreign key definition 478 | # foreign_keys are sets of tuples of all possible values in the foreign table 479 | # foreign keys = 480 | # [reference] [foreign_keys tuple] = { (foreign_keys_values, ) : one_keyedrow, ... } 481 | foreign_keys = defaultdict(dict) 482 | if self.schema: 483 | for fk in self.schema.foreign_keys: 484 | # load relation data 485 | relation = fk['reference']['resource'] 486 | 487 | # create a set of foreign keys 488 | # to optimize we prepare index of existing values 489 | # this index should use reference + foreign_keys as key 490 | # cause many foreign keys may use the same reference 491 | foreign_keys[relation][tuple(fk['reference']['fields'])] = {} 492 | for row in relations[relation]: 493 | key = tuple([row[foreign_field] for foreign_field in fk['reference']['fields']]) 494 | # here we should chose to pick the first or nth row which match 495 | # previous implementation picked the first, so be it 496 | if key not in foreign_keys[relation][tuple(fk['reference']['fields'])]: 497 | foreign_keys[relation][tuple(fk['reference']['fields'])][key] = row 498 | return foreign_keys 499 | 500 | # Private 501 | 502 | def __apply_processors(self, iterator, cast=True, exc_handler=None): 503 | 504 | # Apply processors to iterator 505 | def builtin_processor(extended_rows): 506 | for row_number, headers, row in extended_rows: 507 | if self.__schema and cast: 508 | row = self.__schema.cast_row( 509 | row, row_number=row_number, exc_handler=exc_handler) 510 | yield (row_number, headers, row) 511 | processors = [builtin_processor] + self.__post_cast 512 | for processor in processors: 513 | iterator = processor(iterator) 514 | 515 | return iterator 516 | 517 | 518 | # Internal 519 | 520 | def _create_unique_fields_cache(schema): 521 | primary_key_indexes = [] 522 | cache = {} 523 | 524 | # Unique 525 | for index, field in enumerate(schema.fields): 526 | if field.name in schema.primary_key: 527 | primary_key_indexes.append(index) 528 | if field.constraints.get('unique'): 529 | cache[tuple([index])] = { 530 | 'name': field.name, 531 | 'data': set(), 532 | } 533 | 534 | # Primary key 535 | if primary_key_indexes: 536 | cache[tuple(primary_key_indexes)] = { 537 | 'name': ', '.join(schema.primary_key), 538 | 'data': set(), 539 | } 540 | 541 | return cache 542 | 543 | 544 | def _resolve_relations(row, headers, foreign_keys_values, foreign_key): 545 | 546 | # Prepare helpers - needed data structures 547 | keyed_row = OrderedDict(zip(headers, row)) 548 | # local values of the FK 549 | local_values = tuple(keyed_row[f] for f in foreign_key['fields']) 550 | if set(local_values) != {None}: 551 | # test existence into the foreign 552 | relation = foreign_key['reference']['resource'] 553 | keys = tuple(foreign_key['reference']['fields']) 554 | foreign_values = foreign_keys_values[relation][keys] 555 | if local_values in foreign_values: 556 | return foreign_values[local_values] 557 | else: 558 | return None 559 | else: 560 | # empty values for all keys, return original values 561 | return row 562 | --------------------------------------------------------------------------------