├── LEAD.md
├── data
    ├── empty.csv
    ├── schema_invalid_empty.json
    ├── schema_invalid_wrong_type.json
    ├── data_missing_cols.csv
    ├── data_headers_field_names_mismatch.csv
    ├── data_invalid_extra_cols.csv
    ├── schema_invalid_pk_is_wrong_type.json
    ├── data_unique_primary_key_violation.csv
    ├── schema_invalid_pk_no_fields.json
    ├── data.csv
    ├── data_infer.csv
    ├── data_invalid_col_value.csv
    ├── data.csv.zip
    ├── storage.png
    ├── data_infer_boolean.xlsx
    ├── data_infer_iso-8859-7.csv
    ├── data_infer_utf8.csv
    ├── data_infer_missing_values.csv
    ├── schema_invalid_fk_no_reference.json
    ├── data_infer_row_limit.csv
    ├── schema_valid_simple.json
    ├── schema_invalid_pk_string.json
    ├── schema_valid_pk_array.json
    ├── schema_invalid_pk_array.json
    ├── schema_invalid_fk_string.json
    ├── schema_invalid_fk_array.json
    ├── schema_invalid_multiple_errors.json
    ├── schema_invalid_fk_array_string.json
    ├── schema_invalid_fk_array_string_ref.json
    ├── schema_valid_fk_array.json
    ├── schema_invalid_fk_string_array_ref.json
    ├── schema_invalid_fk_array_wrong_number.json
    ├── data_infer_increase_limit.csv
    └── schema_valid_full.json
├── tests
    ├── __init__.py
    ├── types
    │   ├── __init__.py
    │   ├── test_any.py
    │   ├── test_year.py
    │   ├── test_object.py
    │   ├── test_yearmonth.py
    │   ├── test_array.py
    │   ├── test_integer.py
    │   ├── test_string.py
    │   ├── test_duration.py
    │   ├── test_boolean.py
    │   ├── test_geopoint.py
    │   ├── test_time.py
    │   ├── test_date.py
    │   ├── test_datetime.py
    │   ├── test_geojson.py
    │   └── test_number.py
    ├── constraints
    │   ├── __init__.py
    │   ├── test_unique.py
    │   ├── test_enum.py
    │   ├── test_maximum.py
    │   ├── test_minimum.py
    │   ├── test_maxLength.py
    │   ├── test_minLength.py
    │   ├── test_required.py
    │   └── test_pattern.py
    ├── test_exceptions.py
    ├── test_profile.py
    ├── conftest.py
    ├── test_helpers.py
    ├── test_cli.py
    ├── test_infer.py
    ├── test_validate.py
    ├── test_schema_constraint_field_type.py
    ├── test_field.py
    └── test_schema.py
├── examples
    ├── __init__.py
    ├── table_validate.py
    ├── table_pandas.py
    ├── table_infer.py
    └── table_sql.py
├── tableschema
    ├── VERSION
    ├── __main__.py
    ├── types
    │   ├── any.py
    │   ├── duration.py
    │   ├── object.py
    │   ├── year.py
    │   ├── array.py
    │   ├── __init__.py
    │   ├── boolean.py
    │   ├── geojson.py
    │   ├── yearmonth.py
    │   ├── integer.py
    │   ├── datetime.py
    │   ├── time.py
    │   ├── string.py
    │   ├── date.py
    │   ├── geopoint.py
    │   └── number.py
    ├── constraints
    │   ├── unique.py
    │   ├── required.py
    │   ├── enum.py
    │   ├── maxLength.py
    │   ├── minLength.py
    │   ├── __init__.py
    │   ├── maximum.py
    │   ├── minimum.py
    │   └── pattern.py
    ├── plugins
    │   └── __init__.py
    ├── config.py
    ├── validate.py
    ├── __init__.py
    ├── infer.py
    ├── cli.py
    ├── exceptions.py
    ├── helpers.py
    ├── storage.py
    ├── field.py
    ├── profile.py
    ├── profiles
    │   └── geojson.json
    ├── schema.py
    └── table.py
├── setup.cfg
├── pytest.ini
├── MANIFEST.in
├── .github
    ├── pull_request_template.md
    ├── issue_template.md
    ├── stale.yml
    └── workflows
    │   └── general.yml
├── pylama.ini
├── LICENSE.md
├── .gitignore
├── Makefile
└── setup.py


/LEAD.md:
--------------------------------------------------------------------------------
1 | roll
2 | 


--------------------------------------------------------------------------------
/data/empty.csv:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/types/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tableschema/VERSION:
--------------------------------------------------------------------------------
1 | 1.21.0


--------------------------------------------------------------------------------
/tests/constraints/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/schema_invalid_empty.json:
--------------------------------------------------------------------------------
1 | {}
2 | 


--------------------------------------------------------------------------------
/data/schema_invalid_wrong_type.json:
--------------------------------------------------------------------------------
1 | []
2 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal=1
3 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | testpaths = tests
3 | 


--------------------------------------------------------------------------------
/data/data_missing_cols.csv:
--------------------------------------------------------------------------------
1 | key,value
2 | one
3 | two,2
4 | 


--------------------------------------------------------------------------------
/data/data_headers_field_names_mismatch.csv:
--------------------------------------------------------------------------------
1 | id,bad,name
2 | 1,39,Paul
3 | 


--------------------------------------------------------------------------------
/data/data_invalid_extra_cols.csv:
--------------------------------------------------------------------------------
1 | key,value
2 | one,1,unexpected
3 | two,2
4 | 


--------------------------------------------------------------------------------
/data/schema_invalid_pk_is_wrong_type.json:
--------------------------------------------------------------------------------
1 | {
2 |     "primaryKey": 1
3 | }
4 | 


--------------------------------------------------------------------------------
/data/data_unique_primary_key_violation.csv:
--------------------------------------------------------------------------------
1 | id,age,name
2 | 1 39,Paul
3 | 1,36,Jane
4 | 


--------------------------------------------------------------------------------
/data/schema_invalid_pk_no_fields.json:
--------------------------------------------------------------------------------
1 | {
2 |     "primaryKey": ["id", "title"]
3 | }
4 | 


--------------------------------------------------------------------------------
/data/data.csv:
--------------------------------------------------------------------------------
1 | city,location
2 | london,"51.50,-0.11"
3 | paris,"48.85,2.30"
4 | rome,N/A
5 | 


--------------------------------------------------------------------------------
/data/data_infer.csv:
--------------------------------------------------------------------------------
1 | id,age,name
2 | 1,39,Paul
3 | 2,23,Jimmy
4 | 3,36,Jane
5 | 4,28,Judy
6 | 


--------------------------------------------------------------------------------
/data/data_invalid_col_value.csv:
--------------------------------------------------------------------------------
1 | key,value
2 | zero,0
3 | one,not_an_integer
4 | two,2
5 | 


--------------------------------------------------------------------------------
/data/data.csv.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tableschema-py/HEAD/data/data.csv.zip


--------------------------------------------------------------------------------
/data/storage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tableschema-py/HEAD/data/storage.png


--------------------------------------------------------------------------------
/data/data_infer_boolean.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tableschema-py/HEAD/data/data_infer_boolean.xlsx


--------------------------------------------------------------------------------
/data/data_infer_iso-8859-7.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/tableschema-py/HEAD/data/data_infer_iso-8859-7.csv


--------------------------------------------------------------------------------
/data/data_infer_utf8.csv:
--------------------------------------------------------------------------------
1 | id,age,name
2 | 1,39,Paul
3 | 2,23,Jimmy
4 | 3,36,Jane
5 | 4,28,Judy
6 | 5,37,Iñtërnâtiônàlizætiøn
7 | 


--------------------------------------------------------------------------------
/tableschema/__main__.py:
--------------------------------------------------------------------------------
1 | from .cli import cli
2 | 
3 | 
4 | # Module API
5 | 
6 | if __name__ == "__main__":
7 |     cli()
8 | 


--------------------------------------------------------------------------------
/data/data_infer_missing_values.csv:
--------------------------------------------------------------------------------
1 | id,age,name
2 | 1,39,Paul
3 | -,25,Test
4 | 2,23,Jimmy
5 | -,25,Test
6 | 3,36,Jane
7 | -,25,Test
8 | 4,28,Judy
9 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | global-include VERSION
 2 | include LICENSE.md
 3 | include Makefile
 4 | include pylama.ini
 5 | include pytest.ini
 6 | include README.md
 7 | include tox.ini
 8 | 
 9 | global-include *.json
10 | 


--------------------------------------------------------------------------------
/data/schema_invalid_fk_no_reference.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fields": [
 3 |     {
 4 |         "name": "state"
 5 |     }
 6 |     ],
 7 |     "foreignKeys": [
 8 |     {
 9 |         "fields": "state"
10 |     }
11 |     ]
12 | }
13 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 | # Overview
2 | 
3 | Please replace this line with full information about your pull request. Make sure that tests pass before publishing it
4 | 
5 | ---
6 | 
7 | Please preserve this line to notify @roll (lead of this repository)
8 | 


--------------------------------------------------------------------------------
/.github/issue_template.md:
--------------------------------------------------------------------------------
1 | # Overview
2 | 
3 | Please replace this line with full information about your idea or problem. If it's a bug share as much as possible to reproduce it
4 | 
5 | ---
6 | 
7 | Please preserve this line to notify @roll (lead of this repository)
8 | 


--------------------------------------------------------------------------------
/data/data_infer_row_limit.csv:
--------------------------------------------------------------------------------
 1 | id,age,name
 2 | 1,39,Paul
 3 | 2,23,Jimmy
 4 | 3,36,Jane
 5 | 4,28,Judy
 6 | qwerty,nineteen,Rose
 7 | werty,nineteen,Red
 8 | erty,nineteen,Rotem
 9 | rty,nineteen,Ruth
10 | ty,nineteen,Amber
11 | y,nineteen,Angel
12 | _,nineteen,Angie
13 | 


--------------------------------------------------------------------------------
/pylama.ini:
--------------------------------------------------------------------------------
 1 | [pylama]
 2 | linters = pyflakes,mccabe,pep8
 3 | ignore = E128,E301,E305,E731,C901
 4 | 
 5 | [pylama:pep8]
 6 | max_line_length = 120
 7 | 
 8 | [pylama:mccabe]
 9 | complexity = 36
10 | 
11 | [pylama:*/__init__.py]
12 | ignore = W0611
13 | 
14 | [pylama:*/compat.py]
15 | ignore = W0611,E0602
16 | 


--------------------------------------------------------------------------------
/tableschema/types/any.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | 
 8 | # Module API
 9 | 
10 | def cast_any(format, value, **options):
11 |     return value
12 | 


--------------------------------------------------------------------------------
/tableschema/constraints/unique.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | 
 8 | # Module API
 9 | 
10 | def check_unique(constraint, unique):
11 |     return True
12 | 


--------------------------------------------------------------------------------
/data/schema_valid_simple.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fields": [
 3 |         {
 4 |             "name": "id",
 5 |             "title": "Identifier",
 6 |             "type": "integer"
 7 |         },
 8 |         {
 9 |             "name": "title",
10 |             "title": "Title",
11 |             "type": "string"
12 |         }
13 |     ]
14 | }
15 | 


--------------------------------------------------------------------------------
/data/schema_invalid_pk_string.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fields": [
 3 |         {
 4 |             "name": "id",
 5 |             "title": "Identifier",
 6 |             "type": "integer"
 7 |         },
 8 |         {
 9 |             "name": "title",
10 |             "title": "Title",
11 |             "type": "string"
12 |         }
13 |     ],
14 |     "primaryKey": "identifier"
15 | }
16 | 


--------------------------------------------------------------------------------
/data/schema_valid_pk_array.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fields": [
 3 |         {
 4 |             "name": "id",
 5 |             "title": "Identifier",
 6 |             "type": "integer"
 7 |         },
 8 |         {
 9 |             "name": "title",
10 |             "title": "Title",
11 |             "type": "string"
12 |         }
13 |     ],
14 |     "primaryKey": ["id", "title"]
15 | }
16 | 


--------------------------------------------------------------------------------
/data/schema_invalid_pk_array.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fields": [
 3 |         {
 4 |             "name": "id",
 5 |             "title": "Identifier",
 6 |             "type": "integer"
 7 |         },
 8 |         {
 9 |             "name": "title",
10 |             "title": "Title",
11 |             "type": "string"
12 |         }
13 |     ],
14 |     "primaryKey": ["id", "titel"]
15 | }
16 | 


--------------------------------------------------------------------------------
/tableschema/constraints/required.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | 
 8 | # Module API
 9 | 
10 | def check_required(constraint, value):
11 |     if not (constraint and value is None):
12 |         return True
13 |     return False
14 | 


--------------------------------------------------------------------------------
/data/schema_invalid_fk_string.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fields": [
 3 |     {
 4 |         "name": "state"
 5 |     }
 6 |     ],
 7 |     "foreignKeys": [
 8 |     {
 9 |         "fields": "doesnotexist",
10 |         "reference": {
11 |             "datapackage": "http://data.okfn.org/data/mydatapackage/",
12 |             "resource": "the-resource",
13 |             "fields": "state_id"
14 |         }
15 |     }
16 |     ]
17 | }
18 | 


--------------------------------------------------------------------------------
/tableschema/constraints/enum.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | 
 8 | # Module API
 9 | 
10 | def check_enum(constraint, value):
11 |     if value is None:
12 |         return True
13 |     if value in constraint:
14 |         return True
15 |     return False
16 | 


--------------------------------------------------------------------------------
/tableschema/constraints/maxLength.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | 
 8 | # Module API
 9 | 
10 | def check_maxLength(constraint, value):
11 |     if value is None:
12 |         return True
13 |     if len(value) <= constraint:
14 |         return True
15 |     return False
16 | 


--------------------------------------------------------------------------------
/tableschema/constraints/minLength.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | 
 8 | # Module API
 9 | 
10 | def check_minLength(constraint, value):
11 |     if value is None:
12 |         return True
13 |     if len(value) >= constraint:
14 |         return True
15 |     return False
16 | 


--------------------------------------------------------------------------------
/tableschema/plugins/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from ..helpers import PluginImporter
 8 | 
 9 | 
10 | # Register importer
11 | importer = PluginImporter(
12 |     virtual='tableschema.plugins.', actual='tableschema_')
13 | importer.register()
14 | 
15 | # Delete variables
16 | del PluginImporter
17 | del importer
18 | 


--------------------------------------------------------------------------------
/tests/test_exceptions.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import pytest
 8 | from tableschema.exceptions import CastError
 9 | 
10 | 
11 | # Tests
12 | 
13 | def test_no_errors_reuse():
14 |     ce1 = CastError('message1')
15 |     ce1.errors.append('error')
16 |     ce2 = CastError('message2')
17 |     assert len(ce2.errors) == 0
18 | 


--------------------------------------------------------------------------------
/data/schema_invalid_fk_array.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fields": [
 3 |         {
 4 |             "name": "id",
 5 |             "title": "Identifier",
 6 |             "type": "integer"
 7 |         }
 8 |     ],
 9 |     "foreignKeys": [
10 |         {
11 |             "fields": ["id", "title"],
12 |             "reference": {
13 |                 "datapackage": "http://data.okfn.org/data/mydatapackage/",
14 |                 "resource": "the-resource",
15 |                 "fields": "no" 
16 |             }
17 |         }
18 |     ]
19 | }
20 | 


--------------------------------------------------------------------------------
/tests/constraints/test_unique.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import pytest
 8 | from tableschema import constraints
 9 | 
10 | 
11 | # Tests
12 | 
13 | @pytest.mark.parametrize('constraint, value, result', [
14 |     (False, 'any', True),
15 |     (True, 'any', True),
16 | ])
17 | def test_check_unique(constraint, value, result):
18 |     assert constraints.check_unique(constraint, value) == result
19 | 


--------------------------------------------------------------------------------
/tableschema/config.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import io
 8 | import os
 9 | 
10 | 
11 | # Module API
12 | 
13 | VERSION = io.open(os.path.join(os.path.dirname(__file__), 'VERSION')).read().strip()
14 | ERROR = 'tableschema.error'
15 | DEFAULT_FIELD_TYPE = 'string'
16 | DEFAULT_FIELD_FORMAT = 'default'
17 | DEFAULT_MISSING_VALUES = ['']
18 | REMOTE_SCHEMES = ['http', 'https', 'ftp', 'ftps', 's3']
19 | 


--------------------------------------------------------------------------------
/tableschema/constraints/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | 
 8 | # Module API
 9 | 
10 | from .enum import check_enum
11 | from .maximum import check_maximum
12 | from .maxLength import check_maxLength
13 | from .minimum import check_minimum
14 | from .minLength import check_minLength
15 | from .pattern import check_pattern
16 | from .required import check_required
17 | from .unique import check_unique
18 | 


--------------------------------------------------------------------------------
/tests/constraints/test_enum.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import pytest
 8 | from tableschema import constraints
 9 | 
10 | 
11 | # Tests
12 | 
13 | @pytest.mark.parametrize('constraint, value, result', [
14 |     ([1, 2], 1, True),
15 |     ([0, 2], 1, False),
16 |     ([], 1, False),
17 | ])
18 | def test_check_enum(constraint, value, result):
19 |     assert constraints.check_enum(constraint, value) == result
20 | 


--------------------------------------------------------------------------------
/tests/constraints/test_maximum.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import pytest
 8 | from tableschema import constraints
 9 | 
10 | 
11 | # Tests
12 | 
13 | @pytest.mark.parametrize('constraint, value, result', [
14 |     (0, 1, False),
15 |     (1, 1, True),
16 |     (2, 1, True),
17 | ])
18 | def test_check_maximum(constraint, value, result):
19 |     assert constraints.check_maximum(constraint, value) == result
20 | 


--------------------------------------------------------------------------------
/tests/constraints/test_minimum.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import pytest
 8 | from tableschema import constraints
 9 | 
10 | 
11 | # Tests
12 | 
13 | @pytest.mark.parametrize('constraint, value, result', [
14 |     (0, 1, True),
15 |     (1, 1, True),
16 |     (2, 1, False),
17 | ])
18 | def test_check_minimum(constraint, value, result):
19 |     assert constraints.check_minimum(constraint, value) == result
20 | 


--------------------------------------------------------------------------------
/tests/constraints/test_maxLength.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import pytest
 8 | from tableschema import constraints
 9 | 
10 | 
11 | # Tests
12 | 
13 | @pytest.mark.parametrize('constraint, value, result', [
14 |     (0, [1], False),
15 |     (1, [1], True),
16 |     (2, [1], True),
17 | ])
18 | def test_check_maxLength(constraint, value, result):
19 |     assert constraints.check_maxLength(constraint, value) == result
20 | 


--------------------------------------------------------------------------------
/tests/constraints/test_minLength.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import pytest
 8 | from tableschema import constraints
 9 | 
10 | 
11 | # Tests
12 | 
13 | @pytest.mark.parametrize('constraint, value, result', [
14 |     (0, [1], True),
15 |     (1, [1], True),
16 |     (2, [1], False),
17 | ])
18 | def test_check_minLength(constraint, value, result):
19 |     assert constraints.check_minLength(constraint, value) == result
20 | 


--------------------------------------------------------------------------------
/tests/constraints/test_required.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import pytest
 8 | from tableschema import constraints
 9 | 
10 | 
11 | # Tests
12 | 
13 | @pytest.mark.parametrize('constraint, value, result', [
14 |     (False, 1, True),
15 |     (True, 0, True),
16 |     (True, None, False),
17 | ])
18 | def test_check_required(constraint, value, result):
19 |     assert constraints.check_required(constraint, value) == result
20 | 


--------------------------------------------------------------------------------
/tableschema/validate.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from .schema import Schema
 8 | 
 9 | 
10 | # Module API
11 | 
12 | 
13 | def validate(descriptor):
14 |     """Validate descriptor
15 | 
16 |     # Arguments
17 |         dict: descriptor
18 | 
19 |     # Raises
20 |         ValidationError: on validation errors
21 | 
22 |     # Returns
23 |         bool: True
24 | 
25 |     """
26 |     Schema(descriptor, strict=True)
27 |     return True
28 | 


--------------------------------------------------------------------------------
/examples/table_validate.py:
--------------------------------------------------------------------------------
 1 | from tableschema import Table
 2 | 
 3 | # Data from WEB, schema from MEMORY
 4 | SOURCE = 'https://raw.githubusercontent.com/frictionlessdata/tableschema-py/master/data/data_infer.csv'
 5 | SCHEMA = {'fields': [{'name': 'id', 'type': 'integer'}, {'name': 'age', 'type': 'integer'}, {'name': 'name', 'type': 'string'}] }
 6 | 
 7 | # If schema is not passed it will be inferred
 8 | table = Table(SOURCE, schema=SCHEMA)
 9 | rows = table.iter()
10 | while True:
11 |     try:
12 |         print(next(rows))
13 |     except StopIteration:
14 |         break
15 |     except Exception as exception:
16 |         print(exception)
17 | 


--------------------------------------------------------------------------------
/tableschema/constraints/maximum.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import decimal
 8 | 
 9 | 
10 | # Module API
11 | 
12 | def check_maximum(constraint, value):
13 |     if value is None:
14 |         return True
15 |     try:
16 |         if value <= constraint:
17 |             return True
18 |     except decimal.InvalidOperation:
19 |         # For non-finite numbers NaN, INF and -INF
20 |         # the constraint always is not satisfied
21 |         return False
22 |     return False
23 | 


--------------------------------------------------------------------------------
/tableschema/constraints/minimum.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import decimal
 8 | 
 9 | 
10 | # Module API
11 | 
12 | def check_minimum(constraint, value):
13 |     if value is None:
14 |         return True
15 |     try:
16 |         if value >= constraint:
17 |             return True
18 |     except decimal.InvalidOperation:
19 |         # For non-finite numbers NaN, INF and -INF
20 |         # the constraint always is not satisfied
21 |         return False
22 |     return False
23 | 


--------------------------------------------------------------------------------
/data/schema_invalid_multiple_errors.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fields": [
 3 |         {
 4 |             "name": "id",
 5 |             "title": "Identifier",
 6 |             "type": "magical_unicorn"
 7 |         },
 8 |         {
 9 |             "name": "title",
10 |             "title": "Title",
11 |             "type": "string"
12 |         }
13 |     ],
14 |     "primaryKey": "identifier",
15 |     "foreignKeys": [
16 |         {
17 |             "fields": ["id", "notafield"],
18 |             "reference": {
19 |                 "datapackage": "http://data.okfn.org/data/mydatapackage/",
20 |                 "fields": "no" 
21 |             }
22 |         }
23 |     ]
24 | }
25 | 


--------------------------------------------------------------------------------
/tableschema/constraints/pattern.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import re
 8 | COMPILED_RE = type(re.compile(""))
 9 | 
10 | 
11 | # Module API
12 | 
13 | def check_pattern(constraint, value):
14 |     if value is None:
15 |         return True
16 |     if not isinstance(constraint, COMPILED_RE):
17 |         regex = re.compile('^{0}$'.format(constraint))
18 |     else:
19 |         regex = constraint
20 |     match = regex.match(value)
21 |     if match:
22 |         return True
23 |     return False
24 | 


--------------------------------------------------------------------------------
/tests/types/test_any.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import pytest
 8 | from tableschema import types
 9 | from tableschema.config import ERROR
10 | 
11 | 
12 | # Tests
13 | 
14 | @pytest.mark.parametrize('format, value, result', [
15 |     ('default', 1, 1),
16 |     ('default', '1', '1'),
17 |     ('default', '3.14', '3.14'),
18 |     ('default', True, True),
19 |     ('default', '', ''),
20 | ])
21 | def test_cast_any(format, value, result):
22 |     assert types.cast_any(format, value) == result
23 | 


--------------------------------------------------------------------------------
/data/schema_invalid_fk_array_string.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fields": [
 3 |         {
 4 |             "name": "id",
 5 |             "title": "Identifier",
 6 |             "type": "integer"
 7 |         },
 8 |         {
 9 |             "name": "title",
10 |             "title": "Title",
11 |             "type": "string"
12 |         }
13 |     ],
14 |     "foreignKeys": [
15 |         {
16 |             "fields": "id",
17 |             "reference": {
18 |                 "datapackage": "http://data.okfn.org/data/mydatapackage/",
19 |                 "resource": "the-resource",
20 |                 "fields": ["id_1", "title_id"]
21 |             }
22 |         }
23 |     ]
24 | }
25 | 


--------------------------------------------------------------------------------
/data/schema_invalid_fk_array_string_ref.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fields": [
 3 |         {
 4 |             "name": "id",
 5 |             "title": "Identifier",
 6 |             "type": "integer"
 7 |         },
 8 |         {
 9 |             "name": "title",
10 |             "title": "Title",
11 |             "type": "string"
12 |         }
13 |     ],
14 |     "foreignKeys": [
15 |         {
16 |             "fields": ["id", "title"],
17 |             "reference": {
18 |                 "datapackage": "http://data.okfn.org/data/mydatapackage/",
19 |                 "resource": "the-resource",
20 |                 "fields": "no" 
21 |             }
22 |         }
23 |     ]
24 | }
25 | 


--------------------------------------------------------------------------------
/data/schema_valid_fk_array.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fields": [
 3 |         {
 4 |             "name": "id",
 5 |             "title": "Identifier",
 6 |             "type": "integer"
 7 |         },
 8 |         {
 9 |             "name": "title",
10 |             "title": "Title",
11 |             "type": "string"
12 |         }
13 |     ],
14 |     "foreignKeys": [
15 |         {
16 |             "fields": ["id", "title"],
17 |             "reference": {
18 |                 "datapackage": "http://data.okfn.org/data/mydatapackage/",
19 |                 "resource": "the-resource",
20 |                 "fields": ["fk_id", "title_id"]
21 |             }
22 |         }
23 |     ]
24 | }
25 | 


--------------------------------------------------------------------------------
/data/schema_invalid_fk_string_array_ref.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fields": [
 3 |         {
 4 |             "name": "id",
 5 |             "title": "Identifier",
 6 |             "type": "integer"
 7 |         },
 8 |         {
 9 |             "name": "title",
10 |             "title": "Title",
11 |             "type": "string"
12 |         }
13 |     ],
14 |     "foreignKeys": [
15 |         {
16 |             "fields": "id",
17 |             "reference": {
18 |                 "datapackage": "http://data.okfn.org/data/mydatapackage/",
19 |                 "resource": "the-resource",
20 |                 "fields": ["id_1", "title_id"]
21 |             }
22 |         }
23 |     ]
24 | }
25 | 


--------------------------------------------------------------------------------
/tests/constraints/test_pattern.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import pytest
 8 | from tableschema import constraints
 9 | import re
10 | 
11 | # Tests
12 | 
13 | @pytest.mark.parametrize('constraint, value, result', [
14 |     ('^test$', 'test', True),
15 |     ('^test$', 'TEST', False),
16 |     (re.compile('^test$'), 'test', True),
17 |     (re.compile('^test$'), 'TEST', False),
18 | ])
19 | def test_check_pattern(constraint, value, result):
20 |     assert constraints.check_pattern(constraint, value) == result
21 | 


--------------------------------------------------------------------------------
/data/schema_invalid_fk_array_wrong_number.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fields": [
 3 |         {
 4 |             "name": "id",
 5 |             "title": "Identifier",
 6 |             "type": "integer"
 7 |         },
 8 |         {
 9 |             "name": "title",
10 |             "title": "Title",
11 |             "type": "string"
12 |         }
13 |     ],
14 |     "foreignKeys": [
15 |         {
16 |             "fields": ["id", "title"],
17 |             "reference": {
18 |                 "datapackage": "http://data.okfn.org/data/mydatapackage/",
19 |                 "resource": "the-resource",
20 |                 "fields":  ["id", "title", "somethingelse"]
21 |             }
22 |         }
23 |     ]
24 | }
25 | 


--------------------------------------------------------------------------------
/tableschema/types/duration.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import six
 8 | import datetime
 9 | import isodate
10 | from ..config import ERROR
11 | 
12 | 
13 | # Module API
14 | 
15 | def cast_duration(format, value, **options):
16 |     if not isinstance(value, (isodate.Duration, datetime.timedelta)):
17 |         if not isinstance(value, six.string_types):
18 |             return ERROR
19 |         try:
20 |             value = isodate.parse_duration(value)
21 |         except Exception:
22 |             return ERROR
23 |     return value
24 | 


--------------------------------------------------------------------------------
/tableschema/types/object.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import six
 8 | import json
 9 | from ..config import ERROR
10 | 
11 | 
12 | # Module API
13 | 
14 | def cast_object(format, value, **options):
15 |     if not isinstance(value, dict):
16 |         if not isinstance(value, six.string_types):
17 |             return ERROR
18 |         try:
19 |             value = json.loads(value)
20 |         except Exception:
21 |             return ERROR
22 |         if not isinstance(value, dict):
23 |             return ERROR
24 |     return value
25 | 


--------------------------------------------------------------------------------
/tests/types/test_year.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import pytest
 8 | from tableschema import types
 9 | from tableschema.config import ERROR
10 | 
11 | 
12 | # Tests
13 | 
14 | @pytest.mark.parametrize('format, value, result', [
15 |     ('default', 2000, 2000),
16 |     ('default', '2000', 2000),
17 |     ('default', -2000, ERROR),
18 |     ('default', 20000, ERROR),
19 |     ('default', '3.14', ERROR),
20 |     ('default', '', ERROR),
21 | ])
22 | def test_cast_year(format, value, result):
23 |     assert types.cast_year(format, value) == result
24 | 


--------------------------------------------------------------------------------
/tableschema/types/year.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import six
 8 | from ..config import ERROR
 9 | 
10 | 
11 | # Module API
12 | 
13 | def cast_year(format, value, **options):
14 |     if not isinstance(value, int):
15 |         if not isinstance(value, six.string_types):
16 |             return ERROR
17 |         if len(value) != 4:
18 |             return ERROR
19 |         try:
20 |             value = int(value)
21 |         except Exception:
22 |             return ERROR
23 |     if value < 0 or value > 9999:
24 |         return ERROR
25 |     return value
26 | 


--------------------------------------------------------------------------------
/examples/table_pandas.py:
--------------------------------------------------------------------------------
 1 | # pip install tableschema-pandas
 2 | from pprint import pprint
 3 | from tableschema import Table
 4 | 
 5 | # Data source
 6 | SOURCE = 'https://raw.githubusercontent.com/frictionlessdata/tableschema-py/master/data/data_infer.csv'
 7 | 
 8 | # Data processor
 9 | def skip_under_30(erows):
10 |     for number, headers, row in erows:
11 |         krow = dict(zip(headers, row))
12 |         if krow['age'] >= 30:
13 |             yield (number, headers, row)
14 | 
15 | # Export to pandas
16 | table = Table(SOURCE, post_convert=[skip_under_30])
17 | storage = table.save('persons', storage='pandas')
18 | pprint(storage['persons'])
19 | # Will print (if use skip_under_30 filter)
20 | # id age name
21 | # 1  39  Paul
22 | # 3  36  Jane
23 | 


--------------------------------------------------------------------------------
/tests/test_profile.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import io
 8 | import os
 9 | import json
10 | import pytest
11 | import requests
12 | from tableschema.profile import Profile
13 | 
14 | 
15 | # Tests
16 | 
17 | @pytest.mark.skip
18 | @pytest.mark.skipif(os.environ.get('TRAVIS_BRANCH') != 'master', reason='CI')
19 | def test_specs_table_schema_is_up_to_date():
20 |     profile = Profile('table-schema')
21 |     jsonschema = requests.get('https://specs.frictionlessdata.io/schemas/table-schema.json').json()
22 |     assert profile.jsonschema == jsonschema, 'run `make profiles` to update profiles'
23 | 


--------------------------------------------------------------------------------
/tableschema/types/array.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import six
 8 | import json
 9 | from ..config import ERROR
10 | 
11 | 
12 | # Module API
13 | 
14 | def cast_array(format, value, **options):
15 |     if not isinstance(value, list):
16 |         if isinstance(value, tuple):
17 |             return list(value)
18 |         if not isinstance(value, six.string_types):
19 |             return ERROR
20 |         try:
21 |             value = json.loads(value)
22 |         except Exception:
23 |             return ERROR
24 |         if not isinstance(value, list):
25 |             return ERROR
26 |     return value
27 | 


--------------------------------------------------------------------------------
/tableschema/types/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | 
 8 | # Module API
 9 | 
10 | from .any import cast_any
11 | from .array import cast_array
12 | from .boolean import cast_boolean
13 | from .date import cast_date
14 | from .datetime import cast_datetime
15 | from .duration import cast_duration
16 | from .geojson import cast_geojson
17 | from .geopoint import cast_geopoint
18 | from .integer import cast_integer
19 | from .number import cast_number
20 | from .object import cast_object
21 | from .string import cast_string
22 | from .time import cast_time
23 | from .year import cast_year
24 | from .yearmonth import cast_yearmonth
25 | 


--------------------------------------------------------------------------------
/.github/stale.yml:
--------------------------------------------------------------------------------
 1 | # Number of days of inactivity before an issue becomes stale
 2 | daysUntilStale: 90
 3 | 
 4 | # Number of days of inactivity before a stale issue is closed
 5 | daysUntilClose: 30
 6 | 
 7 | # Issues with these labels will never be considered stale
 8 | exemptLabels:
 9 |   - feature
10 |   - enhancement
11 |   - bug
12 | 
13 | # Label to use when marking an issue as stale
14 | staleLabel: wontfix
15 | 
16 | # Comment to post when marking an issue as stale. Set to `false` to disable
17 | markComment: >
18 |   This issue has been automatically marked as stale because it has not had
19 |   recent activity. It will be closed if no further activity occurs. Thank you
20 |   for your contributions.
21 | 
22 | # Comment to post when closing a stale issue. Set to `false` to disable
23 | closeComment: false
24 | 


--------------------------------------------------------------------------------
/tests/types/test_object.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import pytest
 8 | from tableschema import types
 9 | from tableschema.config import ERROR
10 | 
11 | 
12 | # Tests
13 | 
14 | @pytest.mark.parametrize('format, value, result', [
15 |     ('default', {}, {}),
16 |     ('default', '{}', {}),
17 |     ('default', {'key': 'value'}, {'key': 'value'}),
18 |     ('default', '{"key": "value"}', {'key': 'value'}),
19 |     ('default', '["key", "value"]', ERROR),
20 |     ('default', 'string', ERROR),
21 |     ('default', 1, ERROR),
22 |     ('default', '3.14', ERROR),
23 |     ('default', '', ERROR),
24 | ])
25 | def test_cast_object(format, value, result):
26 |     assert types.cast_object(format, value) == result
27 | 


--------------------------------------------------------------------------------
/tableschema/types/boolean.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import six
 8 | from ..config import ERROR
 9 | 
10 | 
11 | # Module API
12 | 
13 | def cast_boolean(format, value, **options):
14 |     if not isinstance(value, bool):
15 |         if isinstance(value, six.string_types):
16 |             value = value.strip()
17 |         if value in options.get('trueValues', _TRUE_VALUES):
18 |             value = True
19 |         elif value in options.get('falseValues', _FALSE_VALUES):
20 |             value = False
21 |         else:
22 |             return ERROR
23 |     return value
24 | 
25 | 
26 | # Internal
27 | 
28 | _TRUE_VALUES = ['true', 'True', 'TRUE', '1']
29 | _FALSE_VALUES = ['false', 'False', 'FALSE', '0']
30 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import pytest
 8 | from copy import deepcopy
 9 | 
10 | 
11 | # Fixtures
12 | 
13 | @pytest.fixture(scope='session')
14 | def apply_defaults():
15 |     def function(descriptor):
16 |         descriptor = deepcopy(descriptor)
17 |         # Schema descriptor
18 |         if descriptor.get('fields'):
19 |             descriptor.setdefault('missingValues', [''])
20 |             for field in descriptor['fields']:
21 |                 field.setdefault('type', 'string')
22 |                 field.setdefault('format', 'default')
23 |         # Field descriptor
24 |         else:
25 |             descriptor.setdefault('type', 'string')
26 |             descriptor.setdefault('format', 'default')
27 |         return descriptor
28 |     return function
29 | 


--------------------------------------------------------------------------------
/tableschema/types/geojson.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import six
 8 | import json
 9 | from ..config import ERROR
10 | from ..profile import Profile
11 | 
12 | 
13 | # Module API
14 | 
15 | def cast_geojson(format, value, **options):
16 |     if isinstance(value, six.string_types):
17 |         try:
18 |             value = json.loads(value)
19 |         except Exception:
20 |             return ERROR
21 |     if not isinstance(value, dict):
22 |         return ERROR
23 |     if format == 'default':
24 |         try:
25 |             _profile.validate(value)
26 |         except Exception:
27 |             return ERROR
28 |     elif format == 'topojson':
29 |         pass  # Accept any dict as possibly topojson for now
30 |     return value
31 | 
32 | 
33 | # Internal
34 | 
35 | _profile = Profile('geojson')
36 | 


--------------------------------------------------------------------------------
/tests/types/test_yearmonth.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import pytest
 8 | from tableschema import types
 9 | from tableschema.config import ERROR
10 | 
11 | 
12 | # Tests
13 | 
14 | @pytest.mark.parametrize('format, value, result', [
15 |     ('default', [2000, 10], (2000, 10)),
16 |     ('default', (2000, 10), (2000, 10)),
17 |     ('default', '2000-10', (2000, 10)),
18 |     ('default', (2000, 10, 20), ERROR),
19 |     ('default', '2000-13-20', ERROR),
20 |     ('default', '2000-13', ERROR),
21 |     ('default', '2000-0', ERROR),
22 |     ('default', '13', ERROR),
23 |     ('default', -10, ERROR),
24 |     ('default', 20, ERROR),
25 |     ('default', '3.14', ERROR),
26 |     ('default', '', ERROR),
27 | ])
28 | def test_cast_yearmonth(format, value, result):
29 |     assert types.cast_yearmonth(format, value) == result
30 | 


--------------------------------------------------------------------------------
/tests/types/test_array.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import pytest
 8 | from tableschema import types
 9 | from tableschema.config import ERROR
10 | 
11 | 
12 | # Tests
13 | 
14 | @pytest.mark.parametrize('format, value, result', [
15 |     ('default', [], []),
16 |     ('default', (), []),
17 |     ('default', '[]', []),
18 |     ('default', ['key', 'value'], ['key', 'value']),
19 |     ('default', ('key', 'value'), ['key', 'value']),
20 |     ('default', '["key", "value"]', ['key', 'value']),
21 |     ('default', {'key': 'value'}, ERROR),
22 |     ('default', '{"key": "value"}', ERROR),
23 |     ('default', 'string', ERROR),
24 |     ('default', 1, ERROR),
25 |     ('default', '3.14', ERROR),
26 |     ('default', '', ERROR),
27 | ])
28 | def test_cast_array(format, value, result):
29 |     assert types.cast_array(format, value) == result
30 | 


--------------------------------------------------------------------------------
/tableschema/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | from . import config
 7 | __version__ = config.VERSION
 8 | 
 9 | 
10 | # Module API
11 | 
12 | from .cli import cli
13 | from .table import Table
14 | from .schema import Schema
15 | from .field import Field
16 | from .storage import Storage
17 | from .validate import validate
18 | from .infer import infer
19 | from .schema import FailedCast
20 | from .exceptions import DataPackageException
21 | from .exceptions import TableSchemaException
22 | from .exceptions import LoadError
23 | from .exceptions import ValidationError
24 | from .exceptions import CastError
25 | from .exceptions import IntegrityError
26 | from .exceptions import UniqueKeyError
27 | from .exceptions import RelationError
28 | from .exceptions import UnresolvedFKError
29 | from .exceptions import StorageError
30 | 
31 | # Deprecated
32 | 
33 | from . import exceptions
34 | 


--------------------------------------------------------------------------------
/tableschema/types/yearmonth.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import six
 8 | from typing import NamedTuple
 9 | from ..config import ERROR
10 | 
11 | 
12 | # Module API
13 | 
14 | def cast_yearmonth(format, value, **options):
15 |     if isinstance(value, (tuple, list)):
16 |         if len(value) != 2:
17 |             return ERROR
18 |         value = _yearmonth(value[0], value[1])
19 |     elif isinstance(value, six.string_types):
20 |         try:
21 |             year, month = value.split('-')
22 |             year = int(year)
23 |             month = int(month)
24 |             if month < 1 or month > 12:
25 |                 return ERROR
26 |             value = _yearmonth(year, month)
27 |         except Exception:
28 |             return ERROR
29 |     else:
30 |         return ERROR
31 |     return value
32 | 
33 | 
34 | # Internal
35 | class _yearmonth(NamedTuple):
36 |     year: int
37 |     month: int
38 | 


--------------------------------------------------------------------------------
/tests/types/test_integer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | from decimal import Decimal
 8 | 
 9 | import pytest
10 | from tableschema import types
11 | from tableschema.config import ERROR
12 | 
13 | 
14 | # Tests
15 | 
16 | @pytest.mark.parametrize('format, value, result, options', [
17 |     ('default', 1, 1, {}),
18 |     ('default', 1 << 63, 1 << 63, {}),
19 |     ('default', '1', 1, {}),
20 |     ('default', 1.0, 1, {}),
21 |     ('default', Decimal('1.0'), 1, {}),
22 |     ('default', '1$', 1, {'bareNumber': False}),
23 |     ('default', 'ab1$', 1, {'bareNumber': False}),
24 |     ('default', True, ERROR, {}),
25 |     ('default', False, ERROR, {}),
26 |     ('default', 3.14, ERROR, {}),
27 |     ('default', '3.14', ERROR, {}),
28 |     ('default', Decimal('3.14'), ERROR, {}),
29 |     ('default', '', ERROR, {}),
30 | ])
31 | def test_cast_integer(format, value, result, options):
32 |     assert types.cast_integer(format, value, **options) == result
33 | 


--------------------------------------------------------------------------------
/examples/table_infer.py:
--------------------------------------------------------------------------------
 1 | # pip install sqlalchemy tableschema-sql
 2 | import sqlalchemy as sa
 3 | from pprint import pprint
 4 | from tableschema import Table
 5 | 
 6 | # Data source
 7 | SOURCE = 'https://raw.githubusercontent.com/frictionlessdata/tableschema-py/master/data/data_infer.csv'
 8 | 
 9 | # Create SQL database
10 | db = sa.create_engine('sqlite://')
11 | 
12 | # Data processor
13 | def skip_under_30(erows):
14 |     for number, headers, row in erows:
15 |         krow = dict(zip(headers, row))
16 |         if krow['age'] >= 30:
17 |             yield (number, headers, row)
18 | 
19 | # Work with table
20 | table = Table(SOURCE, post_cast=[skip_under_30])
21 | table.schema.save('tmp/persons.json') # Save INFERRED schema
22 | table.save('persons', backend='sql', engine=db) # Save data to SQL
23 | table.save('tmp/persons.csv')  # Save data to DRIVE
24 | 
25 | # Check the result
26 | pprint(Table('persons', backend='sql', engine=db).read(keyed=True))
27 | pprint(Table('tmp/persons.csv').read(keyed=True))
28 | # Will print (twice)
29 | # [{'age': 39, 'id': 1, 'name': 'Paul'},
30 | #  {'age': 36, 'id': 3, 'name': 'Jane'}]
31 | 


--------------------------------------------------------------------------------
/examples/table_sql.py:
--------------------------------------------------------------------------------
 1 | # pip install sqlalchemy tableschema-sql
 2 | import sqlalchemy as sa
 3 | from tableschema import Table
 4 | 
 5 | # Create SQL database
 6 | db = sa.create_engine('sqlite://')
 7 | 
 8 | # Data from WEB, schema from MEMORY
 9 | SOURCE = 'https://raw.githubusercontent.com/frictionlessdata/tableschema-py/master/data/data_infer.csv'
10 | SCHEMA = {'fields': [{'name': 'id', 'type': 'integer'}, {'name': 'age', 'type': 'integer'}, {'name': 'name', 'type': 'string'}] }
11 | 
12 | # Open from WEB save to SQL database
13 | table = Table(SOURCE, schema=SCHEMA)
14 | table.save('articles', storage='sql', engine=db)
15 | 
16 | # Open from SQL save to DRIVE
17 | table = Table('articles', storage='sql', engine=db)
18 | table.infer()
19 | table.schema.save('tmp/articles.json')
20 | table.save('tmp/articles.csv')
21 | 
22 | # Open from DRIVE print to CONSOLE
23 | table = Table('tmp/articles.csv', schema='tmp/articles.json')
24 | print(table.read(keyed=True))
25 | # Will print
26 | # [{'id': 1, 'age': 39, 'name': 'Paul'}, {'id': 2, 'age': 23, 'name': 'Jimmy'}, {'id': 3, 'age': 36, 'name': 'Jane'}, {'id': 4, 'age': 28, 'name': 'Judy'}]
27 | 


--------------------------------------------------------------------------------
/tests/test_helpers.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import os
 8 | import io
 9 | import pytest
10 | from tableschema import exceptions, helpers
11 | 
12 | 
13 | # Tests
14 | 
15 | def test_retrieve_descriptor_dict():
16 |     source = {'this': 'that', 'other': ['thing']}
17 |     assert helpers.retrieve_descriptor(source)
18 | 
19 | 
20 | def test_retrieve_descriptor_list():
21 |     source = [{'this': 'that', 'other': ['thing']}]
22 |     assert helpers.retrieve_descriptor(source)
23 | 
24 | 
25 | def test_retrieve_descriptor_url():
26 |     source = 'data/schema_valid_full.json'
27 |     assert helpers.retrieve_descriptor(source)
28 | 
29 | 
30 | def test_retrieve_descriptor_path():
31 |     source = 'data/schema_valid_full.json'
32 |     assert helpers.retrieve_descriptor(source)
33 | 
34 | 
35 | def test_retrieve_descriptor_invalid():
36 |     source = 'data/data_infer.csv'
37 |     with pytest.raises(exceptions.LoadError):
38 |         helpers.retrieve_descriptor(source)
39 | 


--------------------------------------------------------------------------------
/tests/types/test_string.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import pytest
 8 | from tableschema import types
 9 | from tableschema.config import ERROR
10 | 
11 | 
12 | # Tests
13 | 
14 | @pytest.mark.parametrize('format, value, result', [
15 |     ('default', 'string', 'string'),
16 |     ('default', '', ''),
17 |     ('default', 0, ERROR),
18 |     ('uri', 'http://google.com', 'http://google.com'),
19 |     ('uri', '://no-scheme.test', ERROR),
20 |     ('uri', 'string', ERROR),
21 |     ('uri', '', ERROR),
22 |     ('uri', 0, ERROR),
23 |     ('email', 'name@gmail.com', 'name@gmail.com'),
24 |     ('email', 'http://google.com', ERROR),
25 |     ('email', 'string', ERROR),
26 |     ('email', '', ERROR),
27 |     ('email', 0, ERROR),
28 |     ('binary', 'dGVzdA==', 'dGVzdA=='),
29 |     ('binary', '', ''),
30 |     ('binary', 'string', ERROR),
31 |     ('binary', 0, ERROR),
32 | ])
33 | def test_cast_string(format, value, result):
34 |     assert types.cast_string(format, value) == result
35 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Open Knowledge
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/tableschema/types/integer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import re
 8 | import six
 9 | from decimal import Decimal
10 | from ..config import ERROR
11 | 
12 | 
13 | # Module API
14 | 
15 | 
16 | def cast_integer(format, value, **options):
17 |     if isinstance(value, six.integer_types):
18 |         if value is True or value is False:
19 |             return ERROR
20 |         pass
21 | 
22 |     elif isinstance(value, six.string_types):
23 |         if not options.get('bareNumber', _DEFAULT_BARE_NUMBER):
24 |             value = _RE_BARE_NUMBER.sub('', value)
25 | 
26 |         try:
27 |             value = int(value)
28 |         except Exception:
29 |             return ERROR
30 | 
31 |     elif isinstance(value, float) and value.is_integer():
32 |         value = int(value)
33 | 
34 |     elif isinstance(value, Decimal) and value % 1 == 0:
35 |         value = int(value)
36 | 
37 |     else:
38 |         return ERROR
39 | 
40 |     return value
41 | 
42 | 
43 | # Internal
44 | _RE_BARE_NUMBER = re.compile(r'((^\D*)|(\D*$))')
45 | _DEFAULT_BARE_NUMBER = True
46 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | 
59 | # Extras
60 | tmp
61 | .DS_Store
62 | .idea
63 | .projectile
64 | *.sublime-project
65 | *.sublime-workspace
66 | shippable/*
67 | /docs/site
68 | .python-version
69 | tabulator
70 | jsontableschema_sql
71 | jsontableschema_bigquery
72 | jsontableschema_pandas
73 | README.rst
74 | .pytest_cache/
75 | 


--------------------------------------------------------------------------------
/tests/types/test_duration.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import pytest
 8 | import datetime
 9 | import isodate
10 | from tableschema import types
11 | from tableschema.config import ERROR
12 | 
13 | 
14 | # Tests
15 | 
16 | @pytest.mark.parametrize('format, value, result', [
17 |     ('default', isodate.Duration(years=1), isodate.Duration(years=1)),
18 |     ('default', 'P1Y10M3DT5H11M7S',
19 |          isodate.Duration(years=1, months=10, days=3, hours=5, minutes=11, seconds=7)),
20 |     ('default', 'P1Y', isodate.Duration(years=1)),
21 |     ('default', 'P1M', isodate.Duration(months=1)),
22 |     ('default', 'PT1S', datetime.timedelta(seconds=1)),
23 |     ('default', datetime.timedelta(seconds=1),  datetime.timedelta(seconds=1)),
24 |     ('default', 'P1M1Y', ERROR),
25 |     ('default', 'P-1Y', ERROR),
26 |     ('default', 'year', ERROR),
27 |     ('default', True, ERROR),
28 |     ('default', False, ERROR),
29 |     ('default', 1, ERROR),
30 |     ('default', '', ERROR),
31 |     ('default', [], ERROR),
32 |     ('default', {}, ERROR),
33 | ])
34 | def test_cast_duration(format, value, result):
35 |     assert types.cast_duration(format, value) == result
36 | 


--------------------------------------------------------------------------------
/tableschema/types/datetime.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import six
 8 | import warnings
 9 | from datetime import datetime
10 | from dateutil.parser import parse
11 | from ..config import ERROR
12 | 
13 | 
14 | # Module API
15 | 
16 | def cast_datetime(format, value, **options):
17 |     if not isinstance(value, datetime):
18 |         if not isinstance(value, six.string_types):
19 |             return ERROR
20 |         try:
21 |             if format == 'default':
22 |                 value = datetime.strptime(value, _DEFAULT_PATTERN)
23 |             elif format == 'any':
24 |                 value = parse(value)
25 |             else:
26 |                 if format.startswith('fmt:'):
27 |                     warnings.warn(
28 |                         'Format "fmt:<PATTERN>" is deprecated. '
29 |                         'Please use "<PATTERN>" without "fmt:" prefix.',
30 |                         UserWarning)
31 |                     format = format.replace('fmt:', '')
32 |                 value = datetime.strptime(value, format)
33 |         except Exception:
34 |             return ERROR
35 |     return value
36 | 
37 | 
38 | # Internal
39 | 
40 | _DEFAULT_PATTERN = '%Y-%m-%dT%H:%M:%SZ'
41 | 


--------------------------------------------------------------------------------
/tableschema/types/time.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import six
 8 | import warnings
 9 | from datetime import datetime, time
10 | from dateutil.parser import parse
11 | from ..config import ERROR
12 | 
13 | 
14 | # Module API
15 | 
16 | def cast_time(format, value, **options):
17 |     if not isinstance(value, time):
18 |         if not isinstance(value, six.string_types):
19 |             return ERROR
20 |         try:
21 |             if format == 'default':
22 |                 value = datetime.strptime(value, _DEFAULT_PATTERN).time()
23 |             elif format == 'any':
24 |                 value = parse(value).time()
25 |             else:
26 |                 if format.startswith('fmt:'):
27 |                     warnings.warn(
28 |                         'Format "fmt:<PATTERN>" is deprecated. '
29 |                         'Please use "<PATTERN>" without "fmt:" prefix.',
30 |                         UserWarning)
31 |                     format = format.replace('fmt:', '')
32 |                 value = datetime.strptime(value, format).time()
33 |         except Exception:
34 |             return ERROR
35 |     return value
36 | 
37 | 
38 | # Internal
39 | 
40 | _DEFAULT_PATTERN = '%H:%M:%S'
41 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: all install list readme release templates test version
 2 | 
 3 | 
 4 | PACKAGE := $(shell grep '^PACKAGE =' setup.py | cut -d "'" -f2)
 5 | VERSION := $(shell head -n 1 $(PACKAGE)/VERSION)
 6 | LEAD := $(shell head -n 1 LEAD.md)
 7 | 
 8 | 
 9 | all: list
10 | 
11 | install:
12 | 	pip install --upgrade -e .[develop]
13 | 
14 | list:
15 | 	@grep '^\.PHONY' Makefile | cut -d' ' -f2- | tr ' ' '\n'
16 | 
17 | profiles:
18 | 	wget -O tableschema/profiles/table-schema.json https://specs.frictionlessdata.io/schemas/table-schema.json
19 | 
20 | readme:
21 | 	pip install md-toc
22 | 	pip install referencer
23 | 	referencer $(PACKAGE) README.md --in-place
24 | 	md_toc -p github --header-levels 3 README.md
25 | 	sed -i '/(#$(PACKAGE)-py)/,+2d' README.md
26 | 
27 | release:
28 | 	git checkout master && git pull origin && git fetch -p
29 | 	@git log --pretty=format:"%C(yellow)%h%Creset %s%Cgreen%d" --reverse -20
30 | 	@echo "\nReleasing v$(VERSION) in 10 seconds. Press <CTRL+C> to abort\n" && sleep 10
31 | 	git commit -a -m 'v$(VERSION)' && git tag -a v$(VERSION) -m 'v$(VERSION)'
32 | 	git push --follow-tags
33 | 
34 | templates:
35 | 	sed -i -E "s/@(\w*)/@$(LEAD)/" .github/issue_template.md
36 | 	sed -i -E "s/@(\w*)/@$(LEAD)/" .github/pull_request_template.md
37 | 
38 | test:
39 | 	pylama $(PACKAGE)
40 | 	pytest --cov ${PACKAGE} --cov-report term-missing --cov-fail-under 90
41 | 
42 | version:
43 | 	@echo $(VERSION)
44 | 


--------------------------------------------------------------------------------
/tableschema/types/string.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import re
 8 | import six
 9 | import uuid
10 | import base64
11 | import rfc3986.exceptions
12 | import rfc3986.validators
13 | import rfc3986.uri
14 | from ..config import ERROR
15 | 
16 | 
17 | # Module API
18 | 
19 | def cast_string(format, value, **options):
20 |     if not isinstance(value, six.string_types):
21 |         return ERROR
22 |     if format in _SIMPLE_FORMATS:
23 |         return value
24 |     if format == 'uri':
25 |         uri = _uri_from_string(value)
26 |         try:
27 |             _uri_validator.validate(uri)
28 |         except rfc3986.exceptions.ValidationError:
29 |             return ERROR
30 |     elif format == 'email':
31 |         if not re.match(_EMAIL_PATTERN, value):
32 |             return ERROR
33 |     elif format == 'uuid':
34 |         try:
35 |             uuid.UUID(value, version=4)
36 |         except Exception:
37 |             return ERROR
38 |     elif format == 'binary':
39 |         try:
40 |             base64.b64decode(value)
41 |         except Exception:
42 |             return ERROR
43 |     return value
44 | 
45 | 
46 | # Internal
47 | 
48 | _SIMPLE_FORMATS = {'default', None}
49 | _EMAIL_PATTERN = re.compile(r'[^@]+@[^@]+\.[^@]+')
50 | _uri_from_string = rfc3986.uri.URIReference.from_string
51 | _uri_validator = rfc3986.validators.Validator().require_presence_of('scheme')
52 | 


--------------------------------------------------------------------------------
/tableschema/types/date.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import six
 8 | import warnings
 9 | from datetime import datetime, date
10 | from dateutil.parser import parse
11 | from ..config import ERROR
12 | 
13 | 
14 | # Module API
15 | 
16 | def cast_date(format, value, **options):
17 |     if isinstance(value, datetime):
18 |         value_time = value.time()
19 |         if value_time.hour == 0 and value_time.minute == 0 and value_time.second == 0:
20 |             return datetime(value.year, value.month, value.day).date()
21 |         else:
22 |             return ERROR
23 | 
24 |     if isinstance(value, date):
25 |         return value
26 | 
27 |     if not isinstance(value, six.string_types):
28 |         return ERROR
29 | 
30 |     # Parse string date
31 |     try:
32 |         if format == 'default':
33 |             value = datetime.strptime(value, _DEFAULT_PATTERN).date()
34 |         elif format == 'any':
35 |             value = parse(value).date()
36 |         else:
37 |             if format.startswith('fmt:'):
38 |                 warnings.warn(
39 |                     'Format "fmt:<PATTERN>" is deprecated. '
40 |                     'Please use "<PATTERN>" without "fmt:" prefix.',
41 |                     UserWarning)
42 |                 format = format.replace('fmt:', '')
43 |             value = datetime.strptime(value, format).date()
44 |     except Exception:
45 |         return ERROR
46 | 
47 |     return value
48 | 
49 | 
50 | # Internal
51 | 
52 | _DEFAULT_PATTERN = '%Y-%m-%d'
53 | 


--------------------------------------------------------------------------------
/tests/types/test_boolean.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import pytest
 8 | from tableschema import types
 9 | from tableschema.config import ERROR
10 | 
11 | 
12 | # Tests
13 | 
14 | @pytest.mark.parametrize('format, value, result, options', [
15 |     ('default', True, True, {}),
16 |     ('default', 'true', True, {}),
17 |     ('default', 'True', True, {}),
18 |     ('default', 'TRUE', True, {}),
19 |     ('default', '1', True, {}),
20 |     ('default', 'yes', True, {'trueValues': ['yes']}),
21 |     ('default', False, False, {}),
22 |     ('default', 'false', False, {}),
23 |     ('default', 'False', False, {}),
24 |     ('default', 'FALSE', False, {}),
25 |     ('default', '0', False, {}),
26 |     ('default', 'no', False, {'falseValues': ['no']}),
27 |     ('default', 't', ERROR, {}),
28 |     ('default', 'YES', ERROR, {}),
29 |     ('default', 'Yes', ERROR, {}),
30 |     ('default', 'f', ERROR, {}),
31 |     ('default', 'NO', ERROR, {}),
32 |     ('default', 'No', ERROR, {}),
33 |     ('default', 0, ERROR, {}),
34 |     ('default', 1, ERROR, {}),
35 |     ('default', 0, False, {'falseValues': [0], 'trueValues': [1]}),
36 |     ('default', 1, True, {'falseValues': [0], 'trueValues': [1]}),
37 |     ('default', '3.14', ERROR, {}),
38 |     ('default', '', ERROR, {}),
39 |     ('default', 'Yes', ERROR, {'trueValues': ['yes']}),
40 |     ('default', 'No', ERROR, {'falseValues': ['no']}),
41 | ])
42 | def test_cast_boolean(format, value, result, options):
43 |     assert types.cast_boolean(format, value, **options) == result
44 | 


--------------------------------------------------------------------------------
/tableschema/types/geopoint.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import six
 8 | import json
 9 | from typing import NamedTuple
10 | from decimal import Decimal
11 | from ..config import ERROR
12 | 
13 | 
14 | # Module API
15 | 
16 | def cast_geopoint(format, value, **options):
17 | 
18 |     # Parse
19 |     if isinstance(value, six.string_types):
20 |         try:
21 |             if format == 'default':
22 |                 lon, lat = value.split(',')
23 |                 lon = lon.strip()
24 |                 lat = lat.strip()
25 |             elif format == 'array':
26 |                 lon, lat = json.loads(value)
27 |             elif format == 'object':
28 |                 if isinstance(value, six.string_types):
29 |                     value = json.loads(value)
30 |                 if len(value) != 2:
31 |                     return ERROR
32 |                 lon = value['lon']
33 |                 lat = value['lat']
34 |             value = _geopoint(Decimal(lon), Decimal(lat))
35 |         except Exception:
36 |             return ERROR
37 | 
38 |     # Validate
39 |     try:
40 |         value = _geopoint(*value)
41 |         if value.lon > 180 or value.lon < -180:
42 |             return ERROR
43 |         if value.lat > 90 or value.lat < -90:
44 |             return ERROR
45 |     except Exception:
46 |         return ERROR
47 | 
48 |     return value
49 | 
50 | 
51 | # Internal
52 | 
53 | class _geopoint(NamedTuple):
54 |     lon: Decimal
55 |     lat: Decimal
56 | 
57 |     def __repr__(self):
58 |         return '[%s, %s]' % (self.lon, self.lat)
59 | 


--------------------------------------------------------------------------------
/tests/types/test_geopoint.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import pytest
 8 | from tableschema import types
 9 | from tableschema.config import ERROR
10 | 
11 | 
12 | # Tests
13 | 
14 | @pytest.mark.parametrize('format, value, result', [
15 |     ('default', (180, 90), (180, 90)),
16 |     ('default', [180, 90], (180, 90)),
17 |     ('default', '180,90', (180, 90)),
18 |     ('default', '180, -90', (180, -90)),
19 |     ('default', {'lon': 180, 'lat': 90}, ERROR),
20 |     ('default', '181,90', ERROR),
21 |     ('default', '0,91', ERROR),
22 |     ('default', 'string', ERROR),
23 |     ('default', 1, ERROR),
24 |     ('default', '3.14', ERROR),
25 |     ('default', '', ERROR),
26 |     ('array', (180, 90), (180, 90)),
27 |     ('array', [180, 90], (180, 90)),
28 |     ('array', '[180, -90]', (180, -90)),
29 |     #  ('array', {'lon': 180, 'lat': 90}, ERROR),
30 |     ('array', [181, 90], ERROR),
31 |     ('array', [0, 91], ERROR),
32 |     ('array', '180,90', ERROR),
33 |     ('array', 'string', ERROR),
34 |     ('array', 1, ERROR),
35 |     ('array', '3.14', ERROR),
36 |     ('array', '', ERROR),
37 |     #  ('object', {'lon': 180, 'lat': 90}, (180, 90)),
38 |     ('object', '{"lon": 180, "lat": 90}', (180, 90)),
39 |     ('object', '[180, -90]', ERROR),
40 |     ('object', {'lon': 181, 'lat': 90}, ERROR),
41 |     ('object', {'lon': 180, 'lat': -91}, ERROR),
42 |     #  ('object', [180, -90], ERROR),
43 |     ('object', '180,90', ERROR),
44 |     ('object', 'string', ERROR),
45 |     ('object', 1, ERROR),
46 |     ('object', '3.14', ERROR),
47 |     ('object', '', ERROR),
48 | ])
49 | def test_cast_geopoint(format, value, result):
50 |     assert types.cast_geopoint(format, value) == result
51 | 


--------------------------------------------------------------------------------
/tests/types/test_time.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import warnings
 8 | import pytest
 9 | from datetime import time
10 | from tableschema import types
11 | from tableschema.config import ERROR
12 | 
13 | 
14 | # Tests
15 | 
16 | @pytest.mark.parametrize('format, value, result', [
17 |     ('default', time(6), time(6)),
18 |     ('default', '06:00:00', time(6)),
19 |     ('default', '09:00', ERROR),
20 |     ('default', '3 am', ERROR),
21 |     ('default', '3.00', ERROR),
22 |     ('default', 'invalid', ERROR),
23 |     ('default', True, ERROR),
24 |     ('default', '', ERROR),
25 |     ('any', time(6), time(6)),
26 |     ('any', '06:00:00', time(6)),
27 |     ('any', '3:00 am', time(3)),
28 |     ('any', 'some night', ERROR),
29 |     ('any', 'invalid', ERROR),
30 |     ('any', True, ERROR),
31 |     ('any', '', ERROR),
32 |     ('%H:%M', time(6), time(6)),
33 |     ('%H:%M', '06:00', time(6)),
34 |     ('%M:%H', '06:50', ERROR),
35 |     ('%H:%M', '3:00 am', ERROR),
36 |     ('%H:%M', 'some night', ERROR),
37 |     ('%H:%M', 'invalid', ERROR),
38 |     ('%H:%M', True, ERROR),
39 |     ('%H:%M', '', ERROR),
40 |     ('invalid', '', ERROR),
41 |     # Deprecated
42 |     ('fmt:%H:%M', time(6), time(6)),
43 |     ('fmt:%H:%M', '06:00', time(6)),
44 |     ('fmt:%M:%H', '06:50', ERROR),
45 |     ('fmt:%H:%M', '3:00 am', ERROR),
46 |     ('fmt:%H:%M', 'some night', ERROR),
47 |     ('fmt:%H:%M', 'invalid', ERROR),
48 |     ('fmt:%H:%M', True, ERROR),
49 |     ('fmt:%H:%M', '', ERROR),
50 | ])
51 | def test_cast_time(format, value, result):
52 |     with warnings.catch_warnings():
53 |         warnings.simplefilter("error" if not format.startswith('fmt:') else "ignore")
54 |         assert types.cast_time(format, value) == result
55 | 


--------------------------------------------------------------------------------
/.github/workflows/general.yml:
--------------------------------------------------------------------------------
 1 | name: general
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |     tags:
 8 |       - v*.*.*
 9 |   pull_request:
10 |     branches:
11 |       - main
12 | 
13 | jobs:
14 | 
15 |   # Test
16 | 
17 |   test:
18 |     runs-on: ubuntu-latest
19 |     strategy:
20 |       matrix:
21 |         python-version: ['3.10', '3.11', '3.12']
22 |     steps:
23 |       - name: Checkout repository
24 |         uses: actions/checkout@v2
25 |       - name: Install Python
26 |         uses: actions/setup-python@v2
27 |         with:
28 |           python-version: ${{ matrix.python-version }}
29 |       - name: Install dependencies
30 |         run: |
31 |           python -m pip install --upgrade pip
32 |           pip install setuptools wheel
33 |           make install
34 |       - name: Test software
35 |         run: make test
36 |       - name: Report coverage
37 |         uses: codecov/codecov-action@v1
38 | 
39 |   # Release
40 | 
41 |   release:
42 |     if: github.event_name == 'push' && contains(github.ref, 'refs/tags/')
43 |     runs-on: ubuntu-latest
44 |     needs: [test]
45 |     steps:
46 |       - name: Checkout repository
47 |         uses: actions/checkout@v2
48 |       - name: Install Python
49 |         uses: actions/setup-python@v2
50 |         with:
51 |           python-version: 3.11
52 |       - name: Install dependencies
53 |         run: |
54 |           python -m pip install --upgrade pip
55 |           pip install setuptools wheel
56 |       - name: Build distribution
57 |         run: |
58 |           python setup.py sdist bdist_wheel
59 |       - name: Publish to PYPI
60 |         uses: pypa/gh-action-pypi-publish@master
61 |         with:
62 |           password: ${{ secrets.PYPI_API_KEY }}
63 |       - name: Release to GitHub
64 |         uses: softprops/action-gh-release@v1
65 |         env:
66 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
67 | 


--------------------------------------------------------------------------------
/tableschema/types/number.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import re
 8 | import six
 9 | from decimal import Decimal
10 | from ..config import ERROR
11 | 
12 | 
13 | # Module API
14 | 
15 | def cast_number(format, value, **options):
16 |     if isinstance(value, six.string_types):
17 |         group_char = options.get('groupChar', _DEFAULT_GROUP_CHAR)
18 |         decimal_char = options.get('decimalChar', _DEFAULT_DECIMAL_CHAR)
19 |         value = _RE_WHITESPACE.sub('', value)
20 |         if decimal_char != '.':
21 |             if group_char:
22 |                 value = value.replace(decimal_char, '__decimal_char__')
23 |                 value = value.replace(group_char, '')
24 |                 value = value.replace('__decimal_char__', '.')
25 |             else:
26 |                 value = value.replace(decimal_char, '__decimal_char__')
27 |                 value = value.replace('__decimal_char__', '.')
28 |         elif group_char:
29 |             value = value.replace(group_char, '')
30 |                 
31 |         if not options.get('bareNumber', _DEFAULT_BARE_NUMBER):
32 |             value = _RE_BARE_NUMBER.sub('', value)
33 |     elif isinstance(value, Decimal):
34 |         return value
35 |     elif not isinstance(value, six.integer_types + (float,)):
36 |         return ERROR
37 |     elif value is True or value is False:
38 |         return ERROR
39 |     else:
40 |         value = str(value)
41 |     try:
42 |         value = Decimal(value)
43 |     except Exception:
44 |         return ERROR
45 |     return value
46 | 
47 | 
48 | # Internal
49 | 
50 | _RE_WHITESPACE = re.compile(r'\s')
51 | _RE_BARE_NUMBER = re.compile(r'((^\D*)|(\D*$))')
52 | _DEFAULT_GROUP_CHAR = ''
53 | _DEFAULT_DECIMAL_CHAR = '.'
54 | _DEFAULT_BARE_NUMBER = True
55 | 


--------------------------------------------------------------------------------
/tableschema/infer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import six
 8 | import warnings
 9 | from . import config
10 | from .table import Table
11 | 
12 | 
13 | # Module API
14 | 
15 | def infer(source, headers=1, limit=100, confidence=0.75,
16 |           missing_values=config.DEFAULT_MISSING_VALUES,
17 |           guesser_cls=None, resolver_cls=None,
18 |           **options):
19 |     """Infer source schema.
20 | 
21 |     # Arguments
22 |         source (any): source as path, url or inline data
23 |         headers (int/str[]): headers rows number or headers list
24 |         confidence (float): how many casting errors are allowed (as a ratio, between 0 and 1)
25 |         missing_values (str[]): list of missing values (by default `['']`)
26 |         guesser_cls (class): you can implement inferring strategies by
27 |             providing type-guessing and type-resolving classes [experimental]
28 |         resolver_cls (class): you can implement inferring strategies by
29 |             providing type-guessing and type-resolving classes [experimental]
30 | 
31 |     # Raises
32 |         TableSchemaException: raises any error that occurs during the process
33 | 
34 |     # Returns
35 |         dict: returns schema descriptor
36 | 
37 |     """
38 | 
39 |     # Deprecated arguments order
40 |     is_string = lambda value: isinstance(value, six.string_types)
41 |     if isinstance(source, list) and all(map(is_string, source)):
42 |         warnings.warn('Correct arguments order infer(source, headers)', UserWarning)
43 |         source, headers = headers, source
44 | 
45 |     table = Table(source, headers=headers, sample_size=limit, **options)
46 |     descriptor = table.infer(limit=limit, confidence=confidence,
47 |         missing_values=missing_values, guesser_cls=guesser_cls,
48 |         resolver_cls=resolver_cls)
49 |     return descriptor
50 | 


--------------------------------------------------------------------------------
/tests/types/test_date.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import warnings
 8 | import pytest
 9 | from datetime import date, datetime
10 | from tableschema import types
11 | from tableschema.config import ERROR
12 | 
13 | 
14 | # Tests
15 | 
16 | @pytest.mark.parametrize('format, value, result', [
17 |     ('default', date(2019, 1, 1), date(2019, 1, 1)),
18 |     ('default', '2019-01-01', date(2019, 1, 1)),
19 |     ('default', '10th Jan 1969', ERROR),
20 |     ('default', 'invalid', ERROR),
21 |     ('default', True, ERROR),
22 |     ('default', '', ERROR),
23 |     ('default', datetime(2018, 1, 1), date(2018, 1, 1)),
24 |     ('default', datetime(2018, 3, 1, 8, 30, 23), ERROR),
25 |     ('any', date(2019, 1, 1), date(2019, 1, 1)),
26 |     ('any', '2019-01-01', date(2019, 1, 1)),
27 |     ('any', '10th Jan 1969', date(1969, 1, 10)),
28 |     ('any', '10th Jan nineteen sixty nine', ERROR),
29 |     ('any', 'invalid', ERROR),
30 |     ('any', True, ERROR),
31 |     ('any', '', ERROR),
32 |     ('%d/%m/%y', date(2019, 1, 1), date(2019, 1, 1)),
33 |     ('%d/%m/%y', '21/11/06', date(2006, 11, 21)),
34 |     ('%y/%m/%d', '21/11/06 16:30', ERROR),
35 |     ('%d/%m/%y', 'invalid', ERROR),
36 |     ('%d/%m/%y', True, ERROR),
37 |     ('%d/%m/%y', '', ERROR),
38 |     ('invalid', '21/11/06 16:30', ERROR),
39 |     # Deprecated
40 |     ('fmt:%d/%m/%y', date(2019, 1, 1), date(2019, 1, 1)),
41 |     ('fmt:%d/%m/%y', '21/11/06', date(2006, 11, 21)),
42 |     ('fmt:%y/%m/%d', '21/11/06 16:30', ERROR),
43 |     ('fmt:%d/%m/%y', 'invalid', ERROR),
44 |     ('fmt:%d/%m/%y', True, ERROR),
45 |     ('fmt:%d/%m/%y', '', ERROR),
46 | ])
47 | def test_cast_date(format, value, result):
48 |     with warnings.catch_warnings():
49 |         warnings.simplefilter("error" if not format.startswith('fmt:') else "ignore")
50 |         assert types.cast_date(format, value) == result
51 | 


--------------------------------------------------------------------------------
/tests/types/test_datetime.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import warnings
 8 | import pytest
 9 | from datetime import datetime
10 | from tableschema import types
11 | from tableschema.config import ERROR
12 | 
13 | 
14 | # Tests
15 | 
16 | @pytest.mark.parametrize('format, value, result', [
17 |     ('default', datetime(2014, 1, 1, 6), datetime(2014, 1, 1, 6)),
18 |     ('default', '2014-01-01T06:00:00Z', datetime(2014, 1, 1, 6)),
19 |     ('default', 'Mon 1st Jan 2014 9 am', ERROR),
20 |     ('default', 'invalid', ERROR),
21 |     ('default', True, ERROR),
22 |     ('default', '', ERROR),
23 |     ('any', datetime(2014, 1, 1, 6), datetime(2014, 1, 1, 6)),
24 |     ('any', '10th Jan 1969 9 am', datetime(1969, 1, 10, 9)),
25 |     ('any', 'invalid', ERROR),
26 |     ('any', True, ERROR),
27 |     ('any', '', ERROR),
28 |     ('%d/%m/%y %H:%M', datetime(2006, 11, 21, 16, 30), datetime(2006, 11, 21, 16, 30)),
29 |     ('%d/%m/%y %H:%M', '21/11/06 16:30', datetime(2006, 11, 21, 16, 30)),
30 |     ('%H:%M %d/%m/%y', '21/11/06 16:30', ERROR),
31 |     ('%d/%m/%y %H:%M', 'invalid', ERROR),
32 |     ('%d/%m/%y %H:%M', True, ERROR),
33 |     ('%d/%m/%y %H:%M', '', ERROR),
34 |     ('invalid', '21/11/06 16:30', ERROR),
35 |     # Deprecated
36 |     ('fmt:%d/%m/%y %H:%M', datetime(2006, 11, 21, 16, 30), datetime(2006, 11, 21, 16, 30)),
37 |     ('fmt:%d/%m/%y %H:%M', '21/11/06 16:30', datetime(2006, 11, 21, 16, 30)),
38 |     ('fmt:%H:%M %d/%m/%y', '21/11/06 16:30', ERROR),
39 |     ('fmt:%d/%m/%y %H:%M', 'invalid', ERROR),
40 |     ('fmt:%d/%m/%y %H:%M', True, ERROR),
41 |     ('fmt:%d/%m/%y %H:%M', '', ERROR),
42 | ])
43 | def test_cast_datetime(format, value, result):
44 |     with warnings.catch_warnings():
45 |         warnings.simplefilter("error" if not format.startswith('fmt:') else "ignore")
46 |         assert types.cast_datetime(format, value) == result
47 | 


--------------------------------------------------------------------------------
/data/data_infer_increase_limit.csv:
--------------------------------------------------------------------------------
  1 | a,b
  2 | 0,1
  3 | 0,1
  4 | 0,1
  5 | 0,1
  6 | 0,1
  7 | 0,1
  8 | 0,1
  9 | 0,1
 10 | 0,1
 11 | 0,1
 12 | 0,1
 13 | 0,1
 14 | 0,1
 15 | 0,1
 16 | 0,1
 17 | 0,1
 18 | 0,1
 19 | 0,1
 20 | 0,1
 21 | 0,1
 22 | 0,1
 23 | 0,1
 24 | 0,1
 25 | 0,1
 26 | 0,1
 27 | 0,1
 28 | 0,1
 29 | 0,1
 30 | 0,1
 31 | 0,1
 32 | 0,1
 33 | 0,1
 34 | 0,1
 35 | 0,1
 36 | 0,1
 37 | 0,1
 38 | 0,1
 39 | 0,1
 40 | 0,1
 41 | 0,1
 42 | 0,1
 43 | 0,1
 44 | 0,1
 45 | 0,1
 46 | 0,1
 47 | 0,1
 48 | 0,1
 49 | 0,1
 50 | 0,1
 51 | 0,1
 52 | 0,1
 53 | 0,1
 54 | 0,1
 55 | 0,1
 56 | 0,1
 57 | 0,1
 58 | 0,1
 59 | 0,1
 60 | 0,1
 61 | 0,1
 62 | 0,1
 63 | 0,1
 64 | 0,1
 65 | 0,1
 66 | 0,1
 67 | 0,1
 68 | 0,1
 69 | 0,1
 70 | 0,1
 71 | 0,1
 72 | 0,1
 73 | 0,1
 74 | 0,1
 75 | 0,1
 76 | 0,1
 77 | 0,1
 78 | 0,1
 79 | 0,1
 80 | 0,1
 81 | 0,1
 82 | 0,1
 83 | 0,1
 84 | 0,1
 85 | 0,1
 86 | 0,1
 87 | 0,1
 88 | 0,1
 89 | 0,1
 90 | 0,1
 91 | 0,1
 92 | 0,1
 93 | 0,1
 94 | 0,1
 95 | 0,1
 96 | 0,1
 97 | 0,1
 98 | 0,1
 99 | 0,1
100 | 0,1
101 | 0,1.1
102 | 0,1.1
103 | 0,1.1
104 | 0,1.1
105 | 0,1.1
106 | 0,1.1
107 | 0,1.1
108 | 0,1.1
109 | 0,1.1
110 | 0,1.1
111 | 0,1.1
112 | 0,1.1
113 | 0,1.1
114 | 0,1.1
115 | 0,1.1
116 | 0,1.1
117 | 0,1.1
118 | 0,1.1
119 | 0,1.1
120 | 0,1.1
121 | 0,1.1
122 | 0,1.1
123 | 0,1.1
124 | 0,1.1
125 | 0,1.1
126 | 0,1.1
127 | 0,1.1
128 | 0,1.1
129 | 0,1.1
130 | 0,1.1
131 | 0,1.1
132 | 0,1.1
133 | 0,1.1
134 | 0,1.1
135 | 0,1.1
136 | 0,1.1
137 | 0,1.1
138 | 0,1.1
139 | 0,1.1
140 | 0,1.1
141 | 0,1.1
142 | 0,1.1
143 | 0,1.1
144 | 0,1.1
145 | 0,1.1
146 | 0,1.1
147 | 0,1.1
148 | 0,1.1
149 | 0,1.1
150 | 0,1.1
151 | 0,1.1
152 | 0,1.1
153 | 0,1.1
154 | 0,1.1
155 | 0,1.1
156 | 0,1.1
157 | 0,1.1
158 | 0,1.1
159 | 0,1.1
160 | 0,1.1
161 | 0,1.1
162 | 0,1.1
163 | 0,1.1
164 | 0,1.1
165 | 0,1.1
166 | 0,1.1
167 | 0,1.1
168 | 0,1.1
169 | 0,1.1
170 | 0,1.1
171 | 0,1.1
172 | 0,1.1
173 | 0,1.1
174 | 0,1.1
175 | 0,1.1
176 | 0,1.1
177 | 0,1.1
178 | 0,1.1
179 | 0,1.1
180 | 0,1.1
181 | 0,1.1
182 | 0,1.1
183 | 0,1.1
184 | 0,1.1
185 | 0,1.1
186 | 0,1.1
187 | 0,1.1
188 | 0,1.1
189 | 0,1.1
190 | 0,1.1
191 | 0,1.1
192 | 0,1.1
193 | 0,1.1
194 | 0,1.1
195 | 0,1.1
196 | 0,1.1
197 | 0,1.1
198 | 0,1.1
199 | 0,1.1
200 | 0,1.1
201 | 


--------------------------------------------------------------------------------
/tests/types/test_geojson.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | from mock import patch
 8 | import pytest
 9 | from tableschema import types
10 | from tableschema.config import ERROR
11 | from tableschema.profile import Profile
12 | 
13 | 
14 | # Tests
15 | 
16 | @pytest.mark.parametrize('format, value, result', [
17 |     ('default',
18 |         {'properties': {'Ã': 'Ã'}, 'type': 'Feature', 'geometry': None},
19 |         {'properties': {'Ã': 'Ã'}, 'type': 'Feature', 'geometry': None}),
20 |     ('default',
21 |         '{"geometry": null, "type": "Feature", "properties": {"\\u00c3": "\\u00c3"}}',
22 |         {'properties': {'Ã': 'Ã'}, 'type': 'Feature', 'geometry': None}),
23 |     ('default', {'coordinates': [0, 0, 0], 'type': 'Point'}, ERROR),
24 |     ('default', 'string', ERROR),
25 |     ('default', 1, ERROR),
26 |     ('default', '3.14', ERROR),
27 |     ('default', '', ERROR),
28 |     ('default', {}, ERROR),
29 |     ('default', '{}', ERROR),
30 |     ('topojson',
31 |         {'type': 'LineString', 'arcs': [42]},
32 |         {'type': 'LineString', 'arcs': [42]}),
33 |     ('topojson',
34 |         '{"type": "LineString", "arcs": [42]}',
35 |         {'type': 'LineString', 'arcs': [42]}),
36 |     ('topojson', 'string', ERROR),
37 |     ('topojson', 1, ERROR),
38 |     ('topojson', '3.14', ERROR),
39 |     ('topojson', '', ERROR),
40 | ])
41 | def test_cast_geojson(format, value, result):
42 |     assert types.cast_geojson(format, value) == result
43 | 
44 | 
45 | @pytest.mark.parametrize('format, value, validates', [
46 |     ('default', '', False),
47 |     ('default', '""', False),
48 |     ('default', '3.14', False),
49 |     ('default', '{}', True),
50 |     ('default', {}, True),
51 | ])
52 | def test_validation(format, value, validates):
53 |     """Only json object shaped inputs call Profile.validate()."""
54 |     err = Exception('fake validation error')
55 |     with patch.object(Profile, 'validate', side_effect=err) as mock_validate:
56 |         assert types.cast_geojson(format, value) == ERROR
57 |         assert mock_validate.call_count == int(validates)
58 | 


--------------------------------------------------------------------------------
/tableschema/cli.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | 
 6 | import io
 7 | import sys
 8 | import click
 9 | import tableschema
10 | import json as json_module
11 | from . import config
12 | 
13 | 
14 | # Module API
15 | 
16 | @click.group(help='')
17 | def cli():
18 |     """Command-line interface
19 | 
20 |     ```
21 |     Usage: tableschema [OPTIONS] COMMAND [ARGS]...
22 | 
23 |     Options:
24 |       --help  Show this message and exit.
25 | 
26 |     Commands:
27 |       infer     Infer a schema from data.
28 |       info      Return info on this version of Table Schema
29 |       validate  Validate that a supposed schema is in fact a Table Schema.
30 |     ```
31 | 
32 |     """
33 |     pass
34 | 
35 | 
36 | @cli.command()
37 | def info():
38 |     """Return info on this version of Table Schema"""
39 |     click.echo(json_module.dumps({'version': config.VERSION}, ensure_ascii=False, indent=4))
40 | 
41 | 
42 | @cli.command()
43 | @click.argument('data')
44 | @click.option('--row_limit', default=100, type=int)
45 | @click.option('--confidence', default=0.75, type=float)
46 | @click.option('--encoding', default='utf-8')
47 | @click.option('--to_file')
48 | @click.option('--json', is_flag=True)
49 | def infer(data, row_limit, confidence, encoding, to_file, json):
50 |     """Infer a schema from data.
51 | 
52 |     - data must be a local filepath
53 |     - data must be CSV
54 |     - the file encoding is assumed to be UTF-8 unless an encoding is passed
55 |       with --encoding
56 |     - the first line of data must be headers
57 |     - these constraints are just for the CLI
58 | 
59 |     """
60 |     try:
61 |         descriptor = tableschema.infer(
62 |             data, encoding=encoding, limit=row_limit, confidence=confidence)
63 |     except Exception as exception:
64 |         click.echo(exception)
65 |         sys.exit(1)
66 | 
67 |     if json:
68 |         return click.secho(json_module.dumps(descriptor, ensure_ascii=False, indent=4))
69 |     if to_file:
70 |         with io.open(to_file, mode='w+t', encoding='utf-8') as dest:
71 |             dest.write(json_module.dumps(descriptor, ensure_ascii=False, indent=4))
72 |     click.echo(descriptor)
73 | 
74 | 
75 | @cli.command()
76 | @click.argument('schema')
77 | def validate(schema):
78 |     """Validate that a supposed schema is in fact a Table Schema."""
79 |     try:
80 |         tableschema.validate(schema)
81 |         click.echo("Schema is valid")
82 |         sys.exit(0)
83 |     except tableschema.exceptions.ValidationError as exception:
84 |         click.echo("Schema is not valid")
85 |         click.echo(exception.errors)
86 |         sys.exit(1)
87 |     except Exception as exception:
88 |         click.echo(exception)
89 |         sys.exit(1)
90 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | 
 6 | import os
 7 | import io
 8 | from setuptools import setup, find_packages
 9 | 
10 | 
11 | # Helpers
12 | def read(*paths):
13 |     """Read a text file."""
14 |     basedir = os.path.dirname(__file__)
15 |     fullpath = os.path.join(basedir, *paths)
16 |     contents = io.open(fullpath, encoding='utf-8').read().strip()
17 |     return contents
18 | 
19 | 
20 | # Prepare
21 | PACKAGE = 'tableschema'
22 | NAME = PACKAGE.replace('_', '-')
23 | INSTALL_REQUIRES = [
24 |     'six>=1.9',
25 |     'click>=3.3',
26 |     'requests>=2.5',
27 |     'cached-property>=1.5',
28 |     'python-dateutil>=2.4',
29 |     'jsonschema>=2.5',
30 |     'unicodecsv>=0.14',
31 |     'isodate>=0.5.4',
32 |     'rfc3986>=1.1.0',
33 |     'dataflows-tabulator>=1.54.1',
34 | ]
35 | TESTS_REQUIRE = [
36 |     'mock',
37 |     'pylama',
38 |     'pytest',
39 |     'pytest-cov',
40 | ]
41 | README = read('README.md')
42 | VERSION = read(PACKAGE, 'VERSION')
43 | PACKAGES = find_packages(exclude=['examples', 'tests'])
44 | 
45 | 
46 | # Run
47 | setup(
48 |     name=NAME,
49 |     version=VERSION,
50 |     packages=PACKAGES,
51 |     include_package_data=True,
52 |     install_requires=INSTALL_REQUIRES,
53 |     tests_require=TESTS_REQUIRE,
54 |     extras_require={'develop': TESTS_REQUIRE},
55 |     entry_points={
56 |         'console_scripts': [
57 |             'tableschema = tableschema.__main__:cli',
58 |         ]
59 |     },
60 |     zip_safe=False,
61 |     long_description=README,
62 |     long_description_content_type="text/markdown",
63 |     description='A utility library for working with Table Schema in Python',
64 |     author='Open Knowledge Foundation',
65 |     author_email='info@okfn.org',
66 |     url='https://github.com/frictionlessdata/tableschema-py',
67 |     license='MIT',
68 |     keywords=[
69 |         'frictionless data',
70 |         'open data',
71 |         'json schema',
72 |         'table schema',
73 |         'data package',
74 |         'tabular data package',
75 |     ],
76 |     classifiers=[
77 |         'Development Status :: 4 - Beta',
78 |         'Environment :: Web Environment',
79 |         'Intended Audience :: Developers',
80 |         'License :: OSI Approved :: MIT License',
81 |         'Operating System :: OS Independent',
82 |         'Programming Language :: Python :: 2',
83 |         'Programming Language :: Python :: 2.7',
84 |         'Programming Language :: Python :: 3',
85 |         'Programming Language :: Python :: 3.4',
86 |         'Programming Language :: Python :: 3.5',
87 |         'Programming Language :: Python :: 3.6',
88 |         'Topic :: Internet :: WWW/HTTP :: Dynamic Content',
89 |         'Topic :: Software Development :: Libraries :: Python Modules'
90 |     ],
91 | )
92 | 


--------------------------------------------------------------------------------
/tableschema/exceptions.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | from __future__ import unicode_literals
  6 | 
  7 | 
  8 | # Module API
  9 | 
 10 | class DataPackageException(Exception):
 11 |     """Base class for all DataPackage/TableSchema exceptions.
 12 | 
 13 |     If there are multiple errors, they can be read from the exception object:
 14 | 
 15 |     ```python
 16 |     try:
 17 |         # lib action
 18 |     except DataPackageException as exception:
 19 |         if exception.multiple:
 20 |             for error in exception.errors:
 21 |                 # handle error
 22 |     ```
 23 | 
 24 |     """
 25 | 
 26 |     # Public
 27 | 
 28 |     def __init__(self, message, errors=None):
 29 |         self.__errors = errors or []
 30 |         super(Exception, self).__init__(message)
 31 | 
 32 |     @property
 33 |     def multiple(self):
 34 |         """Whether it's a nested exception
 35 | 
 36 |         # Returns
 37 |             bool: whether it's a nested exception
 38 | 
 39 |         """
 40 |         return bool(self.__errors)
 41 | 
 42 |     @property
 43 |     def errors(self):
 44 |         """List of nested errors
 45 | 
 46 |         # Returns
 47 |             DataPackageException[]: list of nested errors
 48 | 
 49 |         """
 50 |         return self.__errors
 51 | 
 52 | 
 53 | class TableSchemaException(DataPackageException):
 54 |     """Base class for all TableSchema exceptions.
 55 |     """
 56 |     pass
 57 | 
 58 | 
 59 | class LoadError(TableSchemaException):
 60 |     """All loading errors.
 61 |     """
 62 |     pass
 63 | 
 64 | 
 65 | class ValidationError(TableSchemaException):
 66 |     """All validation errors.
 67 |     """
 68 |     pass
 69 | 
 70 | 
 71 | class CastError(TableSchemaException):
 72 |     """All value cast errors.
 73 |     """
 74 |     pass
 75 | 
 76 | 
 77 | class IntegrityError(TableSchemaException):
 78 |     """All integrity errors.
 79 |     """
 80 |     pass
 81 | 
 82 | 
 83 | class UniqueKeyError(CastError):
 84 |     """Unique key constraint violation (CastError subclass)
 85 |     """
 86 |     pass
 87 | 
 88 | 
 89 | class RelationError(TableSchemaException):
 90 |     """All relations errors.
 91 |     """
 92 |     pass
 93 | 
 94 | 
 95 | class UnresolvedFKError(RelationError):
 96 |     """Unresolved foreign key reference error (RelationError subclass).
 97 |     """
 98 |     pass
 99 | 
100 | 
101 | class StorageError(TableSchemaException):
102 |     """All storage errors.
103 |     """
104 |     pass
105 | 
106 | 
107 | # Deprecated
108 | 
109 | MultipleInvalid = TableSchemaException
110 | InvalidJSONError = LoadError
111 | SchemaValidationError = ValidationError
112 | InvalidSchemaError = ValidationError
113 | InvalidCastError = CastError
114 | ConstraintError = CastError
115 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | 
 6 | import os
 7 | import ast
 8 | import pytest
 9 | from click.testing import CliRunner
10 | from tableschema import Schema
11 | from tableschema.cli import infer, validate
12 | os.environ['LC_ALL'] = 'en_US.UTF-8'
13 | 
14 | 
15 | # Tests
16 | 
17 | def test_infer_schema():
18 |     runner = CliRunner()
19 |     result = runner.invoke(infer, ['data/data_infer.csv'])
20 |     # output is a string, evaluate to a dict
21 |     schema = ast.literal_eval(result.output)
22 |     schema_model = Schema(schema)
23 |     assert schema_model.get_field('id').type == 'integer'
24 |     assert schema_model.get_field('age').type == 'integer'
25 |     assert schema_model.get_field('name').type == 'string'
26 | 
27 | 
28 | def test_infer_schema_utf8():
29 |     """UTF8 encoded data containing non-ascii characters."""
30 |     runner = CliRunner()
31 |     result = runner.invoke(infer, ['data/data_infer_utf8.csv'])
32 |     # output is a string, evaluate to a dict
33 |     schema = ast.literal_eval(result.output)
34 |     schema_model = Schema(schema)
35 |     assert schema_model.get_field('id').type == 'integer'
36 |     assert schema_model.get_field('age').type == 'integer'
37 |     assert schema_model.get_field('name').type == 'string'
38 | 
39 | 
40 | def test_infer_schema_greek():
41 |     """iso-8859-7 (greek) encoded data containing non-ascii characters."""
42 |     runner = CliRunner()
43 |     result = runner.invoke(infer,
44 |                            ['data/data_infer_iso-8859-7.csv',
45 |                             '--encoding=iso-8859-7'])
46 |     # output is a string, evaluate to a dict
47 |     schema = ast.literal_eval(result.output)
48 |     schema_model = Schema(schema)
49 |     assert schema_model.get_field('id').type == 'integer'
50 |     assert schema_model.get_field('age').type == 'integer'
51 |     assert schema_model.get_field('name').type == 'string'
52 | 
53 | def test_validate_schema():
54 |     runner = CliRunner()
55 |     result = runner.invoke(validate, ['data/schema_valid_simple.json'])
56 |     assert result.output.splitlines()[0] == 'Schema is valid'
57 |     assert result.exit_code == 0
58 |     result = runner.invoke(validate, ['data/schema_invalid_pk_no_fields.json'])
59 |     assert result.output.splitlines()[0] == 'Schema is not valid'
60 |     assert result.exit_code == 1
61 | 
62 | 
63 | @pytest.mark.skip
64 | def test_infer_schema_greek_no_encoding_defined():
65 |     """iso-8859-7 (greek) encoded data containing non-ascii characters,
66 |     with no encoding arg passed returns an error message."""
67 |     runner = CliRunner()
68 |     result = runner.invoke(cli.infer, ['data/data_infer_iso-8859-7.csv'])
69 |     # There's an exception in the result
70 |     assert 'Could not decode the data file as utf-8.' in result.output
71 | 


--------------------------------------------------------------------------------
/tests/types/test_number.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import pytest
 8 | from decimal import Decimal
 9 | from tableschema import types
10 | from tableschema.config import ERROR
11 | 
12 | 
13 | # Tests
14 | 
15 | @pytest.mark.parametrize('format, value, result, options', [
16 |     ('default', Decimal(1), Decimal(1), {}),
17 |     ('default', 1, Decimal(1), {}),
18 |     ('default', 1.0, Decimal(1), {}),
19 |     ('default', 1 << 63, Decimal(1 << 63), {}),
20 |     ('default', '1', Decimal(1), {}),
21 |     ('default', '10.00', Decimal(10), {}),
22 |     ('default', '10.50', Decimal(10.5), {}),
23 |     ('default', 24.122667, Decimal('24.122667'), {}),
24 |     ('default', '100%', Decimal(100), {'bareNumber': False}),
25 |     ('default', '1000‰', Decimal(1000), {'bareNumber': False}),
26 |     ('default', '-1000', Decimal(-1000), {}),
27 |     ('default', '1,000', Decimal(1000), {'groupChar': ','}),
28 |     ('default', '10,000.00', Decimal(10000), {'groupChar': ','}),
29 |     ('default', '10,000,000.50', Decimal(10000000.5), {'groupChar': ','}),
30 |     ('default', '10#000.00', Decimal(10000), {'groupChar': '#'}),
31 |     ('default', '10#000#000.50', Decimal(10000000.5), {'groupChar': '#'}),
32 |     ('default', '10.50', Decimal(10.5), {'groupChar': '#'}),
33 |     ('default', '1#000', Decimal(1000), {'groupChar': '#'}),
34 |     ('default', '10#000@00', Decimal(10000), {'groupChar': '#', 'decimalChar': '@'}),
35 |     ('default', '10#000#000@50', Decimal(10000000.5), {'groupChar': '#', 'decimalChar': '@'}),
36 |     ('default', '10@50', Decimal(10.5), {'groupChar': '#', 'decimalChar': '@'}),
37 |     ('default', '1#000', Decimal(1000), {'groupChar': '#', 'decimalChar': '@'}),
38 |     ('default', '10,000.00', Decimal(10000), {'groupChar': ',', 'bareNumber': False}),
39 |     ('default', '10,000,000.00', Decimal(10000000), {'groupChar': ',', 'bareNumber': False}),
40 |     ('default', '10.000.000,00', Decimal(10000000), {'groupChar': '.', 'decimalChar': ','}),
41 |     ('default', '$10000.00', Decimal(10000), {'bareNumber': False}),
42 |     ('default', '  10,000.00 €', Decimal(10000), {'groupChar': ',', 'bareNumber': False}),
43 |     ('default', '10 000,00', Decimal(10000), {'groupChar': ' ', 'decimalChar': ','}),
44 |     ('default', '10 000 000,00', Decimal(10000000), {'groupChar': ' ', 'decimalChar': ','}),
45 |     ('default', '10000,00 ₪', Decimal(10000), {'groupChar': ' ', 'decimalChar': ',', 'bareNumber': False}),
46 |     ('default', '  10 000,00 £', Decimal(10000), {'groupChar': ' ', 'decimalChar': ',', 'bareNumber': False}),
47 |     ('default', True, ERROR, {}),
48 |     ('default', False, ERROR, {}),
49 |     ('default', '10,000a.00', ERROR, {}),
50 |     ('default', '10+000.00', ERROR, {}),
51 |     ('default', '$10:000.00', ERROR, {}),
52 |     ('default', 'string', ERROR, {}),
53 |     ('default', '', ERROR, {}),
54 | ])
55 | def test_cast_number(format, value, result, options):
56 |     assert types.cast_number(format, value, **options) == result
57 | 


--------------------------------------------------------------------------------
/tests/test_infer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import os
 8 | import io
 9 | from tableschema import infer
10 | 
11 | 
12 | # Tests
13 | 
14 | def test_infer_schema():
15 |     descriptor = infer('data/data_infer.csv')
16 |     assert descriptor == {
17 |         'fields': [
18 |             {'name': 'id', 'type': 'integer', 'format': 'default'},
19 |             {'name': 'age', 'type': 'integer', 'format': 'default'},
20 |             {'name': 'name', 'type': 'string', 'format': 'default'}],
21 |         'missingValues': [''],
22 |     }
23 | 
24 | 
25 | def test_infer_schema_utf8():
26 |     descriptor = infer('data/data_infer_utf8.csv')
27 |     assert descriptor == {
28 |         'fields': [
29 |             {'name': 'id', 'type': 'integer', 'format': 'default'},
30 |             {'name': 'age', 'type': 'integer', 'format': 'default'},
31 |             {'name': 'name', 'type': 'string', 'format': 'default'}],
32 |         'missingValues': [''],
33 |     }
34 | 
35 | 
36 | def test_infer_schema_with_row_limit():
37 |     descriptor = infer('data/data_infer_row_limit.csv', limit=4)
38 |     assert descriptor == {
39 |         'fields': [
40 |             {'name': 'id', 'type': 'integer', 'format': 'default'},
41 |             {'name': 'age', 'type': 'integer', 'format': 'default'},
42 |             {'name': 'name', 'type': 'string', 'format': 'default'}],
43 |         'missingValues': [''],
44 |     }
45 | 
46 | 
47 | def test_infer_schema_with_missing_values_default():
48 |     descriptor = infer('data/data_infer_missing_values.csv')
49 |     assert descriptor == {
50 |         'fields': [
51 |             {'name': 'id', 'type': 'string', 'format': 'default'},
52 |             {'name': 'age', 'type': 'integer', 'format': 'default'},
53 |             {'name': 'name', 'type': 'string', 'format': 'default'}],
54 |         'missingValues': [''],
55 |     }
56 | 
57 | 
58 | def test_infer_schema_with_missing_values_using_the_argument():
59 |     descriptor = infer('data/data_infer_missing_values.csv', missing_values=['-'])
60 |     assert descriptor == {
61 |         'fields': [
62 |             {'name': 'id', 'type': 'integer', 'format': 'default'},
63 |             {'name': 'age', 'type': 'integer', 'format': 'default'},
64 |             {'name': 'name', 'type': 'string', 'format': 'default'}],
65 |         'missingValues': ['-'],
66 |     }
67 | 
68 | 
69 | def test_infer_check_type_boolean_string_tie():
70 |     descriptor = infer([['f'], ['stringish']], headers=['field'])
71 |     assert descriptor['fields'][0]['type'] == 'string'
72 | 
73 | 
74 | def test_infer_xlsx_file_with_boolean_column_issue_203():
75 |     descriptor = infer('data/data_infer_boolean.xlsx')
76 |     assert descriptor == {
77 |         'fields': [
78 |             {'name': 'number', 'type': 'integer', 'format': 'default'},
79 |             {'name': 'string', 'type': 'string', 'format': 'default'},
80 |             {'name': 'boolean', 'type': 'boolean', 'format': 'default'}],
81 |         'missingValues': [''],
82 |     }
83 | 
84 | 
85 | def test_infer_increase_limit_issue_212():
86 |     descriptor = infer('data/data_infer_increase_limit.csv', limit=200)
87 |     assert descriptor == {
88 |         'fields': [
89 |             {'name': 'a', 'type': 'integer', 'format': 'default'},
90 |             {'name': 'b', 'type': 'number', 'format': 'default'},
91 |         ],
92 |         'missingValues': [''],
93 |     }
94 | 


--------------------------------------------------------------------------------
/data/schema_valid_full.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "fields": [
  3 |         {
  4 |             "name": "first_name",
  5 |             "title": "First Name",
  6 |             "type": "string",
  7 |             "description": "The first name of the person"
  8 |         },
  9 |         {
 10 |             "name": "last_name",
 11 |             "title": "Last Name",
 12 |             "type": "string",
 13 |             "description": "The last name of the person"
 14 |         },
 15 |         {
 16 |             "name": "gender",
 17 |             "title": "Gender",
 18 |             "type": "string",
 19 |             "description": "The gender of the person."
 20 |         },
 21 |         {
 22 |             "name": "age",
 23 |             "title": "Age",
 24 |             "type": "integer",
 25 |             "description": "The age of this person."
 26 |         },
 27 |         {
 28 |             "name": "period_employed",
 29 |             "title": "Period Employed",
 30 |             "type": "number",
 31 |             "description": "The period of employment, in years (eg: 2.6 Y)."
 32 |         },
 33 |         {
 34 |             "name": "employment_start",
 35 |             "title": "Employment Start",
 36 |             "type": "date",
 37 |             "description": "The date this person started employment."
 38 |         },
 39 |         {
 40 |             "name": "daily_start",
 41 |             "title": "Daily Start",
 42 |             "type": "time",
 43 |             "description": "Usual start time for this person."
 44 |         },
 45 |         {
 46 |             "name": "daily_end",
 47 |             "title": "Daily End",
 48 |             "type": "time",
 49 |             "description": "Usual end time for this person."
 50 |         },
 51 |         {
 52 |             "name": "is_management",
 53 |             "title": "Is Management",
 54 |             "type": "boolean",
 55 |             "description": "Is this person part of upper management."
 56 |         },
 57 |         {
 58 |             "name": "photo",
 59 |             "title": "Photo",
 60 |             "type": "string",
 61 |             "format": "binary",
 62 |             "description": "A photo of this person."
 63 |         },
 64 |         {
 65 |             "name": "interests",
 66 |             "title": "Interests",
 67 |             "type": "array",
 68 |             "description": "Declared interests of this person (work-related)."
 69 |         },
 70 |         {
 71 |             "name": "home_location",
 72 |             "title": "Home Location",
 73 |             "type": "geopoint",
 74 |             "description": "A geopoint for this person's home address."
 75 |         },
 76 |         {
 77 |             "name": "position_title",
 78 |             "title": "Position Title",
 79 |             "type": "string",
 80 |             "description": "This person's position in the company."
 81 |         },
 82 |         {
 83 |             "name": "extra",
 84 |             "title": "Extra",
 85 |             "type": "object",
 86 |             "description": "Extra information about this person."
 87 |         },
 88 |         {
 89 |             "name": "notes",
 90 |             "title": "Notes",
 91 |             "type": "any",
 92 |             "description": "Add any relevant notes for HR."
 93 |         }
 94 |     ],
 95 |     "primaryKey": [
 96 |         "first_name",
 97 |         "last_name",
 98 |         "period_employed",
 99 |         "home_location"
100 |     ],
101 |     "foreignKeys": [
102 |         {
103 |             "fields": ["position_title"],
104 |             "reference": {
105 |                 "resource": "positions",
106 |                 "fields": ["name"]
107 |             }
108 |         }
109 |     ]
110 | }
111 | 


--------------------------------------------------------------------------------
/tests/test_validate.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | from __future__ import unicode_literals
  6 | 
  7 | import os
  8 | import io
  9 | import json
 10 | import pytest
 11 | from tableschema import validate, exceptions
 12 | 
 13 | 
 14 | # Tests
 15 | 
 16 | def test_schema_valid_simple():
 17 |     valid = validate('data/schema_valid_simple.json')
 18 |     assert valid
 19 | 
 20 | 
 21 | def test_schema_valid_full():
 22 |     valid = validate('data/schema_valid_full.json')
 23 |     assert valid
 24 | 
 25 | 
 26 | def test_schema_valid_pk_array():
 27 |     valid = validate('data/schema_valid_pk_array.json')
 28 |     assert valid
 29 | 
 30 | 
 31 | def test_schema_invalid_empty():
 32 |     with pytest.raises(exceptions.ValidationError):
 33 |         valid = validate('data/schema_invalid_empty.json')
 34 | 
 35 | 
 36 | def test_schema_invalid_wrong_type():
 37 |     with pytest.raises(exceptions.ValidationError):
 38 |         valid = validate([])
 39 | 
 40 | 
 41 | def test_schema_invalid_pk_string():
 42 |     with pytest.raises(exceptions.ValidationError):
 43 |         valid = validate('data/schema_invalid_pk_string.json')
 44 | 
 45 | 
 46 | def test_schema_invalid_pk_array():
 47 |     with pytest.raises(exceptions.ValidationError):
 48 |         valid = validate('data/schema_invalid_pk_array.json')
 49 | 
 50 | 
 51 | def test_schema_valid_fk_array():
 52 |     valid = validate('data/schema_valid_fk_array.json')
 53 |     assert valid
 54 | 
 55 | 
 56 | def test_schema_invalid_fk_string():
 57 |     with pytest.raises(exceptions.ValidationError):
 58 |         valid = validate('data/schema_invalid_fk_string.json')
 59 | 
 60 | 
 61 | def test_schema_invalid_fk_no_reference():
 62 |     with pytest.raises(exceptions.ValidationError):
 63 |         valid = validate('data/schema_invalid_fk_no_reference.json')
 64 | 
 65 | 
 66 | def test_schema_invalid_fk_array():
 67 |     with pytest.raises(exceptions.ValidationError):
 68 |         valid = validate('data/schema_invalid_fk_array.json')
 69 | 
 70 | 
 71 | def test_schema_invalid_fk_ref_is_an_array_fields_is_a_string():
 72 |     with pytest.raises(exceptions.ValidationError):
 73 |         valid = validate('data/schema_invalid_fk_string_array_ref.json')
 74 | 
 75 | 
 76 | def test_schema_invalid_fk_reference_is_a_string_fields_is_an_array():
 77 |     with pytest.raises(exceptions.ValidationError):
 78 |         valid = validate('data/schema_invalid_fk_array_string_ref.json')
 79 | 
 80 | 
 81 | def test_schema_invalid_fk_reference_array_number_mismatch():
 82 |     with pytest.raises(exceptions.ValidationError):
 83 |         valid = validate('data/schema_invalid_fk_array_wrong_number.json')
 84 | 
 85 | 
 86 | def test_primary_key_is_not_a_valid_type():
 87 |     with pytest.raises(exceptions.ValidationError) as excinfo:
 88 |         valid = validate('data/schema_invalid_pk_is_wrong_type.json')
 89 |     assert len(excinfo.value.errors) == 2
 90 | 
 91 | 
 92 | def test_schema_multiple_errors_no_fail_fast_true():
 93 |     with pytest.raises(exceptions.ValidationError) as excinfo:
 94 |         valid = validate('data/schema_invalid_multiple_errors.json')
 95 |     assert len(excinfo.value.errors) == 5
 96 | 
 97 | 
 98 | def test_validate_error_message():
 99 |     descriptor = {
100 |         'fields': [
101 |             {'name': 'name', 'type': 'other'},
102 |         ],
103 |     }
104 |     with pytest.raises(exceptions.ValidationError) as excinfo:
105 |         validate(descriptor)
106 |     message = str(excinfo.value.errors[0])
107 |     assert 'Descriptor validation error' in message
108 |     assert 'at "fields/0" in descriptor' in message
109 |     assert 'at "properties/fields/items/anyOf" in profile' in message
110 | 
111 | 


--------------------------------------------------------------------------------
/tableschema/helpers.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | from __future__ import unicode_literals
  6 | 
  7 | import os
  8 | import io
  9 | import sys
 10 | import six
 11 | import json
 12 | import requests
 13 | from copy import deepcopy
 14 | from importlib.util import find_spec
 15 | from . import exceptions
 16 | from . import config
 17 | 
 18 | 
 19 | # Retrieve descriptor
 20 | 
 21 | def retrieve_descriptor(source):
 22 | 
 23 |     try:
 24 |         # Inline
 25 |         if isinstance(source, (dict, list)):
 26 |             return deepcopy(source)
 27 | 
 28 |         # String
 29 |         if isinstance(source, six.string_types):
 30 |             # Remote
 31 |             if six.moves.urllib.parse.urlparse(source).scheme in config.REMOTE_SCHEMES:
 32 |                 return requests.get(source).json()
 33 | 
 34 |             # Local
 35 |             with io.open(source, encoding='utf-8') as file:
 36 |                 return json.load(file)
 37 | 
 38 |         # Stream
 39 |         return json.load(source)
 40 | 
 41 |     except Exception:
 42 |         raise exceptions.LoadError('Can\'t load descriptor')
 43 | 
 44 | 
 45 | # Expand descriptor
 46 | 
 47 | def expand_schema_descriptor(descriptor):
 48 |     if isinstance(descriptor, dict):
 49 |         descriptor = deepcopy(descriptor)
 50 |         for field in descriptor.get('fields', []):
 51 |             field.setdefault('type', config.DEFAULT_FIELD_TYPE)
 52 |             field.setdefault('format', config.DEFAULT_FIELD_FORMAT)
 53 |         descriptor.setdefault('missingValues', config.DEFAULT_MISSING_VALUES)
 54 |     return descriptor
 55 | 
 56 | 
 57 | def expand_field_descriptor(descriptor):
 58 |     descriptor = deepcopy(descriptor)
 59 |     descriptor.setdefault('type', config.DEFAULT_FIELD_TYPE)
 60 |     descriptor.setdefault('format', config.DEFAULT_FIELD_FORMAT)
 61 |     return descriptor
 62 | 
 63 | 
 64 | # Miscellaneous
 65 | 
 66 | def ensure_dir(path):
 67 |     """Ensure directory exists.
 68 | 
 69 |     Args:
 70 |         path(str): dir path
 71 | 
 72 |     """
 73 |     dirpath = os.path.dirname(path)
 74 |     if dirpath and not os.path.exists(dirpath):
 75 |         os.makedirs(dirpath)
 76 | 
 77 | 
 78 | def normalize_value(value):
 79 |     """Convert value to string and make it lower cased.
 80 |     """
 81 |     cast = str
 82 |     if six.PY2:
 83 |         cast = unicode  # noqa
 84 |     return cast(value).lower()
 85 | 
 86 | 
 87 | def default_exc_handler(exc, *args, **kwargs):
 88 |     """Default exception handler function: raise exc, ignore other arguments.
 89 |     """
 90 |     raise exc
 91 | 
 92 | 
 93 | class PluginImporter(object):
 94 |     """Plugin importer.
 95 | 
 96 |     Example:
 97 |         Add to myapp.plugins something like this:
 98 |         ```
 99 |         importer = PluginImporter(virtual='myapp.plugins.', actual='myapp_')
100 |         importer.register()
101 |         del PluginImporter
102 |         del importer
103 |         ```
104 | 
105 |     """
106 | 
107 |     # Public
108 | 
109 |     def __init__(self, virtual, actual):
110 |         self.__virtual = virtual
111 |         self.__actual = actual
112 | 
113 |     def __eq__(self, other):
114 |         if not isinstance(other, type(self)):
115 |             return False
116 |         return (self.virtual == other.virtual and
117 |                 self.actual == other.actual)
118 | 
119 |     @property
120 |     def virtual(self):
121 |         return self.__virtual
122 | 
123 |     @property
124 |     def actual(self):
125 |         return self.__actual
126 | 
127 |     def register(self):
128 |         if self not in sys.meta_path:
129 |             sys.meta_path.append(self)
130 | 
131 |     def find_spec(self, fullname, path=None, target=None):
132 |         if fullname.startswith(self.virtual):
133 |             # Transform the module name
134 |             transformed_name = fullname.replace(self.virtual, self.actual)
135 |             return find_spec(transformed_name)
136 |         return None
137 | 
138 | 


--------------------------------------------------------------------------------
/tableschema/storage.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | from __future__ import absolute_import
  5 | from __future__ import unicode_literals
  6 | 
  7 | from six import add_metaclass
  8 | from importlib import import_module
  9 | from abc import ABCMeta, abstractmethod
 10 | from . import exceptions
 11 | 
 12 | 
 13 | # Module API
 14 | 
 15 | @add_metaclass(ABCMeta)
 16 | class Storage(object):
 17 |     """Storage factory/interface
 18 | 
 19 |     # For users
 20 | 
 21 |     > Use `Storage.connect` to instantiate a storage
 22 | 
 23 |     For instantiation of concrete storage instances,
 24 |     `tableschema.Storage` provides a unified factory method `connect`
 25 |     (which uses the plugin system under the hood):
 26 | 
 27 |     ```python
 28 |     # pip install tableschema_sql
 29 |     from tableschema import Storage
 30 | 
 31 |     storage = Storage.connect('sql', **options)
 32 |     storage.create('bucket', descriptor)
 33 |     storage.write('bucket', rows)
 34 |     storage.read('bucket')
 35 |     ```
 36 | 
 37 |     # For integrators
 38 | 
 39 |     The library includes interface declaration to implement tabular `Storage`.
 40 |     This interface allow to use different data storage systems like SQL
 41 |     with `tableschema.Table` class (load/save) as well as on the data package level:
 42 | 
 43 |     ![Storage](https://raw.githubusercontent.com/frictionlessdata/tableschema-py/master/data/storage.png)
 44 | 
 45 |     An implementor must follow `tableschema.Storage` interface
 46 |     to write his own storage backend. Concrete storage backends
 47 |     could include additional functionality specific to conrete storage system.
 48 |     See `plugins` below to know how to integrate custom storage plugin into your workflow.
 49 | 
 50 |     """
 51 | 
 52 |     # Public
 53 | 
 54 |     @abstractmethod
 55 |     def __init__(self, **options):
 56 |         pass
 57 | 
 58 |     @classmethod
 59 |     def connect(cls, name, **options):
 60 |         """Create tabular `storage` based on storage name.
 61 | 
 62 |         > This method is statis: `Storage.connect()`
 63 | 
 64 |         # Arguments
 65 |             name (str): storage name like `sql`
 66 |             options (dict): concrete storage options
 67 | 
 68 |         # Raises
 69 |             StorageError: raises on any error
 70 | 
 71 |         # Returns
 72 |             Storage: returns `Storage` instance
 73 | 
 74 |         """
 75 |         if cls is not Storage:
 76 |             message = 'Storage.connect is not available on concrete implemetations'
 77 |             raise exceptions.StorageError(message)
 78 |         module = 'tableschema.plugins.%s' % name
 79 |         storage = import_module(module).Storage(**options)
 80 |         return storage
 81 | 
 82 |     @property
 83 |     @abstractmethod
 84 |     def buckets(self):
 85 |         """Return list of storage bucket names.
 86 | 
 87 |         A `bucket` is a special term which has almost the same meaning as `table`.
 88 |         You should consider `bucket` as a `table` stored in the `storage`.
 89 | 
 90 |         # Raises
 91 |             exceptions.StorageError: raises on any error
 92 | 
 93 |         # Returns
 94 |             str[]: return list of bucket names
 95 | 
 96 |         """
 97 |         pass
 98 | 
 99 |     @abstractmethod
100 |     def create(self, bucket, descriptor, force=False):
101 |         """Create one/multiple buckets.
102 | 
103 |         # Arguments
104 |             bucket (str/list): bucket name or list of bucket names
105 |             descriptor (dict/dict[]): schema descriptor or list of descriptors
106 |             force (bool): whether to delete and re-create already existing buckets
107 | 
108 |         # Raises
109 |             exceptions.StorageError: raises on any error
110 | 
111 |         """
112 |         pass
113 | 
114 |     @abstractmethod
115 |     def delete(self, bucket=None, ignore=False):
116 |         """ Delete one/multiple/all buckets.
117 | 
118 |         # Arguments
119 |             bucket (str/list/None): bucket name or list of bucket names to delete.
120 |                 If `None`, all buckets will be deleted
121 |             descriptor (dict/dict[]): schema descriptor or list of descriptors
122 |             ignore (bool): don't raise an error on non-existent bucket deletion
123 | 
124 |         # Raises
125 |             exceptions.StorageError: raises on any error
126 | 
127 |         """
128 |         pass
129 | 
130 |     @abstractmethod
131 |     def describe(self, bucket, descriptor=None):
132 |         """ Get/set bucket's Table Schema descriptor
133 | 
134 |         # Arguments
135 |             bucket (str): bucket name
136 |             descriptor (dict/None): schema descriptor to set
137 | 
138 |         # Raises
139 |             exceptions.StorageError: raises on any error
140 | 
141 |         # Returns
142 |             dict: returns Table Schema descriptor
143 | 
144 |         """
145 |         pass
146 | 
147 |     @abstractmethod
148 |     def iter(self, bucket):
149 |         """Return an iterator of typed values based on the schema of this bucket.
150 | 
151 |         # Arguments
152 |             bucket (str): bucket name
153 | 
154 |         # Raises
155 |             exceptions.StorageError: raises on any error
156 | 
157 |         # Returns
158 |             list[]: yields data rows
159 | 
160 |         """
161 |         pass
162 | 
163 |     @abstractmethod
164 |     def read(self, bucket):
165 |         """Read typed values based on the schema of this bucket.
166 | 
167 |         # Arguments
168 |             bucket (str): bucket name
169 |         # Raises
170 |             exceptions.StorageError: raises on any error
171 |         # Returns
172 |             list[]: returns data rows
173 | 
174 |         """
175 |         pass
176 | 
177 |     @abstractmethod
178 |     def write(self, bucket, rows):
179 |         """ This method writes data rows into `storage`.
180 | 
181 |         It should store values of unsupported types as strings internally (like csv does).
182 | 
183 |         # Arguments
184 |             bucket (str): bucket name
185 |             rows (list[]): data rows to write
186 | 
187 |         # Raises
188 |             exceptions.StorageError: raises on any error
189 | 
190 |         """
191 |         pass
192 | 


--------------------------------------------------------------------------------
/tests/test_schema_constraint_field_type.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import (absolute_import, division, print_function,
  3 |                         unicode_literals)
  4 | 
  5 | import io
  6 | import json
  7 | import os
  8 | 
  9 | import pytest
 10 | 
 11 | from tableschema import Schema, exceptions, validate
 12 | 
 13 | # Tests on built-in constraints - field type consistency
 14 | 
 15 | CONSTRAINT_FIELDTYPE_TESTCASES = [
 16 |     # minLength constraint (applies to collections (string, array, object))
 17 |     ('minLength', {'minLength': 4}, None, True),
 18 |     ('minLength', {'minLength': 4}, 'any', False),
 19 |     ('minLength', {'minLength': 4}, 'array', True),
 20 |     ('minLength', {'minLength': 4}, 'boolean', False),
 21 |     ('minLength', {'minLength': 4}, 'date', False),
 22 |     ('minLength', {'minLength': 4}, 'datetime', False),
 23 |     ('minLength', {'minLength': 4}, 'duration', False),
 24 |     ('minLength', {'minLength': 4}, 'geojson', False),
 25 |     ('minLength', {'minLength': 4}, 'geopoint', False),
 26 |     ('minLength', {'minLength': 4}, 'integer', False),
 27 |     ('minLength', {'minLength': 4}, 'number', False),
 28 |     ('minLength', {'minLength': 4}, 'object', True),
 29 |     ('minLength', {'minLength': 4}, 'string', True),
 30 |     ('minLength', {'minLength': 4}, 'time', False),
 31 |     ('minLength', {'minLength': 4}, 'year', False),
 32 |     ('minLength', {'minLength': 4}, 'yearmonth', False),
 33 | 
 34 |     # maxLength constraint (applies to collections (string, array, object))
 35 |     ('maxLength', {'maxLength': 3}, None, True),
 36 |     ('maxLength', {'maxLength': 3}, 'any', False),
 37 |     ('maxLength', {'maxLength': 3}, 'array', True),
 38 |     ('maxLength', {'maxLength': 3}, 'boolean', False),
 39 |     ('maxLength', {'maxLength': 3}, 'date', False),
 40 |     ('maxLength', {'maxLength': 3}, 'datetime', False),
 41 |     ('maxLength', {'maxLength': 3}, 'duration', False),
 42 |     ('maxLength', {'maxLength': 3}, 'geojson', False),
 43 |     ('maxLength', {'maxLength': 3}, 'geopoint', False),
 44 |     ('maxLength', {'maxLength': 3}, 'integer', False),
 45 |     ('maxLength', {'maxLength': 3}, 'number', False),
 46 |     ('maxLength', {'maxLength': 3}, 'object', True),
 47 |     ('maxLength', {'maxLength': 3}, 'string', True),
 48 |     ('maxLength', {'maxLength': 3}, 'time', False),
 49 |     ('maxLength', {'maxLength': 3}, 'year', False),
 50 |     ('maxLength', {'maxLength': 3}, 'yearmonth', False),
 51 | 
 52 |     # minimum constraint (applies to integer, number, date, time, datetime, year, yearmonth)
 53 |     ('minimum', {'minimum': 4}, None, False),
 54 |     ('minimum', {'minimum': 4}, 'any', False),
 55 |     ('minimum', {'minimum': 4}, 'array', False),
 56 |     ('minimum', {'minimum': 4}, 'boolean', False),
 57 |     ('minimum', {'minimum': "1789-07-14"}, 'date', True),
 58 |     ('minimum', {'minimum': "1789-07-14T08:00:00Z"}, 'datetime', True),
 59 |     ('minimum', {'minimum': 4}, 'duration', False),
 60 |     ('minimum', {'minimum': 4}, 'geojson', False),
 61 |     ('minimum', {'minimum': 4}, 'geopoint', False),
 62 |     ('minimum', {'minimum': 4}, 'integer', True),
 63 |     ('minimum', {'minimum': 4}, 'number', True),
 64 |     ('minimum', {'minimum': 4}, 'object', False),
 65 |     ('minimum', {'minimum': 4}, 'string', False),
 66 |     ('minimum', {'minimum': "07:07:07"}, 'time', True),
 67 |     ('minimum', {'minimum': 4}, 'year', True),
 68 |     ('minimum', {'minimum': "1789-07"}, 'yearmonth', True),
 69 | 
 70 |     # maximum constraint (applies to integer, number, date, time and datetime, year, yearmonth)
 71 |     ('maximum', {'maximum': 4}, None, False),
 72 |     ('maximum', {'maximum': 4}, 'any', False),
 73 |     ('maximum', {'maximum': 4}, 'array', False),
 74 |     ('maximum', {'maximum': 4}, 'boolean', False),
 75 |     ('maximum', {'maximum': "2001-01-01"}, 'date', True),
 76 |     ('maximum', {'maximum': "2001-01-01T12:00:00Z"}, 'datetime', True),
 77 |     ('maximum', {'maximum': 4}, 'duration', False),
 78 |     ('maximum', {'maximum': 4}, 'geojson', False),
 79 |     ('maximum', {'maximum': 4}, 'geopoint', False),
 80 |     ('maximum', {'maximum': 4}, 'integer', True),
 81 |     ('maximum', {'maximum': 4}, 'number', True),
 82 |     ('maximum', {'maximum': 4}, 'object', False),
 83 |     ('maximum', {'maximum': 4}, 'string', False),
 84 |     ('maximum', {'maximum': "08:09:10"}, 'time', True),
 85 |     ('maximum', {'maximum': 4}, 'year', True),
 86 |     ('maximum', {'maximum': "2001-01"}, 'yearmonth', True),
 87 | 
 88 |     # pattern constraint (apply to string)
 89 |     ('pattern', {'pattern': '[0-9]+'}, None, True),
 90 |     ('pattern', {'pattern': '[0-9]+'}, 'any', False),
 91 |     ('pattern', {'pattern': '[0-9]+'}, 'array', False),
 92 |     ('pattern', {'pattern': '[0-9]+'}, 'boolean', False),
 93 |     ('pattern', {'pattern': '[0-9]+'}, 'date', False),
 94 |     ('pattern', {'pattern': '[0-9]+'}, 'datetime', False),
 95 |     ('pattern', {'pattern': '[0-9]+'}, 'duration', False),
 96 |     ('pattern', {'pattern': '[0-9]+'}, 'geojson', False),
 97 |     ('pattern', {'pattern': '[0-9]+'}, 'geopoint', False),
 98 |     ('pattern', {'pattern': '[0-9]+'}, 'integer', False),
 99 |     ('pattern', {'pattern': '[0-9]+'}, 'number', False),
100 |     ('pattern', {'pattern': '[0-9]+'}, 'object', False),
101 |     ('pattern', {'pattern': '[0-9]+'}, 'string', True),
102 |     ('pattern', {'pattern': '[0-9]+'}, 'time', False),
103 |     ('pattern', {'pattern': '[0-9]+'}, 'year', False),
104 |     ('pattern', {'pattern': '[0-9]+'}, 'yearmonth', False)
105 | ]
106 | 
107 | 
108 | @pytest.mark.parametrize("constraint_name, constraint, field_type, expected", CONSTRAINT_FIELDTYPE_TESTCASES)
109 | def test_schema_constraint_field_type(constraint_name, constraint, field_type, expected):
110 |     field = {
111 |         'name': 'f',
112 |         'constraints': constraint,
113 |     }
114 |     if field_type is not None:
115 |         field['type'] = field_type
116 |     test_descriptor = {'fields': [field]}
117 | 
118 |     message = 'constraint "{}" can{} be applied to "{}" field' \
119 |         .format(constraint_name, "" if expected else "not",
120 |                 "default" if field_type is None else field_type)
121 | 
122 |     table_schema = Schema(descriptor=test_descriptor)
123 |     assert table_schema.valid == expected, message
124 | 


--------------------------------------------------------------------------------
/tests/test_field.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | from __future__ import absolute_import
  5 | from __future__ import unicode_literals
  6 | 
  7 | import io
  8 | import json
  9 | import pytest
 10 | import requests
 11 | from functools import partial
 12 | from tableschema import Field, exceptions
 13 | 
 14 | 
 15 | # Constants
 16 | 
 17 | DESCRIPTOR_MIN = {'name': 'id'}
 18 | DESCRIPTOR_MAX = {
 19 |     'name': 'id',
 20 |     'type': 'integer',
 21 |     'format': 'default',
 22 |     'constraints': {'required': True},
 23 | }
 24 | 
 25 | 
 26 | # Tests [general]
 27 | 
 28 | def test_descriptor(apply_defaults):
 29 |     assert Field(DESCRIPTOR_MIN).descriptor == apply_defaults(DESCRIPTOR_MIN)
 30 | 
 31 | 
 32 | def test_name():
 33 |     assert Field(DESCRIPTOR_MIN).name == 'id'
 34 | 
 35 | 
 36 | def test_type():
 37 |     assert Field(DESCRIPTOR_MIN).type == 'string'
 38 |     assert Field(DESCRIPTOR_MAX).type == 'integer'
 39 | 
 40 | 
 41 | def test_format():
 42 |     assert Field(DESCRIPTOR_MIN).format == 'default'
 43 |     assert Field(DESCRIPTOR_MAX).format == 'default'
 44 | 
 45 | 
 46 | def test_constraints():
 47 |     assert Field(DESCRIPTOR_MIN).constraints == {}
 48 |     assert Field(DESCRIPTOR_MAX).constraints == {'required': True}
 49 | 
 50 | 
 51 | def test_required():
 52 |     assert Field(DESCRIPTOR_MIN).required == False
 53 |     assert Field(DESCRIPTOR_MAX).required == True
 54 | 
 55 | 
 56 | def test_cast_value():
 57 |     assert Field(DESCRIPTOR_MAX).cast_value('1') == 1
 58 | 
 59 | 
 60 | def test_cast_value_constraint_error():
 61 |     with pytest.raises(exceptions.CastError):
 62 |         Field(DESCRIPTOR_MAX).cast_value('')
 63 | 
 64 | 
 65 | def test_cast_value_constraints_false():
 66 |     assert Field(DESCRIPTOR_MIN).cast_value('', constraints=False) == None
 67 | 
 68 | 
 69 | def test_cast_value_null_with_missing_values():
 70 |     field = Field({'name': 'name', 'type': 'number'}, missing_values=['null'])
 71 |     assert field.cast_value('null') == None
 72 | 
 73 | 
 74 | def test_test_value():
 75 |     assert Field(DESCRIPTOR_MAX).test_value('1') == True
 76 |     assert Field(DESCRIPTOR_MAX).test_value('string') == False
 77 |     assert Field(DESCRIPTOR_MAX).test_value('') == False
 78 | 
 79 | 
 80 | def test_test_value_constraints_false():
 81 |     assert Field(DESCRIPTOR_MIN).test_value('', constraints=False) == True
 82 | 
 83 | 
 84 | def test_missing_values():
 85 |     assert Field(DESCRIPTOR_MIN).missing_values == ['']
 86 |     assert Field(DESCRIPTOR_MIN, missing_values=['-']).missing_values == ['-']
 87 | 
 88 | 
 89 | # Tests [missingValues]
 90 | 
 91 | def test_string_missingValues():
 92 |     field = Field({
 93 |         'name': 'name',
 94 |         'type': 'string',
 95 |     }, missing_values=['', 'NA', 'N/A'])
 96 |     cast = field.cast_value
 97 |     assert cast('') == None
 98 |     assert cast('NA') == None
 99 |     assert cast('N/A') == None
100 | 
101 | 
102 | def test_number_missingValues():
103 |     field = Field({
104 |         'name': 'name',
105 |         'type': 'number',
106 |     }, missing_values=['', 'NA', 'N/A'])
107 |     cast = field.cast_value
108 |     assert cast('') == None
109 |     assert cast('NA') == None
110 |     assert cast('N/A') == None
111 | 
112 | 
113 | # Tests [constraints]
114 | 
115 | def test_test_value_required():
116 |     field = Field({
117 |         'name': 'name',
118 |         'type': 'string',
119 |         'constraints': {'required': True}
120 |     }, missing_values=['', 'NA', 'N/A'])
121 |     test = partial(field.test_value, constraints=['required'])
122 |     assert test('test') == True
123 |     assert test('null') == True
124 |     assert test('none') == True
125 |     assert test('nil') == True
126 |     assert test('nan') == True
127 |     assert test('NA') == False
128 |     assert test('N/A') == False
129 |     assert test('-') == True
130 |     assert test('') == False
131 |     assert test(None) == False
132 | 
133 | 
134 | def test_test_value_pattern():
135 |     field = Field({
136 |         'name': 'name',
137 |         'type': 'string',
138 |         'constraints': {'pattern': '3.*'}
139 |     })
140 |     test = partial(field.test_value, constraints=['pattern'])
141 |     assert test('3') == True
142 |     assert test('321') == True
143 |     assert test('123') == False
144 | 
145 | 
146 | def test_test_value_unique():
147 |     field = Field({
148 |         'name': 'name',
149 |         'type': 'integer',
150 |         'constraints': {'unique': True}
151 |     })
152 |     test = partial(field.test_value, constraints=['unique'])
153 |     assert test(30000) == True
154 |     assert test('bad') == False
155 | 
156 | 
157 | def test_test_value_enum():
158 |     field = Field({
159 |         'name': 'name',
160 |         'type': 'integer',
161 |         'constraints': {'enum': ['1', '2', '3']}
162 |     })
163 |     test = partial(field.test_value, constraints=['enum'])
164 |     assert test('1') == True
165 |     assert test(1) == True
166 |     assert test('4') == False
167 |     assert test(4) == False
168 | 
169 | 
170 | def test_test_value_minimum():
171 |     field = Field({
172 |         'name': 'name',
173 |         'type': 'integer',
174 |         'constraints': {'minimum': 1}
175 |     })
176 |     test = partial(field.test_value, constraints=['minimum'])
177 |     assert test('2') == True
178 |     assert test(2) == True
179 |     assert test('1') == True
180 |     assert test(1) == True
181 |     assert test('0') == False
182 |     assert test(0) == False
183 | 
184 | 
185 | def test_test_value_maximum():
186 |     field = Field({
187 |         'name': 'name',
188 |         'type': 'integer',
189 |         'constraints': {'maximum': 1}
190 |     })
191 |     test = partial(field.test_value, constraints=['maximum'])
192 |     assert test('0') == True
193 |     assert test(0) == True
194 |     assert test('1') == True
195 |     assert test(1) == True
196 |     assert test('2') == False
197 |     assert test(2) == False
198 | 
199 | 
200 | def test_test_value_minLength():
201 |     field = Field({
202 |         'name': 'name',
203 |         'type': 'string',
204 |         'constraints': {'minLength': 1}
205 |     })
206 |     test = partial(field.test_value, constraints=['minLength'])
207 |     assert test('ab') == True
208 |     assert test('a') == True
209 |     # Null value passes
210 |     assert test('') == True
211 | 
212 | 
213 | def test_test_value_maxLength():
214 |     field = Field({
215 |         'name': 'name',
216 |         'type': 'string',
217 |         'constraints': {'maxLength': 1}
218 |     })
219 |     test = partial(field.test_value, constraints=['maxLength'])
220 |     assert test('') == True
221 |     assert test('a') == True
222 |     assert test('ab') == False
223 | 


--------------------------------------------------------------------------------
/tableschema/field.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | from __future__ import absolute_import
  5 | from __future__ import unicode_literals
  6 | 
  7 | import os
  8 | from functools import partial
  9 | from cached_property import cached_property
 10 | from .profile import Profile
 11 | from . import constraints
 12 | from . import exceptions
 13 | from . import helpers
 14 | from . import config
 15 | from . import types
 16 | 
 17 | 
 18 | # Module API
 19 | 
 20 | class Field(object):
 21 |     """Field representaion
 22 | 
 23 |     # Arguments
 24 |         descriptor (dict): schema field descriptor
 25 |         missingValues (str[]): an array with string representing missing values
 26 | 
 27 |     # Raises
 28 |         TableSchemaException: raises any error that occurs during the process
 29 | 
 30 |     """
 31 | 
 32 |     # Public
 33 | 
 34 |     ERROR = config.ERROR
 35 | 
 36 |     def __init__(self, descriptor, missing_values=config.DEFAULT_MISSING_VALUES,
 37 |                  # Internal
 38 |                  schema=None):
 39 | 
 40 |         # Process descriptor
 41 |         descriptor = helpers.expand_field_descriptor(descriptor)
 42 | 
 43 |         # Set attributes
 44 |         self.__descriptor = descriptor
 45 |         self.__missing_values = missing_values
 46 |         self.__schema = schema
 47 |         self.__cast_function = self.__get_cast_function()
 48 |         self.__check_functions = self.__get_check_functions()
 49 |         self.__preserve_missing_values = os.environ.get('TABLESCHEMA_PRESERVE_MISSING_VALUES')
 50 | 
 51 |     @cached_property
 52 |     def schema(self):
 53 |         """Returns a schema instance if the field belongs to some schema
 54 | 
 55 |         # Returns
 56 |             Schema: field's schema
 57 | 
 58 |         """
 59 |         return self.__schema
 60 | 
 61 |     @cached_property
 62 |     def name(self):
 63 |         """Field name
 64 | 
 65 |         # Returns
 66 |             str: field name
 67 | 
 68 |         """
 69 |         return self.__descriptor.get('name')
 70 | 
 71 |     @cached_property
 72 |     def type(self):
 73 |         """Field type
 74 | 
 75 |         # Returns
 76 |             str: field type
 77 | 
 78 |         """
 79 |         return self.__descriptor.get('type')
 80 | 
 81 |     @cached_property
 82 |     def format(self):
 83 |         """Field format
 84 | 
 85 |         # Returns
 86 |             str: field format
 87 | 
 88 |         """
 89 |         return self.__descriptor.get('format')
 90 | 
 91 |     @cached_property
 92 |     def missing_values(self):
 93 |         """Field's missing values
 94 | 
 95 |         # Returns
 96 |             str[]: missing values
 97 | 
 98 |         """
 99 |         return self.__missing_values
100 | 
101 |     @cached_property
102 |     def required(self):
103 |         """Whether field is required
104 | 
105 |         # Returns
106 |             bool: true if required
107 | 
108 |         """
109 |         return self.constraints.get('required', False)
110 | 
111 |     @cached_property
112 |     def constraints(self):
113 |         """Field constraints
114 | 
115 |         # Returns
116 |             dict: dict of field constraints
117 | 
118 |         """
119 |         return self.__descriptor.get('constraints', {})
120 | 
121 |     @cached_property
122 |     def descriptor(self):
123 |         """Fields's descriptor
124 | 
125 |         # Returns
126 |             dict: descriptor
127 | 
128 |         """
129 |         return self.__descriptor
130 | 
131 |     @cached_property
132 |     def cast_function(self):
133 |         return self.__cast_function
134 | 
135 |     @cached_property
136 |     def check_functions(self):
137 |         return self.__check_functions
138 | 
139 |     def cast_value(self, value, constraints=True):
140 |         """Cast given value according to the field type and format.
141 | 
142 |         # Arguments
143 |             value (any): value to cast against field
144 |             constraints (boll/str[]): gets constraints configuration
145 |                 - it could be set to true to disable constraint checks
146 |                 - it could be an Array of constraints to check e.g. ['minimum', 'maximum']
147 | 
148 |         # Raises
149 |             TableSchemaException: raises any error that occurs during the process
150 | 
151 |         # Returns
152 |             any: returns cast value
153 | 
154 |         """
155 | 
156 |         # Null value
157 |         if value in self.__missing_values:
158 |             # Whether missing_values should be preserved without being cast
159 |             if self.__preserve_missing_values:
160 |                 return value
161 |             value = None
162 | 
163 |         # Cast value
164 |         cast_value = value
165 |         if value is not None:
166 |             cast_value = self.__cast_function(value)
167 |             if cast_value == config.ERROR:
168 |                 raise exceptions.CastError((
169 |                     'Field "{field.name}" can\'t cast value "{value}" '
170 |                     'for type "{field.type}" with format "{field.format}"'
171 |                     ).format(field=self, value=value))
172 | 
173 |         # Check value
174 |         if constraints:
175 |             for name, check in self.__check_functions.items():
176 |                 if isinstance(constraints, list):
177 |                     if name not in constraints:
178 |                         continue
179 |                 passed = check(cast_value)
180 |                 if not passed:
181 |                     raise exceptions.CastError((
182 |                         'Field "{field.name}" has constraint "{name}" '
183 |                         'which is not satisfied for value "{value}"'
184 |                         ).format(field=self, name=name, value=value))
185 | 
186 |         return cast_value
187 | 
188 |     def test_value(self, value, constraints=True):
189 |         """Test whether value is compliant to the field.
190 | 
191 |         # Arguments
192 |             value (any): value to cast against field
193 |             constraints (bool/str[]): constraints configuration
194 | 
195 |         # Returns
196 |             bool: returns if value is compliant to the field
197 | 
198 |         """
199 |         try:
200 |             self.cast_value(value, constraints=constraints)
201 |         except exceptions.CastError:
202 |             return False
203 |         return True
204 | 
205 |     # Private
206 | 
207 |     def __get_cast_function(self):
208 |         options = {}
209 |         # Get cast options
210 |         for key in ['decimalChar', 'groupChar', 'bareNumber', 'trueValues', 'falseValues']:
211 |             value = self.descriptor.get(key)
212 |             if value is not None:
213 |                 options[key] = value
214 |         try:
215 |             cast = getattr(types, 'cast_%s' % self.type)
216 |         except AttributeError:
217 |             message = 'Not supported field type: %s' % self.type
218 |             raise exceptions.TableSchemaException(message)
219 |         cast = partial(cast, self.format, **options)
220 |         return cast
221 | 
222 |     def __get_check_functions(self):
223 |         checks = {}
224 |         cast = partial(self.cast_value, constraints=False)
225 |         whitelist = _get_field_constraints(self.type)
226 |         for name, constraint in self.constraints.items():
227 |             if name in whitelist:
228 |                 # Cast enum constraint
229 |                 if name in ['enum']:
230 |                     constraint = list(map(cast, constraint))
231 |                 # Cast maximum/minimum constraint
232 |                 if name in ['maximum', 'minimum']:
233 |                     constraint = cast(constraint)
234 |                 check = getattr(constraints, 'check_%s' % name)
235 |                 checks[name] = partial(check, constraint)
236 |         return checks
237 | 
238 | 
239 | # Internal
240 | 
241 | def _get_field_constraints(type):
242 |     # Extract list of constraints for given type from jsonschema
243 |     jsonschema = Profile('table-schema').jsonschema
244 |     profile_types = jsonschema['properties']['fields']['items']['anyOf']
245 |     for profile_type in profile_types:
246 |         if type in profile_type['properties']['type']['enum']:
247 |             return profile_type['properties']['constraints']['properties'].keys()
248 | 


--------------------------------------------------------------------------------
/tableschema/profile.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | from __future__ import unicode_literals
  6 | 
  7 | import io
  8 | import os
  9 | import six
 10 | import json
 11 | import jsonschema
 12 | from jsonschema.validators import validator_for
 13 | from . import exceptions
 14 | 
 15 | 
 16 | # Module API
 17 | 
 18 | class Profile(object):
 19 | 
 20 |     # Public
 21 | 
 22 |     def __init__(self, profile):
 23 |         self.__profile = profile
 24 |         self.__jsonschema = _PROFILES.get(profile)
 25 |         if not self.__jsonschema:
 26 |             message = 'Can\'t load profile "%s"' % profile
 27 |             raise exceptions.LoadError(message)
 28 | 
 29 |     @property
 30 |     def name(self):
 31 |         return self.__jsonschema.get('title', '').replace(' ', '-').lower() or None
 32 | 
 33 |     @property
 34 |     def jsonschema(self):
 35 |         return self.__jsonschema
 36 | 
 37 |     def validate(self, descriptor):
 38 | 
 39 |         # Other profiles
 40 |         if self.name != 'table-schema':
 41 |             return jsonschema.validate(descriptor, self.jsonschema)
 42 | 
 43 |         # Collect errors
 44 |         errors = []
 45 |         validator = _TableSchemaValidator(
 46 |             self.jsonschema, format_checker=jsonschema.FormatChecker())
 47 |         for error in validator.iter_errors(descriptor):
 48 |             if isinstance(error, jsonschema.exceptions.ValidationError):
 49 |                 message = str(error.message)
 50 |                 if six.PY2:
 51 |                     message = message.replace('u\'', '\'')
 52 |                 descriptor_path = '/'.join(map(str, error.path))
 53 |                 profile_path = '/'.join(map(str, error.schema_path))
 54 |                 error = exceptions.ValidationError(
 55 |                     'Descriptor validation error: %s '
 56 |                     'at "%s" in descriptor and '
 57 |                     'at "%s" in profile'
 58 |                     % (message, descriptor_path, profile_path))
 59 |             errors.append(error)
 60 | 
 61 |         # Railse error
 62 |         if errors:
 63 |             message = 'There are %s validation errors (see exception.errors)' % len(errors)
 64 |             raise exceptions.ValidationError(message, errors=errors)
 65 | 
 66 |         return True
 67 | 
 68 | 
 69 | # Internal
 70 | 
 71 | def _load_profile(filename):
 72 |     path = os.path.join(os.path.dirname(__file__), 'profiles', filename)
 73 |     profile = json.load(io.open(path, encoding='utf-8'))
 74 |     return profile
 75 | 
 76 | 
 77 | _PROFILES = {
 78 |     'table-schema': _load_profile('table-schema.json'),
 79 |     'geojson': _load_profile('geojson.json'),
 80 | }
 81 | 
 82 | _CONSTRAINT_ALLOWED_FIELD_TYPE = {
 83 |     'minLength': {None, 'string', 'array', 'object'},
 84 |     'maxLength': {None, 'string', 'array', 'object'},
 85 |     'minimum': {'integer', 'number', 'date', 'time', 'datetime', 'year', 'yearmonth'},
 86 |     'maximum': {'integer', 'number', 'date', 'time', 'datetime', 'year', 'yearmonth'},
 87 |     'pattern': {None, 'string'},
 88 | }
 89 | 
 90 | 
 91 | class _TableSchemaValidator(validator_for(_PROFILES['table-schema'])):
 92 |     @classmethod
 93 |     def check_schema(cls, schema):
 94 |         # When checking against the metaschema, we do not want to run the
 95 |         # additional checking added in iter_errors
 96 |         parent_cls = cls.__bases__[0]
 97 |         for error in parent_cls(cls.META_SCHEMA).iter_errors(schema):
 98 |             raise jsonschema.exceptions.SchemaError.create_from(error)
 99 | 
100 |     def iter_errors(self, instance, _schema=None):
101 | 
102 |         # iter jsonschema validation errors
103 |         for error in super(_TableSchemaValidator, self).iter_errors(instance, _schema):
104 |             yield jsonschema.exceptions.ValidationError(
105 |                 error.message, error.validator, error.path, error.cause,
106 |                 error.context, error.validator_value, error.instance,
107 |                 error.schema, error.schema_path, error.parent)
108 | 
109 |         # get field names
110 |         try:
111 |             field_names = [f['name'] for f in instance['fields']]
112 |         except (TypeError, KeyError):
113 |             field_names = []
114 | 
115 |         # ensure constraint and field type consistency
116 |         if isinstance(instance, dict) and instance.get('fields'):
117 |             for field in instance['fields']:
118 |                 if not isinstance(field, dict):
119 |                     continue
120 |                 field_type = field.get('type')
121 |                 field_type_str = 'default' if field_type is None else field_type
122 |                 field_name = field.get('name', '[noname]')
123 |                 constraints = field.get('constraints', {})
124 |                 for constraint_name in constraints:
125 |                     if constraint_name in _CONSTRAINT_ALLOWED_FIELD_TYPE:
126 |                         if field_type not in _CONSTRAINT_ALLOWED_FIELD_TYPE[constraint_name]:
127 |                             yield exceptions.ValidationError(
128 |                                 "field {}: built-in {} constraint can't be applied to {} type field"
129 |                                 .format(field_name, constraint_name, field_type_str))
130 | 
131 |         # the hash MAY contain a key `primaryKey`
132 |         if isinstance(instance, dict) and instance.get('primaryKey'):
133 | 
134 |             # ensure that the primary key matches field names
135 |             if isinstance(instance['primaryKey'], six.string_types):
136 |                 if not instance['primaryKey'] in field_names:
137 |                     yield exceptions.ValidationError(
138 |                         'A JSON Table Schema primaryKey value must be found in'
139 |                         ' the schema field names')
140 |             elif isinstance(instance['primaryKey'], list):
141 |                 for k in instance['primaryKey']:
142 |                     if k not in field_names:
143 |                         yield exceptions.ValidationError(
144 |                             'A JSON Table Schema primaryKey value must be '
145 |                             'found in the schema field names')
146 | 
147 |         # the hash may contain a key `foreignKeys`
148 |         if isinstance(instance, dict) and instance.get('foreignKeys'):
149 |             for fk in instance['foreignKeys']:
150 | 
151 |                 # ensure that `foreignKey.fields` match field names
152 |                 if isinstance(fk.get('fields'), six.string_types):
153 |                     if fk.get('fields') not in field_names:
154 |                         yield exceptions.ValidationError(
155 |                             'A JSON Table Schema foreignKey.fields value must '
156 |                             'correspond with field names.')
157 |                 elif isinstance(fk.get('fields', []), list):
158 |                     for field in fk.get('fields'):
159 |                         if field not in field_names:
160 |                             yield exceptions.ValidationError(
161 |                                 'A JSON Table Schema foreignKey.fields value '
162 |                                 'must correspond with field names.')
163 | 
164 |                 # ensure that `foreignKey.reference.fields`
165 |                 # matches outer `fields`
166 |                 if isinstance(fk.get('fields'), six.string_types):
167 |                     fields = fk.get('reference', {}).get('fields', {})
168 |                     if not isinstance(fields, six.string_types):
169 |                         yield exceptions.ValidationError(
170 |                             'A JSON Table Schema foreignKey.reference.fields '
171 |                             'must match field names.')
172 |                 else:
173 |                     if isinstance(fk['reference']['fields'], six.string_types):
174 |                         yield exceptions.ValidationError(
175 |                             'A JSON Table Schema foreignKey.fields cannot '
176 |                             'be a string when foreignKey.reference.fields.'
177 |                             'is a string')
178 |                     if not (len(fk.get('fields')) ==
179 |                             len(fk['reference']['fields'])):
180 |                         yield exceptions.ValidationError(
181 |                             'A JSON Table Schema foreignKey.fields must '
182 |                             'contain the same number entries as '
183 |                             'foreignKey.reference.fields.')
184 | 


--------------------------------------------------------------------------------
/tableschema/profiles/geojson.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "$schema": "http://json-schema.org/draft-04/schema#",
  3 |     "id": "https://raw.githubusercontent.com/fge/sample-json-schemas/master/geojson/geojson.json#",
  4 |     "title": "Geo JSON object",
  5 |     "description": "Schema for a Geo JSON object",
  6 |     "type": "object",
  7 |     "required": [ "type" ],
  8 |     "properties": {
  9 |         "crs": { "$ref": "#/definitions/crs" },
 10 |         "bbox": { "$ref": "#/definitions/bbox" }
 11 |     },
 12 |     "oneOf": [
 13 |         { "$ref": "#/definitions/geometry" },
 14 |         { "$ref": "#/definitions/geometryCollection" },
 15 |         { "$ref": "#/definitions/feature" },
 16 |         { "$ref": "#/definitions/featureCollection" }
 17 |     ],
 18 |     "definitions": {
 19 |         "geometryCollection": {
 20 |             "title": "GeometryCollection",
 21 |             "description": "A collection of geometry objects",
 22 |             "required": [ "geometries" ],
 23 |             "properties": {
 24 |                 "type": { "enum": [ "GeometryCollection" ] },
 25 |                 "geometries": {
 26 |                     "type": "array",
 27 |                     "items": { "$ref": "#/definitions/geometry" }
 28 |                 }
 29 |             }
 30 |         },
 31 |         "feature": {
 32 |             "title": "Feature",
 33 |             "description": "A Geo JSON feature object",
 34 |             "required": [ "geometry", "properties" ],
 35 |             "properties": {
 36 |                 "type": { "enum": [ "Feature" ] },
 37 |                 "geometry": {
 38 |                     "oneOf": [
 39 |                         { "type": "null" },
 40 |                         { "$ref": "#/definitions/geometry" }
 41 |                     ]
 42 |                 },
 43 |                 "properties": { "type": [ "object", "null" ] },
 44 |                 "id": { "FIXME": "may be there, type not known (string? number?)" }
 45 |             }
 46 |         },
 47 |         "featureCollection": {
 48 |             "title": "FeatureCollection",
 49 |             "description": "A Geo JSON feature collection",
 50 |             "required": [ "features" ],
 51 |             "properties": {
 52 |                 "type": { "enum": [ "FeatureCollection" ] },
 53 |                 "features": {
 54 |                     "type": "array",
 55 |                     "items": { "$ref": "#/definitions/feature" }
 56 |                 }
 57 |             }
 58 |         },
 59 |         "geometry": {
 60 |             "title": "geometry",
 61 |             "description": "One geometry as defined by GeoJSON",
 62 |             "type": "object",
 63 |             "required": [ "type", "coordinates" ],
 64 |             "oneOf": [
 65 |                 {
 66 |                     "title": "Point",
 67 |                     "properties": {
 68 |                         "type": { "enum": [ "Point" ] },
 69 |                         "coordinates": { "$ref": "#/definitions/geometry/definitions/position" }
 70 |                     }
 71 |                 },
 72 |                 {
 73 |                     "title": "MultiPoint",
 74 |                     "properties": {
 75 |                         "type": { "enum": [ "MultiPoint" ] },
 76 |                         "coordinates": { "$ref": "#/definitions/geometry/definitions/positionArray" }
 77 |                     }
 78 |                 },
 79 |                 {
 80 |                     "title": "LineString",
 81 |                     "properties": {
 82 |                         "type": { "enum": [ "LineString" ] },
 83 |                         "coordinates": { "$ref": "#/definitions/geometry/definitions/lineString" }
 84 |                     }
 85 |                 },
 86 |                 {
 87 |                     "title": "MultiLineString",
 88 |                     "properties": {
 89 |                         "type": { "enum": [ "MultiLineString" ] },
 90 |                         "coordinates": {
 91 |                             "type": "array",
 92 |                             "items": { "$ref": "#/definitions/geometry/definitions/lineString" }
 93 |                         }
 94 |                     }
 95 |                 },
 96 |                 {
 97 |                     "title": "Polygon",
 98 |                     "properties": {
 99 |                         "type": { "enum": [ "Polygon" ] },
100 |                         "coordinates": { "$ref": "#/definitions/geometry/definitions/polygon" }
101 |                     }
102 |                 },
103 |                 {
104 |                     "title": "MultiPolygon",
105 |                     "properties": {
106 |                         "type": { "enum": [ "MultiPolygon" ] },
107 |                         "coordinates": {
108 |                             "type": "array",
109 |                             "items": { "$ref": "#/definitions/geometry/definitions/polygon" }
110 |                         }
111 |                     }
112 |                 }
113 |             ],
114 |             "definitions": {
115 |                 "position": {
116 |                     "description": "A single position",
117 |                     "type": "array",
118 |                     "minItems": 2,
119 |                     "items": [ { "type": "number" }, { "type": "number" } ],
120 |                     "additionalItems": false
121 |                 },
122 |                 "positionArray": {
123 |                     "description": "An array of positions",
124 |                     "type": "array",
125 |                     "items": { "$ref": "#/definitions/geometry/definitions/position" }
126 |                 },
127 |                 "lineString": {
128 |                     "description": "An array of two or more positions",
129 |                     "allOf": [
130 |                         { "$ref": "#/definitions/geometry/definitions/positionArray" },
131 |                         { "minItems": 2 }
132 |                     ]
133 |                 },
134 |                 "linearRing": {
135 |                     "description": "An array of four positions where the first equals the last",
136 |                     "allOf": [
137 |                         { "$ref": "#/definitions/geometry/definitions/positionArray" },
138 |                         { "minItems": 4 }
139 |                     ]
140 |                 },
141 |                 "polygon": {
142 |                     "description": "An array of linear rings",
143 |                     "type": "array",
144 |                     "items": { "$ref": "#/definitions/geometry/definitions/linearRing" }
145 |                 }
146 |             }
147 |         },
148 |         "crs": {
149 |             "title": "crs",
150 |             "description": "a Coordinate Reference System object",
151 |             "type": [ "object", "null" ],
152 |             "required": [ "type", "properties" ],
153 |             "properties": {
154 |                 "type": { "type": "string" },
155 |                 "properties": { "type": "object" }
156 |             },
157 |             "additionalProperties": false,
158 |             "oneOf": [
159 |                 { "$ref": "#/definitions/crs/definitions/namedCrs" },
160 |                 { "$ref": "#/definitions/crs/definitions/linkedCrs" }
161 |             ],
162 |             "definitions": {
163 |                 "namedCrs": {
164 |                     "properties": {
165 |                         "type": { "enum": [ "name" ] },
166 |                         "properties": {
167 |                             "required": [ "name" ],
168 |                             "additionalProperties": false,
169 |                             "properties": {
170 |                                 "name": {
171 |                                     "type": "string",
172 |                                     "FIXME": "semantic validation necessary"
173 |                                 }
174 |                             }
175 |                         }
176 |                     }
177 |                 },
178 |                 "linkedObject": {
179 |                     "type": "object",
180 |                     "required": [ "href" ],
181 |                     "properties": {
182 |                         "href": {
183 |                             "type": "string",
184 |                             "format": "uri",
185 |                             "FIXME": "spec says \"dereferenceable\", cannot enforce that"
186 |                         },
187 |                         "type": {
188 |                             "type": "string",
189 |                             "description": "Suggested values: proj4, ogjwkt, esriwkt"
190 |                         }
191 |                     }
192 |                 },
193 |                 "linkedCrs": {
194 |                     "properties": {
195 |                         "type": { "enum": [ "link" ] },
196 |                         "properties": { "$ref": "#/definitions/crs/definitions/linkedObject" }
197 |                     }
198 |                 }
199 |             }
200 |         },
201 |         "bbox": {
202 |             "description": "A bounding box as defined by GeoJSON",
203 |             "FIXME": "unenforceable constraint: even number of elements in array",
204 |             "type": "array",
205 |             "items": { "type": "number" }
206 |         }
207 |     }
208 | }
209 | 
210 | 


--------------------------------------------------------------------------------
/tests/test_schema.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | from __future__ import absolute_import
  5 | from __future__ import unicode_literals
  6 | 
  7 | import io
  8 | import os
  9 | import json
 10 | import pytest
 11 | import requests
 12 | from collections import OrderedDict
 13 | from decimal import Decimal
 14 | from tableschema import Schema, FailedCast, exceptions
 15 | 
 16 | 
 17 | # Constants
 18 | 
 19 | BASE_URL = 'https://raw.githubusercontent.com/frictionlessdata/tableschema-py/master/%s'
 20 | DESCRIPTOR_MIN = {'fields': [{'name': 'id'}, {'name': 'height', 'type': 'integer'}]}
 21 | DESCRIPTOR_MAX = {
 22 |     'fields': [
 23 |         {'name': 'id', 'type': 'string', 'constraints': {'required': True}},
 24 |         {'name': 'height', 'type': 'number'},
 25 |         {'name': 'age', 'type': 'integer'},
 26 |         {'name': 'name', 'type': 'string'},
 27 |         {'name': 'occupation', 'type': 'string'},
 28 |     ],
 29 |     'primaryKey': ['id'],
 30 |     'foreignKeys': [{'fields': ['name'], 'reference': {'resource': '', 'fields': ['id']}}],
 31 |     'missingValues': ['', '-', 'null'],
 32 | }
 33 | 
 34 | 
 35 | # General
 36 | 
 37 | 
 38 | def test_init():
 39 |     assert Schema(DESCRIPTOR_MIN)
 40 |     assert Schema(DESCRIPTOR_MAX)
 41 |     assert Schema('data/schema_valid_full.json')
 42 |     assert Schema('data/schema_valid_simple.json')
 43 | 
 44 | 
 45 | def test_init_invalid_in_strict_mode():
 46 |     with pytest.raises(exceptions.TableSchemaException) as exception:
 47 |         Schema('data/schema_invalid_multiple_errors.json', strict=True)
 48 | 
 49 | 
 50 | def test_descriptor(apply_defaults):
 51 |     assert Schema(DESCRIPTOR_MIN).descriptor == apply_defaults(DESCRIPTOR_MIN)
 52 |     assert Schema(DESCRIPTOR_MAX).descriptor == apply_defaults(DESCRIPTOR_MAX)
 53 | 
 54 | 
 55 | def test_descriptor_path(apply_defaults):
 56 |     path = 'data/schema_valid_simple.json'
 57 |     actual = Schema(path).descriptor
 58 |     with io.open(path, encoding='utf-8') as file:
 59 |         expect = apply_defaults(json.load(file))
 60 |     assert actual == expect
 61 | 
 62 | 
 63 | def test_descriptor_url(apply_defaults):
 64 |     url = BASE_URL % 'data/schema_valid_simple.json'
 65 |     actual = Schema(url).descriptor
 66 |     expect = apply_defaults(requests.get(url).json())
 67 |     assert actual == expect
 68 | 
 69 | 
 70 | def test_descriptor_applied_defaults():
 71 |     assert Schema(DESCRIPTOR_MIN).descriptor == {
 72 |         'fields': [
 73 |             {'name': 'id', 'type': 'string', 'format': 'default'},
 74 |             {'name': 'height', 'type': 'integer', 'format': 'default'},
 75 |         ],
 76 |         'missingValues': [''],
 77 |     }
 78 | 
 79 | 
 80 | def test_cast_row():
 81 |     schema = Schema(DESCRIPTOR_MAX)
 82 |     source = ['string', '10.0', '1', 'string', 'string']
 83 |     target = ['string', Decimal(10.0), 1, 'string', 'string']
 84 |     assert schema.cast_row(source) == target
 85 | 
 86 | 
 87 | def test_cast_row_null_values():
 88 |     schema = Schema(DESCRIPTOR_MAX)
 89 |     source = ['string', '', '-', 'string', 'null']
 90 |     target = ['string', None, None, 'string', None]
 91 |     assert schema.cast_row(source) == target
 92 | 
 93 | 
 94 | def test_cast_row_too_short():
 95 |     schema = Schema(DESCRIPTOR_MAX)
 96 |     source = ['string', '10.0', '1', 'string']
 97 |     with pytest.raises(exceptions.CastError):
 98 |         schema.cast_row(source)
 99 | 
100 | 
101 | def test_cast_row_too_long():
102 |     schema = Schema(DESCRIPTOR_MAX)
103 |     source = ['string', '10.0', '1', 'string', 'string', 'string']
104 |     with pytest.raises(exceptions.CastError):
105 |         schema.cast_row(source)
106 | 
107 | 
108 | def test_cast_row_wrong_type():
109 |     schema = Schema(DESCRIPTOR_MAX)
110 |     source = ['string', 'notdecimal', '10.6', 'string', 'string']
111 |     with pytest.raises(exceptions.CastError):
112 |         schema.cast_row(source)
113 | 
114 | 
115 | def test_cast_row_wrong_type_multiple_errors():
116 |     schema = Schema(DESCRIPTOR_MAX)
117 |     source = ['string', 'notdecimal', '10.6', 'string', 'string']
118 |     with pytest.raises(exceptions.CastError) as excinfo:
119 |         schema.cast_row(source)
120 |     assert len(excinfo.value.errors) == 2
121 | 
122 | 
123 | def test_missing_values():
124 |     assert Schema(DESCRIPTOR_MIN).missing_values == ['']
125 |     assert Schema(DESCRIPTOR_MAX).missing_values == ['', '-', 'null']
126 | 
127 | 
128 | # Test row casting with exception handler i.e. don't fail immediately
129 | 
130 | def _check_error(
131 |         error, expect_exc_class, expect_exc_str, expect_row_number=None,
132 |         expect_row_data=None, expect_error_data=None):
133 |     # Helper function to check all given expectations on handled errors.
134 |     # error must be a (exc, row_number, row_data, error_data)-tuple
135 | 
136 |     # Make this a namedtuple?
137 |     exc, row_number, row_data, error_data = error
138 |     assert isinstance(exc, expect_exc_class)
139 |     assert expect_exc_str in str(exc)
140 |     if expect_row_number is not None:
141 |         # actual row number including header line
142 |         assert row_number == expect_row_number
143 |     if expect_row_data is not None:
144 |         assert row_data == expect_row_data
145 |     if error_data is not None:
146 |         assert error_data == expect_error_data
147 | 
148 | 
149 | def test_cast_row_handled():
150 |     schema = Schema(DESCRIPTOR_MAX)
151 |     source = ['string', '10.0', '1', 'string', 'string']
152 |     target = ['string', Decimal(10.0), 1, 'string', 'string']
153 |     errors = []
154 |     def handler(exc, row_number, row_data, error_data):
155 |         errors.append((exc, row_number, row_data, error_data))
156 |     assert schema.cast_row(source, exc_handler=handler) == target
157 |     assert len(errors) == 0
158 | 
159 | 
160 | def test_cast_row_null_values_handled():
161 |     schema = Schema(DESCRIPTOR_MAX)
162 |     source = ['string', '', '-', 'string', 'null']
163 |     target = ['string', None, None, 'string', None]
164 |     errors = []
165 |     def handler(exc, row_number, row_data, error_data):
166 |         errors.append((exc, row_number, row_data, error_data))
167 |     assert schema.cast_row(source, exc_handler=handler) == target
168 |     assert len(errors) == 0
169 | 
170 | 
171 | def test_cast_row_too_short_handled():
172 |     schema = Schema(DESCRIPTOR_MAX)
173 |     source = ['string', '10.0', '1', 'string']
174 |     # Missing values get substituted by None
175 |     target = ['string', Decimal(10.0), 1, 'string', None]
176 |     errors = []
177 |     def handler(exc, row_number, row_data, error_data):
178 |         errors.append((exc, row_number, row_data, error_data))
179 |     assert schema.cast_row(source, exc_handler=handler) == target
180 |     assert len(errors) == 1
181 |     expect_row_data = OrderedDict(
182 |         [('id', 'string'), ('height', '10.0'), ('age', '1'),
183 |          ('name', 'string'), ('occupation', None)])
184 |     _check_error(
185 |         errors[0], expect_exc_class=exceptions.CastError,
186 |         expect_exc_str='Row length', expect_row_number=None,
187 |         expect_row_data=expect_row_data, expect_error_data=expect_row_data)
188 | 
189 | def test_cast_row_too_long_handled():
190 |     schema = Schema(DESCRIPTOR_MAX)
191 |     source = ['string', '10.0', '1', 'string', 'string', 'string']
192 |     # superfluous values are left out
193 |     target = ['string', Decimal(10.0), 1, 'string', 'string']
194 |     errors = []
195 |     def handler(exc, row_number, row_data, error_data):
196 |         errors.append((exc, row_number, row_data, error_data))
197 |     assert schema.cast_row(source, exc_handler=handler) == target
198 |     assert len(errors) == 1
199 |     # superfluous values are keyed with col num for error reporting
200 |     expect_row_data = OrderedDict(
201 |         [('id', 'string'), ('height', '10.0'), ('age', '1'),
202 |          ('name', 'string'), ('occupation', 'string'),
203 |          ('tableschema-cast-error-extra-col-6', 'string')])
204 |     _check_error(
205 |         errors[0], expect_exc_class=exceptions.CastError,
206 |         expect_exc_str='Row length', expect_row_number=None,
207 |         expect_row_data=expect_row_data, expect_error_data=expect_row_data)
208 | 
209 | 
210 | def test_cast_row_wrong_type_handled():
211 |     schema = Schema(DESCRIPTOR_MAX)
212 |     source = ['string', 'notdecimal', '1', 'string', 'string']
213 |     target = ['string', 'notdecimal', 1, 'string', 'string']
214 |     errors = []
215 |     def handler(exc, row_number, row_data, error_data):
216 |         errors.append((exc, row_number, row_data, error_data))
217 |     actual = schema.cast_row(source, exc_handler=handler)
218 |     assert actual == target
219 |     assert isinstance(actual[1], FailedCast)
220 |     assert len(errors) == 1
221 |     expect_row_data = OrderedDict(
222 |         [('id', 'string'), ('height', 'notdecimal'), ('age', '1'),
223 |          ('name', 'string'), ('occupation', 'string')])
224 |     expect_error_data = OrderedDict([('height', 'notdecimal')])
225 |     _check_error(
226 |         errors[0], expect_exc_class=exceptions.CastError,
227 |         expect_exc_str='There are 1 cast errors', expect_row_number=None,
228 |         expect_row_data=expect_row_data, expect_error_data=expect_error_data)
229 |     exc = errors[0][0]
230 |     assert len(exc.errors) == 1
231 | 
232 | 
233 | def test_cast_row_wrong_type_multiple_errors_handled():
234 |     schema = Schema(DESCRIPTOR_MAX)
235 |     source = ['string', 'notdecimal', '10.6', 'string', 'string']
236 |     target = ['string', 'notdecimal', '10.6', 'string', 'string']
237 |     errors = []
238 |     def handler(exc, row_number, row_data, error_data):
239 |         errors.append((exc, row_number, row_data, error_data))
240 |     actual = schema.cast_row(source, exc_handler=handler)
241 |     assert actual == target
242 |     assert isinstance(actual[1], FailedCast)
243 |     assert isinstance(actual[2], FailedCast)
244 |     assert len(errors) == 1
245 |     expect_row_data = OrderedDict(
246 |         [('id', 'string'), ('height', 'notdecimal'), ('age', '10.6'),
247 |          ('name', 'string'), ('occupation', 'string')])
248 |     expect_error_data = OrderedDict(
249 |         [('height', 'notdecimal'),('age', '10.6')])
250 |     _check_error(
251 |         errors[0], expect_exc_class=exceptions.CastError,
252 |         expect_exc_str='There are 2 cast errors', expect_row_number=None,
253 |         expect_row_data=expect_row_data, expect_error_data=expect_error_data)
254 |     exc = errors[0][0]
255 |     assert len(exc.errors) == 2
256 | 
257 | 
258 | def test_fields():
259 |     expect = ['id', 'height']
260 |     actual = [field.name for field in Schema(DESCRIPTOR_MIN).fields]
261 |     assert expect == actual
262 | 
263 | 
264 | def test_get_field():
265 |     schema = Schema(DESCRIPTOR_MIN)
266 |     assert schema.get_field('id').name == 'id'
267 |     assert schema.get_field('height').name == 'height'
268 |     assert schema.get_field('undefined') is None
269 | 
270 | 
271 | def test_update_field():
272 |     schema = Schema(DESCRIPTOR_MIN)
273 |     assert schema.update_field('id', {'type': 'number'}) is True
274 |     assert schema.update_field('height', {'type': 'number'}) is True
275 |     assert schema.update_field('unknown', {'type': 'number'}) is False
276 |     schema.commit()
277 |     assert schema.get_field('id').type == 'number'
278 |     assert schema.get_field('height').type == 'number'
279 | 
280 | 
281 | def test_has_field():
282 |     schema = Schema(DESCRIPTOR_MIN)
283 |     assert schema.has_field('id')
284 |     assert schema.has_field('height')
285 |     assert not schema.has_field('undefined')
286 | 
287 | 
288 | def test_headers():
289 |     assert Schema(DESCRIPTOR_MIN).headers == ['id', 'height']
290 | 
291 | 
292 | def test_primary_key():
293 |     assert Schema(DESCRIPTOR_MIN).primary_key == []
294 |     assert Schema(DESCRIPTOR_MAX).primary_key == ['id']
295 | 
296 | 
297 | def test_foreign_keys():
298 |     assert Schema(DESCRIPTOR_MIN).foreign_keys == []
299 |     assert Schema(DESCRIPTOR_MAX).foreign_keys == DESCRIPTOR_MAX['foreignKeys']
300 | 
301 | 
302 | def test_save(tmpdir, apply_defaults):
303 |     path = str(tmpdir.join('schema.json'))
304 |     Schema(DESCRIPTOR_MIN).save(path)
305 |     with io.open(path, encoding='utf-8') as file:
306 |         descriptor = json.load(file)
307 |     assert descriptor == apply_defaults(DESCRIPTOR_MIN)
308 | 
309 | 
310 | def test_infer():
311 |     data = [
312 |       ['id', 'age', 'name', 'dob'],
313 |       ['1','39','Paul','28/1/79'],
314 |       ['2','23','Jimmy','13/6/95'],
315 |       ['3','36','Jane','17/9/80'],
316 |       ['4','N/A','Judy','19/4/83'],
317 |     ]
318 |     schema = Schema()
319 |     schema.infer(data)
320 |     assert schema.descriptor == {
321 |         'fields': [
322 |             {'format': 'default', 'name': 'id', 'type': 'integer'},
323 |             {'format': 'default', 'name': 'age', 'type': 'integer'},
324 |             {'format': 'default', 'name': 'name', 'type': 'string'},
325 |             {'format': '%d/%m/%y', 'name': 'dob', 'type': 'date'},
326 |         ],
327 |         'missingValues': ['']}
328 |     data = [
329 |       ['id', 'age', 'name'],
330 |       ['1','39','Paul'],
331 |       ['2','23','Jimmy'],
332 |       ['3','36','Jane'],
333 |       ['4','N/A','Judy'],
334 |     ]
335 |     schema = Schema()
336 |     schema.infer(data, confidence=0.8)
337 |     assert schema.descriptor == {
338 |         'fields': [
339 |             {'format': 'default', 'name': 'id', 'type': 'integer'},
340 |             {'format': 'default', 'name': 'age', 'type': 'string'},
341 |             {'format': 'default', 'name': 'name', 'type': 'string'}],
342 |         'missingValues': ['']}
343 | 
344 |     class AllStrings():
345 |         def cast(self, value):
346 |             return [('string', 'default', 0)]
347 |     data = [
348 |       ['id', 'age', 'name'],
349 |       ['1','39','Paul'],
350 |       ['2','23','Jimmy'],
351 |       ['3','36','Jane'],
352 |       ['4','100','Judy'],
353 |     ]
354 | 
355 |     schema = Schema()
356 |     schema.infer(data, confidence=0.8, guesser_cls=AllStrings)
357 |     assert schema.descriptor['fields'] == [
358 |             {'format': 'default', 'name': 'id', 'type': 'string'},
359 |             {'format': 'default', 'name': 'age', 'type': 'string'},
360 |             {'format': 'default', 'name': 'name', 'type': 'string'}]
361 |     assert schema.descriptor == {
362 |         'fields': [
363 |             {'format': 'default', 'name': 'id', 'type': 'string'},
364 |             {'format': 'default', 'name': 'age', 'type': 'string'},
365 |             {'format': 'default', 'name': 'name', 'type': 'string'}],
366 |         'missingValues': ['']}
367 | 
368 | 
369 | def test_add_remove_field():
370 |     schema = Schema()
371 |     schema.add_field({'name': 'name'})
372 |     field = schema.remove_field('name')
373 |     assert field.name == 'name'
374 | 
375 | 
376 | def test_primary_foreign_keys_as_array():
377 |     descriptor = {
378 |         'fields': [{'name': 'name'}],
379 |         'primaryKey': ['name'],
380 |         'foreignKeys': [{
381 |             'fields': ['parent_id'],
382 |             'reference': {'resource': 'resource', 'fields': ['id']}
383 |         }]
384 |     }
385 |     schema = Schema(descriptor)
386 |     assert schema.primary_key == ['name']
387 |     assert schema.foreign_keys == [{
388 |         'fields': ['parent_id'],
389 |         'reference': {'resource': 'resource', 'fields': ['id']}
390 |     }]
391 | 
392 | 
393 | def test_primary_foreign_keys_as_string():
394 |     descriptor = {
395 |         'fields': [{'name': 'name'}],
396 |         'primaryKey': 'name',
397 |         'foreignKeys': [{
398 |             'fields': 'parent_id',
399 |             'reference': {'resource': 'resource', 'fields': 'id'}
400 |         }]
401 |     }
402 |     schema = Schema(descriptor)
403 |     assert schema.primary_key == ['name']
404 |     assert schema.foreign_keys == [{
405 |         'fields': ['parent_id'],
406 |         'reference': {'resource': 'resource', 'fields': ['id']}
407 |     }]
408 | 
409 | 
410 | def test_fields_have_public_backreference_to_schema():
411 |     schema = Schema('data/schema_valid_full.json')
412 |     assert schema.get_field('first_name').schema == schema
413 |     assert schema.get_field('last_name').schema == schema
414 | 
415 | 
416 | # Issues
417 | 
418 | 
419 | def test_schema_field_date_format_issue_177():
420 |     descriptor = {'fields':[{'name':'myfield', 'type':'date', 'format':'%d/%m/%y'}]}
421 |     schema = Schema(descriptor)
422 |     assert schema
423 | 
424 | 
425 | def test_schema_field_time_format_issue_177():
426 |     descriptor = {'fields':[{'name':'myfield', 'type':'time', 'format':'%H:%M:%S'}]}
427 |     schema = Schema(descriptor)
428 |     assert schema
429 | 
430 | 
431 | def test_schema_add_remove_field_issue_218():
432 |     descriptor = {
433 |         'fields':  [
434 |             {'name': 'test_1', 'type': 'string', 'format': 'default'},
435 |             {'name': 'test_2', 'type': 'string', 'format': 'default'},
436 |             {'name': 'test_3', 'type': 'string', 'format': 'default'},
437 |         ]
438 |     }
439 |     test_schema = Schema(descriptor)
440 |     test_schema.remove_field('test_1')
441 |     test_schema.add_field({'name': 'test_4', 'type': 'string', 'format': 'default'})
442 | 
443 | 
444 | def test_schema_not_supported_type_issue_goodatbles_304():
445 |     schema = Schema({'fields': [ {'name': 'name'}, {'name': 'age', 'type': 'bad'} ]})
446 |     assert schema.valid is False
447 |     assert schema.fields[1] is False
448 | 
449 | 
450 | def test_schema_infer_with_non_headers_issues_goodtables_258():
451 |     schema = Schema()
452 |     schema.infer([[1],[2],[3]], headers=[None])
453 |     assert schema.field_names == ['field1']
454 | 


--------------------------------------------------------------------------------
/tableschema/schema.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | from __future__ import absolute_import
  5 | from __future__ import unicode_literals
  6 | 
  7 | import io
  8 | import six
  9 | import json
 10 | from collections import OrderedDict
 11 | from copy import deepcopy
 12 | from six.moves import zip_longest
 13 | from .profile import Profile
 14 | from .field import Field
 15 | from . import exceptions
 16 | from . import helpers
 17 | from . import config
 18 | from . import types
 19 | 
 20 | 
 21 | # Module API
 22 | 
 23 | class Schema(object):
 24 |     """Schema representation
 25 | 
 26 |     # Arguments
 27 |         descriptor (str/dict): schema descriptor one of:
 28 |             - local path
 29 |             - remote url
 30 |             - dictionary
 31 |         strict (bool): flag to specify validation behaviour:
 32 |             - if false, errors will not be raised but instead collected in `schema.errors`
 33 |             - if true, validation errors are raised immediately
 34 | 
 35 |     # Raises
 36 |         TableSchemaException: raise any error that occurs during the process
 37 | 
 38 |     """
 39 | 
 40 |     # Public
 41 | 
 42 |     def __init__(self, descriptor={}, strict=False):
 43 | 
 44 |         # Process descriptor
 45 |         descriptor = helpers.retrieve_descriptor(descriptor)
 46 | 
 47 |         # Set attributes
 48 |         self.__strict = strict
 49 |         self.__current_descriptor = deepcopy(descriptor)
 50 |         self.__next_descriptor = deepcopy(descriptor)
 51 |         self.__profile = Profile('table-schema')
 52 |         self.__errors = []
 53 |         self.__fields = []
 54 | 
 55 |         # Build instance
 56 |         self.__build()
 57 | 
 58 |     @property
 59 |     def valid(self):
 60 |         """Validation status
 61 | 
 62 |         Always true in strict mode.
 63 | 
 64 |         # Returns
 65 |             bool: validation status
 66 | 
 67 |         """
 68 |         return not bool(self.__errors)
 69 | 
 70 |     @property
 71 |     def errors(self):
 72 |         """Validation errors
 73 | 
 74 |         Always empty in strict mode.
 75 | 
 76 |         # Returns
 77 |             Exception[]: validation errors
 78 | 
 79 |         """
 80 |         return self.__errors
 81 | 
 82 |     @property
 83 |     def descriptor(self):
 84 |         """Schema's descriptor
 85 | 
 86 |         # Returns
 87 |             dict: descriptor
 88 | 
 89 |         """
 90 |         # Never use this.descriptor inside this class (!!!)
 91 |         return self.__next_descriptor
 92 | 
 93 |     @property
 94 |     def missing_values(self):
 95 |         """Schema's missing values
 96 | 
 97 |         # Returns
 98 |             str[]: missing values
 99 | 
100 |         """
101 |         return self.__current_descriptor.get('missingValues', [])
102 | 
103 |     @property
104 |     def primary_key(self):
105 |         """Schema's primary keys
106 | 
107 |         # Returns
108 |             str[]: primary keys
109 | 
110 |         """
111 |         primary_key = self.__current_descriptor.get('primaryKey', [])
112 |         if not isinstance(primary_key, list):
113 |             primary_key = [primary_key]
114 |         return primary_key
115 | 
116 |     @property
117 |     def foreign_keys(self):
118 |         """Schema's foreign keys
119 | 
120 |         # Returns
121 |             dict[]: foreign keys
122 | 
123 |         """
124 |         foreign_keys = self.__current_descriptor.get('foreignKeys', [])
125 |         for key in foreign_keys:
126 |             key.setdefault('fields', [])
127 |             key.setdefault('reference', {})
128 |             key['reference'].setdefault('resource', '')
129 |             key['reference'].setdefault('fields', [])
130 |             if not isinstance(key['fields'], list):
131 |                 key['fields'] = [key['fields']]
132 |             if not isinstance(key['reference']['fields'], list):
133 |                 key['reference']['fields'] = [key['reference']['fields']]
134 |         return foreign_keys
135 | 
136 |     @property
137 |     def fields(self):
138 |         """Schema's fields
139 | 
140 |         # Returns
141 |             Field[]: an array of field instances
142 | 
143 |         """
144 |         return self.__fields
145 | 
146 |     @property
147 |     def field_names(self):
148 |         """Schema's field names
149 | 
150 |         # Returns
151 |             str[]: an array of field names
152 | 
153 |         """
154 |         return [field.name for field in self.fields]
155 | 
156 |     def get_field(self, name):
157 |         """Get schema's field by name.
158 | 
159 |         > Use `table.update_field` if you want to modify the field descriptor
160 | 
161 |         # Arguments
162 |             name (str): schema field name
163 | 
164 |         # Returns
165 |            Field/None: `Field` instance or `None` if not found
166 | 
167 |         """
168 |         for field in self.fields:
169 |             if field.name == name:
170 |                 return field
171 |         return None
172 | 
173 |     def add_field(self, descriptor):
174 |         """ Add new field to schema.
175 | 
176 |         The schema descriptor will be validated with newly added field descriptor.
177 | 
178 |         # Arguments
179 |             descriptor (dict): field descriptor
180 | 
181 |         # Raises
182 |             TableSchemaException: raises any error that occurs during the process
183 | 
184 |         # Returns
185 |             Field/None: added `Field` instance or `None` if not added
186 | 
187 |         """
188 |         self.__current_descriptor.setdefault('fields', [])
189 |         self.__current_descriptor['fields'].append(descriptor)
190 |         self.__build()
191 |         return self.__fields[-1]
192 | 
193 |     def update_field(self, name, update):
194 |         """Update existing descriptor field by name
195 | 
196 |         # Arguments
197 |             name (str): schema field name
198 |             update (dict): update to apply to field's descriptor
199 | 
200 |         # Returns
201 |             bool: true on success and false if no field is found to be modified
202 | 
203 |         """
204 |         for field in self.__next_descriptor['fields']:
205 |             if field['name'] == name:
206 |                 field.update(update)
207 |                 return True
208 |         return False
209 | 
210 |     def remove_field(self, name):
211 |         """Remove field resource by name.
212 | 
213 |         The schema descriptor will be validated after field descriptor removal.
214 | 
215 |         # Arguments
216 |             name (str): schema field name
217 | 
218 |         # Raises
219 |             TableSchemaException: raises any error that occurs during the process
220 | 
221 |         # Returns
222 |             Field/None: removed `Field` instances or `None` if not found
223 | 
224 |         """
225 |         field = self.get_field(name)
226 |         if field:
227 |             predicat = lambda field: field.get('name') != name
228 |             self.__current_descriptor['fields'] = list(filter(
229 |                 predicat, self.__current_descriptor['fields']))
230 |             self.__build()
231 |         return field
232 | 
233 |     def cast_row(self, row, fail_fast=False, row_number=None, exc_handler=None):
234 |         """Cast row based on field types and formats.
235 | 
236 |         # Arguments
237 |             row (any[]: data row as an array of values
238 | 
239 |         # Returns
240 |             any[]: returns cast data row
241 | 
242 |         """
243 |         exc_handler = helpers.default_exc_handler if exc_handler is None else \
244 |             exc_handler
245 | 
246 |         # Prepare
247 |         result = []
248 |         errors = []
249 |         if row_number is not None:
250 |             row_number_info = ' for row "%s"' % row_number
251 |         else:
252 |             row_number_info = ''
253 |         # Check row length
254 |         if len(row) != len(self.fields):
255 |             message = (
256 |                 'Row length %s doesn\'t match fields count %s' +
257 |                 row_number_info) % (len(row), len(self.fields))
258 |             exc = exceptions.CastError(message)
259 |             # Some preparations for error reporting, relevant if custom error
260 |             # handling is in place.
261 |             if len(row) < len(self.fields):
262 |                 # Treat missing col values as None
263 |                 keyed_row = OrderedDict(
264 |                     zip_longest((field.name for field in self.fields), row))
265 |                 # Use added None values for further processing
266 |                 row = list(keyed_row.values())
267 |             else:
268 |                 fields = self.fields
269 |                 keyed_row = OrderedDict(
270 |                     # Use extra column number if value index exceeds fields
271 |                     (fields[i].name if fields[i:]
272 |                      else 'tableschema-cast-error-extra-col-{}'.format(i+1),
273 |                      value)
274 |                     for (i, value) in enumerate(row))
275 |             exc_handler(exc, row_number=row_number, row_data=keyed_row,
276 |                         error_data=keyed_row)
277 | 
278 |         # Cast row
279 |         for field, value in zip(self.fields, row):
280 |             try:
281 |                 result.append(field.cast_value(value))
282 |             except exceptions.CastError as exception:
283 |                 if fail_fast:
284 |                     raise
285 |                 # Wrap original value in a FailedCast object to be able to
286 |                 # further process/yield values and to distinguish uncasted
287 |                 # values on the consuming side.
288 |                 result.append(FailedCast(value))
289 |                 errors.append(exception)
290 | 
291 |         # Raise errors
292 |         if errors:
293 |             message = (
294 |                 'There are %s cast errors (see exception.errors)' +
295 |                 row_number_info) % len(errors)
296 |             keyed_row = OrderedDict(zip(self.field_names, row))
297 |             # Add the cast failure-causing fields only to error data.
298 |             # Indexing results with the row field index should be ok at this
299 |             # point due to the previous processing.
300 |             error_data = OrderedDict(
301 |                 (name, value)
302 |                 for (i, (name, value)) in enumerate(keyed_row.items())
303 |                 if isinstance(result[i], FailedCast))
304 |             exc_handler(
305 |                 exceptions.CastError(message, errors=errors),
306 |                 row_number=row_number, row_data=keyed_row,
307 |                 error_data=error_data)
308 | 
309 |         return result
310 | 
311 |     def infer(self, rows, headers=1, confidence=0.75,
312 |               guesser_cls=None, resolver_cls=None):
313 |         """Infer and set `schema.descriptor` based on data sample.
314 | 
315 |         # Arguments
316 |             rows (list[]): array of arrays representing rows.
317 |             headers (int/str[]): data sample headers (one of):
318 |               - row number containing headers (`rows` should contain headers rows)
319 |               - array of headers (`rows` should NOT contain headers rows)
320 |             confidence (float): how many casting errors are allowed (as a ratio, between 0 and 1)
321 |             guesser_cls (class): you can implement inferring strategies by
322 |                  providing type-guessing and type-resolving classes [experimental]
323 |             resolver_cls (class): you can implement inferring strategies by
324 |                  providing type-guessing and type-resolving classes [experimental]
325 | 
326 |         # Returns
327 |             dict: Table Schema descriptor
328 | 
329 |         """
330 | 
331 |         # Get headers
332 |         if isinstance(headers, int):
333 |             headers_row = headers
334 |             while True:
335 |                 headers_row -= 1
336 |                 headers = rows.pop(0)
337 |                 if not headers_row:
338 |                     break
339 |         elif isinstance(headers, list):
340 |             seen_cells = []
341 |             headers = list(headers)
342 |             for index, cell in enumerate(headers):
343 |                 count = seen_cells.count(cell) + 1
344 |                 headers[index] = '%s%s' % (cell, count) if count > 1 else cell
345 |                 seen_cells.append(cell)
346 |         elif not isinstance(headers, list):
347 |             headers = []
348 | 
349 |         # Get descriptor
350 |         missing_values = self.__current_descriptor.get('missingValues', config.DEFAULT_MISSING_VALUES)
351 |         guesser = guesser_cls() if guesser_cls else _TypeGuesser(missing_values)
352 |         resolver = (resolver_cls or _TypeResolver)()
353 |         descriptor = {'fields': [], 'missingValues': missing_values}
354 |         type_matches = {}
355 |         for number, header in enumerate(headers, start=1):
356 |             descriptor['fields'].append({'name': header or 'field%s' % number})
357 |         for index, row in enumerate(rows):
358 |             # Normalize rows with invalid dimensions for sanity
359 |             row_length = len(row)
360 |             headers_length = len(headers)
361 |             if row_length > headers_length:
362 |                 row = row[:len(headers)]
363 |             if row_length < headers_length:
364 |                 diff = headers_length - row_length
365 |                 fill = [''] * diff
366 |                 row = row + fill
367 |             # build a column-wise lookup of type matches
368 |             for index, value in enumerate(row):
369 |                 rv = guesser.cast(value)
370 |                 if type_matches.get(index):
371 |                     type_matches[index].extend(rv)
372 |                 else:
373 |                     type_matches[index] = list(rv)
374 |         # choose a type/format for each column based on the matches
375 |         for index, results in type_matches.items():
376 |             rv = resolver.get(results, confidence)
377 |             descriptor['fields'][index].update(**rv)
378 | 
379 |         # Save descriptor
380 |         self.__current_descriptor = descriptor
381 |         self.__build()
382 | 
383 |         return descriptor
384 | 
385 |     def commit(self, strict=None):
386 |         """Update schema instance if there are in-place changes in the descriptor.
387 | 
388 |         # Example
389 | 
390 |         ```python
391 |         from tableschema import Schema
392 |         descriptor = {'fields': [{'name': 'my_field', 'title': 'My Field', 'type': 'string'}]}
393 |         schema = Schema(descriptor)
394 |         print(schema.get_field('my_field').descriptor['type']) # string
395 | 
396 |         # Update descriptor by field position
397 |         schema.descriptor['fields'][0]['type'] = 'number'
398 |         # Update descriptor by field name
399 |         schema.update_field('my_field', {'title': 'My Pretty Field'}) # True
400 | 
401 |         # Change are not committed
402 |         print(schema.get_field('my_field').descriptor['type']) # string
403 |         print(schema.get_field('my_field').descriptor['title']) # My Field
404 | 
405 |         # Commit change
406 |         schema.commit()
407 |         print(schema.get_field('my_field').descriptor['type']) # number
408 |         print(schema.get_field('my_field').descriptor['title']) # My Pretty Field
409 | 
410 |         ```
411 | 
412 |         # Arguments
413 |             strict (bool): alter `strict` mode for further work
414 | 
415 |         # Raises
416 |             TableSchemaException: raises any error that occurs during the process
417 | 
418 |         # Returns
419 |             bool: true on success and false if not modified
420 | 
421 |         """
422 |         if strict is not None:
423 |             self.__strict = strict
424 |         elif self.__current_descriptor == self.__next_descriptor:
425 |             return False
426 |         self.__current_descriptor = deepcopy(self.__next_descriptor)
427 |         self.__build()
428 |         return True
429 | 
430 |     def save(self, target, ensure_ascii=True):
431 |         """Save schema descriptor to target destination.
432 | 
433 |         # Arguments
434 |             target (str): path where to save a descriptor
435 | 
436 |         # Raises
437 |             TableSchemaException: raises any error that occurs during the process
438 | 
439 |         # Returns
440 |             bool: true on success
441 | 
442 |         """
443 |         mode = 'w'
444 |         encoding = 'utf-8'
445 |         if six.PY2:
446 |             mode = 'wb'
447 |             encoding = None
448 |         helpers.ensure_dir(target)
449 |         with io.open(target, mode=mode, encoding=encoding) as file:
450 |             json.dump(self.__current_descriptor, file, indent=4, ensure_ascii=ensure_ascii)
451 | 
452 |     # Internal
453 | 
454 |     def __build(self):
455 | 
456 |         # Process descriptor
457 |         expand = helpers.expand_schema_descriptor
458 |         self.__current_descriptor = expand(self.__current_descriptor)
459 |         self.__next_descriptor = deepcopy(self.__current_descriptor)
460 | 
461 |         # Validate descriptor
462 |         try:
463 |             self.__profile.validate(self.__current_descriptor)
464 |             self.__errors = []
465 |         except exceptions.ValidationError as exception:
466 |             self.__errors = exception.errors
467 |             if self.__strict:
468 |                 raise exception
469 | 
470 |         # Populate fields
471 |         self.__fields = []
472 |         for field in self.__current_descriptor.get('fields', []):
473 |             missing_values = self.__current_descriptor['missingValues']
474 |             try:
475 |                 field = Field(field, missing_values=missing_values, schema=self)
476 |             except exceptions.TableSchemaException as e:
477 |                 if self.__strict:
478 |                     raise e
479 |                 else:
480 |                     field = False
481 |             self.__fields.append(field)
482 | 
483 |     # Deprecated
484 | 
485 |     headers = field_names
486 |     has_field = get_field
487 | 
488 | 
489 | class FailedCast(object):
490 |     """Wrap an original data field value that failed to be properly casted.
491 | 
492 |     FailedCast allows for further processing/yielding values but still be able
493 |     to distinguish uncasted values on the consuming side.
494 | 
495 |     Delegates attribute access and the basic rich comparison methods to the
496 |     underlying object. Supports default user-defined classes hashability i.e.
497 |     is hashable based on object identity (not based on the wrapped value).
498 | 
499 |     # Arguments
500 |         value (any): value
501 | 
502 |     """
503 | 
504 |     # Make this "reasonably immutable": Don't support setting other attributes,
505 |     # don't support modifying re-setting value
506 |     __slots__ = ('_value',)
507 | 
508 |     def __init__(self, value):
509 |         self._value = value
510 | 
511 |     @property
512 |     def value(self):
513 |         return self._value
514 | 
515 |     def __repr__(self):
516 |         return 'FailedCast(%r)' % self._value
517 | 
518 |     def __getattr__(self, name):
519 |         return getattr(self._value, name)
520 | 
521 |     def __lt__(self, other):
522 |         return self._value < other
523 | 
524 |     def __le__(self, other):
525 |         return self._value <= other
526 | 
527 |     def __eq__(self, other):
528 |         return self._value == other
529 | 
530 |     def __ne__(self, other):
531 |         return self._value != other
532 | 
533 |     def __gt__(self, other):
534 |         return self._value > other
535 | 
536 |     def __ge__(self, other):
537 |         return self._value >= other
538 | 
539 |     def __hash__(self):
540 |         return object.__hash__(self)
541 | 
542 | 
543 | # Internal
544 | _INFER_DATE_FORMATS = [
545 |     '%Y-%m-%d',
546 |     '%d/%m/%Y',
547 |     '%m/%d/%Y',
548 |     '%d/%m/%y',
549 |     '%m/%d/%y',
550 |     '%Y%m%d',
551 |     '%d-%m-%y',
552 |     '%Y/%m/%d',
553 |     '%d.%m.%Y',
554 |     '%d.%m.%y',
555 | ]
556 | 
557 | 
558 | _INFER_TYPE_ORDER = [
559 |     'duration',
560 |     'geojson',
561 |     'geopoint',
562 |     'object',
563 |     'array',
564 |     'datetime',
565 |     'time',
566 |     ('date', _INFER_DATE_FORMATS),
567 |     'integer',
568 |     'number',
569 |     'boolean',
570 |     'string',
571 |     'any',
572 | ]
573 | 
574 | 
575 | class _TypeGuesser(object):
576 |     """Guess the type for a value returning a tuple of ('type', 'format')
577 |     """
578 | 
579 |     # Public
580 | 
581 |     def __init__(self, missing_values):
582 |         self.missing_values = missing_values
583 | 
584 |     def cast(self, value):
585 |         for priority, type_rec in enumerate(_INFER_TYPE_ORDER):
586 |             if isinstance(type_rec, tuple):
587 |                 name, formats = type_rec
588 |             else:
589 |                 name, formats = type_rec, ['default']
590 |             cast = getattr(types, 'cast_%s' % name)
591 |             if value not in self.missing_values:
592 |                 for format in formats:
593 |                     result = cast(format, value)
594 |                     if result != config.ERROR:
595 |                         yield (name, format, priority)
596 | 
597 | 
598 | class _TypeResolver(object):
599 |     """Get the best matching type/format from a list of possible ones.
600 |     """
601 | 
602 |     # Public
603 | 
604 |     def get(self, results, confidence):
605 |         variants = set(results)
606 |         # only one candidate... that's easy.
607 |         if len(variants) == 1:
608 |             rv = {'type': results[0][0], 'format': results[0][1]}
609 |         else:
610 |             counts = {}
611 |             for result in results:
612 |                 if counts.get(result):
613 |                     counts[result] += 1
614 |                 else:
615 |                     counts[result] = 1
616 |             # tuple representation of `counts` dict sorted by values
617 |             sorted_counts = sorted(counts.items(), key=lambda item: item[1], reverse=True)
618 |             if not sorted_counts:
619 |                 return {'type': 'string', 'format': 'default'}
620 |             # Allow also counts that are not the max, based on the confidence
621 |             max_count = sorted_counts[0][1]
622 |             sorted_counts = filter(lambda item: item[1] >= max_count * confidence,
623 |                                    sorted_counts)
624 |             # Choose the most specific data type
625 |             sorted_counts = sorted(sorted_counts,
626 |                                    key=lambda item: item[0][2])
627 |             rv = {'type': sorted_counts[0][0][0], 'format': sorted_counts[0][0][1]}
628 |         return rv
629 | 


--------------------------------------------------------------------------------
/tableschema/table.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | from __future__ import absolute_import
  5 | from __future__ import unicode_literals
  6 | 
  7 | from copy import copy
  8 | from tabulator import Stream
  9 | from functools import partial
 10 | from collections import OrderedDict
 11 | from .storage import Storage
 12 | from .schema import Schema
 13 | from . import exceptions
 14 | from . import helpers
 15 | from . import config
 16 | from collections import defaultdict
 17 | 
 18 | 
 19 | # Module API
 20 | 
 21 | class Table(object):
 22 |     """Table representation
 23 | 
 24 |     # Arguments
 25 |       source (str/list[]): data source one of:
 26 |         - local file (path)
 27 |         - remote file (url)
 28 |         - array of arrays representing the rows
 29 |       schema (any): data schema in all forms supported by `Schema` class
 30 |       strict (bool): strictness option to pass to `Schema` constructor
 31 |       post_cast (function[]): list of post cast processors
 32 |       storage (None): storage name like `sql` or `bigquery`
 33 |       options (dict): `tabulator` or storage's options
 34 | 
 35 |     # Raises
 36 |       TableSchemaException: raises on any error
 37 | 
 38 |     """
 39 | 
 40 |     # Public
 41 | 
 42 |     def __init__(self, source, schema=None, strict=False,
 43 |                  post_cast=[], storage=None, **options):
 44 | 
 45 |         # Set attributes
 46 |         self.__source = source
 47 |         self.__stream = None
 48 |         self.__schema = None
 49 |         self.__headers = None
 50 |         self.__storage = None
 51 |         self.__post_cast = copy(post_cast)
 52 | 
 53 |         # Schema
 54 |         if isinstance(schema, Schema):
 55 |             self.__schema = schema
 56 |         elif schema is not None:
 57 |             self.__schema = Schema(schema)
 58 | 
 59 |         # Stream (tabulator)
 60 |         if storage is None:
 61 |             options.setdefault('headers', 1)
 62 |             self.__stream = Stream(source,  **options)
 63 | 
 64 |         # Stream (storage)
 65 |         else:
 66 |             if not isinstance(storage, Storage):
 67 |                 storage = Storage.connect(storage, **options)
 68 |             if self.__schema:
 69 |                 storage.describe(source, self.__schema.descriptor)
 70 |             headers = Schema(storage.describe(source)).field_names
 71 |             self.__stream = Stream(partial(storage.iter, source), headers=headers)
 72 |             self.__storage = storage
 73 | 
 74 |     @property
 75 |     def headers(self):
 76 |         """Table's headers is available
 77 | 
 78 |         # Returns
 79 |             str[]: headers
 80 | 
 81 |         """
 82 |         return self.__headers
 83 | 
 84 |     @property
 85 |     def schema(self):
 86 |         """Returns schema class instance if available
 87 | 
 88 |         # Returns
 89 |             Schema: schema
 90 | 
 91 |         """
 92 |         return self.__schema
 93 | 
 94 |     @property
 95 |     def size(self):
 96 |         """Table's size in BYTES if it's available
 97 | 
 98 |         If it's already read using e.g. `table.read`, otherwise returns `None`.
 99 |         In the middle of an iteration it returns size of already read contents
100 | 
101 |         # Returns
102 |             int/None: size in BYTES
103 | 
104 |         """
105 |         if self.__stream:
106 |             return self.__stream.size
107 | 
108 |     @property
109 |     def hash(self):
110 |         """Table's SHA256 hash if it's available.
111 | 
112 |         If it's already read using e.g. `table.read`, otherwise returns `None`.
113 |         In the middle of an iteration it returns hash of already read contents
114 | 
115 |         # Returns
116 |             str/None: SHA256 hash
117 | 
118 |         """
119 |         if self.__stream:
120 |             return self.__stream.hash
121 | 
122 |     def iter(self, keyed=False, extended=False, cast=True,
123 |              integrity=False, relations=False,
124 |              foreign_keys_values=False, exc_handler=None):
125 |         """Iterates through the table data and emits rows cast based on table schema.
126 | 
127 |         # Arguments
128 | 
129 |             keyed (bool):
130 |                 yield keyed rows in a form of `{header1\\: value1, header2\\: value2}`
131 |                 (default is false; the form of rows is `[value1, value2]`)
132 | 
133 |             extended (bool):
134 |                 yield extended rows in a for of `[rowNumber, [header1, header2], [value1, value2]]`
135 |                 (default is false; the form of rows is `[value1, value2]`)
136 | 
137 |             cast (bool):
138 |                 disable data casting if false
139 |                 (default is true)
140 | 
141 |             integrity (dict):
142 |                 dictionary in a form of `{'size'\\: <bytes>, 'hash'\\: '<sha256>'}`
143 |                 to check integrity of the table when it's read completely.
144 |                 Both keys are optional.
145 | 
146 |             relations (dict):
147 |                 dictionary of foreign key references in a form
148 |                 of `{resource1\\: [{field1\\: value1, field2\\: value2}, ...], ...}`.
149 |                 If provided, foreign key fields will checked and resolved
150 |                 to one of their references (/!\\ one-to-many fk are not completely resolved).
151 | 
152 |             foreign_keys_values (dict):
153 |                 three-level dictionary of foreign key references optimized
154 |                 to speed up validation process in a form of
155 |                 `{resource1\\: {(fk_field1, fk_field2)\\: {(value1, value2)\\: {one_keyedrow}, ... }}}`.
156 |                 If not provided but relations is true, it will be created
157 |                 before the validation process by *index_foreign_keys_values* method
158 | 
159 |             exc_handler (func):
160 |                 optional custom exception handler callable.
161 |                 Can be used to defer raising errors (i.e. "fail late"), e.g.
162 |                 for data validation purposes. Must support the signature below
163 | 
164 |         # Custom exception handler
165 | 
166 |         ```python
167 |         def exc_handler(exc, row_number=None, row_data=None, error_data=None):
168 |             '''Custom exception handler (example)
169 | 
170 |             # Arguments:
171 |                 exc(Exception):
172 |                     Deferred exception instance
173 |                 row_number(int):
174 |                     Data row number that triggers exception exc
175 |                 row_data(OrderedDict):
176 |                     Invalid data row source data
177 |                 error_data(OrderedDict):
178 |                     Data row source data field subset responsible for the error, if
179 |                     applicable (e.g. invalid primary or foreign key fields). May be
180 |                     identical to row_data.
181 |             '''
182 |             # ...
183 |         ```
184 | 
185 |         # Raises
186 |             TableSchemaException: base class of any error
187 |             CastError: data cast error
188 |             IntegrityError: integrity checking error
189 |             UniqueKeyError: unique key constraint violation
190 |             UnresolvedFKError: unresolved foreign key reference error
191 | 
192 |         # Returns
193 |             Iterator[list]: yields rows
194 | 
195 |         """
196 |         exc_handler = helpers.default_exc_handler if exc_handler is None else \
197 |             exc_handler
198 | 
199 |         # Prepare unique checks
200 |         if cast:
201 |             unique_fields_cache = {}
202 |             if self.schema:
203 |                 unique_fields_cache = _create_unique_fields_cache(self.schema)
204 |         # Prepare relation checks
205 |         if relations and not foreign_keys_values:
206 |             # we have to test relations but the index has not been precomputed
207 |             # prepare the index to boost validation process
208 |             foreign_keys_values = self.index_foreign_keys_values(relations)
209 | 
210 |         # Open/iterate stream
211 |         with self.__stream as stream:
212 |             iterator = stream.iter(extended=True)
213 |             iterator = self.__apply_processors(
214 |                 iterator, cast=cast, exc_handler=exc_handler)
215 |             for row_number, headers, row in iterator:
216 | 
217 |                 # Get headers
218 |                 if not self.__headers:
219 |                     self.__headers = headers
220 | 
221 |                 # Check headers
222 |                 if cast:
223 |                     if self.schema and self.headers:
224 |                         if self.headers != self.schema.field_names:
225 |                             message = (
226 |                                 'Table headers (%r) don\'t match '
227 |                                 'schema field names (%r) in row %s' % (
228 |                                     self.headers, self.schema.field_names,
229 |                                     row_number))
230 |                             keyed_row = OrderedDict(zip(headers, row))
231 |                             exc_handler(
232 |                                 exceptions.CastError(message),
233 |                                 row_number=row_number, row_data=keyed_row,
234 |                                 error_data=keyed_row)
235 |                             continue
236 | 
237 |                 # Check unique
238 |                 if cast:
239 |                     for indexes, cache in unique_fields_cache.items():
240 |                         keyed_values = OrderedDict(
241 |                             (headers[i], value)
242 |                             for i, value in enumerate(row) if i in indexes)
243 |                         values = tuple(keyed_values.values())
244 |                         if not all(map(lambda value: value is None, values)):
245 |                             if values in cache['data']:
246 |                                 message = (
247 |                                     'Field(s) "%s" duplicates in row "%s" '
248 |                                     'for values %r' % (
249 |                                         cache['name'], row_number, values))
250 |                                 exc_handler(
251 |                                     exceptions.UniqueKeyError(message),
252 |                                     row_number=row_number,
253 |                                     row_data=OrderedDict(zip(headers, row)),
254 |                                     error_data=keyed_values)
255 |                             cache['data'].add(values)
256 | 
257 |                 # Resolve relations
258 |                 if relations:
259 |                     if self.schema:
260 |                         row_with_relations = dict(zip(headers, copy(row)))
261 |                         for foreign_key in self.schema.foreign_keys:
262 |                             refValue = _resolve_relations(row, headers, foreign_keys_values,
263 |                                                           foreign_key)
264 |                             if refValue is None:
265 |                                 keyed_row = OrderedDict(zip(headers, row))
266 |                                 # local values of the FK
267 |                                 local_keyed_values = {
268 |                                     key: keyed_row[key]
269 |                                     for key in foreign_key['fields']
270 |                                     }
271 |                                 local_values = tuple(local_keyed_values.values())
272 |                                 message = (
273 |                                     'Foreign key "%s" violation in row "%s": '
274 |                                     '%s not found in %s' % (
275 |                                         foreign_key['fields'],
276 |                                         row_number,
277 |                                         local_values,
278 |                                         foreign_key['reference']['resource']))
279 |                                 exc_handler(
280 |                                     exceptions.UnresolvedFKError(message),
281 |                                     row_number=row_number, row_data=keyed_row,
282 |                                     error_data=local_keyed_values)
283 |                                 # If we reach this point we don't fail-early
284 |                                 # i.e. no exception has been raised. As the
285 |                                 # reference can't be resolved, use empty dict
286 |                                 # as the "unresolved result".
287 |                                 for field in foreign_key['fields']:
288 |                                     if not isinstance(
289 |                                             row_with_relations[field], dict):
290 |                                         row_with_relations[field] = {}
291 |                             elif type(refValue) is dict:
292 |                                 # Substitute resolved referenced object for
293 |                                 # original referencing field value.
294 |                                 # For a composite foreign key, this substitutes
295 |                                 # each part of the composite key with the
296 |                                 # referenced object.
297 |                                 for field in foreign_key['fields']:
298 |                                     if type(row_with_relations[field]) is not dict:
299 |                                         # no previous refValues injected on this field
300 |                                         row_with_relations[field] = refValue
301 |                                     else:
302 |                                         # alreayd one ref, merging
303 |                                         row_with_relations[field].update(refValue)
304 |                             else:
305 |                                 # case when all original value of the FK are empty
306 |                                 # refValue == row, there is nothing to do
307 |                                 # an empty dict might be a better returned value for this case ?
308 |                                 pass
309 | 
310 |                         #  mutate row now that we are done, in the right order
311 |                         row = [row_with_relations[f] for f in headers]
312 | 
313 |                 # Form row
314 |                 if extended:
315 |                     yield (row_number, headers, row)
316 |                 elif keyed:
317 |                     yield dict(zip(headers, row))
318 |                 else:
319 |                     yield row
320 | 
321 |             # Check integrity
322 |             if integrity:
323 |                 violations = []
324 |                 size = integrity.get('size')
325 |                 hash = integrity.get('hash')
326 |                 if size and size != self.__stream.size:
327 |                     violations.append('size "%s"' % self.__stream.size)
328 |                 if hash and hash != self.__stream.hash:
329 |                     violations.append('hash "%s"' % self.__stream.hash)
330 |                 if violations:
331 |                     message = 'Calculated %s differ(s) from declared value(s)'
332 |                     raise exceptions.IntegrityError(message % ' and '.join(violations))
333 | 
334 |     def read(self, keyed=False, extended=False, cast=True, limit=None,
335 |              integrity=False, relations=False, foreign_keys_values=False,
336 |              exc_handler=None):
337 |         """Read the whole table and return as array of rows
338 | 
339 |         > It has the same API as `table.iter` except for
340 | 
341 |         # Arguments
342 |             limit (int): limit count of rows to read and return
343 | 
344 |         # Returns
345 |             list[]: returns rows
346 | 
347 |         """
348 |         result = []
349 |         rows = self.iter(
350 |             keyed=keyed, extended=extended, cast=cast, integrity=integrity,
351 |             relations=relations, foreign_keys_values=foreign_keys_values,
352 |             exc_handler=exc_handler)
353 |         for count, row in enumerate(rows, start=1):
354 |             result.append(row)
355 |             if count == limit:
356 |                 break
357 |         return result
358 | 
359 |     def infer(self, limit=100, confidence=0.75,
360 |               missing_values=config.DEFAULT_MISSING_VALUES,
361 |               guesser_cls=None, resolver_cls=None):
362 |         """Infer a schema for the table.
363 | 
364 |         It will infer and set Table Schema to `table.schema` based on table data.
365 | 
366 |         # Arguments
367 |             limit (int): limit rows sample size
368 |             confidence (float): how many casting errors are allowed (as a ratio, between 0 and 1)
369 |             missing_values (str[]): list of missing values (by default `['']`)
370 |             guesser_cls (class): you can implement inferring strategies by
371 |                  providing type-guessing and type-resolving classes [experimental]
372 |             resolver_cls (class): you can implement inferring strategies by
373 |                  providing type-guessing and type-resolving classes [experimental]
374 | 
375 |         # Returns
376 |             dict: Table Schema descriptor
377 | 
378 |         """
379 |         if self.__schema is None or self.__headers is None:
380 | 
381 |             # Infer (tabulator)
382 |             if not self.__storage:
383 |                 with self.__stream as stream:
384 |                     if self.__schema is None:
385 |                         self.__schema = Schema({'missingValues': missing_values})
386 |                         self.__schema.infer(stream.sample[:limit],
387 |                                             headers=stream.headers,
388 |                                             confidence=confidence,
389 |                                             guesser_cls=guesser_cls,
390 |                                             resolver_cls=resolver_cls)
391 |                     if self.__headers is None:
392 |                         self.__headers = stream.headers
393 | 
394 |             # Infer (storage)
395 |             else:
396 |                 descriptor = self.__storage.describe(self.__source)
397 |                 if self.__schema is None:
398 |                     self.__schema = Schema(descriptor)
399 |                 if self.__headers is None:
400 |                     self.__headers = self.__schema.field_names
401 | 
402 |         return self.__schema.descriptor
403 | 
404 |     def save(self, target, storage=None, **options):
405 |         """Save data source to file locally in CSV format with `,` (comma) delimiter
406 | 
407 |         > To save schema use `table.schema.save()`
408 | 
409 |         # Arguments
410 |             target (str): saving target (e.g. file path)
411 |             storage (None/str): storage name like `sql` or `bigquery`
412 |             options (dict): `tabulator` or storage options
413 | 
414 |         # Raises
415 |             TableSchemaException: raises an error if there is saving problem
416 | 
417 |         # Returns
418 |             True/Storage: returns true or storage instance
419 | 
420 |         """
421 | 
422 |         # Save (tabulator)
423 |         if storage is None:
424 |             with Stream(self.iter, headers=self.__schema.headers) as stream:
425 |                 stream.save(target, **options)
426 |             return True
427 | 
428 |         # Save (storage)
429 |         else:
430 |             if not isinstance(storage, Storage):
431 |                 storage = Storage.connect(storage, **options)
432 |             storage.create(target, self.__schema.descriptor, force=True)
433 |             storage.write(target, self.iter(cast=False))
434 |             return storage
435 | 
436 |     def index_foreign_keys_values(self, relations):
437 |         """Creates a three-level dictionary of foreign key references
438 | 
439 |         We create them optimized to speed up validation process in a form of
440 |         `{resource1: {(fk_field1, fk_field2): {(value1, value2): {one_keyedrow}, ... }}}`.
441 | 
442 |         For each foreign key of the schema it will iterate through the corresponding
443 |         `relations['resource']` to create an index (i.e. a dict) of existing values
444 |         for the foreign fields and store on keyed row for each value combination.
445 | 
446 |         The optimization relies on the indexation of possible values for one foreign key
447 |         in a hashmap to later speed up resolution.
448 | 
449 |         This method is public to allow creating the index once to apply it
450 |         on multiple tables charing the same schema
451 |         (typically [grouped resources in datapackage](https://github.com/frictionlessdata/datapackage-py#group))
452 | 
453 |         # Notes
454 | 
455 |         - the second key of the output is a tuple of the foreign fields,
456 |             a proxy identifier of the foreign key
457 |         - the same relation resource can be indexed multiple times
458 |             as a schema can contain more than one Foreign Keys
459 |             pointing to the same resource
460 | 
461 |         # Arguments
462 |             relations (dict):
463 |                 dict of foreign key references in a form of
464 |                 `{resource1\\: [{field1\\: value1, field2\\: value2}, ...], ...}`.
465 |                 It must contain all resources pointed in the foreign keys schema definition.
466 | 
467 |         # Returns
468 |             dict:
469 |                 returns a three-level dictionary of foreign key references
470 |                 optimized to speed up validation process in a form of
471 |                 `{resource1\\: {(fk_field1, fk_field2)\\: {(value1, value2)\\: {one_keyedrow}, ... }}})`
472 | 
473 |         """
474 | 
475 |         # we dont need to load the complete reference table to test relations
476 |         # we can lower payload AND optimize testing foreign keys
477 |         # by preparing the right index based on the foreign key definition
478 |         # foreign_keys are sets of tuples of all possible values in the foreign table
479 |         # foreign keys =
480 |         # [reference] [foreign_keys tuple] = { (foreign_keys_values, ) : one_keyedrow, ... }
481 |         foreign_keys = defaultdict(dict)
482 |         if self.schema:
483 |             for fk in self.schema.foreign_keys:
484 |                 # load relation data
485 |                 relation = fk['reference']['resource']
486 | 
487 |                 # create a set of foreign keys
488 |                 # to optimize we prepare index of existing values
489 |                 # this index should use reference + foreign_keys as key
490 |                 # cause many foreign keys may use the same reference
491 |                 foreign_keys[relation][tuple(fk['reference']['fields'])] = {}
492 |                 for row in relations[relation]:
493 |                     key = tuple([row[foreign_field] for foreign_field in fk['reference']['fields']])
494 |                     # here we should chose to pick the first or nth row which match
495 |                     # previous implementation picked the first, so be it
496 |                     if key not in foreign_keys[relation][tuple(fk['reference']['fields'])]:
497 |                         foreign_keys[relation][tuple(fk['reference']['fields'])][key] = row
498 |         return foreign_keys
499 | 
500 |     # Private
501 | 
502 |     def __apply_processors(self, iterator, cast=True, exc_handler=None):
503 | 
504 |         # Apply processors to iterator
505 |         def builtin_processor(extended_rows):
506 |             for row_number, headers, row in extended_rows:
507 |                 if self.__schema and cast:
508 |                     row = self.__schema.cast_row(
509 |                         row, row_number=row_number, exc_handler=exc_handler)
510 |                 yield (row_number, headers, row)
511 |         processors = [builtin_processor] + self.__post_cast
512 |         for processor in processors:
513 |             iterator = processor(iterator)
514 | 
515 |         return iterator
516 | 
517 | 
518 | # Internal
519 | 
520 | def _create_unique_fields_cache(schema):
521 |     primary_key_indexes = []
522 |     cache = {}
523 | 
524 |     # Unique
525 |     for index, field in enumerate(schema.fields):
526 |         if field.name in schema.primary_key:
527 |             primary_key_indexes.append(index)
528 |         if field.constraints.get('unique'):
529 |             cache[tuple([index])] = {
530 |                 'name': field.name,
531 |                 'data': set(),
532 |             }
533 | 
534 |     # Primary key
535 |     if primary_key_indexes:
536 |         cache[tuple(primary_key_indexes)] = {
537 |             'name': ', '.join(schema.primary_key),
538 |             'data': set(),
539 |         }
540 | 
541 |     return cache
542 | 
543 | 
544 | def _resolve_relations(row, headers, foreign_keys_values, foreign_key):
545 | 
546 |     # Prepare helpers - needed data structures
547 |     keyed_row = OrderedDict(zip(headers, row))
548 |     # local values of the FK
549 |     local_values = tuple(keyed_row[f] for f in foreign_key['fields'])
550 |     if set(local_values) != {None}:
551 |         # test existence into the foreign
552 |         relation = foreign_key['reference']['resource']
553 |         keys = tuple(foreign_key['reference']['fields'])
554 |         foreign_values = foreign_keys_values[relation][keys]
555 |         if local_values in foreign_values:
556 |             return foreign_values[local_values]
557 |         else:
558 |             return None
559 |     else:
560 |         # empty values for all keys, return original values
561 |         return row
562 | 


--------------------------------------------------------------------------------