├── excelschema ├── __init__.py ├── exception.py ├── constraint.py ├── util.py └── core.py ├── pyproject.toml ├── pyproject.lock ├── LICENSE ├── README.md └── .gitignore /excelschema/__init__.py: -------------------------------------------------------------------------------- 1 | from .core import SchemaParser 2 | from .constraint import Constraint 3 | from .util import parse_record 4 | -------------------------------------------------------------------------------- /excelschema/exception.py: -------------------------------------------------------------------------------- 1 | class NonUniformTypeException(TypeError): 2 | pass 3 | 4 | 5 | class NotUniqueException(ValueError): 6 | pass 7 | 8 | 9 | class NotNullException(ValueError): 10 | pass 11 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "excelschema" 3 | version = "0.1.2.4" 4 | description = "Excel records' parser and schema viewing and validating tools." 5 | authors = ["patarapolw "] 6 | readme = "README.md" 7 | repository = "https://github.com/patarapolw/excelschema" 8 | homepage = "https://github.com/patarapolw/excelschema" 9 | keywords = ["excel"] 10 | 11 | [tool.poetry.dependencies] 12 | python = "*" 13 | python-dateutil = "^2.7" 14 | 15 | [tool.poetry.dev-dependencies] 16 | -------------------------------------------------------------------------------- /pyproject.lock: -------------------------------------------------------------------------------- 1 | [[package]] 2 | category = "main" 3 | description = "Extensions to the standard Python datetime module" 4 | name = "python-dateutil" 5 | optional = false 6 | platform = "*" 7 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" 8 | version = "2.7.3" 9 | 10 | [package.dependencies] 11 | six = ">=1.5" 12 | 13 | [[package]] 14 | category = "main" 15 | description = "Python 2 and 3 compatibility utilities" 16 | name = "six" 17 | optional = false 18 | platform = "*" 19 | python-versions = "*" 20 | version = "1.11.0" 21 | 22 | [metadata] 23 | content-hash = "ddb59c464d287486857167aa19fca617cd0116fc3d277ce662140e83cfc2fd3e" 24 | platform = "*" 25 | python-versions = "*" 26 | 27 | [metadata.hashes] 28 | python-dateutil = ["1adb80e7a782c12e52ef9a8182bebeb73f1d7e24e374397af06fb4956c8dc5c0", "e27001de32f627c22380a688bcc43ce83504a7bc5da472209b4c70f02829f0b8"] 29 | six = ["70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9", "832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb"] 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Pacharapol Withayasakpunt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # excelschema 2 | 3 | Excel records' parser and schema viewing and validating tools. 4 | 5 | ## Installation 6 | 7 | Method 1: 8 | 9 | ``` 10 | $ pip install excelschema 11 | ``` 12 | 13 | Method 2: 14 | - Clone the project from GitHub 15 | - `poetry install` 16 | 17 | ## Usage 18 | 19 | To read an Excel file, you may also need to install [`pyexcel`](https://github.com/pyexcel/pyexcel) and [`pyexcel-xlsx`](https://github.com/pyexcel/pyexcel-xlsx) as well. 20 | 21 | ```python 22 | >>> from excelschema import SchemaParser 23 | >>> import pyexcel 24 | >>> sp = SchemaParser(records=pyexcel.get_records(file_name='foo.xlsx', sheet_name='bar')) 25 | >>> sp.schema 26 | { 27 | 'record_id': , 28 | 'modified': , 29 | 'data': 30 | } 31 | ``` 32 | 33 | Validating records and convert it to a usable one. 34 | 35 | ```python 36 | >>> sp.ensure_one({'record_id': ' 12', 'data': 567}) 37 | {'record_id', 12, 'data': '567'} 38 | ``` 39 | 40 | Setting constraints 41 | 42 | ```python 43 | >>> from excelschema import Constraint 44 | >>> sp.update_schema({ 45 | ... 'user_id': Constraint(type_=int, unique=True, not_null=True) 46 | ... }) 47 | ``` 48 | 49 | It is also possible to create an custom schema without an Excel 50 | 51 | ```python 52 | >>> sp = SchemaParser(schema={ 53 | ... 'record_id': Constraint(type_=int, unique=True, not_null=True), 54 | ... 'modified': datetime 55 | ... }) 56 | ``` 57 | 58 | ## Bonus functions 59 | 60 | Cleaning dirty Excel records 61 | 62 | ```python 63 | >>> from excelschema import parse_record 64 | >>> parse_record({'foo': ' 1', 'bar': ' - ', 'baz': ' '}) 65 | {'foo', 1} 66 | ``` 67 | 68 | 69 | ## Related projects 70 | 71 | - https://github.com/patarapolw/tinydb-constraint 72 | -------------------------------------------------------------------------------- /excelschema/constraint.py: -------------------------------------------------------------------------------- 1 | from typing import NamedTuple, Any, Union 2 | 3 | 4 | class Constraint(NamedTuple): 5 | type_: Union[type, list, type(Any)] = Any 6 | unique: bool = False 7 | not_null: bool = False 8 | 9 | def __repr__(self): 10 | if not self.unique and not self.not_null: 11 | return repr(self.type_) 12 | else: 13 | return super(Constraint, self).__repr__() 14 | 15 | 16 | class ConstraintMapping: 17 | def __init__(self): 18 | self.type_ = dict() 19 | self.preexisting = dict() 20 | self.not_null = set() 21 | 22 | def update(self, schema_dict): 23 | if schema_dict: 24 | for k, c in schema_dict.items(): 25 | if isinstance(c, type): 26 | self._parse_type(k, c) 27 | else: 28 | assert isinstance(c, Constraint), repr(c) 29 | 30 | if c.type_: 31 | self._parse_type(k, c.type_) 32 | if c.unique: 33 | self.preexisting.setdefault(k, set()) 34 | if c.not_null: 35 | self.not_null.add(k) 36 | 37 | def _parse_type(self, k, type_): 38 | if k in self.type_.keys(): 39 | expected_type = self.type_[k] 40 | if expected_type is not Any: 41 | if type_ is not expected_type: 42 | raise TypeError 43 | else: 44 | self.type_[k] = type_ 45 | 46 | def _view(self): 47 | all_keys = set(self.type_.keys()) | set(self.preexisting.keys()) | self.not_null 48 | 49 | for k in all_keys: 50 | type_ = self.type_.get(k, Any) 51 | unique = k in self.preexisting.keys() 52 | not_null = k in self.not_null 53 | 54 | yield k, Constraint(type_, unique, not_null) 55 | 56 | def view(self): 57 | return dict(self._view()) 58 | 59 | def __repr__(self): 60 | return repr(self.view()) 61 | -------------------------------------------------------------------------------- /excelschema/util.py: -------------------------------------------------------------------------------- 1 | import unicodedata 2 | from datetime import datetime, date 3 | import dateutil.parser 4 | from collections import OrderedDict 5 | import itertools 6 | 7 | 8 | def normalize_chars(s): 9 | return unicodedata.normalize("NFKD", s) 10 | 11 | 12 | def parse_record(record, yield_='type', as_datetime_str=False): 13 | return dict(_parse_record(record, yield_, as_datetime_str)) 14 | 15 | 16 | def _parse_record(record, yield_='type', as_datetime_str=False): 17 | def _yield_switch(x): 18 | if yield_ == 'type': 19 | return type(x) 20 | elif yield_ == 'record': 21 | if isinstance(x, (datetime, date)): 22 | x = x.isoformat() 23 | if not as_datetime_str: 24 | x = dateutil.parser.parse(x) 25 | 26 | return x 27 | else: 28 | raise ValueError 29 | 30 | for k, v in record.items(): 31 | if isinstance(v, str): 32 | v = normalize_chars(v.strip()) 33 | if v.isdigit(): 34 | v = int(v) 35 | elif '.' in v and v.replace('.', '', 1).isdigit(): 36 | v = float(v) 37 | elif v in {'', '-'}: 38 | continue 39 | else: 40 | try: 41 | v = dateutil.parser.parse(v) 42 | except ValueError: 43 | pass 44 | elif isinstance(v, date): 45 | v = datetime.combine(v, datetime.min.time()) 46 | 47 | yield k, _yield_switch(v) 48 | 49 | 50 | def parse_excel_array(records=None, array=None, header=True): 51 | if records and array: 52 | raise ValueError('Please specify either record or array') 53 | 54 | if array: 55 | if header: 56 | if not isinstance(header, (list, tuple)): 57 | header = array[0] 58 | array = array[1:] 59 | else: 60 | header = itertools.count() 61 | 62 | records = list() 63 | for row in array: 64 | records.append(OrderedDict(zip(header, row))) 65 | 66 | if isinstance(header, itertools.count): 67 | header = itertools.count() 68 | 69 | return records 70 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/osx,python 3 | 4 | ### OSX ### 5 | # General 6 | .DS_Store 7 | .AppleDouble 8 | .LSOverride 9 | 10 | # Icon must end with two \r 11 | Icon 12 | 13 | # Thumbnails 14 | ._* 15 | 16 | # Files that might appear in the root of a volume 17 | .DocumentRevisions-V100 18 | .fseventsd 19 | .Spotlight-V100 20 | .TemporaryItems 21 | .Trashes 22 | .VolumeIcon.icns 23 | .com.apple.timemachine.donotpresent 24 | 25 | # Directories potentially created on remote AFP share 26 | .AppleDB 27 | .AppleDesktop 28 | Network Trash Folder 29 | Temporary Items 30 | .apdisk 31 | 32 | ### Python ### 33 | # Byte-compiled / optimized / DLL files 34 | __pycache__/ 35 | *.py[cod] 36 | *$py.class 37 | 38 | # C extensions 39 | *.so 40 | 41 | # Distribution / packaging 42 | .Python 43 | build/ 44 | develop-eggs/ 45 | dist/ 46 | downloads/ 47 | eggs/ 48 | .eggs/ 49 | lib/ 50 | lib64/ 51 | parts/ 52 | sdist/ 53 | var/ 54 | wheels/ 55 | *.egg-info/ 56 | .installed.cfg 57 | *.egg 58 | MANIFEST 59 | 60 | # PyInstaller 61 | # Usually these files are written by a python script from a template 62 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 63 | *.manifest 64 | *.spec 65 | 66 | # Installer logs 67 | pip-log.txt 68 | pip-delete-this-directory.txt 69 | 70 | # Unit test / coverage reports 71 | htmlcov/ 72 | .tox/ 73 | .nox/ 74 | .coverage 75 | .coverage.* 76 | .cache 77 | nosetests.xml 78 | coverage.xml 79 | *.cover 80 | .hypothesis/ 81 | .pytest_cache/ 82 | 83 | # Translations 84 | *.mo 85 | *.pot 86 | 87 | # Django stuff: 88 | *.log 89 | local_settings.py 90 | db.sqlite3 91 | 92 | # Flask stuff: 93 | instance/ 94 | .webassets-cache 95 | 96 | # Scrapy stuff: 97 | .scrapy 98 | 99 | # Sphinx documentation 100 | docs/_build/ 101 | 102 | # PyBuilder 103 | target/ 104 | 105 | # Jupyter Notebook 106 | .ipynb_checkpoints 107 | 108 | # IPython 109 | profile_default/ 110 | ipython_config.py 111 | 112 | # pyenv 113 | .python-version 114 | 115 | # celery beat schedule file 116 | celerybeat-schedule 117 | 118 | # SageMath parsed files 119 | *.sage.py 120 | 121 | # Environments 122 | .env 123 | .venv 124 | env/ 125 | venv/ 126 | ENV/ 127 | env.bak/ 128 | venv.bak/ 129 | 130 | # Spyder project settings 131 | .spyderproject 132 | .spyproject 133 | 134 | # Rope project settings 135 | .ropeproject 136 | 137 | # mkdocs documentation 138 | /site 139 | 140 | # mypy 141 | .mypy_cache/ 142 | .dmypy.json 143 | dmypy.json 144 | 145 | ### Python Patch ### 146 | .venv/ 147 | 148 | ### Python.VirtualEnv Stack ### 149 | # Virtualenv 150 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ 151 | [Bb]in 152 | [Ii]nclude 153 | [Ll]ib 154 | [Ll]ib64 155 | [Ll]ocal 156 | [Ss]cripts 157 | pyvenv.cfg 158 | pip-selfcheck.json 159 | 160 | 161 | # End of https://www.gitignore.io/api/osx,python 162 | 163 | .vscode/ 164 | .idea/ 165 | setup.py 166 | -------------------------------------------------------------------------------- /excelschema/core.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | from .constraint import ConstraintMapping 4 | from .exception import NotUniqueException, NotNullException, NonUniformTypeException 5 | from .util import parse_record, parse_excel_array 6 | 7 | 8 | class SchemaParser: 9 | constraint_mapping = ConstraintMapping() 10 | records = list() 11 | 12 | def __init__(self, records=None, array=None, header=True, as_datetime_str=False, schema=None): 13 | self.as_datetime_str = as_datetime_str 14 | 15 | if schema: 16 | self.constraint_mapping.update(schema) 17 | 18 | if records and array: 19 | self.records = self.ensure_multiple(records=records, array=array, header=header) 20 | 21 | @property 22 | def schema(self): 23 | """Get table's latest schema 24 | 25 | Returns: 26 | dict -- dictionary of constraints 27 | """ 28 | 29 | return self.constraint_mapping.view() 30 | 31 | @schema.setter 32 | def schema(self, schema_dict): 33 | """Reset and set a new schema 34 | 35 | Arguments: 36 | schema_dict {dict} -- dictionary of constraints or types 37 | """ 38 | 39 | self.constraint_mapping = ConstraintMapping() 40 | self.update_schema(schema_dict) 41 | 42 | def update_schema(self, schema_dict): 43 | """Update the schema 44 | 45 | Arguments: 46 | schema_dict {dict} -- dictionary of constraints or types 47 | """ 48 | 49 | self.constraint_mapping.update(schema_dict) 50 | self.ensure_multiple(self.records) 51 | 52 | def ensure_multiple(self, records=None, update_schema=False, array=None, header=None): 53 | """Sanitizes records, e.g. from Excel spreadsheet 54 | 55 | Arguments: 56 | records {list, tuple} -- Iterable of records. Can be a 2-D array of list of dictionaries 57 | 58 | Returns: 59 | list -- List of records 60 | """ 61 | 62 | def _records(): 63 | nonlocal records 64 | 65 | records = parse_excel_array(records=records, array=array, header=header) 66 | for record_ in records: 67 | record_schema = parse_record(record_, yield_='type') 68 | num_to_str = set() 69 | for k, v in record_schema.items(): 70 | expected_type = self.constraint_mapping.type_.get(k, None) 71 | if expected_type and v is not expected_type: 72 | if expected_type is str and v in (int, float): 73 | num_to_str.add(k) 74 | else: 75 | raise NonUniformTypeException('{} not in table schema {}' 76 | .format(v, self.schema)) 77 | self.constraint_mapping.update(schema_dict=record_schema) 78 | 79 | record_ = parse_record(record_, yield_='record', 80 | as_datetime_str=self.as_datetime_str) 81 | for k, v in record_.items(): 82 | if k in num_to_str: 83 | record_[k] = str(v) 84 | 85 | is_null = self.constraint_mapping.not_null - set(record_.keys()) 86 | if len(is_null) > 0: 87 | raise NotNullException('{} is null'.format(list(is_null))) 88 | 89 | yield record_ 90 | 91 | temp_mapping = None 92 | if not update_schema: 93 | temp_mapping = deepcopy(self.constraint_mapping) 94 | 95 | for c in self.schema.values(): 96 | assert not isinstance(c.type_, list) 97 | 98 | records = list(_records()) 99 | for record in records: 100 | self._update_uniqueness(record) 101 | 102 | if not update_schema: 103 | self.constraint_mapping = ConstraintMapping() 104 | self.constraint_mapping.update(temp_mapping.view()) 105 | else: 106 | self.records.extend(records) 107 | 108 | return records 109 | 110 | def ensure_one(self, record, update_schema=False): 111 | return self.ensure_multiple([record], update_schema=update_schema)[0] 112 | 113 | def _update_uniqueness(self, record_dict): 114 | for k, v in parse_record(record_dict, yield_='type').items(): 115 | if k in self.constraint_mapping.preexisting.keys(): 116 | if v in self.constraint_mapping.preexisting[k]: 117 | raise NotUniqueException('Duplicate {} for {} exists'.format(v, k)) 118 | 119 | self.constraint_mapping.preexisting[k].add(v) 120 | --------------------------------------------------------------------------------