├── .github └── workflows │ └── test.yml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── itertable ├── __init__.py ├── __main__.py ├── base.py ├── commands.py ├── exceptions.py ├── gis │ ├── __init__.py │ └── mixins.py ├── loaders.py ├── mappers.py ├── parsers │ ├── __init__.py │ ├── base.py │ ├── readers.py │ ├── text.py │ └── xls.py └── util.py ├── pyproject.toml └── tests ├── __init__.py ├── base.py ├── files ├── .gitignore ├── custom.json ├── custom.xml ├── custom2.json ├── extra.xlsx ├── nodata.csv ├── nodata.xlsx ├── noextra.xlsx ├── test.csv ├── test.dbf ├── test.geojson ├── test.json ├── test.prj ├── test.shp ├── test.shx ├── test.xls ├── test.xlsx ├── test.xml ├── test2.csv ├── test3.csv ├── testcsv.zip ├── testmulti.zip └── testxlsx.zip ├── test_custom.py ├── test_dataframe.py ├── test_extra_data.py ├── test_gis.py ├── test_gis_dataframe.py ├── test_load_file.py ├── test_netloader.py ├── test_write.py └── test_zip.py /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | name: Python ${{ matrix.python-version }}, ${{ matrix.variant }} 8 | runs-on: ubuntu-22.04 9 | strategy: 10 | matrix: 11 | python-version: ["3.11", "3.10", 3.9, 3.8, 3.7] 12 | variant: [no-magic] 13 | include: 14 | - python-version: "3.11" 15 | variant: magic 16 | steps: 17 | - uses: actions/checkout@v2 18 | - name: Set up Python ${{ matrix.python-version }} 19 | uses: actions/setup-python@v2 20 | with: 21 | python-version: ${{ matrix.python-version }} 22 | - name: Install dependencies 23 | run: | 24 | python -m pip install --upgrade pip 25 | python -m pip install build 26 | python -m pip install flake8 wheel httpretty beautifulsoup4 27 | python -m pip install requests openpyxl click 28 | python -m pip install Shapely Fiona pandas geopandas xlrd xlwt 29 | - name: Install python-magic 30 | if: ${{ matrix.variant == 'magic' }} 31 | run: python -m pip install python-magic 32 | - name: Lint with flake8 33 | run: | 34 | # stop the build if there are Python syntax errors or undefined names 35 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 36 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 37 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 38 | - name: Test with unittest 39 | run: python -m unittest discover -s tests -t . -v 40 | - name: Test build 41 | run: python -m build 42 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.egg 3 | *.swp 4 | *.egg-info 5 | build 6 | dist 7 | README.rst 8 | wq/__init__.py 9 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. 6 | 7 | ## Our Standards 8 | 9 | Examples of behavior that contributes to creating a positive environment include: 10 | 11 | * Using welcoming and inclusive language 12 | * Being respectful of differing viewpoints and experiences 13 | * Gracefully accepting constructive criticism 14 | * Focusing on what is best for the community 15 | * Showing empathy towards other community members 16 | 17 | Examples of unacceptable behavior by participants include: 18 | 19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances 20 | * Trolling, insulting/derogatory comments, and personal or political attacks 21 | * Public or private harassment 22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission 23 | * Other conduct which could reasonably be considered inappropriate in a professional setting 24 | 25 | ## Our Responsibilities 26 | 27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 28 | 29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 30 | 31 | ## Scope 32 | 33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. 34 | 35 | ## Enforcement 36 | 37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at andrew@wq.io. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. 38 | 39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. 40 | 41 | ## Attribution 42 | 43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] 44 | 45 | [homepage]: http://contributor-covenant.org 46 | [version]: http://contributor-covenant.org/version/1/4/ 47 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thanks for contributing to IterTable! Here are some guidelines to help you get started. 4 | 5 | ## Questions 6 | 7 | Questions and ideas can be submitted to the [Django Data Wizard discussion board](https://github.com/wq/django-data-wizard/discussions). 8 | 9 | ## Bug Reports 10 | 11 | Bug reports can be submitted to either [IterTable issues](https://github.com/wq/itertable/issues) or [Django Data Wizard issues](https://github.com/wq/itertable/issues). Reports can take any form as long as there is enough information to diagnose the problem. To speed up response time, try to include the following whenever possible: 12 | * Versions of Fiona and/or Pandas, if applicable 13 | * Expected (or ideal) behavior 14 | * Actual behavior 15 | 16 | ## Pull Requests 17 | 18 | Pull requests are very welcome and will be reviewed and merged as time allows. To speed up reviews, try to include the following whenever possible: 19 | * Reference the issue that the PR fixes (e.g. [#3](https://github.com/wq/itertable/issues/3)) 20 | * Failing test case fixed by the PR 21 | * If the PR provides new functionality, update [the documentation](https://github.com/wq/django-data-wizard/tree/main/docs/itertable) 22 | * Ensure the PR passes lint and unit tests. This happens automatically, but you can also run these locally with the following commands: 23 | 24 | ```bash 25 | python -m unittest discover -s tests -t . -v # run the test suite 26 | flake8 # run code style checking 27 | ``` 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012-2022, S. Andrew Sheppard, http://wq.io/ 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 7 | of the Software, and to permit persons to whom the Software is furnished to do 8 | so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | **IterTable** is a Pythonic API for iterating through tabular data formats, including CSV, XLSX, XML, and JSON. 2 | 3 | ```python 4 | from itertable import load_file 5 | 6 | for row in load_file("example.xlsx"): 7 | print(row.date, row.name) 8 | ``` 9 | 10 | [![Latest PyPI Release](https://img.shields.io/pypi/v/itertable.svg)](https://pypi.org/project/itertable) 11 | [![Release Notes](https://img.shields.io/github/release/wq/itertable.svg)](https://github.com/wq/itertable/releases) 12 | [![License](https://img.shields.io/pypi/l/itertable.svg)](https://github.com/wq/itertable/blob/master/LICENSE) 13 | [![GitHub Stars](https://img.shields.io/github/stars/wq/itertable.svg)](https://github.com/wq/itertable/stargazers) 14 | [![GitHub Forks](https://img.shields.io/github/forks/wq/itertable.svg)](https://github.com/wq/itertable/network) 15 | [![GitHub Issues](https://img.shields.io/github/issues/wq/itertable.svg)](https://github.com/wq/itertable/issues) 16 | 17 | [![Tests](https://github.com/wq/itertable/actions/workflows/test.yml/badge.svg)](https://github.com/wq/itertable/actions/workflows/test.yml) 18 | [![Python Support](https://img.shields.io/pypi/pyversions/itertable.svg)](https://pypi.python.org/pypi/itertable) 19 | 20 | ### [Documentation][docs] 21 | 22 | [**Installation**][installation] 23 | 24 | [**API**][api] 25 |
26 | [CLI][cli] 27 | • 28 | [GIS][gis] 29 | 30 | [**Extending IterTable**][custom] 31 |
32 | [BaseIter][base] 33 | • 34 | [Loaders][loaders] 35 | • 36 | [Parsers][parsers] 37 | • 38 | [Mappers][mappers] 39 | 40 | [docs]: https://django-data-wizard.wq.io/itertable/ 41 | 42 | [installation]: https://django-data-wizard.wq.io/itertable/#getting-started 43 | [api]: https://django-data-wizard.wq.io/itertable/#overview 44 | [cli]: https://django-data-wizard.wq.io/itertable/#command-line-interface 45 | [custom]: https://django-data-wizard.wq.io/itertable/custom 46 | [base]: https://django-data-wizard.wq.io/itertable/base 47 | [loaders]: https://django-data-wizard.wq.io/itertable/loaders 48 | [parsers]: https://django-data-wizard.wq.io/itertable/parsers 49 | [mappers]: https://django-data-wizard.wq.io/itertable/mappers 50 | [gis]: https://django-data-wizard.wq.io/itertable/gis 51 | -------------------------------------------------------------------------------- /itertable/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import BaseIter 2 | 3 | from .loaders import ( 4 | BaseLoader, 5 | FileLoader, 6 | Zipper, 7 | ZipFileLoader, 8 | StringLoader, 9 | NetLoader, 10 | ZipNetLoader, 11 | ) 12 | 13 | from .parsers import ( 14 | CsvParser, 15 | JsonParser, 16 | XmlParser, 17 | OldExcelParser, 18 | ExcelParser, 19 | ) 20 | 21 | from .mappers import ( 22 | BaseMapper, 23 | DictMapper, 24 | TupleMapper, 25 | TimeSeriesMapper, 26 | make_date_mapper, 27 | ) 28 | 29 | from .util import ( 30 | make_iter, 31 | load_file, 32 | load_url, 33 | load_string, 34 | guess_type, 35 | flattened, 36 | ) 37 | 38 | try: 39 | from .version import __version__ as VERSION 40 | except ImportError: 41 | VERSION = "0.0.0" 42 | 43 | 44 | __all__ = ( 45 | "BaseIter", 46 | "BaseLoader", 47 | "FileLoader", 48 | "Zipper", 49 | "ZipFileLoader", 50 | "StringLoader", 51 | "NetLoader", 52 | "ZipNetLoader", 53 | "CsvParser", 54 | "JsonParser", 55 | "XmlParser", 56 | "ExcelParser", 57 | "OldExcelParser", 58 | "BaseMapper", 59 | "DictMapper", 60 | "TupleMapper", 61 | "TimeSeriesMapper", 62 | "make_date_mapper", 63 | "make_iter", 64 | "load_file", 65 | "load_url", 66 | "load_string", 67 | "guess_type", 68 | "flattened", 69 | "VERSION", 70 | "CsvFileIter", 71 | "CsvNetIter", 72 | "CsvStringIter", 73 | "JsonFileIter", 74 | "JsonNetIter", 75 | "JsonStringIter", 76 | "XmlFileIter", 77 | "XmlNetIter", 78 | "XmlStringIter", 79 | "OldExcelFileIter", 80 | "ExcelFileIter", 81 | ) 82 | 83 | # Some useful pre-mixed classes 84 | CsvFileIter = make_iter(FileLoader, CsvParser) 85 | CsvNetIter = make_iter(NetLoader, CsvParser) 86 | CsvStringIter = make_iter(StringLoader, CsvParser) 87 | 88 | JsonFileIter = make_iter(FileLoader, JsonParser) 89 | JsonNetIter = make_iter(NetLoader, JsonParser) 90 | JsonStringIter = make_iter(StringLoader, JsonParser) 91 | 92 | XmlFileIter = make_iter(FileLoader, XmlParser) 93 | XmlNetIter = make_iter(NetLoader, XmlParser) 94 | XmlStringIter = make_iter(StringLoader, XmlParser) 95 | 96 | OldExcelFileIter = make_iter(FileLoader, OldExcelParser) 97 | ExcelFileIter = make_iter(FileLoader, ExcelParser) 98 | OldExcelNetIter = make_iter(NetLoader, OldExcelParser) 99 | ExcelNetIter = make_iter(NetLoader, ExcelParser) 100 | 101 | try: 102 | from .gis import GisIter, ShapeIter, WktIter 103 | 104 | __all__ += ( 105 | "GisIter", 106 | "ShapeIter", 107 | "WktIter", 108 | ) 109 | except ImportError: 110 | pass 111 | -------------------------------------------------------------------------------- /itertable/__main__.py: -------------------------------------------------------------------------------- 1 | from .commands import cat 2 | 3 | 4 | if __name__ == "__main__": 5 | cat() 6 | -------------------------------------------------------------------------------- /itertable/base.py: -------------------------------------------------------------------------------- 1 | from collections.abc import MutableMapping, MutableSequence 2 | 3 | 4 | class BaseIter(MutableMapping, MutableSequence): 5 | "itertable.BaseIter: Base class for generic resource management" 6 | 7 | tabular = False 8 | nested = False 9 | binary = False 10 | loaded = False 11 | parsed = False 12 | 13 | def __init__(self, **kwargs): 14 | self.__dict__.update(kwargs) 15 | self.refresh() 16 | 17 | def refresh(self): 18 | if not self.loaded: 19 | self.load() 20 | self.loaded = True 21 | 22 | if self.parsed: 23 | return 24 | 25 | if getattr(self, "empty_file", False): 26 | self.data = [] 27 | else: 28 | self.parse() 29 | if hasattr(self, "file"): 30 | f = self.file 31 | if hasattr(f, "close") and not getattr(f, "closed", False): 32 | f.close() 33 | 34 | self.parsed = True 35 | 36 | def load(self): 37 | "Open a resource (defined by loader mixins)" 38 | # self.file = ... 39 | pass 40 | 41 | def parse(self): 42 | """ 43 | Parse a resource (defined by parser mixins). 44 | Result should be an iterable of dicts. 45 | """ 46 | # self.data = some_parse_method(self.file) 47 | pass 48 | 49 | def dump(self, file=None): 50 | """""" 51 | if file is None: 52 | file = self.file 53 | file.write(str(self.data)) 54 | 55 | def save(self): 56 | """""" 57 | self.dump(self.file) 58 | 59 | field_names = None 60 | scan_fields = False 61 | _auto_field_names = None 62 | 63 | def get_field_names(self): 64 | "Returns a list of raw fields to expect (defined by parser mixins)" 65 | if self.field_names is not None: 66 | # Support specifying field_names as string (like namedtuple does) 67 | if isinstance(self.field_names, str): 68 | return self.field_names.replace(",", " ").split() 69 | else: 70 | return self.field_names 71 | 72 | # If no defined field names, try to retrieve from data 73 | if not getattr(self, "data", None): 74 | return None 75 | 76 | if self._auto_field_names: 77 | return self._auto_field_names 78 | 79 | if self.scan_fields: 80 | # Scan all rows for field names 81 | field_names = set() 82 | for row in self.data: 83 | field_names.update(row.keys()) 84 | field_names = list(field_names) 85 | else: 86 | # Assume first row contains same keys as all other rows 87 | field_names = list(self.data[0].keys()) 88 | 89 | self._auto_field_names = field_names 90 | return field_names 91 | 92 | @property 93 | def key_field(self): 94 | "Assign a key_field to use the resource as a Map" 95 | return None 96 | 97 | def get_key_field(self): 98 | return self.key_field 99 | 100 | def usable_item(self, item): 101 | "Hook to allow items to be transformed" 102 | return item 103 | 104 | def parse_usable_item(self, uitem): 105 | "Hook to allow items to be untransformed" 106 | return uitem 107 | 108 | def compute_index(self, recompute=False): 109 | key_field = self.get_key_field() 110 | if key_field is None: 111 | return None 112 | 113 | if getattr(self, "_index_cache", None) is not None and not recompute: 114 | return self._index_cache 115 | 116 | index = {} 117 | for i, item in enumerate(self.data): 118 | uitem = self.usable_item(item) 119 | if isinstance(uitem, dict): 120 | key = uitem.get(key_field, None) 121 | else: 122 | key = getattr(uitem, key_field, None) 123 | if key is not None: 124 | index[key] = i 125 | 126 | self._index_cache = index 127 | return index 128 | 129 | def find_index(self, key): 130 | index = self.compute_index() 131 | if index is not None: 132 | return index.get(key, None) 133 | else: 134 | return key 135 | 136 | def __len__(self): 137 | return len(self.data) 138 | 139 | def __getitem__(self, key): 140 | index = self.find_index(key) 141 | if index is None: 142 | raise KeyError 143 | return self.usable_item(self.data[index]) 144 | 145 | def __setitem__(self, key, uitem): 146 | item = self.parse_usable_item(uitem) 147 | index = self.find_index(key) 148 | if index is not None: 149 | self.data[index] = item 150 | else: 151 | self.data.append(item) 152 | self.compute_index(True) 153 | 154 | def __delitem__(self, key): 155 | index = self.find_index(key) 156 | if index is None: 157 | raise KeyError 158 | del self.data[index] 159 | self.compute_index(True) 160 | 161 | def insert(self, index, uitem): 162 | item = self.parse_usable_item(uitem) 163 | self.data.insert(index, item) 164 | self.compute_index(True) 165 | 166 | def __iter__(self): 167 | for item in self.data: 168 | uitem = self.usable_item(item) 169 | if uitem is None: 170 | return 171 | pk = self.get_key_field() 172 | if pk is None: 173 | yield uitem 174 | elif isinstance(uitem, dict): 175 | yield uitem.get(pk, None) 176 | else: 177 | yield getattr(uitem, pk, None) 178 | 179 | def sync(self, other, save=True): 180 | if self.get_key_field() is None or other.get_key_field() is None: 181 | raise Exception("Key field required to sync!") 182 | for key in self: 183 | other[key] = self[key] 184 | if save: 185 | other.save() 186 | 187 | def copy(self, other, save=True): 188 | del other.data[:] 189 | for item in self.data: 190 | uitem = self.usable_item(item) 191 | other.append(uitem) 192 | if save: 193 | other.save() 194 | 195 | # Slots to track things that can't be pickled 196 | # (need a separate slot for each expected mixin class, since they don't 197 | # extend BaseIter) 198 | no_pickle = [] 199 | no_pickle_loader = [] 200 | no_pickle_mapper = [] 201 | no_pickle_parser = [] 202 | 203 | def get_no_pickle(self): 204 | return ( 205 | self.no_pickle 206 | + self.no_pickle_loader 207 | + self.no_pickle_mapper 208 | + self.no_pickle_parser 209 | ) 210 | 211 | def __getstate__(self): 212 | """ 213 | Don't include auto-created and unpicklable properties in state. 214 | """ 215 | state = self.__dict__.copy() 216 | for name in self.get_no_pickle(): 217 | state.pop(name, None) 218 | return state 219 | 220 | def item_dict(self, item): 221 | return item 222 | 223 | def as_dataframe(self): 224 | from pandas import DataFrame 225 | 226 | key = self.get_key_field() 227 | if key: 228 | data = [self.item_dict(row) for row in self.values()] 229 | else: 230 | data = [self.item_dict(row) for row in self] 231 | df = DataFrame(data) 232 | if key: 233 | df.set_index(key, inplace=True) 234 | return df 235 | -------------------------------------------------------------------------------- /itertable/commands.py: -------------------------------------------------------------------------------- 1 | from . import load_file, load_url, flattened, JsonStringIter, CsvStringIter 2 | from .exceptions import IterException 3 | import click 4 | import os 5 | import importlib 6 | 7 | 8 | @click.command() 9 | @click.argument("source") 10 | @click.argument("source_options", required=False) 11 | @click.option("--format", "-f", default="csv", help="Output format") 12 | def cat(source, source_options, format): 13 | """ 14 | Display contents of a file or IterTable class. SOURCE can be either a 15 | filename or a Python path. SOURCE_OPTIONS is an optional string 16 | specifying init options in "name=value" format, separated by commas. 17 | 18 | The data will be printed to the terminal in CSV form, unless the format is 19 | set to JSON. 20 | 21 | Examples: 22 | 23 | \b 24 | python3 -m itertable example.json # JSON to CSV 25 | python3 -m itertable -f json example.csv # CSV to JSON 26 | python3 -m itertable example.xlsx "start_row=5" 27 | python3 -m itertable http://example.com/example.csv 28 | python3 -m itertable itertable.CsvNetIter "url=http://example.com/example.csv" 29 | """ # noqa 30 | 31 | # Parse option string 32 | options = {} 33 | if source_options: 34 | for opt in source_options.split(","): 35 | key, val = opt.split("=") 36 | if val.isdigit(): 37 | val = int(val) 38 | options[key] = val 39 | 40 | if os.path.exists(source): 41 | try: 42 | input = load_file(source, options=options) 43 | except IterException as e: 44 | raise click.ClickException(str(e)) 45 | elif "http" in source and "://" in source: 46 | try: 47 | input = load_url(source, options=options) 48 | except IterException as e: 49 | raise click.ClickException(str(e)) 50 | else: 51 | parts = source.split(".") 52 | class_name = parts[-1] 53 | module_name = ".".join(parts[:-1]) 54 | try: 55 | module = importlib.import_module(module_name) 56 | Iter = getattr(module, class_name) 57 | input = flattened(Iter, **options) 58 | except (ImportError, ValueError, AttributeError, IterException) as e: 59 | raise click.ClickException(str(e)) 60 | 61 | if format == "json": 62 | OutputIter = JsonStringIter 63 | init = "[]" 64 | else: 65 | OutputIter = CsvStringIter 66 | init = "" 67 | output = OutputIter(data=input.data, string=init) 68 | output.data = input.data 69 | output.save() 70 | result = output.string 71 | if output.binary: 72 | result = result.decode("utf-8") 73 | print(result) 74 | -------------------------------------------------------------------------------- /itertable/exceptions.py: -------------------------------------------------------------------------------- 1 | try: 2 | from bs4 import BeautifulSoup 3 | except ImportError: 4 | BeautifulSoup = None 5 | 6 | 7 | class IterException(Exception): 8 | def __str__(self): 9 | if self.args and self.args[0]: 10 | return self.args[0] 11 | return self.__doc__ 12 | 13 | 14 | class LoadFailed(IterException): 15 | """Error loading data!""" 16 | 17 | def __init__(self, message, path=None, code=None): 18 | super(LoadFailed, self).__init__(message) 19 | self.path = path 20 | self.code = code 21 | 22 | def __str__(self): 23 | if self.args and self.args[0]: 24 | text = self.args[0] 25 | has_html = False 26 | for tag in " 1 and self.layer_id is None: 44 | cls = type(self) 45 | self.data = [ 46 | { 47 | "id": id, 48 | "name": name, 49 | "data": cls(filename=self.filename, layer_id=id), 50 | } 51 | for id, name in enumerate(self.layers) 52 | ] 53 | else: 54 | # One layer, load & parse GIS data 55 | with fiona.open(self.filename, layer=self.layer_id) as f: 56 | self.meta = f.meta 57 | if "id" in f.meta.get("schema", {}).get("properties", {}): 58 | # TODO: Is this correct? 59 | del f.meta["schema"]["properties"]["id"] 60 | self.data = list(map(self.parse_feature, f)) 61 | 62 | def parse_feature(self, f): 63 | # Flatten Fiona's GeoJSON-style representation into something more 64 | # amenable to namedtuple-ing 65 | feat = {key: value for key, value in f["properties"].items()} 66 | if "id" not in feat and "ID" not in feat: 67 | feat["id"] = f["id"] 68 | feat["geometry"] = f["geometry"] 69 | return feat 70 | 71 | def dump_feature(self, feat, i): 72 | # Undo aforementioned flattening 73 | return { 74 | "id": feat.get("id", feat.get("ID", i)), 75 | "geometry": feat["geometry"], 76 | "properties": { 77 | key: value 78 | for key, value in feat.items() 79 | if key 80 | not in ( 81 | "geometry", 82 | "id", 83 | ) 84 | }, 85 | } 86 | 87 | def dump(self): 88 | # Dump and save the dataset at the same time via Fiona 89 | pass 90 | 91 | def save(self): 92 | with fiona.open(self.filename, "w", **self.meta) as f: 93 | for i, feat in enumerate(self.data): 94 | f.write(self.dump_feature(feat, i)) 95 | 96 | 97 | class GisMapper(TupleMapper): 98 | """ 99 | GIS-aware tuple mapper 100 | """ 101 | 102 | def as_dataframe(self): 103 | # Mimic BaseIter.as_dataframe() but with GeoDataFrame 104 | # (also, key_field is always set) 105 | from geopandas import GeoDataFrame 106 | 107 | key = self.get_key_field() 108 | data = [self.item_dict(row) for row in self.values()] 109 | df = GeoDataFrame(data) 110 | df.set_index(key, inplace=True) 111 | return df 112 | 113 | def item_dict(self, uitem): 114 | # Turn usable item into GeoDataFrame-friendly dict 115 | data = uitem._asdict() 116 | data["geometry"] = geometry.shape(data["geometry"]) 117 | return data 118 | 119 | 120 | class ShapeMapper(GisMapper): 121 | """ 122 | Map Fiona's GeoJSON-style geometries to and from Shapely shapes 123 | """ 124 | 125 | def map_value(self, field, value): 126 | value = super(ShapeMapper, self).map_value(field, value) 127 | if field == "geometry": 128 | value = geometry.shape(value) 129 | return value 130 | 131 | def unmap_value(self, field, value): 132 | if field == "geometry": 133 | value = geometry.mapping(value) 134 | return super(ShapeMapper, self).unmap_value(field, value) 135 | 136 | def item_dict(self, uitem): 137 | return uitem._asdict() 138 | 139 | 140 | class WktMapper(ShapeMapper): 141 | """ 142 | Map geometries to and from WKT (good for Django integration) 143 | """ 144 | 145 | def map_value(self, field, value): 146 | value = super(WktMapper, self).map_value(field, value) 147 | if field == "geometry": 148 | value = wkt.dumps(value) 149 | return value 150 | 151 | def unmap_value(self, field, value): 152 | if field == "geometry": 153 | value = wkt.loads(value) 154 | return super(WktMapper, self).unmap_value(field, value) 155 | 156 | def item_dict(self, uitem): 157 | data = uitem._asdict() 158 | data["geometry"] = wkt.loads(data["geometry"]) 159 | return data 160 | 161 | 162 | def guess_driver(filename): 163 | if filename.endswith(".shp"): 164 | return "ESRI Shapefile" 165 | else: 166 | return "GeoJSON" 167 | -------------------------------------------------------------------------------- /itertable/loaders.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import requests 3 | from io import StringIO, BytesIO 4 | from .exceptions import LoadFailed 5 | from zipfile import ZipFile 6 | 7 | try: 8 | from .version import VERSION 9 | except ImportError: 10 | VERSION = "0.0.0" 11 | 12 | 13 | class BaseLoader(object): 14 | no_pickle_loader = ["file"] 15 | empty_file = None 16 | 17 | def load(self): 18 | raise NotImplementedError 19 | 20 | 21 | class FileLoader(BaseLoader): 22 | filename = None 23 | require_existing = True 24 | 25 | @property 26 | def read_mode(self): 27 | return "rb" if self.binary else "r" 28 | 29 | @property 30 | def write_mode(self): 31 | return "wb+" if self.binary else "w+" 32 | 33 | def load(self): 34 | try: 35 | self.file = open(self.filename, self.read_mode) 36 | self.empty_file = False 37 | except OSError as e: 38 | if self.require_existing: 39 | raise LoadFailed( 40 | e.strerror, 41 | path=self.filename, 42 | code=e.errno, 43 | ) 44 | elif self.binary: 45 | self.file = BytesIO() 46 | else: 47 | self.file = StringIO() 48 | self.empty_file = True 49 | 50 | def save(self): 51 | file = open(self.filename, self.write_mode) 52 | self.dump(file) 53 | file.close() 54 | 55 | 56 | class Zipper(object): 57 | inner_filename = None 58 | inner_binary = False 59 | 60 | def unzip_file(self): 61 | zipfile = ZipFile(self.file) 62 | inner_file = zipfile.read(self.get_inner_filename(zipfile)) 63 | if self.inner_binary: 64 | self.file = BytesIO(inner_file) 65 | else: 66 | self.file = StringIO(inner_file.decode("utf-8")) 67 | zipfile.fp.close() 68 | zipfile.close() 69 | 70 | def get_inner_filename(self, zipfile): 71 | if self.inner_filename: 72 | return self.inner_filename 73 | names = zipfile.namelist() 74 | if len(names) == 1: 75 | return names[0] 76 | 77 | zipfile.fp.close() 78 | zipfile.close() 79 | raise LoadFailed("Multiple Inner Files!") 80 | 81 | 82 | class ZipFileLoader(Zipper, FileLoader): 83 | binary = True 84 | 85 | def load(self): 86 | super(ZipFileLoader, self).load() 87 | self.unzip_file() 88 | 89 | 90 | class StringLoader(BaseLoader): 91 | string = "" 92 | 93 | @property 94 | def _io_class(self): 95 | return BytesIO if self.binary else StringIO 96 | 97 | def load(self): 98 | if self.binary and not self.string: 99 | self.string = b"" 100 | self.file = self._io_class(self.string) 101 | 102 | def save(self): 103 | file = self._io_class() 104 | self.dump(file) 105 | self.string = file.getvalue() 106 | file.close() 107 | 108 | 109 | class NetLoader(StringLoader): 110 | "NetLoader: opens HTTP/REST resources for use in IterTable" 111 | 112 | username = None 113 | password = None 114 | debug = False 115 | url = None 116 | client = requests 117 | 118 | @property 119 | def user_agent(self): 120 | return "IterTable/%s (%s)" % ( 121 | VERSION, 122 | requests.utils.default_user_agent(), 123 | ) 124 | 125 | @property 126 | def headers(self): 127 | return { 128 | "User-Agent": self.user_agent, 129 | } 130 | 131 | def load(self, **kwargs): 132 | result = self.GET() 133 | self.file = self._io_class(result) 134 | 135 | def req(self, url=None, method=None, params=None, body=None, headers={}): 136 | if url is None: 137 | url = self.url 138 | if url is None: 139 | raise LoadFailed("No URL provided") 140 | 141 | if params is None: 142 | params = getattr(self, "params", None) 143 | 144 | if isinstance(params, str): 145 | url += "?" + params 146 | params = None 147 | 148 | if self.debug: 149 | if params: 150 | from requests.compat import urlencode 151 | 152 | debug_url = url + "?" + urlencode(params, doseq=True) 153 | else: 154 | debug_url = url 155 | self.debug_string = "%s: %s" % (method, debug_url) 156 | print(self.debug_string) 157 | 158 | if self.username is not None and self.password is not None: 159 | auth = (self.username, self.password) 160 | else: 161 | auth = None 162 | 163 | all_headers = self.headers.copy() 164 | all_headers.update(headers) 165 | 166 | resp = self.client.request( 167 | method, 168 | url, 169 | params=params, 170 | headers=all_headers, 171 | auth=auth, 172 | data=body, 173 | ) 174 | resp.connection.close() 175 | 176 | if resp.status_code < 200 or resp.status_code > 299: 177 | raise LoadFailed( 178 | resp.text, 179 | path=url, 180 | code=resp.status_code, 181 | ) 182 | 183 | if self.binary: 184 | return resp.content 185 | else: 186 | return resp.text 187 | 188 | def GET(self, **kwargs): 189 | return self.req(method="GET", **kwargs) 190 | 191 | def POST(self, **kwargs): 192 | return self.req(method="POST", **kwargs) 193 | 194 | def PUT(self, **kwargs): 195 | return self.req(method="PUT", **kwargs) 196 | 197 | def DELETE(self, **kwargs): 198 | return self.req(method="DELETE", **kwargs) 199 | 200 | 201 | class ZipNetLoader(Zipper, NetLoader): 202 | binary = True 203 | 204 | def load(self): 205 | super(ZipNetLoader, self).load() 206 | self.unzip_file() 207 | -------------------------------------------------------------------------------- /itertable/mappers.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple, OrderedDict 2 | import re 3 | from datetime import datetime 4 | from .exceptions import NoData, MappingFailed 5 | from unicodedata import normalize 6 | 7 | 8 | class BaseMapper(object): 9 | def get_key_field(self): 10 | return self.map_field(self.key_field) 11 | 12 | def map_field(self, field): 13 | return field 14 | 15 | def map_value(self, field, value): 16 | return value 17 | 18 | def unmap_field(self, field): 19 | return field 20 | 21 | def unmap_value(self, field, value): 22 | return value 23 | 24 | def usable_item(self, item): 25 | uitem = {} 26 | for key, val in item.items(): 27 | field = self.map_field(key) 28 | value = self.map_value(field, val) 29 | uitem[field] = value 30 | return uitem 31 | 32 | def parse_usable_item(self, uitem): 33 | item = {} 34 | for field, value in uitem.items(): 35 | key = self.unmap_field(field) 36 | val = self.unmap_value(field, value) 37 | item[key] = val 38 | return item 39 | 40 | 41 | class DictMapper(BaseMapper): 42 | field_map = {} 43 | value_map = {} 44 | 45 | def map_field(self, field): 46 | field = self.field_map[field] if field in self.field_map else field 47 | return field 48 | 49 | def map_value(self, field, value): 50 | if not isinstance(value, str): 51 | return value 52 | value = self.value_map[value] if value in self.value_map else value 53 | return value 54 | 55 | def unmap_field(self, field): 56 | for f in self.field_map: 57 | if self.field_map[f] == field: 58 | return f 59 | return field 60 | 61 | def unmap_value(self, field, value): 62 | if not isinstance(value, str): 63 | return value 64 | for v in self.value_map: 65 | if self.value_map[v] == value: 66 | return v 67 | return value 68 | 69 | 70 | class TupleMapper(DictMapper): 71 | no_pickle_mapper = ["_tuple_class", "_tuple_prototype"] 72 | 73 | @property 74 | def field_map(self): 75 | field_names = self.get_field_names() 76 | if not field_names and not getattr(self, "data", None): 77 | raise NoData 78 | 79 | # FIXME: check for duplicates 80 | if not hasattr(self, "_field_map"): 81 | items = [ 82 | (field, self.tuple_field_name(field)) for field in field_names 83 | ] 84 | self._field_map = OrderedDict(items) 85 | return self._field_map 86 | 87 | def tuple_field_name(self, field): 88 | field = self.clean_field_name(field) 89 | field = re.sub(r"\W", "", field.lower()) 90 | # normalize identifiers for consistency with namedtuple 91 | # http://bugs.python.org/issue23091 92 | field = normalize("NFKC", field) 93 | return field 94 | 95 | def clean_field_name(self, field): 96 | return field 97 | 98 | @property 99 | def tuple_class(self): 100 | "Returns a class to use for individual items" 101 | 102 | if not hasattr(self, "_tuple_class"): 103 | cls = namedtuple( 104 | self.__class__.__name__ + "Tuple", 105 | list(self.field_map.values()), 106 | ) 107 | self._tuple_class = cls 108 | 109 | return self._tuple_class 110 | 111 | @property 112 | def tuple_prototype(self): 113 | if not hasattr(self, "_tuple_prototype"): 114 | vals = {field: None for field in self.field_map.values()} 115 | self._tuple_prototype = self.tuple_class(**vals) 116 | return self._tuple_prototype 117 | 118 | def usable_item(self, item): 119 | mapped = super(TupleMapper, self).usable_item(item) 120 | try: 121 | return self.tuple_prototype._replace(**mapped) 122 | except ValueError as e: 123 | raise MappingFailed(str(e)) 124 | 125 | def parse_usable_item(self, uitem): 126 | mapped = {key: getattr(uitem, key) for key in self.field_map.values()} 127 | return super(TupleMapper, self).parse_usable_item(mapped) 128 | 129 | def item_dict(self, uitem): 130 | return uitem._asdict() 131 | 132 | def create(self, **kwargs): 133 | return self.tuple_prototype._replace(**kwargs) 134 | 135 | 136 | def parse_iso8601(val): 137 | # See http://bugs.python.org/issue15873 138 | if hasattr(datetime, "fromisoformat"): 139 | return datetime.fromisoformat(val) 140 | try: 141 | from django.utils.dateparse import parse_datetime 142 | except ImportError: 143 | try: 144 | from iso8601 import parse_date as parse_datetime 145 | except ImportError: 146 | raise Exception("No suitable iso8601 parser found!") 147 | try: 148 | result = parse_datetime(val) 149 | except Exception: 150 | result = None 151 | if result is None: 152 | raise ValueError("Could not parse %s as iso8601 date!" % val) 153 | return result 154 | 155 | 156 | def make_date_mapper(fmt): 157 | """ 158 | Generate functions to use for mapping strings to dates 159 | """ 160 | 161 | def mapper(val): 162 | if fmt == "iso8601": 163 | return parse_iso8601(val) 164 | val = datetime.strptime(val, fmt) 165 | if "%Y" in fmt or "%y" in fmt: 166 | return val 167 | else: 168 | return val.time() 169 | 170 | return mapper 171 | 172 | 173 | class TimeSeriesMapper(TupleMapper): 174 | date_formats = None 175 | map_floats = True 176 | map_functions = [] 177 | 178 | def make_date_mapper(self, fmt): 179 | return make_date_mapper(fmt) 180 | 181 | def map_value(self, field, value): 182 | if not isinstance(value, str): 183 | return value 184 | 185 | if not self.map_functions: 186 | self.map_functions = [ 187 | self.make_date_mapper(fmt) for fmt in self.date_formats 188 | ] 189 | 190 | if self.map_floats: 191 | self.map_functions.insert(0, float) 192 | 193 | value = value.strip() 194 | for i, fn in enumerate(self.map_functions): 195 | try: 196 | return fn(value) 197 | except ValueError: 198 | pass 199 | return value 200 | 201 | @property 202 | def key_fields(self): 203 | raise NotImplementedError("Key fields must be specified") 204 | 205 | def parameter_fields(self): 206 | return sorted(set(self.field_map.values()) - set(self.key_fields)) 207 | -------------------------------------------------------------------------------- /itertable/parsers/__init__.py: -------------------------------------------------------------------------------- 1 | from .text import CsvParser, JsonParser, XmlParser 2 | from .xls import OldExcelParser, ExcelParser 3 | 4 | 5 | __all__ = ( 6 | "CsvParser", 7 | "JsonParser", 8 | "XmlParser", 9 | "OldExcelParser", 10 | "ExcelParser", 11 | ) 12 | -------------------------------------------------------------------------------- /itertable/parsers/base.py: -------------------------------------------------------------------------------- 1 | class BaseParser(object): 2 | pass 3 | 4 | 5 | class TableParser(BaseParser): 6 | tabular = True 7 | header_row = None 8 | max_header_row = 20 9 | start_row = None 10 | extra_data = None 11 | -------------------------------------------------------------------------------- /itertable/parsers/readers.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | 4 | class SkipPreludeReader(csv.DictReader): 5 | """ 6 | A specialized version of DictReader that attempts to find where the "real" 7 | CSV data is in a file that may contain a prelude of non-CSV text. 8 | """ 9 | 10 | max_header_row = 20 11 | 12 | def __init__( 13 | self, 14 | f, 15 | fieldnames=None, 16 | restkey=None, 17 | restval=None, 18 | dialect="excel", 19 | *args, 20 | **kwds 21 | ): 22 | # Preserve file since we're going to start reading it 23 | self._file = f 24 | 25 | # Preserve reader options since we'll need to make another one 26 | readeropts = [f, dialect] 27 | readeropts.extend(args) 28 | self._readeropts = (readeropts, kwds) 29 | super().__init__( 30 | f, fieldnames, restkey, restval, dialect, *args, **kwds 31 | ) 32 | 33 | @property 34 | def fieldnames(self): 35 | if self._fieldnames is not None: 36 | return self._fieldnames 37 | 38 | # Create a new reader just to figure out which row is the header 39 | args, kwds = self._readeropts 40 | data = csv.reader(*args, **kwds) 41 | rows = [] 42 | for i in range(self.max_header_row): 43 | try: 44 | rows.append(next(data)) 45 | except StopIteration: 46 | pass 47 | header_row, field_names = self.choose_header(rows) 48 | 49 | # Reset file and advance reader so it starts in the right spot 50 | if hasattr(self._file, "seek"): 51 | self._file.seek(0) 52 | for i in range(header_row + 1): 53 | try: 54 | next(self.reader) 55 | except StopIteration: 56 | pass 57 | 58 | self._fieldnames = field_names 59 | self._header_row = header_row 60 | return field_names 61 | 62 | @property 63 | def header_row(self): 64 | self.fieldnames # used for side effect 65 | return self._header_row 66 | 67 | def choose_header(self, rows): 68 | """ 69 | Determine which row contains column headers from the provided set. 70 | Default is to assume that the first longest row is the header. 71 | """ 72 | header_row = 0 73 | field_names = [] 74 | 75 | # Select header from available rows 76 | for i, row in enumerate(rows): 77 | if len(row) > len(field_names): 78 | header_row = i 79 | field_names = row 80 | return header_row, field_names 81 | -------------------------------------------------------------------------------- /itertable/parsers/text.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import json 3 | from .readers import SkipPreludeReader 4 | from xml.etree import ElementTree as ET 5 | 6 | from .base import BaseParser, TableParser 7 | from ..exceptions import ParseFailed 8 | 9 | 10 | class CsvParser(TableParser): 11 | delimiter = "," 12 | quotechar = '"' 13 | no_pickle_parser = ["csvdata"] 14 | binary = False 15 | 16 | def parse(self): 17 | # Like DictReader, assume explicit field definition means CSV does not 18 | # contain column headers. 19 | fields = self.get_field_names() 20 | if self.start_row is None: 21 | if fields: 22 | self.start_row = 0 23 | else: 24 | self.start_row = 1 25 | if self.header_row is None: 26 | if fields: 27 | self.header_row = None 28 | else: 29 | self.header_row = 0 30 | 31 | Reader = self.reader_class() 32 | self.csvdata = Reader( 33 | self.file, 34 | fields, 35 | delimiter=self.delimiter, 36 | quotechar=self.quotechar, 37 | ) 38 | self.field_names = self.csvdata.fieldnames 39 | if self.header_row is not None: 40 | self.header_row = self.csvdata.header_row 41 | self.data = [row for row in self.csvdata] 42 | self.extra_data = {} 43 | 44 | def reader_class(self): 45 | class Reader(SkipPreludeReader): 46 | max_header_row = self.max_header_row 47 | 48 | return Reader 49 | 50 | def dump(self, file=None): 51 | if file is None: 52 | file = self.file 53 | csvout = csv.DictWriter( 54 | file, 55 | self.get_field_names(), 56 | delimiter=self.delimiter, 57 | quotechar=self.quotechar, 58 | ) 59 | csvout.writeheader() 60 | for row in self.data: 61 | csvout.writerow(row) 62 | 63 | 64 | class JsonParser(BaseParser): 65 | indent = None 66 | namespace = None 67 | binary = False 68 | 69 | def parse(self): 70 | try: 71 | obj = json.load(self.file) 72 | if self.namespace: 73 | for key in self.namespace.split("."): 74 | obj = obj[key] 75 | self.data = list(map(self.parse_item, obj)) 76 | except ValueError: 77 | raise ParseFailed 78 | 79 | def parse_item(self, item): 80 | return item 81 | 82 | def dump(self, file=None): 83 | if file is None: 84 | file = self.file 85 | obj = list(map(self.dump_item, self.data)) 86 | if self.namespace: 87 | for key in reversed(self.namespace.split(".")): 88 | obj = {key: obj} 89 | json.dump(obj, file, indent=self.indent) 90 | 91 | def dump_item(self, item): 92 | return item 93 | 94 | 95 | class XmlParser(BaseParser): 96 | root_tag = None 97 | item_tag = None 98 | binary = False 99 | 100 | def parse(self): 101 | doc = ET.parse(self.file) 102 | root = self.parse_root(doc) 103 | if self.root_tag is None: 104 | self.root_tag = root.tag 105 | if self.item_tag is None: 106 | self.item_tag = list(root)[0].tag 107 | self.data = list(map(self.parse_item, root.findall(self.item_tag))) 108 | 109 | def parse_root(self, doc): 110 | root = doc.getroot() 111 | if self.root_tag is not None and root.tag != self.root_tag: 112 | root = root.find(self.root_tag) 113 | return root 114 | 115 | def parse_item(self, el): 116 | return {e.tag: e.text for e in el} 117 | 118 | def dump(self, file=None): 119 | if file is None: 120 | file = self.file 121 | root = ET.Element(self.root_tag) 122 | for item in self.data: 123 | root.append(self.dump_item(item)) 124 | output = ET.tostring(root).decode("utf-8") 125 | file.write(output) 126 | 127 | def dump_item(self, item): 128 | el = ET.Element(self.item_tag) 129 | for key in self.get_field_names(): 130 | if key not in item or item[key] is None: 131 | continue 132 | sel = ET.SubElement(el, key) 133 | sel.text = str(item.get(key)) 134 | return el 135 | -------------------------------------------------------------------------------- /itertable/parsers/xls.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import math 3 | from .base import TableParser 4 | 5 | 6 | class WorkbookParser(TableParser): 7 | workbook = None 8 | worksheet = None 9 | sheet_name = 0 10 | start_row = None 11 | column_count = None 12 | no_pickle_parser = ["workbook", "worksheet"] 13 | binary = True 14 | 15 | date_format = "yyyy-mm-dd" 16 | time_format = "hh:mm:ss" 17 | datetime_format = "yyyy-mm-dd hh:mm:ss" 18 | 19 | def parse(self): 20 | if not self.workbook: 21 | self.parse_workbook() 22 | 23 | if self.sheet_name is None: 24 | SpreadsheetIter = type(self) 25 | self.data = [ 26 | { 27 | "name": name, 28 | "data": SpreadsheetIter( 29 | loaded=True, 30 | workbook=self.workbook, 31 | sheet_name=name, 32 | ), 33 | } 34 | for name in self.sheet_names 35 | ] 36 | return 37 | 38 | sheet_name = self.sheet_name 39 | if isinstance(self.sheet_name, int): 40 | sheet_name = self.sheet_names[sheet_name] 41 | 42 | self.parse_worksheet(sheet_name) 43 | 44 | if self.header_row is None: 45 | if self.start_row is not None: 46 | self.header_row = self.start_row - 1 47 | else: 48 | self.column_count = 0 49 | 50 | def checkval(cell): 51 | if cell.value is not None and cell.value != "": 52 | return True 53 | return False 54 | 55 | search_rows = min(len(self.worksheet) - 1, self.max_header_row) 56 | for row in range(search_rows, -1, -1): 57 | count = len(list(filter(checkval, self.worksheet[row]))) 58 | if count >= self.column_count: 59 | self.column_count = count 60 | self.header_row = row 61 | 62 | if self.header_row is None: 63 | return 64 | 65 | if self.start_row is None: 66 | self.start_row = self.header_row + 1 67 | 68 | if self.field_names is None: 69 | rows = self.worksheet[self.header_row : self.start_row] 70 | self.field_names = [ 71 | str(c.value) or "c%s" % i for i, c in enumerate(rows[0]) 72 | ] 73 | for row in rows[1:]: 74 | for i, c in enumerate(row): 75 | self.field_names[i] += "\n" + str(c.value) 76 | 77 | seen_fields = set() 78 | for i, field in enumerate(self.field_names): 79 | if field in seen_fields: 80 | field += str(i) 81 | self.field_names[i] = field 82 | seen_fields.add(field) 83 | 84 | self.data = list(map(self.parse_row, self.worksheet[self.start_row :])) 85 | 86 | self.extra_data = {} 87 | if self.header_row > 0: 88 | for r in range(0, self.header_row): 89 | for c, cell in enumerate(self.worksheet[r]): 90 | val = self.get_value(cell) 91 | if val is not None and val != "": 92 | self.extra_data.setdefault(r, {}) 93 | self.extra_data[r][c] = val 94 | 95 | def parse_workbook(self): 96 | raise NotImplementedError 97 | 98 | @property 99 | def sheet_names(self): 100 | raise NotImplementedError 101 | 102 | def get_sheet_by_name(self, name): 103 | raise NotImplementedError 104 | 105 | def parse_worksheet(self, name): 106 | raise NotImplementedError 107 | 108 | def parse_row(self, row): 109 | return { 110 | name: self.get_value(row[i]) 111 | for i, name in enumerate(self.get_field_names()) 112 | if i < len(row) 113 | } 114 | 115 | def get_value(self, cell): 116 | raise NotImplementedError 117 | 118 | def dump(self, file=None): 119 | if file is None: 120 | file = self.file 121 | write, close = self.open_worksheet(file) 122 | for i, field in enumerate(self.field_names): 123 | write(0, i, field) 124 | for r, row in enumerate(self.data): 125 | for c, field in enumerate(self.field_names): 126 | write(r + 1, c, row[field]) 127 | close() 128 | 129 | def calc_width(self, val): 130 | val = str(val) if val is not None else "" 131 | size = 0 132 | for c in val: 133 | if c in ".,;:'\"iIlt1": 134 | size += 0.5 135 | elif c in "MW": 136 | size += 1.3 137 | elif c.isupper(): 138 | size += 1.2 139 | elif c.islower(): 140 | size += 1 141 | else: 142 | size += 1.1 143 | return size * 1.4 144 | 145 | 146 | class OldExcelParser(WorkbookParser): 147 | def parse_workbook(self): 148 | try: 149 | import xlrd 150 | except ImportError: 151 | raise Exception("xlrd is required to load .xls files") 152 | self.workbook = xlrd.open_workbook(file_contents=self.file.read()) 153 | 154 | @property 155 | def sheet_names(self): 156 | return self.workbook.sheet_names() 157 | 158 | def get_sheet_by_name(self, name): 159 | return self.workbook.sheet_by_name(name) 160 | 161 | def parse_worksheet(self, name): 162 | worksheet = self.get_sheet_by_name(name) 163 | self.worksheet = [worksheet.row(i) for i in range(worksheet.nrows)] 164 | 165 | def get_value(self, cell): 166 | import xlrd 167 | 168 | if cell.ctype == xlrd.XL_CELL_DATE: 169 | time, date = math.modf(cell.value) 170 | tpl = xlrd.xldate_as_tuple(cell.value, self.workbook.datemode) 171 | if date and time: 172 | return datetime.datetime(*tpl) 173 | elif date: 174 | return datetime.date(*tpl[0:3]) 175 | else: 176 | return datetime.time(*tpl[3:6]) 177 | return cell.value 178 | 179 | def calc_width(self, val): 180 | val = str(val) if val is not None else "" 181 | size = 0 182 | for c in val: 183 | if c in ".,;:'\"iIlt1": 184 | size += 0.5 185 | elif c in "MW": 186 | size += 1.3 187 | elif c.isupper(): 188 | size += 1.2 189 | elif c.islower(): 190 | size += 1 191 | else: 192 | size += 1.1 193 | return size 194 | 195 | def open_worksheet(self, file): 196 | import xlwt 197 | 198 | workbook = xlwt.Workbook() 199 | worksheet = workbook.add_sheet("Sheet 1") 200 | 201 | formats = { 202 | datetime.date: xlwt.Style.easyxf( 203 | num_format_str=self.date_format, 204 | ), 205 | datetime.time: xlwt.Style.easyxf( 206 | num_format_str=self.time_format, 207 | ), 208 | datetime.datetime: xlwt.Style.easyxf( 209 | num_format_str=self.datetime_format, 210 | ), 211 | "header": xlwt.Style.easyxf( 212 | "font: bold on; borders: bottom thick;" 213 | ), 214 | } 215 | 216 | widths = {} 217 | 218 | def write(r, c, val): 219 | widths.setdefault(c, 0) 220 | widths[c] = max(widths[c], self.calc_width(val)) 221 | fmt = formats.get(type(val)) 222 | if not fmt and r == 0: 223 | fmt = formats["header"] 224 | if fmt: 225 | worksheet.write(r, c, val, fmt) 226 | else: 227 | worksheet.write(r, c, val) 228 | 229 | def close(): 230 | for c, width in widths.items(): 231 | worksheet.col(c).set_width(int(width * 256)) 232 | workbook.save(file) 233 | 234 | return write, close 235 | 236 | 237 | class ExcelParser(WorkbookParser): 238 | def parse_workbook(self): 239 | import openpyxl 240 | 241 | self.workbook = openpyxl.open(self.file, data_only=True) 242 | 243 | @property 244 | def sheet_names(self): 245 | return self.workbook.sheetnames 246 | 247 | def get_sheet_by_name(self, name): 248 | return self.workbook[name] 249 | 250 | def parse_worksheet(self, name): 251 | worksheet = self.get_sheet_by_name(name) 252 | self.worksheet = [row for row in worksheet.rows] 253 | 254 | def get_value(self, cell): 255 | value = cell.internal_value 256 | if isinstance(value, datetime.datetime): 257 | if value.time() == datetime.time(0, 0): 258 | return value.date() 259 | return value 260 | 261 | def open_worksheet(self, file): 262 | from openpyxl import Workbook, styles, utils 263 | 264 | workbook = Workbook() 265 | worksheet = workbook.active 266 | 267 | formats = { 268 | datetime.date: styles.NamedStyle( 269 | name="date", 270 | number_format=self.date_format, 271 | ), 272 | datetime.time: styles.NamedStyle( 273 | name="time", 274 | number_format=self.time_format, 275 | ), 276 | datetime.datetime: styles.NamedStyle( 277 | name="datetime", 278 | number_format=self.datetime_format, 279 | ), 280 | "header": styles.NamedStyle( 281 | name="header", 282 | font=styles.Font(bold=True), 283 | border=styles.Border(bottom=styles.Side(style="thick")), 284 | ), 285 | } 286 | widths = {} 287 | 288 | def write(r, c, val): 289 | widths.setdefault(c, 0) 290 | widths[c] = max(widths[c], self.calc_width(val)) 291 | cell = worksheet.cell(r + 1, c + 1, val) 292 | 293 | fmt = formats.get(type(val)) 294 | if fmt: 295 | cell.style = fmt 296 | elif r == 0: 297 | cell.style = formats["header"] 298 | 299 | def close(): 300 | for c, width in widths.items(): 301 | col = utils.get_column_letter(c + 1) 302 | worksheet.column_dimensions[col].width = width 303 | workbook.save(self.filename) 304 | 305 | return write, close 306 | -------------------------------------------------------------------------------- /itertable/util.py: -------------------------------------------------------------------------------- 1 | from .base import BaseIter 2 | from .loaders import FileLoader, NetLoader, StringLoader 3 | from .parsers import ( 4 | CsvParser, 5 | JsonParser, 6 | XmlParser, 7 | ExcelParser, 8 | OldExcelParser, 9 | ) 10 | from .mappers import TupleMapper 11 | from .exceptions import ParseFailed 12 | import mimetypes 13 | import io 14 | 15 | xlsx = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" 16 | 17 | PARSERS = { 18 | "application/vnd.ms-excel": OldExcelParser, 19 | "application/CDFV2": OldExcelParser, 20 | xlsx: ExcelParser, 21 | "application/octet-stream": ExcelParser, 22 | "text/csv": CsvParser, 23 | "application/csv": CsvParser, 24 | "application/json": JsonParser, 25 | "application/xml": XmlParser, 26 | "text/xml": XmlParser, 27 | } 28 | 29 | BINARY_TYPES = set(key for key, cls in PARSERS.items() if cls.binary) 30 | TEXT_TYPES = set(key for key, cls in PARSERS.items() if not cls.binary) 31 | 32 | # Save generated classes to avoid recreating them 33 | _iter_classes = {} 34 | 35 | 36 | def make_iter( 37 | loader, parser, mapper=TupleMapper, name=None, module="itertable" 38 | ): 39 | """ 40 | Mix the specified loader, parser, and mapper classes into a usable Iter 41 | """ 42 | key = (loader, parser, mapper) 43 | if key in _iter_classes: 44 | return _iter_classes[key] 45 | 46 | if name is None: 47 | lname = parser.__name__.replace("Parser", "") 48 | pname = loader.__name__.replace("Loader", "") 49 | if mapper == TupleMapper: 50 | mname = "" 51 | else: 52 | mname = mapper.__name__.replace("Mapper", "") 53 | name = lname + pname + mname + "Iter" 54 | cls = type(name, (loader, parser, mapper, BaseIter), {}) 55 | cls.__module__ = module 56 | _iter_classes[key] = cls 57 | return cls 58 | 59 | 60 | def guess_type(filename, buffer=None): 61 | mimetype, encoding = mimetypes.guess_type(filename) 62 | if mimetype is None: 63 | try: 64 | import magic 65 | 66 | if buffer: 67 | mimetype = magic.from_buffer(buffer, mime=True) 68 | if mimetype == "text/plain": 69 | if buffer.startswith("{") or buffer.startswith("["): 70 | mimetype = "application/json" 71 | elif buffer.startswith("<"): 72 | mimetype = "application/xml" 73 | elif "," in buffer: 74 | mimetype = "text/csv" 75 | else: 76 | mimetype = magic.from_file(filename, mime=True) 77 | except ImportError: 78 | pass 79 | return mimetype 80 | 81 | 82 | def load_file(filename, mapper=TupleMapper, options=None): 83 | if options is None: 84 | options = {} 85 | 86 | if isinstance(filename, str): 87 | mimetype = guess_type(filename) 88 | else: 89 | file = filename 90 | assert hasattr(file, "read"), "Use load_file() with path or file obj" 91 | buffer = file.read(2048) 92 | if hasattr(file, "seek"): 93 | file.seek(0) 94 | filename = getattr(file, "name", "__unknown__") 95 | mimetype = guess_type(filename, buffer=buffer) 96 | 97 | if mimetype in TEXT_TYPES and isinstance(buffer, bytes): 98 | bfile = file 99 | file = io.StringIO(bfile.read().decode()) 100 | bfile.close() 101 | 102 | options.update(file=file, loaded=True) 103 | 104 | if mimetype not in PARSERS: 105 | raise ParseFailed("Could not determine parser for %s" % mimetype) 106 | parser = PARSERS[mimetype] 107 | loader = FileLoader 108 | Iter = make_iter(loader, parser, mapper) 109 | return Iter(filename=filename, **options) 110 | 111 | 112 | def load_url(url, mapper=TupleMapper, options={}): 113 | mimetype = guess_type(url) 114 | if mimetype not in PARSERS: 115 | raise ParseFailed("Could not determine parser for %s" % mimetype) 116 | parser = PARSERS[mimetype] 117 | loader = NetLoader 118 | Iter = make_iter(loader, parser, mapper) 119 | return Iter(url=url, **options) 120 | 121 | 122 | def load_string(string, mapper=TupleMapper, options={}): 123 | if string.startswith("<"): 124 | parser = XmlParser 125 | elif string.startswith("[") or ( 126 | string.startswith("{") and "namespace" in options 127 | ): 128 | parser = JsonParser 129 | elif "," in string: 130 | parser = CsvParser 131 | else: 132 | raise Exception("Could not determine parser for string!") 133 | 134 | loader = StringLoader 135 | Iter = make_iter(loader, parser, mapper) 136 | if Iter.binary: 137 | string = string.encode("utf-8") 138 | return Iter(string=string, **options) 139 | 140 | 141 | class FlatIter(TupleMapper, BaseIter): 142 | """ 143 | Denormalizes a nested Iter structure (e.g. an array of individual time 144 | series) into a single iterable. Each row in the top level Iter should have 145 | an attribute (typically 'data') pointing to an inner Iter. Both the top 146 | level Iter class and the inner class should extend TupleMapper. 147 | """ 148 | 149 | iter_class = None 150 | inner_attr = "data" 151 | 152 | def __init__(self, *args, **kwargs): 153 | self.iter_class = kwargs.pop("iter_class", self.iter_class) 154 | self.inner_attr = kwargs.pop("inner_attr", self.inner_attr) 155 | if self.iter_class is None: 156 | raise Exception("An Iter class must be specified") 157 | 158 | # Pass remaining arguments 159 | self.nested_iter = self.iter_class(*args, **kwargs) 160 | self.data = list(self.unpack_iter()) 161 | 162 | def unpack_iter(self): 163 | # Loop through outer Iter (e.g. metadata series) 164 | for outer in self.nested_iter: 165 | meta = outer._asdict() 166 | inner_iter = meta.pop(self.inner_attr) 167 | 168 | # Loop through inner Iter (e.g. time series) on each record 169 | for inner in inner_iter: 170 | record = meta.copy() 171 | record.update(inner._asdict()) 172 | yield record 173 | 174 | 175 | def flattened(iter_class, *args, **kwargs): 176 | if iter_class.nested: 177 | return FlatIter(iter_class=iter_class, *args, **kwargs) 178 | else: 179 | return iter_class(*args, **kwargs) 180 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "setuptools-scm"] 3 | 4 | [project] 5 | name = "itertable" 6 | dynamic = ["version"] 7 | authors = [ 8 | {name = "S. Andrew Sheppard", email = "andrew@wq.io"}, 9 | ] 10 | description = "Iterable API for tabular datasets including CSV, XLSX, XML, & JSON." 11 | readme = "README.md" 12 | requires-python = ">=3.7" 13 | license = {text = "MIT" } 14 | classifiers = [ 15 | "Development Status :: 5 - Production/Stable", 16 | "License :: OSI Approved :: MIT License", 17 | "Natural Language :: English", 18 | "Programming Language :: Python :: 3", 19 | "Programming Language :: Python :: 3.7", 20 | "Programming Language :: Python :: 3.8", 21 | "Programming Language :: Python :: 3.9", 22 | "Programming Language :: Python :: 3.10", 23 | "Programming Language :: Python :: 3.11", 24 | "Intended Audience :: Science/Research", 25 | "Intended Audience :: Developers", 26 | "Topic :: Text Processing :: Markup :: XML", 27 | "Topic :: Scientific/Engineering :: GIS", 28 | "Topic :: Utilities", 29 | ] 30 | dependencies = [ 31 | "requests", 32 | "openpyxl", 33 | "click" 34 | ] 35 | 36 | [project.urls] 37 | Homepage = "https://django-data-wizard.wq.io/itertable/" 38 | Documentation = "https://django-data-wizard.wq.io/itertable/" 39 | Source = "https://github.com/wq/itertable/" 40 | "Release Notes" = "https://github.com/wq/itertable/releases" 41 | Issues = "https://github.com/wq/itertable/issues" 42 | CI = "https://github.com/wq/itertable/actions/workflows/test.yml" 43 | 44 | [project.optional-dependencies] 45 | gis = ["Fiona", "geopandas"] 46 | pandas = ["pandas"] 47 | oldexel = ["xlrd", "xlwt"] 48 | 49 | [tool.setuptools] 50 | packages = ["itertable", "itertable.parsers", "itertable.gis"] 51 | 52 | [tool.setuptools_scm] 53 | write_to = "itertable/version.py" 54 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wq/itertable/c996d18c533b727b2ca8dd33915e1eed7820280f/tests/__init__.py -------------------------------------------------------------------------------- /tests/base.py: -------------------------------------------------------------------------------- 1 | from os.path import join, dirname 2 | from os import unlink 3 | import unittest 4 | 5 | 6 | class IterTestCase(unittest.TestCase): 7 | data = [ 8 | { 9 | "one": 1, 10 | "two": 2, 11 | "three": 3, 12 | }, 13 | { 14 | "one": 4, 15 | "two": 5, 16 | "three": 6, 17 | }, 18 | ] 19 | 20 | def get_filename(self, filename, ext, remove_existing=False): 21 | filename = join(dirname(__file__), "files", "%s.%s" % (filename, ext)) 22 | if remove_existing: 23 | try: 24 | unlink(filename) 25 | except OSError: 26 | pass 27 | return filename 28 | 29 | def check_instance(self, instance): 30 | self.assertEqual(len(instance), len(self.data)) 31 | 32 | for row, data in zip(instance, self.data): 33 | for key in data: 34 | val = getattr(row, key) 35 | try: 36 | val = int(float(val)) 37 | except ValueError: 38 | pass 39 | self.assertEqual(val, data[key]) 40 | -------------------------------------------------------------------------------- /tests/files/.gitignore: -------------------------------------------------------------------------------- 1 | output.* 2 | copy.* 3 | sync.* 4 | -------------------------------------------------------------------------------- /tests/files/custom.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Test Case", 3 | "data": { 4 | "info": "Items Array", 5 | "items": [ 6 | { 7 | "one": 1, 8 | "two": 2, 9 | "three": 3 10 | }, 11 | { 12 | "one": 4, 13 | "two": 5, 14 | "three": 6 15 | } 16 | ] 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /tests/files/custom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | Test Case 4 | 5 | Items Array 6 | 7 | 1 8 | 2 9 | 3 10 | 11 | 12 | 4 13 | 5 14 | 6 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /tests/files/custom2.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Test Case", 3 | "data": { 4 | "info": "Items Array", 5 | "items": [ 6 | { 7 | "one": 1, 8 | "two": 2, 9 | "three": 3 10 | }, 11 | { 12 | "one": 4, 13 | "two": 5, 14 | "three": 6, 15 | "four": "extra" 16 | } 17 | ] 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /tests/files/extra.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wq/itertable/c996d18c533b727b2ca8dd33915e1eed7820280f/tests/files/extra.xlsx -------------------------------------------------------------------------------- /tests/files/nodata.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wq/itertable/c996d18c533b727b2ca8dd33915e1eed7820280f/tests/files/nodata.csv -------------------------------------------------------------------------------- /tests/files/nodata.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wq/itertable/c996d18c533b727b2ca8dd33915e1eed7820280f/tests/files/nodata.xlsx -------------------------------------------------------------------------------- /tests/files/noextra.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wq/itertable/c996d18c533b727b2ca8dd33915e1eed7820280f/tests/files/noextra.xlsx -------------------------------------------------------------------------------- /tests/files/test.csv: -------------------------------------------------------------------------------- 1 | one,two,three 2 | 1,2,3 3 | 4,5,6 4 | -------------------------------------------------------------------------------- /tests/files/test.dbf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wq/itertable/c996d18c533b727b2ca8dd33915e1eed7820280f/tests/files/test.dbf -------------------------------------------------------------------------------- /tests/files/test.geojson: -------------------------------------------------------------------------------- 1 | { 2 | "type": "FeatureCollection", 3 | "features": [ 4 | { 5 | "type": "Feature", 6 | "id": "one", 7 | "properties": { 8 | "one": 1, 9 | "two": 2, 10 | "three": 3 11 | }, 12 | "geometry": { 13 | "type": "Polygon", 14 | "coordinates": [ 15 | [ 16 | [ 17 | -93.28044891357422, 18 | 44.9771852553236 19 | ], 20 | [ 21 | -93.28611373901367, 22 | 44.972084916104706 23 | ], 24 | [ 25 | -93.27701568603516, 26 | 44.9715991458543 27 | ], 28 | [ 29 | -93.27220916748047, 30 | 44.9810709235921 31 | ], 32 | [ 33 | -93.27924728393555, 34 | 44.983985001986305 35 | ], 36 | [ 37 | -93.28044891357422, 38 | 44.9771852553236 39 | ] 40 | ] 41 | ] 42 | } 43 | }, 44 | { 45 | "type": "Feature", 46 | "id": "two", 47 | "properties": { 48 | "one": 4, 49 | "two": 5, 50 | "three": 6 51 | }, 52 | "geometry": { 53 | "type": "Polygon", 54 | "coordinates": [ 55 | [ 56 | [ 57 | -93.25349807739258, 58 | 44.968927335931234 59 | ], 60 | [ 61 | -93.25349807739258, 62 | 44.977670978257756 63 | ], 64 | [ 65 | -93.24045181274414, 66 | 44.977670978257756 67 | ], 68 | [ 69 | -93.24045181274414, 70 | 44.968927335931234 71 | ], 72 | [ 73 | -93.25349807739258, 74 | 44.968927335931234 75 | ] 76 | ] 77 | ] 78 | } 79 | } 80 | ] 81 | } 82 | -------------------------------------------------------------------------------- /tests/files/test.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "one": 1, 4 | "two": 2, 5 | "three": 3 6 | }, 7 | { 8 | "one": 4, 9 | "two": 5, 10 | "three": 6 11 | } 12 | ] 13 | -------------------------------------------------------------------------------- /tests/files/test.prj: -------------------------------------------------------------------------------- 1 | GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137,298.257223563]],PRIMEM["Greenwich",0],UNIT["Degree",0.017453292519943295]] -------------------------------------------------------------------------------- /tests/files/test.shp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wq/itertable/c996d18c533b727b2ca8dd33915e1eed7820280f/tests/files/test.shp -------------------------------------------------------------------------------- /tests/files/test.shx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wq/itertable/c996d18c533b727b2ca8dd33915e1eed7820280f/tests/files/test.shx -------------------------------------------------------------------------------- /tests/files/test.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wq/itertable/c996d18c533b727b2ca8dd33915e1eed7820280f/tests/files/test.xls -------------------------------------------------------------------------------- /tests/files/test.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wq/itertable/c996d18c533b727b2ca8dd33915e1eed7820280f/tests/files/test.xlsx -------------------------------------------------------------------------------- /tests/files/test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 1 5 | 2 6 | 3 7 | 8 | 9 | 4 10 | 5 11 | 6 12 | 13 | 14 | -------------------------------------------------------------------------------- /tests/files/test2.csv: -------------------------------------------------------------------------------- 1 | Non-CSV Header, to test SkipPreludeReader 2 | Name: Test 3 | 4 | one,two,three 5 | 1,2,3 6 | 4,5,6 7 | -------------------------------------------------------------------------------- /tests/files/test3.csv: -------------------------------------------------------------------------------- 1 | one,two,three,µ 2 | 1,2,3,test 3 | 4,5,6,test 4 | -------------------------------------------------------------------------------- /tests/files/testcsv.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wq/itertable/c996d18c533b727b2ca8dd33915e1eed7820280f/tests/files/testcsv.zip -------------------------------------------------------------------------------- /tests/files/testmulti.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wq/itertable/c996d18c533b727b2ca8dd33915e1eed7820280f/tests/files/testmulti.zip -------------------------------------------------------------------------------- /tests/files/testxlsx.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wq/itertable/c996d18c533b727b2ca8dd33915e1eed7820280f/tests/files/testxlsx.zip -------------------------------------------------------------------------------- /tests/test_custom.py: -------------------------------------------------------------------------------- 1 | from itertable import JsonFileIter, XmlFileIter 2 | from itertable.exceptions import MappingFailed 3 | from .base import IterTestCase 4 | 5 | 6 | class CustomJsonFileIter(JsonFileIter): 7 | namespace = "data.items" 8 | 9 | 10 | class ExtraJsonFileIter(CustomJsonFileIter): 11 | scan_fields = True 12 | 13 | 14 | class CustomXmlFileIter(XmlFileIter): 15 | root_tag = "items" 16 | item_tag = "item" 17 | 18 | 19 | class CustomTestCase(IterTestCase): 20 | def test_custom_json(self): 21 | filename = self.get_filename("custom", "json") 22 | instance = CustomJsonFileIter(filename=filename) 23 | self.check_instance(instance) 24 | 25 | def test_scan_fields(self): 26 | filename = self.get_filename("custom2", "json") 27 | instance = ExtraJsonFileIter(filename=filename) 28 | self.check_instance(instance) 29 | self.assertIn("four", instance.get_field_names()) 30 | self.assertIsNone(instance[0].four) 31 | self.assertEqual(instance[1].four, "extra") 32 | 33 | def test_unexpected_field(self): 34 | filename = self.get_filename("custom2", "json") 35 | instance = CustomJsonFileIter(filename=filename) 36 | # Extra field in non-first row breaks namedtuple 37 | with self.assertRaises(MappingFailed) as e: 38 | instance[1] 39 | self.assertIn("unexpected field", str(e.exception)) 40 | 41 | def test_custom_xml(self): 42 | filename = self.get_filename("custom", "xml") 43 | instance = CustomXmlFileIter(filename=filename) 44 | self.check_instance(instance) 45 | -------------------------------------------------------------------------------- /tests/test_dataframe.py: -------------------------------------------------------------------------------- 1 | from itertable import load_string, BaseIter 2 | from .base import IterTestCase 3 | 4 | 5 | class LoadFileTestCase(IterTestCase): 6 | def setUp(self): 7 | self.csv_data = "one,two,three\n1,2,3\n4,5,6" 8 | 9 | def test_base_dataframe(self): 10 | io = BaseIter(data=self.data) 11 | df = io.as_dataframe() 12 | self.assertEqual(len(df), 2) 13 | 14 | val = df[df.two == 2].three[0] 15 | self.assertEqual(val, 3) 16 | 17 | def test_index_dataframe(self): 18 | class KeyIter(BaseIter): 19 | key_field = "one" 20 | 21 | io = KeyIter(data=self.data) 22 | df = io.as_dataframe() 23 | self.assertEqual(len(df), 2) 24 | 25 | val = df.loc[4].three 26 | self.assertEqual(val, 6) 27 | 28 | def test_csv_dataframe(self): 29 | io = load_string(self.csv_data) 30 | df = io.as_dataframe() 31 | self.assertEqual(len(df), 2) 32 | 33 | val = df[df.two == "2"].three[0] 34 | self.assertEqual(val, "3") 35 | -------------------------------------------------------------------------------- /tests/test_extra_data.py: -------------------------------------------------------------------------------- 1 | from itertable import ExcelFileIter 2 | from .base import IterTestCase 3 | from datetime import date 4 | 5 | 6 | class ExtraDataIter(ExcelFileIter): 7 | start_row = 5 8 | 9 | 10 | class LoadFileTestCase(IterTestCase): 11 | def test_extra_data(self): 12 | filename = self.get_filename("extra", "xlsx") 13 | instance = ExtraDataIter(filename=filename) 14 | self.check_instance(instance) 15 | 16 | self.assertEqual(instance.extra_data[0][0], "Name") 17 | self.assertEqual(instance.extra_data[0][1], "Test") 18 | self.assertEqual(instance.extra_data[1][0], "Type") 19 | self.assertEqual(instance.extra_data[1][1], "Test") 20 | self.assertEqual(instance.extra_data[0][3], "Date") 21 | self.assertEqual(instance.extra_data[0][4], date(2014, 12, 12)) 22 | 23 | def test_no_extra_data(self): 24 | filename = self.get_filename("extra", "xlsx") 25 | ExtraDataIter(filename=filename) 26 | filename = self.get_filename("noextra", "xlsx") 27 | instance = ExtraDataIter(filename=filename) 28 | self.check_instance(instance) 29 | self.assertFalse(instance.extra_data) 30 | 31 | def check_instance(self, instance): 32 | self.assertEqual(len(instance), len(self.data)) 33 | 34 | for row, data in zip(instance, self.data): 35 | for key in data: 36 | val = getattr(row, key) 37 | if isinstance(val, str) and val.isdigit(): 38 | val = int(val) 39 | self.assertEqual(val, data[key]) 40 | -------------------------------------------------------------------------------- /tests/test_gis.py: -------------------------------------------------------------------------------- 1 | from itertable.gis import ShapeIter 2 | from shapely.geometry import Point 3 | from .base import IterTestCase 4 | from os import unlink 5 | 6 | 7 | class GisTestCase(IterTestCase): 8 | def setUp(self): 9 | self.points = [ 10 | Point(-93.278, 44.976), 11 | Point(-93.247, 44.973), 12 | ] 13 | self.types = ( 14 | "geojson", 15 | "shp", 16 | ) 17 | 18 | def test_shapeio(self): 19 | for ext in self.types: 20 | filename = self.get_filename("test", ext) 21 | instance = ShapeIter(filename=filename) 22 | self.check_instance(instance) 23 | 24 | def test_shapeio_sync(self): 25 | for source_ext in self.types: 26 | for dest_ext in self.types: 27 | source_file = self.get_filename("test", source_ext) 28 | dest_file = self.get_filename("sync", dest_ext, True) 29 | source_instance = ShapeIter(filename=source_file) 30 | dest_instance = ShapeIter( 31 | filename=dest_file, require_existing=False 32 | ) 33 | source_instance.sync(dest_instance) 34 | self.check_instance(ShapeIter(filename=dest_file)) 35 | 36 | def check_instance(self, instance): 37 | self.assertEqual(len(instance), len(self.data)) 38 | 39 | for row, data, point in zip(instance.values(), self.data, self.points): 40 | for key in data: 41 | val = getattr(row, key) 42 | try: 43 | val = int(val) 44 | except ValueError: 45 | pass 46 | self.assertEqual(val, data[key]) 47 | self.assertTrue(row.geometry.contains(point)) 48 | 49 | def get_filename(self, filename, ext, remove_existing=False): 50 | filename = super(GisTestCase, self).get_filename( 51 | filename, ext, remove_existing 52 | ) 53 | if ext == "shp" and remove_existing: 54 | for ext in ("dbf", "shx", "prj"): 55 | try: 56 | unlink(filename.replace("shp", ext)) 57 | except OSError: 58 | pass 59 | return filename 60 | -------------------------------------------------------------------------------- /tests/test_gis_dataframe.py: -------------------------------------------------------------------------------- 1 | from itertable.gis import GisIter, WktIter, ShapeIter 2 | from .base import IterTestCase 3 | 4 | 5 | class GisDataFrameTestCase(IterTestCase): 6 | def test_gisio_dataframe(self): 7 | self.dataframe_test(GisIter) 8 | 9 | def test_wktio_dataframe(self): 10 | self.dataframe_test(WktIter) 11 | 12 | def test_shapeio_dataframe(self): 13 | self.dataframe_test(ShapeIter) 14 | 15 | def dataframe_test(self, cls): 16 | instance = cls(filename="tests/files/test.shp") 17 | df = instance.as_dataframe() 18 | self.assertEqual(len(df), 2) 19 | self.assertGreater(df.geometry.area.sum(), 0) 20 | -------------------------------------------------------------------------------- /tests/test_load_file.py: -------------------------------------------------------------------------------- 1 | from itertable import load_file 2 | from itertable.exceptions import LoadFailed, NoData 3 | from .base import IterTestCase 4 | import unittest 5 | import pickle 6 | import io 7 | 8 | try: 9 | import magic 10 | except ImportError: 11 | magic = None 12 | 13 | 14 | class LoadFileTestCase(IterTestCase): 15 | def setUp(self): 16 | self.types = ("csv", "json", "xml", "xls", "xlsx") 17 | 18 | def test_load_file(self): 19 | for ext in self.types: 20 | filename = self.get_filename("test", ext) 21 | instance = load_file(filename) 22 | self.check_instance(instance) 23 | 24 | def test_load_file_object(self): 25 | for ext in self.types: 26 | filename = self.get_filename("test", ext) 27 | if ext in ("xls", "xlsx"): 28 | mode = "rb" 29 | else: 30 | mode = "r" 31 | with open(filename, mode) as f: 32 | instance = load_file(f) 33 | self.check_instance(instance) 34 | 35 | def test_load_file_object_binary(self): 36 | for ext in self.types: 37 | filename = self.get_filename("test", ext) 38 | with open(filename, "rb") as f: 39 | instance = load_file(f) 40 | self.check_instance(instance) 41 | 42 | @unittest.skipUnless(magic, "magic required for buffer-based detection") 43 | def test_load_file_object_no_name(self): 44 | for ext in self.types: 45 | filename = self.get_filename("test", ext) 46 | if ext in ("xls", "xlsx"): 47 | mode = "rb" 48 | IO = io.BytesIO 49 | else: 50 | mode = "r" 51 | IO = io.StringIO 52 | 53 | with open(filename, mode) as f: 54 | obj = IO(f.read()) 55 | 56 | instance = load_file(obj) 57 | self.check_instance(instance) 58 | 59 | def test_load_file_like(self): 60 | class FileLike: 61 | name = "test.csv" 62 | 63 | def read(self, *args, **kwargs): 64 | return "one,two,three\n1,2,3\n4,5,6" 65 | 66 | def __iter__(self): 67 | yield from self.read().split("\n") 68 | 69 | instance = load_file(FileLike()) 70 | self.check_instance(instance) 71 | 72 | def test_load_non_file(self): 73 | with self.assertRaises(AssertionError): 74 | load_file([{"value": "not a file"}]) 75 | 76 | def test_load_csv_prelude(self): 77 | filename = self.get_filename("test2", "csv") 78 | instance = load_file(filename) 79 | self.check_instance(instance) 80 | 81 | def test_load_csv_unicode(self): 82 | filename = self.get_filename("test3", "csv") 83 | instance = load_file(filename) 84 | self.check_instance(instance) 85 | self.assertTrue(hasattr(instance[0], "μ")) 86 | self.assertEqual(instance[0].μ, "test") 87 | 88 | def test_load_xlsx_sheets(self): 89 | filename = self.get_filename("test", "xlsx") 90 | instance = load_file(filename, options={"sheet_name": None}) 91 | self.assertEqual(len(instance), 1) 92 | self.assertEqual(instance[0].name, "Sheet1") 93 | self.check_instance(instance[0].data) 94 | 95 | def test_load_nodata(self): 96 | filename = self.get_filename("nodata", "csv") 97 | instance = load_file(filename) 98 | with self.assertRaises(NoData) as cm: 99 | instance[0] 100 | self.assertEqual(str(cm.exception), "No data returned!") 101 | 102 | def test_load_nodata_excel(self): 103 | filename = self.get_filename("nodata", "xlsx") 104 | instance = load_file(filename) 105 | with self.assertRaises(NoData) as cm: 106 | instance[0] 107 | self.assertEqual(str(cm.exception), "No data returned!") 108 | 109 | def test_load_nodata_excel_sheets(self): 110 | filename = self.get_filename("nodata", "xlsx") 111 | instance = load_file(filename, options={"sheet_name": None}) 112 | self.assertEqual(len(instance), 1) 113 | self.assertEqual(instance[0].name, "Sheet1") 114 | sheet = instance[0].data 115 | with self.assertRaises(NoData) as cm: 116 | sheet[0] 117 | self.assertEqual(str(cm.exception), "No data returned!") 118 | 119 | def test_load_non_existing(self): 120 | filename = self.get_filename("nonexisting", "csv") 121 | with self.assertRaises(LoadFailed) as cm: 122 | load_file(filename) 123 | self.assertEqual(str(cm.exception), "No such file or directory") 124 | 125 | def test_load_init_empty(self): 126 | filename = self.get_filename("nonexisting", "csv") 127 | instance = load_file(filename, options={"require_existing": False}) 128 | with self.assertRaises(NoData) as cm: 129 | instance[0] 130 | self.assertEqual(str(cm.exception), "No data returned!") 131 | 132 | def test_pickle(self): 133 | for ext in self.types: 134 | filename = self.get_filename("test", ext) 135 | instance = load_file(filename) 136 | instance = pickle.loads(pickle.dumps(instance)) 137 | self.check_instance(instance) 138 | 139 | def test_auto_pickle(self): 140 | for ext in self.types: 141 | filename = self.get_filename("test", ext) 142 | instance = load_file(filename) 143 | # Run through the io once to ensure auto-generated data is present 144 | self.check_instance(instance) 145 | instance = pickle.loads(pickle.dumps(instance)) 146 | self.check_instance(instance) 147 | -------------------------------------------------------------------------------- /tests/test_netloader.py: -------------------------------------------------------------------------------- 1 | import httpretty 2 | from itertable import CsvNetIter, load_url 3 | from itertable.exceptions import LoadFailed 4 | import pickle 5 | from .base import IterTestCase 6 | 7 | 8 | class TestIter(CsvNetIter): 9 | url = "http://example.com/test.csv" 10 | 11 | 12 | class NetLoaderTestCase(IterTestCase): 13 | def setUp(self): 14 | httpretty.enable() 15 | 16 | httpretty.register_uri( 17 | httpretty.GET, 18 | "http://example.com/test.csv", 19 | body="one,two,three\n1,2,3\n4,5,6", 20 | content_type="text/csv", 21 | ) 22 | httpretty.register_uri( 23 | httpretty.GET, 24 | "http://example.com/fail.txt", 25 | body="Not Found", 26 | content_type="text/plain", 27 | status=404, 28 | ) 29 | httpretty.register_uri( 30 | httpretty.GET, 31 | "http://example.com/fail.html", 32 | body="Not Found", 33 | content_type="text/html", 34 | status=404, 35 | ) 36 | 37 | def tearDown(self): 38 | httpretty.disable() 39 | httpretty.reset() 40 | 41 | def test_load_csv(self): 42 | self.check_instance(TestIter()) 43 | 44 | def test_load_url(self): 45 | self.check_instance(load_url("http://example.com/test.csv")) 46 | 47 | def test_load_csv_params(self): 48 | self.check_instance(TestIter(params={"test": 1})) 49 | qs = httpretty.last_request().querystring 50 | self.assertEqual(qs, {"test": ["1"]}) 51 | 52 | self.check_instance(TestIter(params="test=1")) 53 | qs = httpretty.last_request().querystring 54 | self.assertEqual(qs, {"test": ["1"]}) 55 | 56 | self.check_instance(TestIter(params=None)) 57 | qs = httpretty.last_request().querystring 58 | self.assertEqual(qs, {}) 59 | 60 | def test_debug_string(self): 61 | instance = TestIter(debug=True) 62 | self.assertEqual( 63 | instance.debug_string, "GET: http://example.com/test.csv" 64 | ) 65 | instance = TestIter(params={"test": 1}, debug=True) 66 | self.assertEqual( 67 | instance.debug_string, "GET: http://example.com/test.csv?test=1" 68 | ) 69 | 70 | def test_load_csv_auth(self): 71 | class AuthTestIter(CsvNetIter): 72 | url = "http://example.com/test.csv" 73 | username = "user" 74 | password = "pass" 75 | 76 | self.check_instance(AuthTestIter()) 77 | headers = httpretty.last_request().headers 78 | auth = "Basic dXNlcjpwYXNz" # b64encode("user:pass") 79 | self.assertEqual(headers.get("Authorization", None), auth) 80 | 81 | def test_load_csv_pickle(self): 82 | instance = TestIter() 83 | self.check_instance(instance) 84 | instance = pickle.loads(pickle.dumps(instance)) 85 | self.check_instance(instance) 86 | 87 | def test_load_fail(self): 88 | class TestIter(CsvNetIter): 89 | url = "http://example.com/fail.txt" 90 | 91 | with self.assertRaises(LoadFailed) as cm: 92 | TestIter() 93 | self.assertEqual(str(cm.exception), "Not Found") 94 | 95 | def test_load_fail_html(self): 96 | class TestIter(CsvNetIter): 97 | url = "http://example.com/fail.html" 98 | 99 | with self.assertRaises(LoadFailed) as cm: 100 | TestIter() 101 | self.assertEqual(str(cm.exception), "Not Found") 102 | -------------------------------------------------------------------------------- /tests/test_write.py: -------------------------------------------------------------------------------- 1 | from itertable import load_file 2 | from itertable import ( 3 | CsvFileIter, 4 | JsonFileIter, 5 | XmlFileIter, 6 | OldExcelFileIter, 7 | ExcelFileIter, 8 | ) 9 | from .base import IterTestCase 10 | 11 | 12 | class LoadFileTestCase(IterTestCase): 13 | def setUp(self): 14 | self.data = [ 15 | { 16 | "one": 1, 17 | "two": 2, 18 | "three": 3, 19 | }, 20 | { 21 | "one": 4, 22 | "two": 5, 23 | "three": 6, 24 | }, 25 | ] 26 | self.types = ("csv", "json", "xml", "xls", "xlsx") 27 | self.classes = ( 28 | CsvFileIter, 29 | JsonFileIter, 30 | XmlFileIter, 31 | OldExcelFileIter, 32 | ExcelFileIter, 33 | ) 34 | 35 | def test_write_file(self): 36 | """ 37 | Test BaseIter.save() when starting from an empty Iter instance 38 | """ 39 | for ext, cls in zip(self.types, self.classes): 40 | filename = self.get_filename("output", ext, True) 41 | 42 | # Create an empty instance of the class 43 | instance = cls( 44 | filename=filename, 45 | require_existing=False, 46 | field_names=["one", "two", "three"], 47 | # These only apply to XmlFileIter, will be ignored by others 48 | root_tag="root", 49 | item_tag="item", 50 | ) 51 | 52 | # Add rows to the instance using list-style BaseIter.append() 53 | for row in self.data: 54 | instance.append(instance.create(**row)) 55 | 56 | # Save the instance, which should write to output.[ext] 57 | instance.save() 58 | 59 | # The contents of the saved file should match the original data 60 | self.check_instance(load_file(filename)) 61 | 62 | def duplicate(self, mode, xform): 63 | """ 64 | Test BaseIter.copy/sync() (and implicit save()) between combinations of 65 | the default Iter classes. 66 | """ 67 | for source_ext, source_cls in zip(self.types, self.classes): 68 | for dest_ext, dest_cls in zip(self.types, self.classes): 69 | source_file = self.get_filename("test", source_ext) 70 | dest_file = self.get_filename(mode, dest_ext, True) 71 | 72 | # Sync requires key_field to be set on both classes 73 | source_cls = xform(source_cls) 74 | dest_cls = xform(dest_cls) 75 | 76 | # Load source data into Iter instance 77 | source_instance = source_cls(filename=source_file) 78 | 79 | # Create empty instance of the destination Iter class 80 | dest_instance = dest_cls( 81 | filename=dest_file, 82 | require_existing=False, 83 | field_names=["one", "two", "three"], 84 | root_tag="root", 85 | item_tag="item", 86 | ) 87 | 88 | # The Sync 89 | getattr(source_instance, mode)(dest_instance) 90 | 91 | # Load the destination file again and check contents 92 | self.check_instance(load_file(dest_file)) 93 | 94 | def test_copy_io(self): 95 | self.duplicate("copy", lambda d: d) 96 | 97 | def test_sync_io(self): 98 | self.duplicate("sync", self.with_key_field) 99 | 100 | def with_key_field(self, cls): 101 | class new_class(cls): 102 | key_field = "one" 103 | 104 | new_class.__name__ = "Dict" + cls.__name__ 105 | return new_class 106 | -------------------------------------------------------------------------------- /tests/test_zip.py: -------------------------------------------------------------------------------- 1 | from itertable import ( 2 | ZipFileLoader, 3 | ZipNetLoader, 4 | CsvParser, 5 | ExcelParser, 6 | TupleMapper, 7 | BaseIter, 8 | ) 9 | from .base import IterTestCase 10 | from itertable.exceptions import LoadFailed 11 | import httpretty 12 | 13 | 14 | class CsvZipFileIter(ZipFileLoader, CsvParser, TupleMapper, BaseIter): 15 | inner_binary = False 16 | 17 | 18 | class ExcelZipFileIter(ZipFileLoader, ExcelParser, TupleMapper, BaseIter): 19 | inner_binary = True 20 | 21 | 22 | class CsvZipNetIter(ZipNetLoader, CsvParser, TupleMapper, BaseIter): 23 | url = "http://example.com/testcsv.zip" 24 | inner_binary = False 25 | 26 | 27 | class ExcelZipNetIter(ZipNetLoader, ExcelParser, TupleMapper, BaseIter): 28 | url = "http://example.com/testxlsx.zip" 29 | inner_binary = True 30 | 31 | 32 | class ZipFileTestCase(IterTestCase): 33 | def test_csv_zip(self): 34 | filename = self.get_filename("testcsv", "zip") 35 | instance = CsvZipFileIter(filename=filename) 36 | self.check_instance(instance) 37 | 38 | def test_xlsx_zip(self): 39 | filename = self.get_filename("testxlsx", "zip") 40 | instance = ExcelZipFileIter(filename=filename) 41 | self.check_instance(instance) 42 | 43 | def test_multi_zip(self): 44 | filename = self.get_filename("testmulti", "zip") 45 | with self.assertRaises(LoadFailed) as cm: 46 | CsvZipFileIter(filename=filename) 47 | self.assertEqual(str(cm.exception), "Multiple Inner Files!") 48 | 49 | def test_multi_zip_name(self): 50 | filename = self.get_filename("testmulti", "zip") 51 | instance = CsvZipFileIter(filename=filename, inner_filename="test.csv") 52 | self.check_instance(instance) 53 | 54 | 55 | class NetZipFileTestCase(IterTestCase): 56 | def setUp(self): 57 | httpretty.enable() 58 | self.register_url("testcsv") 59 | self.register_url("testxlsx") 60 | 61 | def register_url(self, name): 62 | filename = self.get_filename(name, "zip") 63 | zipfile = open(filename, "rb") 64 | zipdata = zipfile.read() 65 | zipfile.close() 66 | httpretty.register_uri( 67 | httpretty.GET, 68 | "http://example.com/%s.zip" % name, 69 | body=zipdata, 70 | content_type="application/zip", 71 | ) 72 | 73 | def tearDown(self): 74 | httpretty.disable() 75 | httpretty.reset() 76 | 77 | def test_load_zip(self): 78 | self.check_instance(CsvZipNetIter()) 79 | 80 | def test_xlsx_zip(self): 81 | self.check_instance(ExcelZipNetIter()) 82 | --------------------------------------------------------------------------------