├── .github
    └── workflows
    │   └── test.yml
├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── itertable
    ├── __init__.py
    ├── __main__.py
    ├── base.py
    ├── commands.py
    ├── exceptions.py
    ├── gis
    │   ├── __init__.py
    │   └── mixins.py
    ├── loaders.py
    ├── mappers.py
    ├── parsers
    │   ├── __init__.py
    │   ├── base.py
    │   ├── readers.py
    │   ├── text.py
    │   └── xls.py
    └── util.py
├── pyproject.toml
└── tests
    ├── __init__.py
    ├── base.py
    ├── files
        ├── .gitignore
        ├── custom.json
        ├── custom.xml
        ├── custom2.json
        ├── extra.xlsx
        ├── nodata.csv
        ├── nodata.xlsx
        ├── noextra.xlsx
        ├── test.csv
        ├── test.dbf
        ├── test.geojson
        ├── test.json
        ├── test.prj
        ├── test.shp
        ├── test.shx
        ├── test.xls
        ├── test.xlsx
        ├── test.xml
        ├── test2.csv
        ├── test3.csv
        ├── testcsv.zip
        ├── testmulti.zip
        └── testxlsx.zip
    ├── test_custom.py
    ├── test_dataframe.py
    ├── test_extra_data.py
    ├── test_gis.py
    ├── test_gis_dataframe.py
    ├── test_load_file.py
    ├── test_netloader.py
    ├── test_write.py
    └── test_zip.py


/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   build:
 7 |     name: Python ${{ matrix.python-version }}, ${{ matrix.variant }}
 8 |     runs-on: ubuntu-22.04
 9 |     strategy:
10 |       matrix:
11 |         python-version: ["3.11", "3.10", 3.9, 3.8, 3.7]
12 |         variant: [no-magic]
13 |         include:
14 |           - python-version: "3.11"
15 |             variant: magic
16 |     steps:
17 |     - uses: actions/checkout@v2
18 |     - name: Set up Python ${{ matrix.python-version }}
19 |       uses: actions/setup-python@v2
20 |       with:
21 |         python-version: ${{ matrix.python-version }}
22 |     - name: Install dependencies
23 |       run: |
24 |         python -m pip install --upgrade pip
25 |         python -m pip install build
26 |         python -m pip install flake8 wheel httpretty beautifulsoup4
27 |         python -m pip install requests openpyxl click
28 |         python -m pip install Shapely Fiona pandas geopandas xlrd xlwt
29 |     - name: Install python-magic
30 |       if: ${{ matrix.variant == 'magic' }}
31 |       run: python -m pip install python-magic
32 |     - name: Lint with flake8
33 |       run: |
34 |         # stop the build if there are Python syntax errors or undefined names
35 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
36 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
37 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
38 |     - name: Test with unittest
39 |       run: python -m unittest discover -s tests -t . -v
40 |     - name: Test build
41 |       run: python -m build
42 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.egg
3 | *.swp
4 | *.egg-info
5 | build
6 | dist
7 | README.rst
8 | wq/__init__.py
9 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
 6 | 
 7 | ## Our Standards
 8 | 
 9 | Examples of behavior that contributes to creating a positive environment include:
10 | 
11 | * Using welcoming and inclusive language
12 | * Being respectful of differing viewpoints and experiences
13 | * Gracefully accepting constructive criticism
14 | * Focusing on what is best for the community
15 | * Showing empathy towards other community members
16 | 
17 | Examples of unacceptable behavior by participants include:
18 | 
19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances
20 | * Trolling, insulting/derogatory comments, and personal or political attacks
21 | * Public or private harassment
22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission
23 | * Other conduct which could reasonably be considered inappropriate in a professional setting
24 | 
25 | ## Our Responsibilities
26 | 
27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
28 | 
29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
30 | 
31 | ## Scope
32 | 
33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
34 | 
35 | ## Enforcement
36 | 
37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at andrew@wq.io. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
38 | 
39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
40 | 
41 | ## Attribution
42 | 
43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version]
44 | 
45 | [homepage]: http://contributor-covenant.org
46 | [version]: http://contributor-covenant.org/version/1/4/
47 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thanks for contributing to IterTable!  Here are some guidelines to help you get started.
 4 | 
 5 | ## Questions
 6 | 
 7 | Questions and ideas can be submitted to the [Django Data Wizard discussion board](https://github.com/wq/django-data-wizard/discussions).
 8 | 
 9 | ## Bug Reports
10 | 
11 | Bug reports can be submitted to either [IterTable issues](https://github.com/wq/itertable/issues) or [Django Data Wizard issues](https://github.com/wq/itertable/issues).  Reports can take any form as long as there is enough information to diagnose the problem.  To speed up response time, try to include the following whenever possible:
12 |  * Versions of Fiona and/or Pandas, if applicable
13 |  * Expected (or ideal) behavior
14 |  * Actual behavior
15 | 
16 | ## Pull Requests
17 | 
18 | Pull requests are very welcome and will be reviewed and merged as time allows.  To speed up reviews, try to include the following whenever possible:
19 |  * Reference the issue that the PR fixes (e.g. [#3](https://github.com/wq/itertable/issues/3))
20 |  * Failing test case fixed by the PR
21 |  * If the PR provides new functionality, update [the documentation](https://github.com/wq/django-data-wizard/tree/main/docs/itertable)
22 |  * Ensure the PR passes lint and unit tests.  This happens automatically, but you can also run these locally with the following commands:
23 |  
24 | ```bash 
25 | python -m unittest discover -s tests -t . -v   # run the test suite
26 | flake8 # run code style checking
27 | ```
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2012-2022, S. Andrew Sheppard, http://wq.io/
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 7 | of the Software, and to permit persons to whom the Software is furnished to do
 8 | so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | **IterTable** is a Pythonic API for iterating through tabular data formats, including CSV, XLSX, XML, and JSON.
 2 | 
 3 | ```python
 4 | from itertable import load_file
 5 | 
 6 | for row in load_file("example.xlsx"):
 7 |     print(row.date, row.name)
 8 | ```
 9 | 
10 | [![Latest PyPI Release](https://img.shields.io/pypi/v/itertable.svg)](https://pypi.org/project/itertable)
11 | [![Release Notes](https://img.shields.io/github/release/wq/itertable.svg)](https://github.com/wq/itertable/releases)
12 | [![License](https://img.shields.io/pypi/l/itertable.svg)](https://github.com/wq/itertable/blob/master/LICENSE)
13 | [![GitHub Stars](https://img.shields.io/github/stars/wq/itertable.svg)](https://github.com/wq/itertable/stargazers)
14 | [![GitHub Forks](https://img.shields.io/github/forks/wq/itertable.svg)](https://github.com/wq/itertable/network)
15 | [![GitHub Issues](https://img.shields.io/github/issues/wq/itertable.svg)](https://github.com/wq/itertable/issues)
16 | 
17 | [![Tests](https://github.com/wq/itertable/actions/workflows/test.yml/badge.svg)](https://github.com/wq/itertable/actions/workflows/test.yml)
18 | [![Python Support](https://img.shields.io/pypi/pyversions/itertable.svg)](https://pypi.python.org/pypi/itertable)
19 | 
20 | ### [Documentation][docs]
21 | 
22 | [**Installation**][installation]
23 | 
24 | [**API**][api]
25 | <br>
26 | [CLI][cli]
27 | &bull;
28 | [GIS][gis]
29 | 
30 | [**Extending IterTable**][custom]
31 | <br>
32 | [BaseIter][base]
33 | &bull;
34 | [Loaders][loaders]
35 | &bull;
36 | [Parsers][parsers]
37 | &bull;
38 | [Mappers][mappers]
39 | 
40 | [docs]: https://django-data-wizard.wq.io/itertable/
41 | 
42 | [installation]: https://django-data-wizard.wq.io/itertable/#getting-started
43 | [api]: https://django-data-wizard.wq.io/itertable/#overview
44 | [cli]: https://django-data-wizard.wq.io/itertable/#command-line-interface
45 | [custom]: https://django-data-wizard.wq.io/itertable/custom
46 | [base]: https://django-data-wizard.wq.io/itertable/base
47 | [loaders]: https://django-data-wizard.wq.io/itertable/loaders
48 | [parsers]: https://django-data-wizard.wq.io/itertable/parsers
49 | [mappers]: https://django-data-wizard.wq.io/itertable/mappers
50 | [gis]: https://django-data-wizard.wq.io/itertable/gis
51 | 


--------------------------------------------------------------------------------
/itertable/__init__.py:
--------------------------------------------------------------------------------
  1 | from .base import BaseIter
  2 | 
  3 | from .loaders import (
  4 |     BaseLoader,
  5 |     FileLoader,
  6 |     Zipper,
  7 |     ZipFileLoader,
  8 |     StringLoader,
  9 |     NetLoader,
 10 |     ZipNetLoader,
 11 | )
 12 | 
 13 | from .parsers import (
 14 |     CsvParser,
 15 |     JsonParser,
 16 |     XmlParser,
 17 |     OldExcelParser,
 18 |     ExcelParser,
 19 | )
 20 | 
 21 | from .mappers import (
 22 |     BaseMapper,
 23 |     DictMapper,
 24 |     TupleMapper,
 25 |     TimeSeriesMapper,
 26 |     make_date_mapper,
 27 | )
 28 | 
 29 | from .util import (
 30 |     make_iter,
 31 |     load_file,
 32 |     load_url,
 33 |     load_string,
 34 |     guess_type,
 35 |     flattened,
 36 | )
 37 | 
 38 | try:
 39 |     from .version import __version__ as VERSION
 40 | except ImportError:
 41 |     VERSION = "0.0.0"
 42 | 
 43 | 
 44 | __all__ = (
 45 |     "BaseIter",
 46 |     "BaseLoader",
 47 |     "FileLoader",
 48 |     "Zipper",
 49 |     "ZipFileLoader",
 50 |     "StringLoader",
 51 |     "NetLoader",
 52 |     "ZipNetLoader",
 53 |     "CsvParser",
 54 |     "JsonParser",
 55 |     "XmlParser",
 56 |     "ExcelParser",
 57 |     "OldExcelParser",
 58 |     "BaseMapper",
 59 |     "DictMapper",
 60 |     "TupleMapper",
 61 |     "TimeSeriesMapper",
 62 |     "make_date_mapper",
 63 |     "make_iter",
 64 |     "load_file",
 65 |     "load_url",
 66 |     "load_string",
 67 |     "guess_type",
 68 |     "flattened",
 69 |     "VERSION",
 70 |     "CsvFileIter",
 71 |     "CsvNetIter",
 72 |     "CsvStringIter",
 73 |     "JsonFileIter",
 74 |     "JsonNetIter",
 75 |     "JsonStringIter",
 76 |     "XmlFileIter",
 77 |     "XmlNetIter",
 78 |     "XmlStringIter",
 79 |     "OldExcelFileIter",
 80 |     "ExcelFileIter",
 81 | )
 82 | 
 83 | # Some useful pre-mixed classes
 84 | CsvFileIter = make_iter(FileLoader, CsvParser)
 85 | CsvNetIter = make_iter(NetLoader, CsvParser)
 86 | CsvStringIter = make_iter(StringLoader, CsvParser)
 87 | 
 88 | JsonFileIter = make_iter(FileLoader, JsonParser)
 89 | JsonNetIter = make_iter(NetLoader, JsonParser)
 90 | JsonStringIter = make_iter(StringLoader, JsonParser)
 91 | 
 92 | XmlFileIter = make_iter(FileLoader, XmlParser)
 93 | XmlNetIter = make_iter(NetLoader, XmlParser)
 94 | XmlStringIter = make_iter(StringLoader, XmlParser)
 95 | 
 96 | OldExcelFileIter = make_iter(FileLoader, OldExcelParser)
 97 | ExcelFileIter = make_iter(FileLoader, ExcelParser)
 98 | OldExcelNetIter = make_iter(NetLoader, OldExcelParser)
 99 | ExcelNetIter = make_iter(NetLoader, ExcelParser)
100 | 
101 | try:
102 |     from .gis import GisIter, ShapeIter, WktIter
103 | 
104 |     __all__ += (
105 |         "GisIter",
106 |         "ShapeIter",
107 |         "WktIter",
108 |     )
109 | except ImportError:
110 |     pass
111 | 


--------------------------------------------------------------------------------
/itertable/__main__.py:
--------------------------------------------------------------------------------
1 | from .commands import cat
2 | 
3 | 
4 | if __name__ == "__main__":
5 |     cat()
6 | 


--------------------------------------------------------------------------------
/itertable/base.py:
--------------------------------------------------------------------------------
  1 | from collections.abc import MutableMapping, MutableSequence
  2 | 
  3 | 
  4 | class BaseIter(MutableMapping, MutableSequence):
  5 |     "itertable.BaseIter: Base class for generic resource management"
  6 | 
  7 |     tabular = False
  8 |     nested = False
  9 |     binary = False
 10 |     loaded = False
 11 |     parsed = False
 12 | 
 13 |     def __init__(self, **kwargs):
 14 |         self.__dict__.update(kwargs)
 15 |         self.refresh()
 16 | 
 17 |     def refresh(self):
 18 |         if not self.loaded:
 19 |             self.load()
 20 |             self.loaded = True
 21 | 
 22 |         if self.parsed:
 23 |             return
 24 | 
 25 |         if getattr(self, "empty_file", False):
 26 |             self.data = []
 27 |         else:
 28 |             self.parse()
 29 |             if hasattr(self, "file"):
 30 |                 f = self.file
 31 |                 if hasattr(f, "close") and not getattr(f, "closed", False):
 32 |                     f.close()
 33 | 
 34 |         self.parsed = True
 35 | 
 36 |     def load(self):
 37 |         "Open a resource (defined by loader mixins)"
 38 |         # self.file = ...
 39 |         pass
 40 | 
 41 |     def parse(self):
 42 |         """
 43 |         Parse a resource (defined by parser mixins).
 44 |         Result should be an iterable of dicts.
 45 |         """
 46 |         # self.data = some_parse_method(self.file)
 47 |         pass
 48 | 
 49 |     def dump(self, file=None):
 50 |         """"""
 51 |         if file is None:
 52 |             file = self.file
 53 |         file.write(str(self.data))
 54 | 
 55 |     def save(self):
 56 |         """"""
 57 |         self.dump(self.file)
 58 | 
 59 |     field_names = None
 60 |     scan_fields = False
 61 |     _auto_field_names = None
 62 | 
 63 |     def get_field_names(self):
 64 |         "Returns a list of raw fields to expect (defined by parser mixins)"
 65 |         if self.field_names is not None:
 66 |             # Support specifying field_names as string (like namedtuple does)
 67 |             if isinstance(self.field_names, str):
 68 |                 return self.field_names.replace(",", " ").split()
 69 |             else:
 70 |                 return self.field_names
 71 | 
 72 |         # If no defined field names, try to retrieve from data
 73 |         if not getattr(self, "data", None):
 74 |             return None
 75 | 
 76 |         if self._auto_field_names:
 77 |             return self._auto_field_names
 78 | 
 79 |         if self.scan_fields:
 80 |             # Scan all rows for field names
 81 |             field_names = set()
 82 |             for row in self.data:
 83 |                 field_names.update(row.keys())
 84 |             field_names = list(field_names)
 85 |         else:
 86 |             # Assume first row contains same keys as all other rows
 87 |             field_names = list(self.data[0].keys())
 88 | 
 89 |         self._auto_field_names = field_names
 90 |         return field_names
 91 | 
 92 |     @property
 93 |     def key_field(self):
 94 |         "Assign a key_field to use the resource as a Map"
 95 |         return None
 96 | 
 97 |     def get_key_field(self):
 98 |         return self.key_field
 99 | 
100 |     def usable_item(self, item):
101 |         "Hook to allow items to be transformed"
102 |         return item
103 | 
104 |     def parse_usable_item(self, uitem):
105 |         "Hook to allow items to be untransformed"
106 |         return uitem
107 | 
108 |     def compute_index(self, recompute=False):
109 |         key_field = self.get_key_field()
110 |         if key_field is None:
111 |             return None
112 | 
113 |         if getattr(self, "_index_cache", None) is not None and not recompute:
114 |             return self._index_cache
115 | 
116 |         index = {}
117 |         for i, item in enumerate(self.data):
118 |             uitem = self.usable_item(item)
119 |             if isinstance(uitem, dict):
120 |                 key = uitem.get(key_field, None)
121 |             else:
122 |                 key = getattr(uitem, key_field, None)
123 |             if key is not None:
124 |                 index[key] = i
125 | 
126 |         self._index_cache = index
127 |         return index
128 | 
129 |     def find_index(self, key):
130 |         index = self.compute_index()
131 |         if index is not None:
132 |             return index.get(key, None)
133 |         else:
134 |             return key
135 | 
136 |     def __len__(self):
137 |         return len(self.data)
138 | 
139 |     def __getitem__(self, key):
140 |         index = self.find_index(key)
141 |         if index is None:
142 |             raise KeyError
143 |         return self.usable_item(self.data[index])
144 | 
145 |     def __setitem__(self, key, uitem):
146 |         item = self.parse_usable_item(uitem)
147 |         index = self.find_index(key)
148 |         if index is not None:
149 |             self.data[index] = item
150 |         else:
151 |             self.data.append(item)
152 |             self.compute_index(True)
153 | 
154 |     def __delitem__(self, key):
155 |         index = self.find_index(key)
156 |         if index is None:
157 |             raise KeyError
158 |         del self.data[index]
159 |         self.compute_index(True)
160 | 
161 |     def insert(self, index, uitem):
162 |         item = self.parse_usable_item(uitem)
163 |         self.data.insert(index, item)
164 |         self.compute_index(True)
165 | 
166 |     def __iter__(self):
167 |         for item in self.data:
168 |             uitem = self.usable_item(item)
169 |             if uitem is None:
170 |                 return
171 |             pk = self.get_key_field()
172 |             if pk is None:
173 |                 yield uitem
174 |             elif isinstance(uitem, dict):
175 |                 yield uitem.get(pk, None)
176 |             else:
177 |                 yield getattr(uitem, pk, None)
178 | 
179 |     def sync(self, other, save=True):
180 |         if self.get_key_field() is None or other.get_key_field() is None:
181 |             raise Exception("Key field required to sync!")
182 |         for key in self:
183 |             other[key] = self[key]
184 |         if save:
185 |             other.save()
186 | 
187 |     def copy(self, other, save=True):
188 |         del other.data[:]
189 |         for item in self.data:
190 |             uitem = self.usable_item(item)
191 |             other.append(uitem)
192 |         if save:
193 |             other.save()
194 | 
195 |     # Slots to track things that can't be pickled
196 |     # (need a separate slot for each expected mixin class, since they don't
197 |     #  extend BaseIter)
198 |     no_pickle = []
199 |     no_pickle_loader = []
200 |     no_pickle_mapper = []
201 |     no_pickle_parser = []
202 | 
203 |     def get_no_pickle(self):
204 |         return (
205 |             self.no_pickle
206 |             + self.no_pickle_loader
207 |             + self.no_pickle_mapper
208 |             + self.no_pickle_parser
209 |         )
210 | 
211 |     def __getstate__(self):
212 |         """
213 |         Don't include auto-created and unpicklable properties in state.
214 |         """
215 |         state = self.__dict__.copy()
216 |         for name in self.get_no_pickle():
217 |             state.pop(name, None)
218 |         return state
219 | 
220 |     def item_dict(self, item):
221 |         return item
222 | 
223 |     def as_dataframe(self):
224 |         from pandas import DataFrame
225 | 
226 |         key = self.get_key_field()
227 |         if key:
228 |             data = [self.item_dict(row) for row in self.values()]
229 |         else:
230 |             data = [self.item_dict(row) for row in self]
231 |         df = DataFrame(data)
232 |         if key:
233 |             df.set_index(key, inplace=True)
234 |         return df
235 | 


--------------------------------------------------------------------------------
/itertable/commands.py:
--------------------------------------------------------------------------------
 1 | from . import load_file, load_url, flattened, JsonStringIter, CsvStringIter
 2 | from .exceptions import IterException
 3 | import click
 4 | import os
 5 | import importlib
 6 | 
 7 | 
 8 | @click.command()
 9 | @click.argument("source")
10 | @click.argument("source_options", required=False)
11 | @click.option("--format", "-f", default="csv", help="Output format")
12 | def cat(source, source_options, format):
13 |     """
14 |     Display contents of a file or IterTable class.  SOURCE can be either a
15 |     filename or a Python path.  SOURCE_OPTIONS is an optional string
16 |     specifying init options in "name=value" format, separated by commas.
17 | 
18 |     The data will be printed to the terminal in CSV form, unless the format is
19 |     set to JSON.
20 | 
21 |     Examples:
22 | 
23 |     \b
24 |     python3 -m itertable example.json         # JSON to CSV
25 |     python3 -m itertable -f json example.csv  # CSV to JSON
26 |     python3 -m itertable example.xlsx "start_row=5"
27 |     python3 -m itertable http://example.com/example.csv
28 |     python3 -m itertable itertable.CsvNetIter "url=http://example.com/example.csv"
29 |     """  # noqa
30 | 
31 |     # Parse option string
32 |     options = {}
33 |     if source_options:
34 |         for opt in source_options.split(","):
35 |             key, val = opt.split("=")
36 |             if val.isdigit():
37 |                 val = int(val)
38 |             options[key] = val
39 | 
40 |     if os.path.exists(source):
41 |         try:
42 |             input = load_file(source, options=options)
43 |         except IterException as e:
44 |             raise click.ClickException(str(e))
45 |     elif "http" in source and "://" in source:
46 |         try:
47 |             input = load_url(source, options=options)
48 |         except IterException as e:
49 |             raise click.ClickException(str(e))
50 |     else:
51 |         parts = source.split(".")
52 |         class_name = parts[-1]
53 |         module_name = ".".join(parts[:-1])
54 |         try:
55 |             module = importlib.import_module(module_name)
56 |             Iter = getattr(module, class_name)
57 |             input = flattened(Iter, **options)
58 |         except (ImportError, ValueError, AttributeError, IterException) as e:
59 |             raise click.ClickException(str(e))
60 | 
61 |     if format == "json":
62 |         OutputIter = JsonStringIter
63 |         init = "[]"
64 |     else:
65 |         OutputIter = CsvStringIter
66 |         init = ""
67 |     output = OutputIter(data=input.data, string=init)
68 |     output.data = input.data
69 |     output.save()
70 |     result = output.string
71 |     if output.binary:
72 |         result = result.decode("utf-8")
73 |     print(result)
74 | 


--------------------------------------------------------------------------------
/itertable/exceptions.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from bs4 import BeautifulSoup
 3 | except ImportError:
 4 |     BeautifulSoup = None
 5 | 
 6 | 
 7 | class IterException(Exception):
 8 |     def __str__(self):
 9 |         if self.args and self.args[0]:
10 |             return self.args[0]
11 |         return self.__doc__
12 | 
13 | 
14 | class LoadFailed(IterException):
15 |     """Error loading data!"""
16 | 
17 |     def __init__(self, message, path=None, code=None):
18 |         super(LoadFailed, self).__init__(message)
19 |         self.path = path
20 |         self.code = code
21 | 
22 |     def __str__(self):
23 |         if self.args and self.args[0]:
24 |             text = self.args[0]
25 |             has_html = False
26 |             for tag in "<html", "<body", "<div":
27 |                 if tag in text or tag.upper() in text:
28 |                     has_html = True
29 |             if has_html and BeautifulSoup:
30 |                 html = BeautifulSoup(text).body
31 |                 if html:
32 |                     text = html.get_text("\n")
33 |             return text
34 |         elif self.code is not None:
35 |             return "%s Error" % self.code
36 |         return super(LoadFailed, self).__str__()
37 | 
38 | 
39 | class ParseFailed(IterException):
40 |     """Error parsing data!"""
41 | 
42 |     pass
43 | 
44 | 
45 | class MappingFailed(IterException):
46 |     """Error processing data!"""
47 | 
48 |     pass
49 | 
50 | 
51 | class NoData(IterException):
52 |     """No data returned!"""
53 | 
54 |     pass
55 | 


--------------------------------------------------------------------------------
/itertable/gis/__init__.py:
--------------------------------------------------------------------------------
 1 | from .mixins import FionaLoaderParser, GisMapper, ShapeMapper, WktMapper
 2 | from ..base import BaseIter
 3 | 
 4 | 
 5 | class MetaSyncIter(BaseIter):
 6 |     """
 7 |     Custom sync() to handle transfering Fiona metadata (except for driver)
 8 |     """
 9 | 
10 |     def sync(self, other, save=True):
11 |         driver = other.meta.get("driver", None)
12 |         other.meta = self.meta.copy()
13 |         if driver:
14 |             other.meta["driver"] = driver
15 |         super(MetaSyncIter, self).sync(other, save)
16 | 
17 |     def get_field_names(self):
18 |         if self.field_names is None and self.meta is not None:
19 |             return ["id", "geometry"] + list(
20 |                 self.meta["schema"]["properties"].keys()
21 |             )
22 |         return super(MetaSyncIter, self).get_field_names()
23 | 
24 | 
25 | class GisIter(FionaLoaderParser, GisMapper, MetaSyncIter):
26 |     pass
27 | 
28 | 
29 | class ShapeIter(FionaLoaderParser, ShapeMapper, MetaSyncIter):
30 |     pass
31 | 
32 | 
33 | class WktIter(FionaLoaderParser, WktMapper, MetaSyncIter):
34 |     pass
35 | 


--------------------------------------------------------------------------------
/itertable/gis/mixins.py:
--------------------------------------------------------------------------------
  1 | import fiona
  2 | from shapely import wkt, geometry
  3 | from ..loaders import FileLoader
  4 | from ..exceptions import LoadFailed
  5 | from ..parsers.base import BaseParser
  6 | from ..mappers import TupleMapper
  7 | 
  8 | 
  9 | class FionaLoaderParser(FileLoader, BaseParser):
 10 |     """
 11 |     Composite loader & parser mixin for GIS data, powered by Fiona
 12 |     """
 13 | 
 14 |     layer_id = None
 15 |     meta = {}
 16 |     key_field = "id"
 17 | 
 18 |     def load(self):
 19 |         try:
 20 |             self.layers = fiona.listlayers(self.filename)
 21 |         except OSError as e:
 22 |             if self.require_existing:
 23 |                 raise LoadFailed(
 24 |                     e.strerror,
 25 |                     path=self.filename,
 26 |                     code=e.errno,
 27 |                 )
 28 |             else:
 29 |                 self.empty_file = True
 30 |         except ValueError as e:
 31 |             if self.require_existing:
 32 |                 raise LoadFailed(str(e))
 33 |             else:
 34 |                 self.empty_file = True
 35 | 
 36 |         if self.empty_file:
 37 |             driver = guess_driver(self.filename)
 38 |             self.meta = {"driver": driver}
 39 |             self.empty_file = True
 40 | 
 41 |     def parse(self):
 42 |         # If multiple layers, parse all of them (!)
 43 |         if len(self.layers) > 1 and self.layer_id is None:
 44 |             cls = type(self)
 45 |             self.data = [
 46 |                 {
 47 |                     "id": id,
 48 |                     "name": name,
 49 |                     "data": cls(filename=self.filename, layer_id=id),
 50 |                 }
 51 |                 for id, name in enumerate(self.layers)
 52 |             ]
 53 |         else:
 54 |             # One layer, load & parse GIS data
 55 |             with fiona.open(self.filename, layer=self.layer_id) as f:
 56 |                 self.meta = f.meta
 57 |                 if "id" in f.meta.get("schema", {}).get("properties", {}):
 58 |                     # TODO: Is this correct?
 59 |                     del f.meta["schema"]["properties"]["id"]
 60 |                 self.data = list(map(self.parse_feature, f))
 61 | 
 62 |     def parse_feature(self, f):
 63 |         # Flatten Fiona's GeoJSON-style representation into something more
 64 |         # amenable to namedtuple-ing
 65 |         feat = {key: value for key, value in f["properties"].items()}
 66 |         if "id" not in feat and "ID" not in feat:
 67 |             feat["id"] = f["id"]
 68 |         feat["geometry"] = f["geometry"]
 69 |         return feat
 70 | 
 71 |     def dump_feature(self, feat, i):
 72 |         # Undo aforementioned flattening
 73 |         return {
 74 |             "id": feat.get("id", feat.get("ID", i)),
 75 |             "geometry": feat["geometry"],
 76 |             "properties": {
 77 |                 key: value
 78 |                 for key, value in feat.items()
 79 |                 if key
 80 |                 not in (
 81 |                     "geometry",
 82 |                     "id",
 83 |                 )
 84 |             },
 85 |         }
 86 | 
 87 |     def dump(self):
 88 |         # Dump and save the dataset at the same time via Fiona
 89 |         pass
 90 | 
 91 |     def save(self):
 92 |         with fiona.open(self.filename, "w", **self.meta) as f:
 93 |             for i, feat in enumerate(self.data):
 94 |                 f.write(self.dump_feature(feat, i))
 95 | 
 96 | 
 97 | class GisMapper(TupleMapper):
 98 |     """
 99 |     GIS-aware tuple mapper
100 |     """
101 | 
102 |     def as_dataframe(self):
103 |         # Mimic BaseIter.as_dataframe() but with GeoDataFrame
104 |         # (also, key_field is always set)
105 |         from geopandas import GeoDataFrame
106 | 
107 |         key = self.get_key_field()
108 |         data = [self.item_dict(row) for row in self.values()]
109 |         df = GeoDataFrame(data)
110 |         df.set_index(key, inplace=True)
111 |         return df
112 | 
113 |     def item_dict(self, uitem):
114 |         # Turn usable item into GeoDataFrame-friendly dict
115 |         data = uitem._asdict()
116 |         data["geometry"] = geometry.shape(data["geometry"])
117 |         return data
118 | 
119 | 
120 | class ShapeMapper(GisMapper):
121 |     """
122 |     Map Fiona's GeoJSON-style geometries to and from Shapely shapes
123 |     """
124 | 
125 |     def map_value(self, field, value):
126 |         value = super(ShapeMapper, self).map_value(field, value)
127 |         if field == "geometry":
128 |             value = geometry.shape(value)
129 |         return value
130 | 
131 |     def unmap_value(self, field, value):
132 |         if field == "geometry":
133 |             value = geometry.mapping(value)
134 |         return super(ShapeMapper, self).unmap_value(field, value)
135 | 
136 |     def item_dict(self, uitem):
137 |         return uitem._asdict()
138 | 
139 | 
140 | class WktMapper(ShapeMapper):
141 |     """
142 |     Map geometries to and from WKT (good for Django integration)
143 |     """
144 | 
145 |     def map_value(self, field, value):
146 |         value = super(WktMapper, self).map_value(field, value)
147 |         if field == "geometry":
148 |             value = wkt.dumps(value)
149 |         return value
150 | 
151 |     def unmap_value(self, field, value):
152 |         if field == "geometry":
153 |             value = wkt.loads(value)
154 |         return super(WktMapper, self).unmap_value(field, value)
155 | 
156 |     def item_dict(self, uitem):
157 |         data = uitem._asdict()
158 |         data["geometry"] = wkt.loads(data["geometry"])
159 |         return data
160 | 
161 | 
162 | def guess_driver(filename):
163 |     if filename.endswith(".shp"):
164 |         return "ESRI Shapefile"
165 |     else:
166 |         return "GeoJSON"
167 | 


--------------------------------------------------------------------------------
/itertable/loaders.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import requests
  3 | from io import StringIO, BytesIO
  4 | from .exceptions import LoadFailed
  5 | from zipfile import ZipFile
  6 | 
  7 | try:
  8 |     from .version import VERSION
  9 | except ImportError:
 10 |     VERSION = "0.0.0"
 11 | 
 12 | 
 13 | class BaseLoader(object):
 14 |     no_pickle_loader = ["file"]
 15 |     empty_file = None
 16 | 
 17 |     def load(self):
 18 |         raise NotImplementedError
 19 | 
 20 | 
 21 | class FileLoader(BaseLoader):
 22 |     filename = None
 23 |     require_existing = True
 24 | 
 25 |     @property
 26 |     def read_mode(self):
 27 |         return "rb" if self.binary else "r"
 28 | 
 29 |     @property
 30 |     def write_mode(self):
 31 |         return "wb+" if self.binary else "w+"
 32 | 
 33 |     def load(self):
 34 |         try:
 35 |             self.file = open(self.filename, self.read_mode)
 36 |             self.empty_file = False
 37 |         except OSError as e:
 38 |             if self.require_existing:
 39 |                 raise LoadFailed(
 40 |                     e.strerror,
 41 |                     path=self.filename,
 42 |                     code=e.errno,
 43 |                 )
 44 |             elif self.binary:
 45 |                 self.file = BytesIO()
 46 |             else:
 47 |                 self.file = StringIO()
 48 |             self.empty_file = True
 49 | 
 50 |     def save(self):
 51 |         file = open(self.filename, self.write_mode)
 52 |         self.dump(file)
 53 |         file.close()
 54 | 
 55 | 
 56 | class Zipper(object):
 57 |     inner_filename = None
 58 |     inner_binary = False
 59 | 
 60 |     def unzip_file(self):
 61 |         zipfile = ZipFile(self.file)
 62 |         inner_file = zipfile.read(self.get_inner_filename(zipfile))
 63 |         if self.inner_binary:
 64 |             self.file = BytesIO(inner_file)
 65 |         else:
 66 |             self.file = StringIO(inner_file.decode("utf-8"))
 67 |         zipfile.fp.close()
 68 |         zipfile.close()
 69 | 
 70 |     def get_inner_filename(self, zipfile):
 71 |         if self.inner_filename:
 72 |             return self.inner_filename
 73 |         names = zipfile.namelist()
 74 |         if len(names) == 1:
 75 |             return names[0]
 76 | 
 77 |         zipfile.fp.close()
 78 |         zipfile.close()
 79 |         raise LoadFailed("Multiple Inner Files!")
 80 | 
 81 | 
 82 | class ZipFileLoader(Zipper, FileLoader):
 83 |     binary = True
 84 | 
 85 |     def load(self):
 86 |         super(ZipFileLoader, self).load()
 87 |         self.unzip_file()
 88 | 
 89 | 
 90 | class StringLoader(BaseLoader):
 91 |     string = ""
 92 | 
 93 |     @property
 94 |     def _io_class(self):
 95 |         return BytesIO if self.binary else StringIO
 96 | 
 97 |     def load(self):
 98 |         if self.binary and not self.string:
 99 |             self.string = b""
100 |         self.file = self._io_class(self.string)
101 | 
102 |     def save(self):
103 |         file = self._io_class()
104 |         self.dump(file)
105 |         self.string = file.getvalue()
106 |         file.close()
107 | 
108 | 
109 | class NetLoader(StringLoader):
110 |     "NetLoader: opens HTTP/REST resources for use in IterTable"
111 | 
112 |     username = None
113 |     password = None
114 |     debug = False
115 |     url = None
116 |     client = requests
117 | 
118 |     @property
119 |     def user_agent(self):
120 |         return "IterTable/%s (%s)" % (
121 |             VERSION,
122 |             requests.utils.default_user_agent(),
123 |         )
124 | 
125 |     @property
126 |     def headers(self):
127 |         return {
128 |             "User-Agent": self.user_agent,
129 |         }
130 | 
131 |     def load(self, **kwargs):
132 |         result = self.GET()
133 |         self.file = self._io_class(result)
134 | 
135 |     def req(self, url=None, method=None, params=None, body=None, headers={}):
136 |         if url is None:
137 |             url = self.url
138 |             if url is None:
139 |                 raise LoadFailed("No URL provided")
140 | 
141 |         if params is None:
142 |             params = getattr(self, "params", None)
143 | 
144 |         if isinstance(params, str):
145 |             url += "?" + params
146 |             params = None
147 | 
148 |         if self.debug:
149 |             if params:
150 |                 from requests.compat import urlencode
151 | 
152 |                 debug_url = url + "?" + urlencode(params, doseq=True)
153 |             else:
154 |                 debug_url = url
155 |             self.debug_string = "%s: %s" % (method, debug_url)
156 |             print(self.debug_string)
157 | 
158 |         if self.username is not None and self.password is not None:
159 |             auth = (self.username, self.password)
160 |         else:
161 |             auth = None
162 | 
163 |         all_headers = self.headers.copy()
164 |         all_headers.update(headers)
165 | 
166 |         resp = self.client.request(
167 |             method,
168 |             url,
169 |             params=params,
170 |             headers=all_headers,
171 |             auth=auth,
172 |             data=body,
173 |         )
174 |         resp.connection.close()
175 | 
176 |         if resp.status_code < 200 or resp.status_code > 299:
177 |             raise LoadFailed(
178 |                 resp.text,
179 |                 path=url,
180 |                 code=resp.status_code,
181 |             )
182 | 
183 |         if self.binary:
184 |             return resp.content
185 |         else:
186 |             return resp.text
187 | 
188 |     def GET(self, **kwargs):
189 |         return self.req(method="GET", **kwargs)
190 | 
191 |     def POST(self, **kwargs):
192 |         return self.req(method="POST", **kwargs)
193 | 
194 |     def PUT(self, **kwargs):
195 |         return self.req(method="PUT", **kwargs)
196 | 
197 |     def DELETE(self, **kwargs):
198 |         return self.req(method="DELETE", **kwargs)
199 | 
200 | 
201 | class ZipNetLoader(Zipper, NetLoader):
202 |     binary = True
203 | 
204 |     def load(self):
205 |         super(ZipNetLoader, self).load()
206 |         self.unzip_file()
207 | 


--------------------------------------------------------------------------------
/itertable/mappers.py:
--------------------------------------------------------------------------------
  1 | from collections import namedtuple, OrderedDict
  2 | import re
  3 | from datetime import datetime
  4 | from .exceptions import NoData, MappingFailed
  5 | from unicodedata import normalize
  6 | 
  7 | 
  8 | class BaseMapper(object):
  9 |     def get_key_field(self):
 10 |         return self.map_field(self.key_field)
 11 | 
 12 |     def map_field(self, field):
 13 |         return field
 14 | 
 15 |     def map_value(self, field, value):
 16 |         return value
 17 | 
 18 |     def unmap_field(self, field):
 19 |         return field
 20 | 
 21 |     def unmap_value(self, field, value):
 22 |         return value
 23 | 
 24 |     def usable_item(self, item):
 25 |         uitem = {}
 26 |         for key, val in item.items():
 27 |             field = self.map_field(key)
 28 |             value = self.map_value(field, val)
 29 |             uitem[field] = value
 30 |         return uitem
 31 | 
 32 |     def parse_usable_item(self, uitem):
 33 |         item = {}
 34 |         for field, value in uitem.items():
 35 |             key = self.unmap_field(field)
 36 |             val = self.unmap_value(field, value)
 37 |             item[key] = val
 38 |         return item
 39 | 
 40 | 
 41 | class DictMapper(BaseMapper):
 42 |     field_map = {}
 43 |     value_map = {}
 44 | 
 45 |     def map_field(self, field):
 46 |         field = self.field_map[field] if field in self.field_map else field
 47 |         return field
 48 | 
 49 |     def map_value(self, field, value):
 50 |         if not isinstance(value, str):
 51 |             return value
 52 |         value = self.value_map[value] if value in self.value_map else value
 53 |         return value
 54 | 
 55 |     def unmap_field(self, field):
 56 |         for f in self.field_map:
 57 |             if self.field_map[f] == field:
 58 |                 return f
 59 |         return field
 60 | 
 61 |     def unmap_value(self, field, value):
 62 |         if not isinstance(value, str):
 63 |             return value
 64 |         for v in self.value_map:
 65 |             if self.value_map[v] == value:
 66 |                 return v
 67 |         return value
 68 | 
 69 | 
 70 | class TupleMapper(DictMapper):
 71 |     no_pickle_mapper = ["_tuple_class", "_tuple_prototype"]
 72 | 
 73 |     @property
 74 |     def field_map(self):
 75 |         field_names = self.get_field_names()
 76 |         if not field_names and not getattr(self, "data", None):
 77 |             raise NoData
 78 | 
 79 |         # FIXME: check for duplicates
 80 |         if not hasattr(self, "_field_map"):
 81 |             items = [
 82 |                 (field, self.tuple_field_name(field)) for field in field_names
 83 |             ]
 84 |             self._field_map = OrderedDict(items)
 85 |         return self._field_map
 86 | 
 87 |     def tuple_field_name(self, field):
 88 |         field = self.clean_field_name(field)
 89 |         field = re.sub(r"\W", "", field.lower())
 90 |         # normalize identifiers for consistency with namedtuple
 91 |         # http://bugs.python.org/issue23091
 92 |         field = normalize("NFKC", field)
 93 |         return field
 94 | 
 95 |     def clean_field_name(self, field):
 96 |         return field
 97 | 
 98 |     @property
 99 |     def tuple_class(self):
100 |         "Returns a class to use for individual items"
101 | 
102 |         if not hasattr(self, "_tuple_class"):
103 |             cls = namedtuple(
104 |                 self.__class__.__name__ + "Tuple",
105 |                 list(self.field_map.values()),
106 |             )
107 |             self._tuple_class = cls
108 | 
109 |         return self._tuple_class
110 | 
111 |     @property
112 |     def tuple_prototype(self):
113 |         if not hasattr(self, "_tuple_prototype"):
114 |             vals = {field: None for field in self.field_map.values()}
115 |             self._tuple_prototype = self.tuple_class(**vals)
116 |         return self._tuple_prototype
117 | 
118 |     def usable_item(self, item):
119 |         mapped = super(TupleMapper, self).usable_item(item)
120 |         try:
121 |             return self.tuple_prototype._replace(**mapped)
122 |         except ValueError as e:
123 |             raise MappingFailed(str(e))
124 | 
125 |     def parse_usable_item(self, uitem):
126 |         mapped = {key: getattr(uitem, key) for key in self.field_map.values()}
127 |         return super(TupleMapper, self).parse_usable_item(mapped)
128 | 
129 |     def item_dict(self, uitem):
130 |         return uitem._asdict()
131 | 
132 |     def create(self, **kwargs):
133 |         return self.tuple_prototype._replace(**kwargs)
134 | 
135 | 
136 | def parse_iso8601(val):
137 |     # See http://bugs.python.org/issue15873
138 |     if hasattr(datetime, "fromisoformat"):
139 |         return datetime.fromisoformat(val)
140 |     try:
141 |         from django.utils.dateparse import parse_datetime
142 |     except ImportError:
143 |         try:
144 |             from iso8601 import parse_date as parse_datetime
145 |         except ImportError:
146 |             raise Exception("No suitable iso8601 parser found!")
147 |     try:
148 |         result = parse_datetime(val)
149 |     except Exception:
150 |         result = None
151 |     if result is None:
152 |         raise ValueError("Could not parse %s as iso8601 date!" % val)
153 |     return result
154 | 
155 | 
156 | def make_date_mapper(fmt):
157 |     """
158 |     Generate functions to use for mapping strings to dates
159 |     """
160 | 
161 |     def mapper(val):
162 |         if fmt == "iso8601":
163 |             return parse_iso8601(val)
164 |         val = datetime.strptime(val, fmt)
165 |         if "%Y" in fmt or "%y" in fmt:
166 |             return val
167 |         else:
168 |             return val.time()
169 | 
170 |     return mapper
171 | 
172 | 
173 | class TimeSeriesMapper(TupleMapper):
174 |     date_formats = None
175 |     map_floats = True
176 |     map_functions = []
177 | 
178 |     def make_date_mapper(self, fmt):
179 |         return make_date_mapper(fmt)
180 | 
181 |     def map_value(self, field, value):
182 |         if not isinstance(value, str):
183 |             return value
184 | 
185 |         if not self.map_functions:
186 |             self.map_functions = [
187 |                 self.make_date_mapper(fmt) for fmt in self.date_formats
188 |             ]
189 | 
190 |             if self.map_floats:
191 |                 self.map_functions.insert(0, float)
192 | 
193 |         value = value.strip()
194 |         for i, fn in enumerate(self.map_functions):
195 |             try:
196 |                 return fn(value)
197 |             except ValueError:
198 |                 pass
199 |         return value
200 | 
201 |     @property
202 |     def key_fields(self):
203 |         raise NotImplementedError("Key fields must be specified")
204 | 
205 |     def parameter_fields(self):
206 |         return sorted(set(self.field_map.values()) - set(self.key_fields))
207 | 


--------------------------------------------------------------------------------
/itertable/parsers/__init__.py:
--------------------------------------------------------------------------------
 1 | from .text import CsvParser, JsonParser, XmlParser
 2 | from .xls import OldExcelParser, ExcelParser
 3 | 
 4 | 
 5 | __all__ = (
 6 |     "CsvParser",
 7 |     "JsonParser",
 8 |     "XmlParser",
 9 |     "OldExcelParser",
10 |     "ExcelParser",
11 | )
12 | 


--------------------------------------------------------------------------------
/itertable/parsers/base.py:
--------------------------------------------------------------------------------
 1 | class BaseParser(object):
 2 |     pass
 3 | 
 4 | 
 5 | class TableParser(BaseParser):
 6 |     tabular = True
 7 |     header_row = None
 8 |     max_header_row = 20
 9 |     start_row = None
10 |     extra_data = None
11 | 


--------------------------------------------------------------------------------
/itertable/parsers/readers.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | 
 3 | 
 4 | class SkipPreludeReader(csv.DictReader):
 5 |     """
 6 |     A specialized version of DictReader that attempts to find where the "real"
 7 |     CSV data is in a file that may contain a prelude of non-CSV text.
 8 |     """
 9 | 
10 |     max_header_row = 20
11 | 
12 |     def __init__(
13 |         self,
14 |         f,
15 |         fieldnames=None,
16 |         restkey=None,
17 |         restval=None,
18 |         dialect="excel",
19 |         *args,
20 |         **kwds
21 |     ):
22 |         # Preserve file since we're going to start reading it
23 |         self._file = f
24 | 
25 |         # Preserve reader options since we'll need to make another one
26 |         readeropts = [f, dialect]
27 |         readeropts.extend(args)
28 |         self._readeropts = (readeropts, kwds)
29 |         super().__init__(
30 |             f, fieldnames, restkey, restval, dialect, *args, **kwds
31 |         )
32 | 
33 |     @property
34 |     def fieldnames(self):
35 |         if self._fieldnames is not None:
36 |             return self._fieldnames
37 | 
38 |         # Create a new reader just to figure out which row is the header
39 |         args, kwds = self._readeropts
40 |         data = csv.reader(*args, **kwds)
41 |         rows = []
42 |         for i in range(self.max_header_row):
43 |             try:
44 |                 rows.append(next(data))
45 |             except StopIteration:
46 |                 pass
47 |         header_row, field_names = self.choose_header(rows)
48 | 
49 |         # Reset file and advance reader so it starts in the right spot
50 |         if hasattr(self._file, "seek"):
51 |             self._file.seek(0)
52 |         for i in range(header_row + 1):
53 |             try:
54 |                 next(self.reader)
55 |             except StopIteration:
56 |                 pass
57 | 
58 |         self._fieldnames = field_names
59 |         self._header_row = header_row
60 |         return field_names
61 | 
62 |     @property
63 |     def header_row(self):
64 |         self.fieldnames  # used for side effect
65 |         return self._header_row
66 | 
67 |     def choose_header(self, rows):
68 |         """
69 |         Determine which row contains column headers from the provided set.
70 |         Default is to assume that the first longest row is the header.
71 |         """
72 |         header_row = 0
73 |         field_names = []
74 | 
75 |         # Select header from available rows
76 |         for i, row in enumerate(rows):
77 |             if len(row) > len(field_names):
78 |                 header_row = i
79 |                 field_names = row
80 |         return header_row, field_names
81 | 


--------------------------------------------------------------------------------
/itertable/parsers/text.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import json
  3 | from .readers import SkipPreludeReader
  4 | from xml.etree import ElementTree as ET
  5 | 
  6 | from .base import BaseParser, TableParser
  7 | from ..exceptions import ParseFailed
  8 | 
  9 | 
 10 | class CsvParser(TableParser):
 11 |     delimiter = ","
 12 |     quotechar = '"'
 13 |     no_pickle_parser = ["csvdata"]
 14 |     binary = False
 15 | 
 16 |     def parse(self):
 17 |         # Like DictReader, assume explicit field definition means CSV does not
 18 |         # contain column headers.
 19 |         fields = self.get_field_names()
 20 |         if self.start_row is None:
 21 |             if fields:
 22 |                 self.start_row = 0
 23 |             else:
 24 |                 self.start_row = 1
 25 |         if self.header_row is None:
 26 |             if fields:
 27 |                 self.header_row = None
 28 |             else:
 29 |                 self.header_row = 0
 30 | 
 31 |         Reader = self.reader_class()
 32 |         self.csvdata = Reader(
 33 |             self.file,
 34 |             fields,
 35 |             delimiter=self.delimiter,
 36 |             quotechar=self.quotechar,
 37 |         )
 38 |         self.field_names = self.csvdata.fieldnames
 39 |         if self.header_row is not None:
 40 |             self.header_row = self.csvdata.header_row
 41 |         self.data = [row for row in self.csvdata]
 42 |         self.extra_data = {}
 43 | 
 44 |     def reader_class(self):
 45 |         class Reader(SkipPreludeReader):
 46 |             max_header_row = self.max_header_row
 47 | 
 48 |         return Reader
 49 | 
 50 |     def dump(self, file=None):
 51 |         if file is None:
 52 |             file = self.file
 53 |         csvout = csv.DictWriter(
 54 |             file,
 55 |             self.get_field_names(),
 56 |             delimiter=self.delimiter,
 57 |             quotechar=self.quotechar,
 58 |         )
 59 |         csvout.writeheader()
 60 |         for row in self.data:
 61 |             csvout.writerow(row)
 62 | 
 63 | 
 64 | class JsonParser(BaseParser):
 65 |     indent = None
 66 |     namespace = None
 67 |     binary = False
 68 | 
 69 |     def parse(self):
 70 |         try:
 71 |             obj = json.load(self.file)
 72 |             if self.namespace:
 73 |                 for key in self.namespace.split("."):
 74 |                     obj = obj[key]
 75 |             self.data = list(map(self.parse_item, obj))
 76 |         except ValueError:
 77 |             raise ParseFailed
 78 | 
 79 |     def parse_item(self, item):
 80 |         return item
 81 | 
 82 |     def dump(self, file=None):
 83 |         if file is None:
 84 |             file = self.file
 85 |         obj = list(map(self.dump_item, self.data))
 86 |         if self.namespace:
 87 |             for key in reversed(self.namespace.split(".")):
 88 |                 obj = {key: obj}
 89 |         json.dump(obj, file, indent=self.indent)
 90 | 
 91 |     def dump_item(self, item):
 92 |         return item
 93 | 
 94 | 
 95 | class XmlParser(BaseParser):
 96 |     root_tag = None
 97 |     item_tag = None
 98 |     binary = False
 99 | 
100 |     def parse(self):
101 |         doc = ET.parse(self.file)
102 |         root = self.parse_root(doc)
103 |         if self.root_tag is None:
104 |             self.root_tag = root.tag
105 |         if self.item_tag is None:
106 |             self.item_tag = list(root)[0].tag
107 |         self.data = list(map(self.parse_item, root.findall(self.item_tag)))
108 | 
109 |     def parse_root(self, doc):
110 |         root = doc.getroot()
111 |         if self.root_tag is not None and root.tag != self.root_tag:
112 |             root = root.find(self.root_tag)
113 |         return root
114 | 
115 |     def parse_item(self, el):
116 |         return {e.tag: e.text for e in el}
117 | 
118 |     def dump(self, file=None):
119 |         if file is None:
120 |             file = self.file
121 |         root = ET.Element(self.root_tag)
122 |         for item in self.data:
123 |             root.append(self.dump_item(item))
124 |         output = ET.tostring(root).decode("utf-8")
125 |         file.write(output)
126 | 
127 |     def dump_item(self, item):
128 |         el = ET.Element(self.item_tag)
129 |         for key in self.get_field_names():
130 |             if key not in item or item[key] is None:
131 |                 continue
132 |             sel = ET.SubElement(el, key)
133 |             sel.text = str(item.get(key))
134 |         return el
135 | 


--------------------------------------------------------------------------------
/itertable/parsers/xls.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import math
  3 | from .base import TableParser
  4 | 
  5 | 
  6 | class WorkbookParser(TableParser):
  7 |     workbook = None
  8 |     worksheet = None
  9 |     sheet_name = 0
 10 |     start_row = None
 11 |     column_count = None
 12 |     no_pickle_parser = ["workbook", "worksheet"]
 13 |     binary = True
 14 | 
 15 |     date_format = "yyyy-mm-dd"
 16 |     time_format = "hh:mm:ss"
 17 |     datetime_format = "yyyy-mm-dd hh:mm:ss"
 18 | 
 19 |     def parse(self):
 20 |         if not self.workbook:
 21 |             self.parse_workbook()
 22 | 
 23 |         if self.sheet_name is None:
 24 |             SpreadsheetIter = type(self)
 25 |             self.data = [
 26 |                 {
 27 |                     "name": name,
 28 |                     "data": SpreadsheetIter(
 29 |                         loaded=True,
 30 |                         workbook=self.workbook,
 31 |                         sheet_name=name,
 32 |                     ),
 33 |                 }
 34 |                 for name in self.sheet_names
 35 |             ]
 36 |             return
 37 | 
 38 |         sheet_name = self.sheet_name
 39 |         if isinstance(self.sheet_name, int):
 40 |             sheet_name = self.sheet_names[sheet_name]
 41 | 
 42 |         self.parse_worksheet(sheet_name)
 43 | 
 44 |         if self.header_row is None:
 45 |             if self.start_row is not None:
 46 |                 self.header_row = self.start_row - 1
 47 |             else:
 48 |                 self.column_count = 0
 49 | 
 50 |                 def checkval(cell):
 51 |                     if cell.value is not None and cell.value != "":
 52 |                         return True
 53 |                     return False
 54 | 
 55 |                 search_rows = min(len(self.worksheet) - 1, self.max_header_row)
 56 |                 for row in range(search_rows, -1, -1):
 57 |                     count = len(list(filter(checkval, self.worksheet[row])))
 58 |                     if count >= self.column_count:
 59 |                         self.column_count = count
 60 |                         self.header_row = row
 61 | 
 62 |         if self.header_row is None:
 63 |             return
 64 | 
 65 |         if self.start_row is None:
 66 |             self.start_row = self.header_row + 1
 67 | 
 68 |         if self.field_names is None:
 69 |             rows = self.worksheet[self.header_row : self.start_row]
 70 |             self.field_names = [
 71 |                 str(c.value) or "c%s" % i for i, c in enumerate(rows[0])
 72 |             ]
 73 |             for row in rows[1:]:
 74 |                 for i, c in enumerate(row):
 75 |                     self.field_names[i] += "\n" + str(c.value)
 76 | 
 77 |             seen_fields = set()
 78 |             for i, field in enumerate(self.field_names):
 79 |                 if field in seen_fields:
 80 |                     field += str(i)
 81 |                     self.field_names[i] = field
 82 |                 seen_fields.add(field)
 83 | 
 84 |         self.data = list(map(self.parse_row, self.worksheet[self.start_row :]))
 85 | 
 86 |         self.extra_data = {}
 87 |         if self.header_row > 0:
 88 |             for r in range(0, self.header_row):
 89 |                 for c, cell in enumerate(self.worksheet[r]):
 90 |                     val = self.get_value(cell)
 91 |                     if val is not None and val != "":
 92 |                         self.extra_data.setdefault(r, {})
 93 |                         self.extra_data[r][c] = val
 94 | 
 95 |     def parse_workbook(self):
 96 |         raise NotImplementedError
 97 | 
 98 |     @property
 99 |     def sheet_names(self):
100 |         raise NotImplementedError
101 | 
102 |     def get_sheet_by_name(self, name):
103 |         raise NotImplementedError
104 | 
105 |     def parse_worksheet(self, name):
106 |         raise NotImplementedError
107 | 
108 |     def parse_row(self, row):
109 |         return {
110 |             name: self.get_value(row[i])
111 |             for i, name in enumerate(self.get_field_names())
112 |             if i < len(row)
113 |         }
114 | 
115 |     def get_value(self, cell):
116 |         raise NotImplementedError
117 | 
118 |     def dump(self, file=None):
119 |         if file is None:
120 |             file = self.file
121 |         write, close = self.open_worksheet(file)
122 |         for i, field in enumerate(self.field_names):
123 |             write(0, i, field)
124 |         for r, row in enumerate(self.data):
125 |             for c, field in enumerate(self.field_names):
126 |                 write(r + 1, c, row[field])
127 |         close()
128 | 
129 |     def calc_width(self, val):
130 |         val = str(val) if val is not None else ""
131 |         size = 0
132 |         for c in val:
133 |             if c in ".,;:'\"iIlt1":
134 |                 size += 0.5
135 |             elif c in "MW":
136 |                 size += 1.3
137 |             elif c.isupper():
138 |                 size += 1.2
139 |             elif c.islower():
140 |                 size += 1
141 |             else:
142 |                 size += 1.1
143 |         return size * 1.4
144 | 
145 | 
146 | class OldExcelParser(WorkbookParser):
147 |     def parse_workbook(self):
148 |         try:
149 |             import xlrd
150 |         except ImportError:
151 |             raise Exception("xlrd is required to load .xls files")
152 |         self.workbook = xlrd.open_workbook(file_contents=self.file.read())
153 | 
154 |     @property
155 |     def sheet_names(self):
156 |         return self.workbook.sheet_names()
157 | 
158 |     def get_sheet_by_name(self, name):
159 |         return self.workbook.sheet_by_name(name)
160 | 
161 |     def parse_worksheet(self, name):
162 |         worksheet = self.get_sheet_by_name(name)
163 |         self.worksheet = [worksheet.row(i) for i in range(worksheet.nrows)]
164 | 
165 |     def get_value(self, cell):
166 |         import xlrd
167 | 
168 |         if cell.ctype == xlrd.XL_CELL_DATE:
169 |             time, date = math.modf(cell.value)
170 |             tpl = xlrd.xldate_as_tuple(cell.value, self.workbook.datemode)
171 |             if date and time:
172 |                 return datetime.datetime(*tpl)
173 |             elif date:
174 |                 return datetime.date(*tpl[0:3])
175 |             else:
176 |                 return datetime.time(*tpl[3:6])
177 |         return cell.value
178 | 
179 |     def calc_width(self, val):
180 |         val = str(val) if val is not None else ""
181 |         size = 0
182 |         for c in val:
183 |             if c in ".,;:'\"iIlt1":
184 |                 size += 0.5
185 |             elif c in "MW":
186 |                 size += 1.3
187 |             elif c.isupper():
188 |                 size += 1.2
189 |             elif c.islower():
190 |                 size += 1
191 |             else:
192 |                 size += 1.1
193 |         return size
194 | 
195 |     def open_worksheet(self, file):
196 |         import xlwt
197 | 
198 |         workbook = xlwt.Workbook()
199 |         worksheet = workbook.add_sheet("Sheet 1")
200 | 
201 |         formats = {
202 |             datetime.date: xlwt.Style.easyxf(
203 |                 num_format_str=self.date_format,
204 |             ),
205 |             datetime.time: xlwt.Style.easyxf(
206 |                 num_format_str=self.time_format,
207 |             ),
208 |             datetime.datetime: xlwt.Style.easyxf(
209 |                 num_format_str=self.datetime_format,
210 |             ),
211 |             "header": xlwt.Style.easyxf(
212 |                 "font: bold on; borders: bottom thick;"
213 |             ),
214 |         }
215 | 
216 |         widths = {}
217 | 
218 |         def write(r, c, val):
219 |             widths.setdefault(c, 0)
220 |             widths[c] = max(widths[c], self.calc_width(val))
221 |             fmt = formats.get(type(val))
222 |             if not fmt and r == 0:
223 |                 fmt = formats["header"]
224 |             if fmt:
225 |                 worksheet.write(r, c, val, fmt)
226 |             else:
227 |                 worksheet.write(r, c, val)
228 | 
229 |         def close():
230 |             for c, width in widths.items():
231 |                 worksheet.col(c).set_width(int(width * 256))
232 |             workbook.save(file)
233 | 
234 |         return write, close
235 | 
236 | 
237 | class ExcelParser(WorkbookParser):
238 |     def parse_workbook(self):
239 |         import openpyxl
240 | 
241 |         self.workbook = openpyxl.open(self.file, data_only=True)
242 | 
243 |     @property
244 |     def sheet_names(self):
245 |         return self.workbook.sheetnames
246 | 
247 |     def get_sheet_by_name(self, name):
248 |         return self.workbook[name]
249 | 
250 |     def parse_worksheet(self, name):
251 |         worksheet = self.get_sheet_by_name(name)
252 |         self.worksheet = [row for row in worksheet.rows]
253 | 
254 |     def get_value(self, cell):
255 |         value = cell.internal_value
256 |         if isinstance(value, datetime.datetime):
257 |             if value.time() == datetime.time(0, 0):
258 |                 return value.date()
259 |         return value
260 | 
261 |     def open_worksheet(self, file):
262 |         from openpyxl import Workbook, styles, utils
263 | 
264 |         workbook = Workbook()
265 |         worksheet = workbook.active
266 | 
267 |         formats = {
268 |             datetime.date: styles.NamedStyle(
269 |                 name="date",
270 |                 number_format=self.date_format,
271 |             ),
272 |             datetime.time: styles.NamedStyle(
273 |                 name="time",
274 |                 number_format=self.time_format,
275 |             ),
276 |             datetime.datetime: styles.NamedStyle(
277 |                 name="datetime",
278 |                 number_format=self.datetime_format,
279 |             ),
280 |             "header": styles.NamedStyle(
281 |                 name="header",
282 |                 font=styles.Font(bold=True),
283 |                 border=styles.Border(bottom=styles.Side(style="thick")),
284 |             ),
285 |         }
286 |         widths = {}
287 | 
288 |         def write(r, c, val):
289 |             widths.setdefault(c, 0)
290 |             widths[c] = max(widths[c], self.calc_width(val))
291 |             cell = worksheet.cell(r + 1, c + 1, val)
292 | 
293 |             fmt = formats.get(type(val))
294 |             if fmt:
295 |                 cell.style = fmt
296 |             elif r == 0:
297 |                 cell.style = formats["header"]
298 | 
299 |         def close():
300 |             for c, width in widths.items():
301 |                 col = utils.get_column_letter(c + 1)
302 |                 worksheet.column_dimensions[col].width = width
303 |             workbook.save(self.filename)
304 | 
305 |         return write, close
306 | 


--------------------------------------------------------------------------------
/itertable/util.py:
--------------------------------------------------------------------------------
  1 | from .base import BaseIter
  2 | from .loaders import FileLoader, NetLoader, StringLoader
  3 | from .parsers import (
  4 |     CsvParser,
  5 |     JsonParser,
  6 |     XmlParser,
  7 |     ExcelParser,
  8 |     OldExcelParser,
  9 | )
 10 | from .mappers import TupleMapper
 11 | from .exceptions import ParseFailed
 12 | import mimetypes
 13 | import io
 14 | 
 15 | xlsx = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
 16 | 
 17 | PARSERS = {
 18 |     "application/vnd.ms-excel": OldExcelParser,
 19 |     "application/CDFV2": OldExcelParser,
 20 |     xlsx: ExcelParser,
 21 |     "application/octet-stream": ExcelParser,
 22 |     "text/csv": CsvParser,
 23 |     "application/csv": CsvParser,
 24 |     "application/json": JsonParser,
 25 |     "application/xml": XmlParser,
 26 |     "text/xml": XmlParser,
 27 | }
 28 | 
 29 | BINARY_TYPES = set(key for key, cls in PARSERS.items() if cls.binary)
 30 | TEXT_TYPES = set(key for key, cls in PARSERS.items() if not cls.binary)
 31 | 
 32 | # Save generated classes to avoid recreating them
 33 | _iter_classes = {}
 34 | 
 35 | 
 36 | def make_iter(
 37 |     loader, parser, mapper=TupleMapper, name=None, module="itertable"
 38 | ):
 39 |     """
 40 |     Mix the specified loader, parser, and mapper classes into a usable Iter
 41 |     """
 42 |     key = (loader, parser, mapper)
 43 |     if key in _iter_classes:
 44 |         return _iter_classes[key]
 45 | 
 46 |     if name is None:
 47 |         lname = parser.__name__.replace("Parser", "")
 48 |         pname = loader.__name__.replace("Loader", "")
 49 |         if mapper == TupleMapper:
 50 |             mname = ""
 51 |         else:
 52 |             mname = mapper.__name__.replace("Mapper", "")
 53 |         name = lname + pname + mname + "Iter"
 54 |     cls = type(name, (loader, parser, mapper, BaseIter), {})
 55 |     cls.__module__ = module
 56 |     _iter_classes[key] = cls
 57 |     return cls
 58 | 
 59 | 
 60 | def guess_type(filename, buffer=None):
 61 |     mimetype, encoding = mimetypes.guess_type(filename)
 62 |     if mimetype is None:
 63 |         try:
 64 |             import magic
 65 | 
 66 |             if buffer:
 67 |                 mimetype = magic.from_buffer(buffer, mime=True)
 68 |                 if mimetype == "text/plain":
 69 |                     if buffer.startswith("{") or buffer.startswith("["):
 70 |                         mimetype = "application/json"
 71 |                     elif buffer.startswith("<"):
 72 |                         mimetype = "application/xml"
 73 |                     elif "," in buffer:
 74 |                         mimetype = "text/csv"
 75 |             else:
 76 |                 mimetype = magic.from_file(filename, mime=True)
 77 |         except ImportError:
 78 |             pass
 79 |     return mimetype
 80 | 
 81 | 
 82 | def load_file(filename, mapper=TupleMapper, options=None):
 83 |     if options is None:
 84 |         options = {}
 85 | 
 86 |     if isinstance(filename, str):
 87 |         mimetype = guess_type(filename)
 88 |     else:
 89 |         file = filename
 90 |         assert hasattr(file, "read"), "Use load_file() with path or file obj"
 91 |         buffer = file.read(2048)
 92 |         if hasattr(file, "seek"):
 93 |             file.seek(0)
 94 |         filename = getattr(file, "name", "__unknown__")
 95 |         mimetype = guess_type(filename, buffer=buffer)
 96 | 
 97 |         if mimetype in TEXT_TYPES and isinstance(buffer, bytes):
 98 |             bfile = file
 99 |             file = io.StringIO(bfile.read().decode())
100 |             bfile.close()
101 | 
102 |         options.update(file=file, loaded=True)
103 | 
104 |     if mimetype not in PARSERS:
105 |         raise ParseFailed("Could not determine parser for %s" % mimetype)
106 |     parser = PARSERS[mimetype]
107 |     loader = FileLoader
108 |     Iter = make_iter(loader, parser, mapper)
109 |     return Iter(filename=filename, **options)
110 | 
111 | 
112 | def load_url(url, mapper=TupleMapper, options={}):
113 |     mimetype = guess_type(url)
114 |     if mimetype not in PARSERS:
115 |         raise ParseFailed("Could not determine parser for %s" % mimetype)
116 |     parser = PARSERS[mimetype]
117 |     loader = NetLoader
118 |     Iter = make_iter(loader, parser, mapper)
119 |     return Iter(url=url, **options)
120 | 
121 | 
122 | def load_string(string, mapper=TupleMapper, options={}):
123 |     if string.startswith("<"):
124 |         parser = XmlParser
125 |     elif string.startswith("[") or (
126 |         string.startswith("{") and "namespace" in options
127 |     ):
128 |         parser = JsonParser
129 |     elif "," in string:
130 |         parser = CsvParser
131 |     else:
132 |         raise Exception("Could not determine parser for string!")
133 | 
134 |     loader = StringLoader
135 |     Iter = make_iter(loader, parser, mapper)
136 |     if Iter.binary:
137 |         string = string.encode("utf-8")
138 |     return Iter(string=string, **options)
139 | 
140 | 
141 | class FlatIter(TupleMapper, BaseIter):
142 |     """
143 |     Denormalizes a nested Iter structure (e.g. an array of individual time
144 |     series) into a single iterable.  Each row in the top level Iter should have
145 |     an attribute (typically 'data') pointing to an inner Iter.  Both the top
146 |     level Iter class and the inner class should extend TupleMapper.
147 |     """
148 | 
149 |     iter_class = None
150 |     inner_attr = "data"
151 | 
152 |     def __init__(self, *args, **kwargs):
153 |         self.iter_class = kwargs.pop("iter_class", self.iter_class)
154 |         self.inner_attr = kwargs.pop("inner_attr", self.inner_attr)
155 |         if self.iter_class is None:
156 |             raise Exception("An Iter class must be specified")
157 | 
158 |         # Pass remaining arguments
159 |         self.nested_iter = self.iter_class(*args, **kwargs)
160 |         self.data = list(self.unpack_iter())
161 | 
162 |     def unpack_iter(self):
163 |         # Loop through outer Iter (e.g. metadata series)
164 |         for outer in self.nested_iter:
165 |             meta = outer._asdict()
166 |             inner_iter = meta.pop(self.inner_attr)
167 | 
168 |             # Loop through inner Iter (e.g. time series) on each record
169 |             for inner in inner_iter:
170 |                 record = meta.copy()
171 |                 record.update(inner._asdict())
172 |                 yield record
173 | 
174 | 
175 | def flattened(iter_class, *args, **kwargs):
176 |     if iter_class.nested:
177 |         return FlatIter(iter_class=iter_class, *args, **kwargs)
178 |     else:
179 |         return iter_class(*args, **kwargs)
180 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools", "setuptools-scm"]
 3 | 
 4 | [project]
 5 | name = "itertable"
 6 | dynamic = ["version"]
 7 | authors = [
 8 |     {name = "S. Andrew Sheppard", email = "andrew@wq.io"},
 9 | ]
10 | description = "Iterable API for tabular datasets including CSV, XLSX, XML, & JSON."
11 | readme = "README.md"
12 | requires-python = ">=3.7"
13 | license = {text = "MIT" }
14 | classifiers = [
15 |     "Development Status :: 5 - Production/Stable",
16 |     "License :: OSI Approved :: MIT License",
17 |     "Natural Language :: English",
18 |     "Programming Language :: Python :: 3",
19 |     "Programming Language :: Python :: 3.7",
20 |     "Programming Language :: Python :: 3.8",
21 |     "Programming Language :: Python :: 3.9",
22 |     "Programming Language :: Python :: 3.10",
23 |     "Programming Language :: Python :: 3.11",
24 |     "Intended Audience :: Science/Research",
25 |     "Intended Audience :: Developers",
26 |     "Topic :: Text Processing :: Markup :: XML",
27 |     "Topic :: Scientific/Engineering :: GIS",
28 |     "Topic :: Utilities",
29 | ]
30 | dependencies = [
31 |     "requests",
32 |     "openpyxl",
33 |     "click"
34 | ]
35 | 
36 | [project.urls]
37 | Homepage = "https://django-data-wizard.wq.io/itertable/"
38 | Documentation = "https://django-data-wizard.wq.io/itertable/"
39 | Source = "https://github.com/wq/itertable/"
40 | "Release Notes" = "https://github.com/wq/itertable/releases"
41 | Issues = "https://github.com/wq/itertable/issues"
42 | CI = "https://github.com/wq/itertable/actions/workflows/test.yml"
43 | 
44 | [project.optional-dependencies]
45 | gis = ["Fiona", "geopandas"]
46 | pandas = ["pandas"]
47 | oldexel = ["xlrd", "xlwt"]
48 | 
49 | [tool.setuptools]
50 | packages = ["itertable", "itertable.parsers", "itertable.gis"]
51 | 
52 | [tool.setuptools_scm]
53 | write_to = "itertable/version.py"
54 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wq/itertable/c996d18c533b727b2ca8dd33915e1eed7820280f/tests/__init__.py


--------------------------------------------------------------------------------
/tests/base.py:
--------------------------------------------------------------------------------
 1 | from os.path import join, dirname
 2 | from os import unlink
 3 | import unittest
 4 | 
 5 | 
 6 | class IterTestCase(unittest.TestCase):
 7 |     data = [
 8 |         {
 9 |             "one": 1,
10 |             "two": 2,
11 |             "three": 3,
12 |         },
13 |         {
14 |             "one": 4,
15 |             "two": 5,
16 |             "three": 6,
17 |         },
18 |     ]
19 | 
20 |     def get_filename(self, filename, ext, remove_existing=False):
21 |         filename = join(dirname(__file__), "files", "%s.%s" % (filename, ext))
22 |         if remove_existing:
23 |             try:
24 |                 unlink(filename)
25 |             except OSError:
26 |                 pass
27 |         return filename
28 | 
29 |     def check_instance(self, instance):
30 |         self.assertEqual(len(instance), len(self.data))
31 | 
32 |         for row, data in zip(instance, self.data):
33 |             for key in data:
34 |                 val = getattr(row, key)
35 |                 try:
36 |                     val = int(float(val))
37 |                 except ValueError:
38 |                     pass
39 |                 self.assertEqual(val, data[key])
40 | 


--------------------------------------------------------------------------------
/tests/files/.gitignore:
--------------------------------------------------------------------------------
1 | output.*
2 | copy.*
3 | sync.*
4 | 


--------------------------------------------------------------------------------
/tests/files/custom.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "Test Case",
 3 |   "data": {
 4 |     "info": "Items Array",
 5 |     "items": [
 6 |       {
 7 |         "one": 1,
 8 |         "two": 2,
 9 |         "three": 3
10 |       },
11 |       {
12 |         "one": 4,
13 |         "two": 5,
14 |         "three": 6
15 |       }
16 |     ]
17 |   }
18 | }
19 | 


--------------------------------------------------------------------------------
/tests/files/custom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <root>
 3 |   <name>Test Case</name>
 4 |   <items>
 5 |     <info>Items Array</info>
 6 |     <item>
 7 |       <one>1</one>
 8 |       <two>2</two>
 9 |       <three>3</three>
10 |     </item>
11 |     <item>
12 |       <one>4</one>
13 |       <two>5</two>
14 |       <three>6</three>
15 |     </item>
16 |   </items>
17 | </root>
18 | 


--------------------------------------------------------------------------------
/tests/files/custom2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "Test Case",
 3 |   "data": {
 4 |     "info": "Items Array",
 5 |     "items": [
 6 |       {
 7 |         "one": 1,
 8 |         "two": 2,
 9 |         "three": 3
10 |       },
11 |       {
12 |         "one": 4,
13 |         "two": 5,
14 |         "three": 6,
15 |         "four": "extra"
16 |       }
17 |     ]
18 |   }
19 | }
20 | 


--------------------------------------------------------------------------------
/tests/files/extra.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wq/itertable/c996d18c533b727b2ca8dd33915e1eed7820280f/tests/files/extra.xlsx


--------------------------------------------------------------------------------
/tests/files/nodata.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wq/itertable/c996d18c533b727b2ca8dd33915e1eed7820280f/tests/files/nodata.csv


--------------------------------------------------------------------------------
/tests/files/nodata.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wq/itertable/c996d18c533b727b2ca8dd33915e1eed7820280f/tests/files/nodata.xlsx


--------------------------------------------------------------------------------
/tests/files/noextra.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wq/itertable/c996d18c533b727b2ca8dd33915e1eed7820280f/tests/files/noextra.xlsx


--------------------------------------------------------------------------------
/tests/files/test.csv:
--------------------------------------------------------------------------------
1 | one,two,three
2 | 1,2,3
3 | 4,5,6
4 | 


--------------------------------------------------------------------------------
/tests/files/test.dbf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wq/itertable/c996d18c533b727b2ca8dd33915e1eed7820280f/tests/files/test.dbf


--------------------------------------------------------------------------------
/tests/files/test.geojson:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type": "FeatureCollection",
 3 |   "features": [
 4 |     {
 5 |       "type": "Feature",
 6 |       "id": "one",
 7 |       "properties": {
 8 |         "one": 1,
 9 |         "two": 2,
10 |         "three": 3
11 |       },
12 |       "geometry": {
13 |         "type": "Polygon",
14 |         "coordinates": [
15 |           [
16 |             [
17 |               -93.28044891357422,
18 |               44.9771852553236
19 |             ],
20 |             [
21 |               -93.28611373901367,
22 |               44.972084916104706
23 |             ],
24 |             [
25 |               -93.27701568603516,
26 |               44.9715991458543
27 |             ],
28 |             [
29 |               -93.27220916748047,
30 |               44.9810709235921
31 |             ],
32 |             [
33 |               -93.27924728393555,
34 |               44.983985001986305
35 |             ],
36 |             [
37 |               -93.28044891357422,
38 |               44.9771852553236
39 |             ]
40 |           ]
41 |         ]
42 |       }
43 |     },
44 |     {
45 |       "type": "Feature",
46 |       "id": "two",
47 |       "properties": {
48 |         "one": 4,
49 |         "two": 5,
50 |         "three": 6
51 |       },
52 |       "geometry": {
53 |         "type": "Polygon",
54 |         "coordinates": [
55 |           [
56 |             [
57 |               -93.25349807739258,
58 |               44.968927335931234
59 |             ],
60 |             [
61 |               -93.25349807739258,
62 |               44.977670978257756
63 |             ],
64 |             [
65 |               -93.24045181274414,
66 |               44.977670978257756
67 |             ],
68 |             [
69 |               -93.24045181274414,
70 |               44.968927335931234
71 |             ],
72 |             [
73 |               -93.25349807739258,
74 |               44.968927335931234
75 |             ]
76 |           ]
77 |         ]
78 |       }
79 |     }
80 |   ]
81 | }
82 | 


--------------------------------------------------------------------------------
/tests/files/test.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "one": 1,
 4 |     "two": 2,
 5 |     "three": 3
 6 |   },
 7 |   {
 8 |     "one": 4,
 9 |     "two": 5,
10 |     "three": 6
11 |   }
12 | ]
13 | 


--------------------------------------------------------------------------------
/tests/files/test.prj:
--------------------------------------------------------------------------------
1 | GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137,298.257223563]],PRIMEM["Greenwich",0],UNIT["Degree",0.017453292519943295]]


--------------------------------------------------------------------------------
/tests/files/test.shp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wq/itertable/c996d18c533b727b2ca8dd33915e1eed7820280f/tests/files/test.shp


--------------------------------------------------------------------------------
/tests/files/test.shx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wq/itertable/c996d18c533b727b2ca8dd33915e1eed7820280f/tests/files/test.shx


--------------------------------------------------------------------------------
/tests/files/test.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wq/itertable/c996d18c533b727b2ca8dd33915e1eed7820280f/tests/files/test.xls


--------------------------------------------------------------------------------
/tests/files/test.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wq/itertable/c996d18c533b727b2ca8dd33915e1eed7820280f/tests/files/test.xlsx


--------------------------------------------------------------------------------
/tests/files/test.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <root>
 3 |   <item>
 4 |     <one>1</one>
 5 |     <two>2</two>
 6 |     <three>3</three>
 7 |   </item>
 8 |   <item>
 9 |     <one>4</one>
10 |     <two>5</two>
11 |     <three>6</three>
12 |   </item>
13 | </root>
14 | 


--------------------------------------------------------------------------------
/tests/files/test2.csv:
--------------------------------------------------------------------------------
1 | Non-CSV Header, to test SkipPreludeReader
2 | Name: Test
3 | 
4 | one,two,three
5 | 1,2,3
6 | 4,5,6
7 | 


--------------------------------------------------------------------------------
/tests/files/test3.csv:
--------------------------------------------------------------------------------
1 | one,two,three,µ
2 | 1,2,3,test
3 | 4,5,6,test
4 | 


--------------------------------------------------------------------------------
/tests/files/testcsv.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wq/itertable/c996d18c533b727b2ca8dd33915e1eed7820280f/tests/files/testcsv.zip


--------------------------------------------------------------------------------
/tests/files/testmulti.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wq/itertable/c996d18c533b727b2ca8dd33915e1eed7820280f/tests/files/testmulti.zip


--------------------------------------------------------------------------------
/tests/files/testxlsx.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wq/itertable/c996d18c533b727b2ca8dd33915e1eed7820280f/tests/files/testxlsx.zip


--------------------------------------------------------------------------------
/tests/test_custom.py:
--------------------------------------------------------------------------------
 1 | from itertable import JsonFileIter, XmlFileIter
 2 | from itertable.exceptions import MappingFailed
 3 | from .base import IterTestCase
 4 | 
 5 | 
 6 | class CustomJsonFileIter(JsonFileIter):
 7 |     namespace = "data.items"
 8 | 
 9 | 
10 | class ExtraJsonFileIter(CustomJsonFileIter):
11 |     scan_fields = True
12 | 
13 | 
14 | class CustomXmlFileIter(XmlFileIter):
15 |     root_tag = "items"
16 |     item_tag = "item"
17 | 
18 | 
19 | class CustomTestCase(IterTestCase):
20 |     def test_custom_json(self):
21 |         filename = self.get_filename("custom", "json")
22 |         instance = CustomJsonFileIter(filename=filename)
23 |         self.check_instance(instance)
24 | 
25 |     def test_scan_fields(self):
26 |         filename = self.get_filename("custom2", "json")
27 |         instance = ExtraJsonFileIter(filename=filename)
28 |         self.check_instance(instance)
29 |         self.assertIn("four", instance.get_field_names())
30 |         self.assertIsNone(instance[0].four)
31 |         self.assertEqual(instance[1].four, "extra")
32 | 
33 |     def test_unexpected_field(self):
34 |         filename = self.get_filename("custom2", "json")
35 |         instance = CustomJsonFileIter(filename=filename)
36 |         # Extra field in non-first row breaks namedtuple
37 |         with self.assertRaises(MappingFailed) as e:
38 |             instance[1]
39 |         self.assertIn("unexpected field", str(e.exception))
40 | 
41 |     def test_custom_xml(self):
42 |         filename = self.get_filename("custom", "xml")
43 |         instance = CustomXmlFileIter(filename=filename)
44 |         self.check_instance(instance)
45 | 


--------------------------------------------------------------------------------
/tests/test_dataframe.py:
--------------------------------------------------------------------------------
 1 | from itertable import load_string, BaseIter
 2 | from .base import IterTestCase
 3 | 
 4 | 
 5 | class LoadFileTestCase(IterTestCase):
 6 |     def setUp(self):
 7 |         self.csv_data = "one,two,three\n1,2,3\n4,5,6"
 8 | 
 9 |     def test_base_dataframe(self):
10 |         io = BaseIter(data=self.data)
11 |         df = io.as_dataframe()
12 |         self.assertEqual(len(df), 2)
13 | 
14 |         val = df[df.two == 2].three[0]
15 |         self.assertEqual(val, 3)
16 | 
17 |     def test_index_dataframe(self):
18 |         class KeyIter(BaseIter):
19 |             key_field = "one"
20 | 
21 |         io = KeyIter(data=self.data)
22 |         df = io.as_dataframe()
23 |         self.assertEqual(len(df), 2)
24 | 
25 |         val = df.loc[4].three
26 |         self.assertEqual(val, 6)
27 | 
28 |     def test_csv_dataframe(self):
29 |         io = load_string(self.csv_data)
30 |         df = io.as_dataframe()
31 |         self.assertEqual(len(df), 2)
32 | 
33 |         val = df[df.two == "2"].three[0]
34 |         self.assertEqual(val, "3")
35 | 


--------------------------------------------------------------------------------
/tests/test_extra_data.py:
--------------------------------------------------------------------------------
 1 | from itertable import ExcelFileIter
 2 | from .base import IterTestCase
 3 | from datetime import date
 4 | 
 5 | 
 6 | class ExtraDataIter(ExcelFileIter):
 7 |     start_row = 5
 8 | 
 9 | 
10 | class LoadFileTestCase(IterTestCase):
11 |     def test_extra_data(self):
12 |         filename = self.get_filename("extra", "xlsx")
13 |         instance = ExtraDataIter(filename=filename)
14 |         self.check_instance(instance)
15 | 
16 |         self.assertEqual(instance.extra_data[0][0], "Name")
17 |         self.assertEqual(instance.extra_data[0][1], "Test")
18 |         self.assertEqual(instance.extra_data[1][0], "Type")
19 |         self.assertEqual(instance.extra_data[1][1], "Test")
20 |         self.assertEqual(instance.extra_data[0][3], "Date")
21 |         self.assertEqual(instance.extra_data[0][4], date(2014, 12, 12))
22 | 
23 |     def test_no_extra_data(self):
24 |         filename = self.get_filename("extra", "xlsx")
25 |         ExtraDataIter(filename=filename)
26 |         filename = self.get_filename("noextra", "xlsx")
27 |         instance = ExtraDataIter(filename=filename)
28 |         self.check_instance(instance)
29 |         self.assertFalse(instance.extra_data)
30 | 
31 |     def check_instance(self, instance):
32 |         self.assertEqual(len(instance), len(self.data))
33 | 
34 |         for row, data in zip(instance, self.data):
35 |             for key in data:
36 |                 val = getattr(row, key)
37 |                 if isinstance(val, str) and val.isdigit():
38 |                     val = int(val)
39 |                 self.assertEqual(val, data[key])
40 | 


--------------------------------------------------------------------------------
/tests/test_gis.py:
--------------------------------------------------------------------------------
 1 | from itertable.gis import ShapeIter
 2 | from shapely.geometry import Point
 3 | from .base import IterTestCase
 4 | from os import unlink
 5 | 
 6 | 
 7 | class GisTestCase(IterTestCase):
 8 |     def setUp(self):
 9 |         self.points = [
10 |             Point(-93.278, 44.976),
11 |             Point(-93.247, 44.973),
12 |         ]
13 |         self.types = (
14 |             "geojson",
15 |             "shp",
16 |         )
17 | 
18 |     def test_shapeio(self):
19 |         for ext in self.types:
20 |             filename = self.get_filename("test", ext)
21 |             instance = ShapeIter(filename=filename)
22 |             self.check_instance(instance)
23 | 
24 |     def test_shapeio_sync(self):
25 |         for source_ext in self.types:
26 |             for dest_ext in self.types:
27 |                 source_file = self.get_filename("test", source_ext)
28 |                 dest_file = self.get_filename("sync", dest_ext, True)
29 |                 source_instance = ShapeIter(filename=source_file)
30 |                 dest_instance = ShapeIter(
31 |                     filename=dest_file, require_existing=False
32 |                 )
33 |                 source_instance.sync(dest_instance)
34 |                 self.check_instance(ShapeIter(filename=dest_file))
35 | 
36 |     def check_instance(self, instance):
37 |         self.assertEqual(len(instance), len(self.data))
38 | 
39 |         for row, data, point in zip(instance.values(), self.data, self.points):
40 |             for key in data:
41 |                 val = getattr(row, key)
42 |                 try:
43 |                     val = int(val)
44 |                 except ValueError:
45 |                     pass
46 |                 self.assertEqual(val, data[key])
47 |                 self.assertTrue(row.geometry.contains(point))
48 | 
49 |     def get_filename(self, filename, ext, remove_existing=False):
50 |         filename = super(GisTestCase, self).get_filename(
51 |             filename, ext, remove_existing
52 |         )
53 |         if ext == "shp" and remove_existing:
54 |             for ext in ("dbf", "shx", "prj"):
55 |                 try:
56 |                     unlink(filename.replace("shp", ext))
57 |                 except OSError:
58 |                     pass
59 |         return filename
60 | 


--------------------------------------------------------------------------------
/tests/test_gis_dataframe.py:
--------------------------------------------------------------------------------
 1 | from itertable.gis import GisIter, WktIter, ShapeIter
 2 | from .base import IterTestCase
 3 | 
 4 | 
 5 | class GisDataFrameTestCase(IterTestCase):
 6 |     def test_gisio_dataframe(self):
 7 |         self.dataframe_test(GisIter)
 8 | 
 9 |     def test_wktio_dataframe(self):
10 |         self.dataframe_test(WktIter)
11 | 
12 |     def test_shapeio_dataframe(self):
13 |         self.dataframe_test(ShapeIter)
14 | 
15 |     def dataframe_test(self, cls):
16 |         instance = cls(filename="tests/files/test.shp")
17 |         df = instance.as_dataframe()
18 |         self.assertEqual(len(df), 2)
19 |         self.assertGreater(df.geometry.area.sum(), 0)
20 | 


--------------------------------------------------------------------------------
/tests/test_load_file.py:
--------------------------------------------------------------------------------
  1 | from itertable import load_file
  2 | from itertable.exceptions import LoadFailed, NoData
  3 | from .base import IterTestCase
  4 | import unittest
  5 | import pickle
  6 | import io
  7 | 
  8 | try:
  9 |     import magic
 10 | except ImportError:
 11 |     magic = None
 12 | 
 13 | 
 14 | class LoadFileTestCase(IterTestCase):
 15 |     def setUp(self):
 16 |         self.types = ("csv", "json", "xml", "xls", "xlsx")
 17 | 
 18 |     def test_load_file(self):
 19 |         for ext in self.types:
 20 |             filename = self.get_filename("test", ext)
 21 |             instance = load_file(filename)
 22 |             self.check_instance(instance)
 23 | 
 24 |     def test_load_file_object(self):
 25 |         for ext in self.types:
 26 |             filename = self.get_filename("test", ext)
 27 |             if ext in ("xls", "xlsx"):
 28 |                 mode = "rb"
 29 |             else:
 30 |                 mode = "r"
 31 |             with open(filename, mode) as f:
 32 |                 instance = load_file(f)
 33 |             self.check_instance(instance)
 34 | 
 35 |     def test_load_file_object_binary(self):
 36 |         for ext in self.types:
 37 |             filename = self.get_filename("test", ext)
 38 |             with open(filename, "rb") as f:
 39 |                 instance = load_file(f)
 40 |             self.check_instance(instance)
 41 | 
 42 |     @unittest.skipUnless(magic, "magic required for buffer-based detection")
 43 |     def test_load_file_object_no_name(self):
 44 |         for ext in self.types:
 45 |             filename = self.get_filename("test", ext)
 46 |             if ext in ("xls", "xlsx"):
 47 |                 mode = "rb"
 48 |                 IO = io.BytesIO
 49 |             else:
 50 |                 mode = "r"
 51 |                 IO = io.StringIO
 52 | 
 53 |             with open(filename, mode) as f:
 54 |                 obj = IO(f.read())
 55 | 
 56 |             instance = load_file(obj)
 57 |             self.check_instance(instance)
 58 | 
 59 |     def test_load_file_like(self):
 60 |         class FileLike:
 61 |             name = "test.csv"
 62 | 
 63 |             def read(self, *args, **kwargs):
 64 |                 return "one,two,three\n1,2,3\n4,5,6"
 65 | 
 66 |             def __iter__(self):
 67 |                 yield from self.read().split("\n")
 68 | 
 69 |         instance = load_file(FileLike())
 70 |         self.check_instance(instance)
 71 | 
 72 |     def test_load_non_file(self):
 73 |         with self.assertRaises(AssertionError):
 74 |             load_file([{"value": "not a file"}])
 75 | 
 76 |     def test_load_csv_prelude(self):
 77 |         filename = self.get_filename("test2", "csv")
 78 |         instance = load_file(filename)
 79 |         self.check_instance(instance)
 80 | 
 81 |     def test_load_csv_unicode(self):
 82 |         filename = self.get_filename("test3", "csv")
 83 |         instance = load_file(filename)
 84 |         self.check_instance(instance)
 85 |         self.assertTrue(hasattr(instance[0], "μ"))
 86 |         self.assertEqual(instance[0].μ, "test")
 87 | 
 88 |     def test_load_xlsx_sheets(self):
 89 |         filename = self.get_filename("test", "xlsx")
 90 |         instance = load_file(filename, options={"sheet_name": None})
 91 |         self.assertEqual(len(instance), 1)
 92 |         self.assertEqual(instance[0].name, "Sheet1")
 93 |         self.check_instance(instance[0].data)
 94 | 
 95 |     def test_load_nodata(self):
 96 |         filename = self.get_filename("nodata", "csv")
 97 |         instance = load_file(filename)
 98 |         with self.assertRaises(NoData) as cm:
 99 |             instance[0]
100 |         self.assertEqual(str(cm.exception), "No data returned!")
101 | 
102 |     def test_load_nodata_excel(self):
103 |         filename = self.get_filename("nodata", "xlsx")
104 |         instance = load_file(filename)
105 |         with self.assertRaises(NoData) as cm:
106 |             instance[0]
107 |         self.assertEqual(str(cm.exception), "No data returned!")
108 | 
109 |     def test_load_nodata_excel_sheets(self):
110 |         filename = self.get_filename("nodata", "xlsx")
111 |         instance = load_file(filename, options={"sheet_name": None})
112 |         self.assertEqual(len(instance), 1)
113 |         self.assertEqual(instance[0].name, "Sheet1")
114 |         sheet = instance[0].data
115 |         with self.assertRaises(NoData) as cm:
116 |             sheet[0]
117 |         self.assertEqual(str(cm.exception), "No data returned!")
118 | 
119 |     def test_load_non_existing(self):
120 |         filename = self.get_filename("nonexisting", "csv")
121 |         with self.assertRaises(LoadFailed) as cm:
122 |             load_file(filename)
123 |         self.assertEqual(str(cm.exception), "No such file or directory")
124 | 
125 |     def test_load_init_empty(self):
126 |         filename = self.get_filename("nonexisting", "csv")
127 |         instance = load_file(filename, options={"require_existing": False})
128 |         with self.assertRaises(NoData) as cm:
129 |             instance[0]
130 |         self.assertEqual(str(cm.exception), "No data returned!")
131 | 
132 |     def test_pickle(self):
133 |         for ext in self.types:
134 |             filename = self.get_filename("test", ext)
135 |             instance = load_file(filename)
136 |             instance = pickle.loads(pickle.dumps(instance))
137 |             self.check_instance(instance)
138 | 
139 |     def test_auto_pickle(self):
140 |         for ext in self.types:
141 |             filename = self.get_filename("test", ext)
142 |             instance = load_file(filename)
143 |             # Run through the io once to ensure auto-generated data is present
144 |             self.check_instance(instance)
145 |             instance = pickle.loads(pickle.dumps(instance))
146 |             self.check_instance(instance)
147 | 


--------------------------------------------------------------------------------
/tests/test_netloader.py:
--------------------------------------------------------------------------------
  1 | import httpretty
  2 | from itertable import CsvNetIter, load_url
  3 | from itertable.exceptions import LoadFailed
  4 | import pickle
  5 | from .base import IterTestCase
  6 | 
  7 | 
  8 | class TestIter(CsvNetIter):
  9 |     url = "http://example.com/test.csv"
 10 | 
 11 | 
 12 | class NetLoaderTestCase(IterTestCase):
 13 |     def setUp(self):
 14 |         httpretty.enable()
 15 | 
 16 |         httpretty.register_uri(
 17 |             httpretty.GET,
 18 |             "http://example.com/test.csv",
 19 |             body="one,two,three\n1,2,3\n4,5,6",
 20 |             content_type="text/csv",
 21 |         )
 22 |         httpretty.register_uri(
 23 |             httpretty.GET,
 24 |             "http://example.com/fail.txt",
 25 |             body="Not Found",
 26 |             content_type="text/plain",
 27 |             status=404,
 28 |         )
 29 |         httpretty.register_uri(
 30 |             httpretty.GET,
 31 |             "http://example.com/fail.html",
 32 |             body="<html><body>Not Found</body></html>",
 33 |             content_type="text/html",
 34 |             status=404,
 35 |         )
 36 | 
 37 |     def tearDown(self):
 38 |         httpretty.disable()
 39 |         httpretty.reset()
 40 | 
 41 |     def test_load_csv(self):
 42 |         self.check_instance(TestIter())
 43 | 
 44 |     def test_load_url(self):
 45 |         self.check_instance(load_url("http://example.com/test.csv"))
 46 | 
 47 |     def test_load_csv_params(self):
 48 |         self.check_instance(TestIter(params={"test": 1}))
 49 |         qs = httpretty.last_request().querystring
 50 |         self.assertEqual(qs, {"test": ["1"]})
 51 | 
 52 |         self.check_instance(TestIter(params="test=1"))
 53 |         qs = httpretty.last_request().querystring
 54 |         self.assertEqual(qs, {"test": ["1"]})
 55 | 
 56 |         self.check_instance(TestIter(params=None))
 57 |         qs = httpretty.last_request().querystring
 58 |         self.assertEqual(qs, {})
 59 | 
 60 |     def test_debug_string(self):
 61 |         instance = TestIter(debug=True)
 62 |         self.assertEqual(
 63 |             instance.debug_string, "GET: http://example.com/test.csv"
 64 |         )
 65 |         instance = TestIter(params={"test": 1}, debug=True)
 66 |         self.assertEqual(
 67 |             instance.debug_string, "GET: http://example.com/test.csv?test=1"
 68 |         )
 69 | 
 70 |     def test_load_csv_auth(self):
 71 |         class AuthTestIter(CsvNetIter):
 72 |             url = "http://example.com/test.csv"
 73 |             username = "user"
 74 |             password = "pass"
 75 | 
 76 |         self.check_instance(AuthTestIter())
 77 |         headers = httpretty.last_request().headers
 78 |         auth = "Basic dXNlcjpwYXNz"  # b64encode("user:pass")
 79 |         self.assertEqual(headers.get("Authorization", None), auth)
 80 | 
 81 |     def test_load_csv_pickle(self):
 82 |         instance = TestIter()
 83 |         self.check_instance(instance)
 84 |         instance = pickle.loads(pickle.dumps(instance))
 85 |         self.check_instance(instance)
 86 | 
 87 |     def test_load_fail(self):
 88 |         class TestIter(CsvNetIter):
 89 |             url = "http://example.com/fail.txt"
 90 | 
 91 |         with self.assertRaises(LoadFailed) as cm:
 92 |             TestIter()
 93 |         self.assertEqual(str(cm.exception), "Not Found")
 94 | 
 95 |     def test_load_fail_html(self):
 96 |         class TestIter(CsvNetIter):
 97 |             url = "http://example.com/fail.html"
 98 | 
 99 |         with self.assertRaises(LoadFailed) as cm:
100 |             TestIter()
101 |         self.assertEqual(str(cm.exception), "Not Found")
102 | 


--------------------------------------------------------------------------------
/tests/test_write.py:
--------------------------------------------------------------------------------
  1 | from itertable import load_file
  2 | from itertable import (
  3 |     CsvFileIter,
  4 |     JsonFileIter,
  5 |     XmlFileIter,
  6 |     OldExcelFileIter,
  7 |     ExcelFileIter,
  8 | )
  9 | from .base import IterTestCase
 10 | 
 11 | 
 12 | class LoadFileTestCase(IterTestCase):
 13 |     def setUp(self):
 14 |         self.data = [
 15 |             {
 16 |                 "one": 1,
 17 |                 "two": 2,
 18 |                 "three": 3,
 19 |             },
 20 |             {
 21 |                 "one": 4,
 22 |                 "two": 5,
 23 |                 "three": 6,
 24 |             },
 25 |         ]
 26 |         self.types = ("csv", "json", "xml", "xls", "xlsx")
 27 |         self.classes = (
 28 |             CsvFileIter,
 29 |             JsonFileIter,
 30 |             XmlFileIter,
 31 |             OldExcelFileIter,
 32 |             ExcelFileIter,
 33 |         )
 34 | 
 35 |     def test_write_file(self):
 36 |         """
 37 |         Test BaseIter.save() when starting from an empty Iter instance
 38 |         """
 39 |         for ext, cls in zip(self.types, self.classes):
 40 |             filename = self.get_filename("output", ext, True)
 41 | 
 42 |             # Create an empty instance of the class
 43 |             instance = cls(
 44 |                 filename=filename,
 45 |                 require_existing=False,
 46 |                 field_names=["one", "two", "three"],
 47 |                 # These only apply to XmlFileIter, will be ignored by others
 48 |                 root_tag="root",
 49 |                 item_tag="item",
 50 |             )
 51 | 
 52 |             # Add rows to the instance using list-style BaseIter.append()
 53 |             for row in self.data:
 54 |                 instance.append(instance.create(**row))
 55 | 
 56 |             # Save the instance, which should write to output.[ext]
 57 |             instance.save()
 58 | 
 59 |             # The contents of the saved file should match the original data
 60 |             self.check_instance(load_file(filename))
 61 | 
 62 |     def duplicate(self, mode, xform):
 63 |         """
 64 |         Test BaseIter.copy/sync() (and implicit save()) between combinations of
 65 |         the default Iter classes.
 66 |         """
 67 |         for source_ext, source_cls in zip(self.types, self.classes):
 68 |             for dest_ext, dest_cls in zip(self.types, self.classes):
 69 |                 source_file = self.get_filename("test", source_ext)
 70 |                 dest_file = self.get_filename(mode, dest_ext, True)
 71 | 
 72 |                 # Sync requires key_field to be set on both classes
 73 |                 source_cls = xform(source_cls)
 74 |                 dest_cls = xform(dest_cls)
 75 | 
 76 |                 # Load source data into Iter instance
 77 |                 source_instance = source_cls(filename=source_file)
 78 | 
 79 |                 # Create empty instance of the destination Iter class
 80 |                 dest_instance = dest_cls(
 81 |                     filename=dest_file,
 82 |                     require_existing=False,
 83 |                     field_names=["one", "two", "three"],
 84 |                     root_tag="root",
 85 |                     item_tag="item",
 86 |                 )
 87 | 
 88 |                 # The Sync
 89 |                 getattr(source_instance, mode)(dest_instance)
 90 | 
 91 |                 # Load the destination file again and check contents
 92 |                 self.check_instance(load_file(dest_file))
 93 | 
 94 |     def test_copy_io(self):
 95 |         self.duplicate("copy", lambda d: d)
 96 | 
 97 |     def test_sync_io(self):
 98 |         self.duplicate("sync", self.with_key_field)
 99 | 
100 |     def with_key_field(self, cls):
101 |         class new_class(cls):
102 |             key_field = "one"
103 | 
104 |         new_class.__name__ = "Dict" + cls.__name__
105 |         return new_class
106 | 


--------------------------------------------------------------------------------
/tests/test_zip.py:
--------------------------------------------------------------------------------
 1 | from itertable import (
 2 |     ZipFileLoader,
 3 |     ZipNetLoader,
 4 |     CsvParser,
 5 |     ExcelParser,
 6 |     TupleMapper,
 7 |     BaseIter,
 8 | )
 9 | from .base import IterTestCase
10 | from itertable.exceptions import LoadFailed
11 | import httpretty
12 | 
13 | 
14 | class CsvZipFileIter(ZipFileLoader, CsvParser, TupleMapper, BaseIter):
15 |     inner_binary = False
16 | 
17 | 
18 | class ExcelZipFileIter(ZipFileLoader, ExcelParser, TupleMapper, BaseIter):
19 |     inner_binary = True
20 | 
21 | 
22 | class CsvZipNetIter(ZipNetLoader, CsvParser, TupleMapper, BaseIter):
23 |     url = "http://example.com/testcsv.zip"
24 |     inner_binary = False
25 | 
26 | 
27 | class ExcelZipNetIter(ZipNetLoader, ExcelParser, TupleMapper, BaseIter):
28 |     url = "http://example.com/testxlsx.zip"
29 |     inner_binary = True
30 | 
31 | 
32 | class ZipFileTestCase(IterTestCase):
33 |     def test_csv_zip(self):
34 |         filename = self.get_filename("testcsv", "zip")
35 |         instance = CsvZipFileIter(filename=filename)
36 |         self.check_instance(instance)
37 | 
38 |     def test_xlsx_zip(self):
39 |         filename = self.get_filename("testxlsx", "zip")
40 |         instance = ExcelZipFileIter(filename=filename)
41 |         self.check_instance(instance)
42 | 
43 |     def test_multi_zip(self):
44 |         filename = self.get_filename("testmulti", "zip")
45 |         with self.assertRaises(LoadFailed) as cm:
46 |             CsvZipFileIter(filename=filename)
47 |         self.assertEqual(str(cm.exception), "Multiple Inner Files!")
48 | 
49 |     def test_multi_zip_name(self):
50 |         filename = self.get_filename("testmulti", "zip")
51 |         instance = CsvZipFileIter(filename=filename, inner_filename="test.csv")
52 |         self.check_instance(instance)
53 | 
54 | 
55 | class NetZipFileTestCase(IterTestCase):
56 |     def setUp(self):
57 |         httpretty.enable()
58 |         self.register_url("testcsv")
59 |         self.register_url("testxlsx")
60 | 
61 |     def register_url(self, name):
62 |         filename = self.get_filename(name, "zip")
63 |         zipfile = open(filename, "rb")
64 |         zipdata = zipfile.read()
65 |         zipfile.close()
66 |         httpretty.register_uri(
67 |             httpretty.GET,
68 |             "http://example.com/%s.zip" % name,
69 |             body=zipdata,
70 |             content_type="application/zip",
71 |         )
72 | 
73 |     def tearDown(self):
74 |         httpretty.disable()
75 |         httpretty.reset()
76 | 
77 |     def test_load_zip(self):
78 |         self.check_instance(CsvZipNetIter())
79 | 
80 |     def test_xlsx_zip(self):
81 |         self.check_instance(ExcelZipNetIter())
82 | 


--------------------------------------------------------------------------------