├── .coveragerc
├── .gitignore
├── .readthedocs.yml
├── .travis.yml
├── LICENSE
├── MANIFEST.in
├── README.rst
├── dev-requirements.txt
├── docs
    ├── Makefile
    ├── _static
    │   └── theme_override.css
    ├── api.rst
    ├── conf.py
    ├── genindex.rst
    ├── index.rst
    ├── make.bat
    ├── numerical.rst
    ├── requirements.txt
    ├── testing.rst
    └── versions.rst
├── setup.cfg
├── setup.py
├── tafra
    ├── __init__.py
    ├── base.py
    ├── csvreader.py
    ├── formatter.py
    ├── group.py
    ├── protocol.py
    ├── py.typed
    └── version.py
└── test
    ├── __init__.py
    ├── ex1.csv
    ├── ex2.csv
    ├── ex3.csv
    ├── ex4.csv
    ├── ex5.csv
    ├── ex6.csv
    ├── test.bat
    ├── test.sh
    └── test_tafra.py


/.coveragerc:
--------------------------------------------------------------------------------
 1 | 
 2 | # .coveragerc to control coverage.py
 3 | [run]
 4 | # branch = True
 5 | 
 6 | [report]
 7 | # Regexes for lines to exclude from consideration
 8 | exclude_lines =
 9 |     # Have to re-enable the standard pragma
10 |     pragma: no cover
11 |     pass
12 | 
13 |     # Don't complain about missing debug-only code:
14 |     def __repr__
15 |     if self\.debug
16 | 
17 |     # Don't complain if tests don't hit defensive assertion code:
18 |     raise AssertionError
19 |     raise NotImplementedError
20 | 
21 |     # Don't complain if non-runnable code isn't run:
22 |     if 0:
23 |     if __name__ == .__main__.:
24 | 
25 | ignore_errors = True
26 | 
27 | [html]
28 | directory = test/htmlcov
29 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Other things
  2 | *.ipynb
  3 | test/test*.csv
  4 | 
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # VSCode
 14 | .vscode/
 15 | *.code-workspace
 16 | 
 17 | # Zips
 18 | *.zip
 19 | 
 20 | # Spotfire
 21 | *.dxp
 22 | 
 23 | # Vim
 24 | *.swp
 25 | *.swo
 26 | 
 27 | # Distribution / packaging
 28 | .Python
 29 | build/
 30 | develop-eggs/
 31 | dist/
 32 | downloads/
 33 | eggs/
 34 | .eggs/
 35 | lib/
 36 | lib64/
 37 | parts/
 38 | sdist/
 39 | var/
 40 | wheels/
 41 | *.egg-info/
 42 | .installed.cfg
 43 | *.egg
 44 | MANIFEST
 45 | 
 46 | # PyInstaller
 47 | #  Usually these files are written by a python script from a template
 48 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 49 | *.manifest
 50 | *.spec
 51 | 
 52 | # Installer logs
 53 | pip-log.txt
 54 | pip-delete-this-directory.txt
 55 | 
 56 | # Unit test / coverage reports
 57 | htmlcov/
 58 | .tox/
 59 | .coverage
 60 | .coverage.*
 61 | .cache
 62 | nosetests.xml
 63 | coverage.xml
 64 | *.cover
 65 | .hypothesis/
 66 | .pytest_cache/
 67 | 
 68 | # Translations
 69 | *.mo
 70 | *.pot
 71 | 
 72 | # Django stuff:
 73 | *.log
 74 | local_settings.py
 75 | db.sqlite3
 76 | 
 77 | # Flask stuff:
 78 | instance/
 79 | .webassets-cache
 80 | 
 81 | # Scrapy stuff:
 82 | .scrapy
 83 | 
 84 | # Sphinx documentation
 85 | docs/_build/
 86 | 
 87 | # PyBuilder
 88 | target/
 89 | 
 90 | # Jupyter Notebook
 91 | .ipynb_checkpoints
 92 | 
 93 | # pyenv
 94 | .python-version
 95 | 
 96 | # celery beat schedule file
 97 | celerybeat-schedule
 98 | 
 99 | # SageMath parsed files
100 | *.sage.py
101 | 
102 | # Environments
103 | .env
104 | .venv
105 | env/
106 | venv/
107 | ENV/
108 | env.bak/
109 | venv.bak/
110 | 
111 | # Spyder project settings
112 | .spyderproject
113 | .spyproject
114 | 
115 | # Rope project settings
116 | .ropeproject
117 | 
118 | # mkdocs documentation
119 | /site
120 | 
121 | # mypy
122 | .mypy_cache/
123 | *._py
124 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Build documentation in the docs/ directory with Sphinx
 9 | sphinx:
10 |   configuration: docs/conf.py
11 | 
12 | # Build documentation with MkDocs
13 | #mkdocs:
14 | #  configuration: mkdocs.yml
15 | 
16 | # Optionally build your docs in additional formats such as PDF
17 | formats:
18 |   - pdf
19 | 
20 | # Optionally set the version of Python and requirements required to build your docs
21 | python:
22 |   version: 3.7
23 |   system_packages: true
24 |   install:
25 |     - requirements: docs/requirements.txt
26 |     - method: pip
27 |       path: .
28 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: "python"
 2 | 
 3 | python:
 4 |     - "3.7"
 5 |     - "3.8"
 6 | 
 7 | install:
 8 |     - "pip install -U flake8 mypy numpy pandas typing_extensions pytest pytest-cov \"attrs>=19.2.0\" hypothesis coveralls sphinx sphinx_rtd_theme"
 9 |     - "pip install -U git+https://github.com/numpy/numpy-stubs.git"
10 |     - "pip install ."
11 | 
12 | script:
13 |     - 'flake8 tafra'
14 |     - "mypy tafra"
15 |     - "pytest"
16 |     - "sphinx-build -W -b html docs docs/_build/html"
17 | 
18 | notifications:
19 |     - email: false
20 | 
21 | after_success:
22 |     - "coveralls"
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Derrick W. Turk and David S. Fulford
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include README.rst
 2 | include LICENSE
 3 | include docs/*.rst
 4 | include docs/Makefile
 5 | include docs/make.bat
 6 | include docs/conf.py
 7 | include docs/_static/*
 8 | include docs/img/*
 9 | include docs/requirements.txt
10 | include test/*.py
11 | include test/ex*.csv
12 | include test/test.bat
13 | include test/test.sh
14 | include .coveragerc
15 | include dev-requirements.txt
16 | include setup.cfg
17 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | =============================
  2 | Tafra: a minimalist dataframe
  3 | =============================
  4 | 
  5 | .. image:: https://img.shields.io/pypi/v/tafra.svg
  6 |     :target: https://pypi.org/project/tafra/
  7 | 
  8 | .. image:: https://travis-ci.org/petbox-dev/tafra.svg?branch=master
  9 |     :target: https://travis-ci.org/petbox-dev/tafra
 10 | 
 11 | .. image:: https://readthedocs.org/projects/tafra/badge/?version=latest
 12 |     :target: https://tafra.readthedocs.io/en/latest/?badge=latest
 13 |     :alt: Documentation Status
 14 | 
 15 | .. image:: https://coveralls.io/repos/github/petbox-dev/tafra/badge.svg
 16 |     :target: https://coveralls.io/github/petbox-dev/tafra
 17 |     :alt: Coverage Status
 18 | 
 19 | 
 20 | The ``tafra`` began life as a thought experiment: how could we reduce the idea
 21 | of a da\ *tafra*\ me (as expressed in libraries like ``pandas`` or languages
 22 | like R) to its useful essence, while carving away the cruft?
 23 | The `original proof of concept <https://usethe.computer/posts/12-typing-groupby.html>`_
 24 | stopped at "group by".
 25 | 
 26 | .. `original proof of concept`_
 27 | 
 28 | This library expands on the proof of concept to produce a practically
 29 | useful ``tafra``, which we hope you may find to be a helpful lightweight
 30 | substitute for certain uses of ``pandas``.
 31 | 
 32 | A ``tafra`` is, more-or-less, a set of named *columns* or *dimensions*.
 33 | Each of these is a typed ``numpy`` array of consistent length, representing
 34 | the values for each column by *rows*.
 35 | 
 36 | The library provides lightweight syntax for manipulating rows and columns,
 37 | support for managing data types, iterators for rows and sub-frames,
 38 | `pandas`-like "transform" support and conversion from `pandas` Dataframes,
 39 | and SQL-style "group by" and join operations.
 40 | 
 41 | +----------------------------+-----------------------------------------------------------------------------------------------------------------------------+
 42 | | Tafra                      | `Tafra <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra>`_                                                 |
 43 | +----------------------------+-----------------------------------------------------------------------------------------------------------------------------+
 44 | | Aggregations               | `Union <https://tafra.readthedocs.io/en/latest/api.html#tafra.group.Union>`_,                                               |
 45 | |                            | `GroupBy <https://tafra.readthedocs.io/en/latest/api.html#tafra.group.GroupBy>`_,                                           |
 46 | |                            | `Transform <https://tafra.readthedocs.io/en/latest/api.html#tafra.group.Transform>`_,                                       |
 47 | |                            | `IterateBy <https://tafra.readthedocs.io/en/latest/api.html#tafra.group.IterateBy>`_,                                       |
 48 | |                            | `InnerJoin <https://tafra.readthedocs.io/en/latest/api.html#tafra.group.InnerJoin>`_,                                       |
 49 | |                            | `LeftJoin <https://tafra.readthedocs.io/en/latest/api.html#tafra.group.LeftJoin>`_,                                         |
 50 | |                            | `CrossJoin <https://tafra.readthedocs.io/en/latest/api.html#tafra.group.CrossJoin>`_                                        |
 51 | +----------------------------+-----------------------------------------------------------------------------------------------------------------------------+
 52 | | Aggregation Helpers        | `union <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.union>`__,                                         |
 53 | |                            | `union_inplace <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.union_inplace>`_,                          |
 54 | |                            | `group_by <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.group_by>`_,                                    |
 55 | |                            | `transform <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.transform>`__,                                 |
 56 | |                            | `iterate_by <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.iterate_by>`_,                                |
 57 | |                            | `inner_join <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.inner_join>`_,                                |
 58 | |                            | `left_join <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.left_join>`_,                                  |
 59 | |                            | `cross_join <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.cross_join>`_                                 |
 60 | +----------------------------+-----------------------------------------------------------------------------------------------------------------------------+
 61 | | Constructors               | `as_tafra <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.as_tafra>`_,                                    |
 62 | |                            | `from_dataframe <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.from_dataframe>`_,                        |
 63 | |                            | `from_series <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.from_series>`_,                              |
 64 | |                            | `from_records <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.from_records>`_                             |
 65 | +----------------------------+-----------------------------------------------------------------------------------------------------------------------------+
 66 | | SQL Readers                | `read_sql <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.read_sql>`_,                                    |
 67 | |                            | `read_sql_chunks <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.read_sql_chucnks>`_                      |
 68 | +----------------------------+-----------------------------------------------------------------------------------------------------------------------------+
 69 | | Destructors                | `to_records <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.to_records>`_,                                |
 70 | |                            | `to_list <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.to_list>`_,                                      |
 71 | |                            | `to_tuple <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.to_tuple>`_,                                    |
 72 | |                            | `to_array <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.to_array>`_,                                    |
 73 | |                            | `to_pandas <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.to_pandas>`_                                   |
 74 | +----------------------------+-----------------------------------------------------------------------------------------------------------------------------+
 75 | | Properties                 | `rows <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.rows>`_,                                            |
 76 | |                            | `columns <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.columns>`_,                                      |
 77 | |                            | `data <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.data>`_,                                            |
 78 | |                            | `dtypes <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.dtypes>`_,                                        |
 79 | |                            | `size <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.size>`_,                                            |
 80 | |                            | `ndim <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.ndim>`_,                                            |
 81 | |                            | `shape <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.shape>`_                                           |
 82 | +----------------------------+-----------------------------------------------------------------------------------------------------------------------------+
 83 | | Iter Methods               | `iterrows <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.iterrows>`_,                                    |
 84 | |                            | `itertuples <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.itertuples>`_,                                |
 85 | |                            | `itercols <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.itercols>`_                                     |
 86 | +----------------------------+-----------------------------------------------------------------------------------------------------------------------------+
 87 | | Functional Methods         | `row_map <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.row_map>`_,                                      |
 88 | |                            | `tuple_map <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.tuple_map>`_,                                  |
 89 | |                            | `col_map <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.col_map>`_,                                      |
 90 | |                            | `pipe <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.pipe>`_                                             |
 91 | +----------------------------+-----------------------------------------------------------------------------------------------------------------------------+
 92 | | Dict-like Methods          | `keys <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.keys>`_,                                            |
 93 | |                            | `values <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.values>`_,                                        |
 94 | |                            | `items <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.items>`_,                                          |
 95 | |                            | `get <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.get>`_,                                              |
 96 | |                            | `update <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.update>`_,                                        |
 97 | |                            | `update_inplace <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.update_inplace>`_,                        |
 98 | |                            | `update_dtypes <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.update_dtypes>`_,                          |
 99 | |                            | `update_dtypes_inplace <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.update_dtypes_inplace>`_           |
100 | +----------------------------+-----------------------------------------------------------------------------------------------------------------------------+
101 | | Other Helper Methods       | `select <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.select>`_,                                        |
102 | |                            | `head <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.head>`_,                                            |
103 | |                            | `copy <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.copy>`_,                                            |
104 | |                            | `rename <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.rename>`_,                                        |
105 | |                            | `rename_inplace <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.rename_inplace>`_,                        |
106 | |                            | `coalesce <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.coalesce>`_,                                    |
107 | |                            | `coalesce_inplace <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.coalesce_inplace>`_,                    |
108 | |                            | `_coalesce_dtypes <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra._coalesce_dtypes>`_,                    |
109 | |                            | `delete <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.delete>`_,                                        |
110 | |                            | `delete_inplace <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.delete_inplace>`_                         |
111 | +----------------------------+-----------------------------------------------------------------------------------------------------------------------------+
112 | | Printer Methods            | `pprint <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.pprint>`_,                                        |
113 | |                            | `pformat <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.pformat>`_,                                      |
114 | |                            | `to_html <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra.to_html>`_                                       |
115 | +----------------------------+-----------------------------------------------------------------------------------------------------------------------------+
116 | | Indexing Methods           | `_slice <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra._slice>`_,                                        |
117 | |                            | `_index <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra._index>`_,                                        |
118 | |                            | `_ndindex <https://tafra.readthedocs.io/en/latest/api.html#tafra.base.Tafra._ndindex>`_                                     |
119 | +----------------------------+-----------------------------------------------------------------------------------------------------------------------------+
120 | 
121 | Getting Started
122 | ===============
123 | 
124 | Install the library with `pip <https://pip.pypa.io/en/stable/>`_:
125 | 
126 | .. code-block:: shell
127 | 
128 |     pip install tafra
129 | 
130 | 
131 | A short example
132 | ---------------
133 | 
134 | .. code-block:: python
135 | 
136 |     >>> from tafra import Tafra
137 | 
138 |     >>> t = Tafra({
139 |     ...    'x': np.array([1, 2, 3, 4]),
140 |     ...    'y': np.array(['one', 'two', 'one', 'two'], dtype='object'),
141 |     ... })
142 | 
143 |     >>> t.pformat()
144 |     Tafra(data = {
145 |      'x': array([1, 2, 3, 4]),
146 |      'y': array(['one', 'two', 'one', 'two'])},
147 |     dtypes = {
148 |      'x': 'int', 'y': 'object'},
149 |     rows = 4)
150 | 
151 |     >>> print('List:', '\n', t.to_list())
152 |     List:
153 |      [array([1, 2, 3, 4]), array(['one', 'two', 'one', 'two'], dtype=object)]
154 | 
155 |     >>> print('Records:', '\n', tuple(t.to_records()))
156 |     Records:
157 |      ((1, 'one'), (2, 'two'), (3, 'one'), (4, 'two'))
158 | 
159 |     >>> gb = t.group_by(
160 |     ...     ['y'], {'x': sum}
161 |     ... )
162 | 
163 |     >>> print('Group By:', '\n', gb.pformat())
164 |     Group By:
165 |     Tafra(data = {
166 |      'x': array([4, 6]), 'y': array(['one', 'two'])},
167 |     dtypes = {
168 |      'x': 'int', 'y': 'object'},
169 |     rows = 2)
170 | 
171 | 
172 | Flexibility
173 | -----------
174 | 
175 | Have some code that works with ``pandas``, or just a way of doing things
176 | that you prefer? ``tafra`` is flexible:
177 | 
178 | .. code-block:: python
179 | 
180 |     >>> df = pd.DataFrame(np.c_[
181 |     ...     np.array([1, 2, 3, 4]),
182 |     ...     np.array(['one', 'two', 'one', 'two'])
183 |     ... ], columns=['x', 'y'])
184 | 
185 |     >>> t = Tafra.from_dataframe(df)
186 | 
187 | 
188 | And going back is just as simple:
189 | 
190 | .. code-block:: python
191 | 
192 |     >>> df = pd.DataFrame(t.data)
193 | 
194 | 
195 | Timings
196 | =======
197 | 
198 | In this case, lightweight also means performant. Beyond any additional
199 | features added to the library, ``tafra`` should provide the necessary
200 | base for organizing data structures for numerical processing. One of the
201 | most important aspects is fast access to the data itself. By minimizing
202 | abstraction to access the underlying ``numpy`` arrays, ``tafra`` provides
203 | an order of magnitude increase in performance.
204 | 
205 | -   **Import note** If you assign directly to the ``Tafra.data`` or
206 |     ``Tafra._data`` attributes, you *must* call ``Tafra._coalesce_dtypes``
207 |     afterwards in order to ensure the typing is consistent.
208 | 
209 | Construct a ``Tafra`` and a ``DataFrame``:
210 | 
211 | .. code-block:: python
212 | 
213 |     >>> tf = Tafra({
214 |     ...     'x': np.array([1., 2., 3., 4., 5., 6.]),
215 |     ...     'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
216 |     ...     'z': np.array([0, 0, 0, 1, 1, 1])
217 |     ... })
218 | 
219 |     >>> df = pd.DataFrame(t.data)
220 | 
221 | Read Operations
222 | ---------------
223 | 
224 | Direct access:
225 | 
226 | .. code-block:: python
227 | 
228 |     >>> %timemit x = t._data['x']
229 |     55.3 ns ± 5.64 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)
230 | 
231 | 
232 | Indirect with some penalty to support ``Tafra`` slicing and ``numpy``'s
233 | advanced indexing:
234 | 
235 | .. code-block:: python
236 | 
237 |     >>> %timemit x = t['x']
238 |     219 ns ± 71.6 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
239 | 
240 | 
241 | ``pandas`` timing:
242 | 
243 | .. code-block:: python
244 | 
245 |     >>> %timemit x = df['x']
246 |     1.55 µs ± 105 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
247 | 
248 | 
249 | This is the fastest methed for accessing the numpy array among alternatives of
250 | ``df.values()``, ``df.to_numpy()``, and ``df.loc[]``.
251 | 
252 | 
253 | Assignment Operations
254 | ---------------------
255 | 
256 | Direct access is not recommended as it avoids the validation steps, but it
257 | does provide fast access to the data attribute:
258 | 
259 | .. code-block:: python
260 | 
261 |     >>> x = np.arange(6)
262 | 
263 |     >>> %timeit tf._data['x'] = x
264 |     65 ns ± 5.55 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)
265 | 
266 | 
267 | Indidrect access has a performance penalty due to the validation checks to
268 | ensure consistency of the ``tafra``:
269 | 
270 | .. code-block:: python
271 | 
272 |     >>> %timeit tf['x'] = x
273 |     7.39 µs ± 950 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
274 | 
275 | Even so, there is considerable performance improvement over ``pandas``.
276 | 
277 | ``pandas`` timing:
278 | 
279 | .. code-block:: python
280 | 
281 |     >>> %timeit df['x'] = x
282 |     47.8 µs ± 3.53 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
283 | 
284 | 
285 | Grouping Operations
286 | -------------------
287 | 
288 | ``tafra`` also excels at aggregation methods, the primary of which are a
289 | SQL-like ``GROUP BY`` and the split-apply-combine equivalent to a SQL-like
290 | ``GROUP BY`` following by a ``LEFT JOIN`` back to the original table.
291 | 
292 | .. code-block:: python
293 | 
294 |     >>> %timeit tf.group_by(['y', 'z'], {'x': sum})
295 |     138 µs ± 4.03 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
296 | 
297 |     >>> %timeit tf.transform(['y', 'z'], {'sum_x': (sum, 'x')})
298 |     161 µs ± 2.31 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
299 | 
300 | The equivalent ``pandas`` functions are given below. They require a chain
301 | of several object methods to perform the same role, and the transform requires
302 | a copy operation and assignment into the copied ``DataFrame`` in order to
303 | preserve immutability.
304 | 
305 | .. code-block:: python
306 | 
307 |     >>> %timeit df.groupby(['y','z']).agg({'x': 'sum'}).reset_index()
308 |     2.5 ms ± 177 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
309 | 
310 |     >>> %%timeit
311 |     ... tdf = df.copy()
312 |     ... tdf['x'] = df.groupby(['y', 'z'])[['x']].transform(sum)
313 |     2.81 ms ± 143 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
314 | 


--------------------------------------------------------------------------------
/dev-requirements.txt:
--------------------------------------------------------------------------------
 1 | attrs>=19.3.0
 2 | coverage>=5.1
 3 | coveralls>=2.0.0
 4 | flake8>=3.8.2
 5 | hypothesis>=5.16.0
 6 | mypy>=0.770
 7 | numpy>=1.18.4
 8 | numpy-stubs>=0.0.1
 9 | pytest>=5.4.2
10 | pytest-cov>=2.9.0
11 | Sphinx>=3.0.4
12 | sphinx-rtd-theme>=0.4.3
13 | typing_extensions>=3.7.4.1
14 | wheel>=0.34.2
15 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/_static/theme_override.css:
--------------------------------------------------------------------------------
 1 | /* https://rackerlabs.github.io/docs-rackspace/tools/rtd-tables.html */
 2 | /* override table width restrictions */
 3 | @media screen and (min-width: 767px) {
 4 | 
 5 |    .wy-table-responsive table td {
 6 |       /* !important prevents the common CSS stylesheets from overriding
 7 |          this as on RTD they are loaded after this stylesheet */
 8 |       white-space: normal !important;
 9 |    }
10 | 
11 |    .wy-table-responsive {
12 |       overflow: visible !important;
13 |    }
14 | }
15 | 


--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
  1 | =============
  2 | API Reference
  3 | =============
  4 | 
  5 | Summary
  6 | =======
  7 | 
  8 | Tafra
  9 | -----
 10 | 
 11 | .. currentmodule:: tafra.base
 12 | 
 13 | .. autosummary::
 14 | 
 15 |     Tafra
 16 | 
 17 | 
 18 | Aggregations
 19 | ------------
 20 | 
 21 | .. currentmodule:: tafra.group
 22 | 
 23 | .. autosummary::
 24 | 
 25 |     Union
 26 |     GroupBy
 27 |     Transform
 28 |     IterateBy
 29 |     InnerJoin
 30 |     LeftJoin
 31 |     CrossJoin
 32 | 
 33 | 
 34 | Methods
 35 | -------
 36 | 
 37 | .. currentmodule:: tafra.base.Tafra
 38 | 
 39 | .. autosummary::
 40 | 
 41 |     from_records
 42 |     from_dataframe
 43 |     from_series
 44 |     read_sql
 45 |     read_sql_chunks
 46 |     read_csv
 47 |     as_tafra
 48 |     to_records
 49 |     to_list
 50 |     to_tuple
 51 |     to_array
 52 |     to_pandas
 53 |     to_csv
 54 |     rows
 55 |     columns
 56 |     data
 57 |     dtypes
 58 |     size
 59 |     ndim
 60 |     shape
 61 |     head
 62 |     keys
 63 |     values
 64 |     items
 65 |     get
 66 |     iterrows
 67 |     itertuples
 68 |     itercols
 69 |     row_map
 70 |     tuple_map
 71 |     col_map
 72 |     key_map
 73 |     pipe
 74 |     select
 75 |     copy
 76 |     update
 77 |     update_inplace
 78 |     update_dtypes
 79 |     update_dtypes_inplace
 80 |     parse_object_dtypes
 81 |     parse_object_dtypes_inplace
 82 |     rename
 83 |     rename_inplace
 84 |     coalesce
 85 |     coalesce_inplace
 86 |     _coalesce_dtypes
 87 |     delete
 88 |     delete_inplace
 89 |     pprint
 90 |     pformat
 91 |     to_html
 92 |     _slice
 93 |     _iindex
 94 |     _aindex
 95 |     _ndindex
 96 | 
 97 | 
 98 | Helper Methods
 99 | --------------
100 | 
101 | .. currentmodule:: tafra.base.Tafra
102 | 
103 | .. autosummary::
104 | 
105 |     union
106 |     union_inplace
107 |     group_by
108 |     transform
109 |     iterate_by
110 |     inner_join
111 |     left_join
112 |     cross_join
113 | 
114 | 
115 | Object Formatter
116 | ----------------
117 | 
118 | .. currentmodule:: tafra.formatter
119 | 
120 | .. autosummary::
121 | 
122 |     ObjectFormatter
123 | 
124 | 
125 | Detailed Reference
126 | ==================
127 | 
128 | 
129 | Tafra
130 | -----
131 | 
132 | .. currentmodule:: tafra.base
133 | 
134 | 
135 | Methods
136 | ~~~~~~~
137 | 
138 | .. autoclass:: Tafra
139 | 
140 |     .. automethod:: from_dataframe
141 |     .. automethod:: from_series
142 |     .. automethod:: from_records
143 |     .. automethod:: read_sql
144 |     .. automethod:: read_sql_chunks
145 |     .. automethod:: read_csv
146 |     .. automethod:: as_tafra
147 |     .. automethod:: to_records
148 |     .. automethod:: to_list
149 |     .. automethod:: to_tuple
150 |     .. automethod:: to_array
151 |     .. automethod:: to_pandas
152 |     .. automethod:: to_csv
153 |     .. autoattribute:: rows
154 |     .. autoattribute:: columns
155 |     .. autoattribute:: data
156 |     .. autoattribute:: dtypes
157 |     .. autoattribute:: size
158 |     .. autoattribute:: ndim
159 |     .. autoattribute:: shape
160 |     .. automethod:: head
161 |     .. automethod:: keys
162 |     .. automethod:: values
163 |     .. automethod:: items
164 |     .. automethod:: get
165 |     .. automethod:: iterrows
166 |     .. automethod:: itertuples
167 |     .. automethod:: itercols
168 |     .. automethod:: row_map
169 |     .. automethod:: tuple_map
170 |     .. automethod:: col_map
171 |     .. automethod:: key_map
172 |     .. automethod:: pipe
173 |     .. automethod:: __rshift__
174 |     .. automethod:: select
175 |     .. automethod:: copy
176 |     .. automethod:: update
177 |     .. automethod:: update_inplace
178 |     .. automethod:: update_dtypes
179 |     .. automethod:: update_dtypes_inplace
180 |     .. automethod:: parse_object_dtypes
181 |     .. automethod:: parse_object_dtypes_inplace
182 |     .. automethod:: rename
183 |     .. automethod:: rename_inplace
184 |     .. automethod:: coalesce
185 |     .. automethod:: coalesce_inplace
186 |     .. automethod:: _coalesce_dtypes
187 |     .. automethod:: delete
188 |     .. automethod:: delete_inplace
189 |     .. automethod:: pprint
190 |     .. automethod:: pformat
191 |     .. automethod:: to_html
192 |     .. automethod:: _slice
193 |     .. automethod:: _iindex
194 |     .. automethod:: _aindex
195 |     .. automethod:: _ndindex
196 | 
197 | 
198 | Helper Methods
199 | ~~~~~~~~~~~~~~
200 | 
201 | .. class:: Tafra
202 |     :noindex:
203 | 
204 |     .. automethod:: union
205 |     .. automethod:: union_inplace
206 |     .. automethod:: group_by
207 |     .. automethod:: transform
208 |     .. automethod:: iterate_by
209 |     .. automethod:: inner_join
210 |     .. automethod:: left_join
211 |     .. automethod:: cross_join
212 | 
213 | 
214 | Aggregations
215 | ------------
216 | 
217 | .. currentmodule:: tafra.group
218 | 
219 | .. autoclass:: Union
220 | 
221 |     .. automethod:: apply
222 |     .. automethod:: apply_inplace
223 | 
224 | .. autoclass:: GroupBy
225 | 
226 |     .. automethod:: apply
227 | 
228 | .. autoclass:: Transform
229 | 
230 |     .. automethod:: apply
231 | 
232 | .. autoclass:: IterateBy
233 | 
234 |     .. automethod:: apply
235 | 
236 | .. autoclass:: InnerJoin
237 | 
238 |     .. automethod:: apply
239 | 
240 | .. autoclass:: LeftJoin
241 | 
242 |     .. automethod:: apply
243 | 
244 | .. autoclass:: CrossJoin
245 | 
246 |     .. automethod:: apply
247 | 
248 | 
249 | Object Formatter
250 | ----------------
251 | 
252 | .. currentmodule:: tafra.formatter
253 | 
254 | .. autoclass:: ObjectFormatter
255 | 
256 |     .. automethod:: __getitem__
257 |     .. automethod:: __setitem__
258 |     .. automethod:: __delitem__
259 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | 
16 | sys.path.insert(0, os.path.abspath('..'))
17 | import tafra
18 | 
19 | 
20 | # -- Project information -----------------------------------------------------
21 | 
22 | project = 'tafra'
23 | copyright = '2020, David S. Fulford'
24 | author = 'David S. Fulford'
25 | 
26 | # The full version, including alpha/beta/rc tags
27 | release = tafra.__version__
28 | 
29 | 
30 | # -- General configuration ---------------------------------------------------
31 | # Add any Sphinx extension module names here, as strings. They can be
32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
33 | # ones.
34 | extensions = [
35 |     'sphinx.ext.autodoc',
36 |     'sphinx.ext.autosummary',
37 |     'sphinx.ext.viewcode',
38 |     'sphinx.ext.napoleon',
39 |     'sphinx.ext.coverage',
40 | ]
41 | 
42 | # Add any paths that contain templates here, relative to this directory.
43 | templates_path = ['_templates']
44 | 
45 | # The suffix of source filenames.
46 | source_suffix = '.rst'
47 | 
48 | # The master toctree document.
49 | master_doc = 'index'
50 | 
51 | # List of patterns, relative to source directory, that match files and
52 | # directories to ignore when looking for source files.
53 | # This pattern also affects html_static_path and html_extra_path.
54 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
55 | 
56 | 
57 | # -- Options for HTML output -------------------------------------------------
58 | 
59 | # The theme to use for HTML and HTML Help pages.  See the documentation for
60 | # a list of builtin themes.
61 | #
62 | html_theme = 'sphinx_rtd_theme'
63 | 
64 | # Add any paths that contain custom static files (such as style sheets) here,
65 | # relative to this directory. They are copied after the builtin static files,
66 | # so a file named "default.css" will overwrite the builtin "default.css".
67 | html_static_path = ['_static']
68 | 
69 | html_context = {
70 |     # https://rackerlabs.github.io/docs-rackspace/tools/rtd-tables.html
71 |     'css_files': ['_static/theme_overrides.css'],
72 | }
73 | 


--------------------------------------------------------------------------------
/docs/genindex.rst:
--------------------------------------------------------------------------------
1 | Index
2 | =====
3 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. include:: ../README.rst
 2 | 
 3 | Contents
 4 | ========
 5 | 
 6 | .. toctree::
 7 |     :maxdepth: 2
 8 | 
 9 |     README <self>
10 |     api
11 |     numerical
12 | 
13 | .. toctree::
14 |     :maxdepth: 1
15 | 
16 |     testing
17 |     versions
18 |     genindex
19 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/numerical.rst:
--------------------------------------------------------------------------------
  1 | =====================
  2 | Numerical Performance
  3 | =====================
  4 | 
  5 | Summary
  6 | =======
  7 | 
  8 | One of the goals of ``tafra`` is to provide a fast-as-possible data structure
  9 | for numerical computing. To achieve this, all function returns are written
 10 | as `generator expressions <https://www.python.org/dev/peps/pep-0289/>`_ wherever
 11 | possible.
 12 | 
 13 | Additionally, because the :attr:`data` contains values of ndarrays, the
 14 | ``map`` functions may also take functions that operate on ndarrays. This means
 15 | that they are able to take `numba <http://numba.pydata.org/>`_ ``@jit``'ed
 16 | functions as well.
 17 | 
 18 | ``pandas`` is essentially a standard package for anyone performing data science
 19 | with Python, and it provides a wide variety of useful features. However, it's
 20 | not particularly aimed at maximizing performance. Let's use an example of a
 21 | dataframe of function arguments, and a function that maps scalar arguments into
 22 | a vector result. Any function of time serves this purpose, so let's use a
 23 | hyperbolic function.
 24 | 
 25 | First, let's randomnly generate some function arguments and construct both a
 26 | ``Tafra`` and a ``pandas.DataFrame``:
 27 | 
 28 | .. code-block:: python
 29 | 
 30 |     >>> from tafra import Tafra
 31 |     >>> import pandas as pd
 32 |     >>> import numpy as np
 33 | 
 34 |     >>> from typing import Tuple, Union, Any
 35 | 
 36 |     >>> tf = Tafra({
 37 |     ...     'wellid': np.arange(0, 100),
 38 |     ...     'qi': np.random.lognormal(np.log(2000.), np.log(3000. / 1000.) / (2 * 1.28), 100),
 39 |     ...     'Di': np.random.uniform(.5, .9, 100),
 40 |     ...     'bi': np.random.normal(1.0, .2, 100)
 41 |     ... })
 42 | 
 43 |     >>> df = pd.DataFrame(tf.data)
 44 | 
 45 |     >>> tf.head(5)
 46 | 
 47 | ====== ====== ======= ======= =======
 48 | index  wellid qi      Di      bi
 49 | ====== ====== ======= ======= =======
 50 | dtype  int32  float64 float64 float64
 51 | 0      0      2665.82 0.54095 1.07538
 52 | 1      1      1245.85 0.81711 0.48448
 53 | 2      2      1306.56 0.61570 0.54587
 54 | 3      3      2950.33 0.81956 0.66440
 55 | 4      4      1963.93 0.56918 0.74165
 56 | ====== ====== ======= ======= =======
 57 | 
 58 | 
 59 | Next, we define our hyperbolic function and the time array to evaluate:
 60 | 
 61 | .. code-block:: python
 62 | 
 63 |     >>> import math
 64 | 
 65 |     >>> def tan_to_nominal(D: float) -> float:
 66 |     ...     return -math.log1p(-D)
 67 | 
 68 |     >>> def sec_to_nominal(D: float, b: float) -> float:
 69 |     ...     if b <= 1e-4:
 70 |     ...         return tan_to_nominal(Di)
 71 |     ...
 72 |     ...     return ((1.0 - D) ** -b - 1.0) / b
 73 | 
 74 |     >>> def hyp(qi: float, Di: float, bi: float, t: np.ndarray) -> np.ndarray:
 75 |     ...     Dn = sec_to_nominal(Di, bi)
 76 |     ...
 77 |     ...     if bi <= 1e-4:
 78 |     ...         return qi * np.exp(-Dn * t)
 79 |     ...
 80 |     ...    return qi / (1.0 + Dn * bi * t) ** (1.0 / bi)
 81 | 
 82 |     >>> t = 10 ** np.linspace(0, 4, 101)
 83 | 
 84 | 
 85 | And let's build a generic ``mapper`` function to map over the named columns:
 86 | 
 87 | .. code-block:: python
 88 | 
 89 |     >>> def mapper(tf: Union[Tafra, pd.DataFrame]) -> Tuple[int, np.ndarray]:
 90 |     ...     return tf['wellid'], hyp(tf['qi'], tf['Di'], tf['bi'], t)
 91 | 
 92 | 
 93 | We can call this with the following style. The ``pandas`` syntax is a bit
 94 | verbose, but :meth:`pandas.DataFrame.from_items()` is deprecated in newer
 95 | versions, so this is the recommended way. Let's time each approach:
 96 | 
 97 | .. code-block:: python
 98 | 
 99 |     >>> %timeit tdcs = Tafra(tf.row_map(mapper))
100 |     3.38 ms ± 129 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
101 | 
102 | 
103 |     >>> %timeit pdcs = pd.DataFrame(dict(df.apply(mapper, axis=1).to_list())))
104 |     6.86 ms ± 408 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
105 | 
106 | 
107 | We see ``Tafra`` is about twice as fast. Mapping a function this way is
108 | convenient, but there is some indirection occuring that we can do away with to
109 | obtain direct access to the data of the ``Tafra``, and there is a faster
110 | method for ``pandas`` as well as opposed to :meth:`pandas.DataFrame.apply`.
111 | Instead of constructing a new ``Tafra`` or ``pd.DataFrame`` for each row, we
112 | can instead return a :class:`NamedTuple`, which is faster to construct. Doing so:
113 | 
114 | .. code-block:: python
115 | 
116 |     >>> def tuple_mapper(tf: Tuple[Any, ...]) -> Tuple[int, np.ndarray]:
117 |     ...     return tf.wellid, hyp(tf.qi, tf.Di, tf.bi, t)
118 | 
119 |     >>> %timeit Tafra(tf.tuple_map(tuple_mapper))
120 |     1.68 ms ± 84.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
121 | 
122 |     >>> %timeit pd.DataFrame(dict((tuple_mapper(row)) for row in df.itertuples()))
123 |     3.14 ms ± 121 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
124 | 
125 | 
126 | And once again, ``Tafra`` is about twice as fast.
127 | 
128 | One of the upcoming features of ``pandas`` is the ability to apply ``numba``
129 | ``@jit``'ed functions to :meth:`pandas.DataFrame.apply`. The performance
130 | improvement should be significant, especially for long-running functions,
131 | but there will still be overhead in the abstraction before the function is
132 | called. We can demonstrate this by ``@jit``'ing our hyperbolic function and
133 | mapping it over the dataframes, and get an idea of how much improvement is
134 | possible:
135 | 
136 | .. code-block:: python
137 | 
138 |     >>> from numba import jit
139 |     >>> jit_kw = {'fastmath': True}
140 | 
141 |     >>> @jit(**jit_kw)
142 |     ...  def tan_to_nominal(D: float) -> float:
143 |     ...     return -math.log1p(-D)
144 | 
145 |     >>> @jit(**jit_kw)
146 |     ... def sec_to_nominal(D: float, b: float) -> float:
147 |     ...     if b <= 1e-4:
148 |     ...         return tan_to_nominal(D)
149 |     ...
150 |     ...     return ((1.0 - D) ** -b - 1.0) / b
151 | 
152 |     >>> @jit(**jit_kw)
153 |     ... def hyp(qi: float, Di: float, bi: float, t: np.ndarray) -> np.ndarray:
154 |     ...     Dn = sec_to_nominal(Di, bi)
155 |     ...
156 |     ...     if bi <= 1e-4:
157 |     ...         return qi * np.exp(-Dn * t)
158 |     ...
159 |     ...     return qi / (1.0 + Dn * bi * t) ** (1.0 / bi)
160 | 
161 |     >>> %timeit Tafra(tf.tuple_map(tuple_mapper))
162 |     884 µs ± 41.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
163 | 
164 |     >>> %timeit pd.DataFrame(dict((tuple_mapper(row)) for row in df.itertuples()))
165 |     3.09 ms ± 115 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
166 | 
167 | 
168 | Interestingly, we see that ``pandas`` does not get much benefit from this, as
169 | the limit occurs not in the performance of the functions but in the performance
170 | of ``pandas`` itself. We can validate this by skipping the dataframe
171 | construction step:
172 | 
173 | .. code-block:: python
174 | 
175 |     >>> %timeit [tf.tuple_map(tuple_mapper)]
176 |     81.9 µs ± 2.91 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
177 | 
178 |     >>> %timeit [(tuple_mapper(row)) for row in df.itertuples()]
179 |     614 µs ± 14.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
180 | 
181 | 
182 | Last, we might as the question "If ``pandas`` is incurring some performance
183 | penalty, what is the performance penalty of ``Tafra``?" We'll write a function
184 | that operates on the :class:`numpy.ndarray`\s themselves rather than using the
185 | helper :meth:`Tafra.tuple_map()`. We can also use ``numpy``'s built in apply
186 | function, :meth:`numpy.apply_along_axis()`, but it is considerably slower than
187 | a ``@jit``'ed function.
188 | 
189 | .. code-block:: python
190 | 
191 |     >>> @jit(**jit_kw)
192 |     ... def ndarray_map(qi: np.ndarray, Di: np.ndarray, bi: np.ndarray, t: np.ndarray) -> np.ndarray:
193 |     ...     out = np.zeros((qi.shape[0], t.shape[0]))
194 |     ...     for i in range(qi.shape[0]):
195 |     ...         out[i, :] = hyp(qi[i], Di[i], bi[i], t)
196 |     ...
197 |     ...     return out
198 |     81.2 µs ± 9.7 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
199 | 
200 | 
201 | And the timing is neglible, meaning ``Tafra``'s :meth:`Tafra.tuple_map()` is
202 | essentially as performant as we are able to achieve in Python.
203 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.17.0
2 | scipy>=1.4.0
3 | Sphinx>=3.0.0
4 | sphinx-rtd-theme>=0.4.0
5 | typing_extensions>=3.7.4.1
6 | 


--------------------------------------------------------------------------------
/docs/testing.rst:
--------------------------------------------------------------------------------
 1 | =======
 2 | Testing
 3 | =======
 4 | 
 5 | Testing is set to evaluate:
 6 | 
 7 | - style with `flake8 <https://pypi.org/project/flake8/>`_,
 8 | - typing with `mypy <https://pypi.org/project/mypy/>`_,
 9 | - valid function return values and behaviors with `hypothesis <https://pypi.org/project/hypothesis/>`_, and
10 | - test coverage using `coverage <https://pypi.org/project/coverage/>`_.
11 | 
12 | Windows
13 | -------
14 | 
15 | Run ``test.bat`` in the ``test`` directory.
16 | 
17 | 
18 | Linux
19 | -----
20 | 
21 | Run ``test.sh`` in the ``test`` directory.
22 | 


--------------------------------------------------------------------------------
/docs/versions.rst:
--------------------------------------------------------------------------------
 1 | ===============
 2 | Version History
 3 | ===============
 4 | 
 5 | .. automodule:: tafra
 6 |    :noindex:
 7 | 
 8 | 1.0.10
 9 | ------
10 | 
11 | * Add ``pipe`` and overload ``>>`` operator for Tafra objects
12 | 
13 | 1.0.9
14 | -----
15 | 
16 | * Add test files to build
17 | 
18 | 1.0.8
19 | -----
20 | 
21 | * Check rows in constructor to ensure equal data length
22 | 
23 | 1.0.7
24 | -----
25 | 
26 | * Handle missing or NULL values in ``read_csv()``.
27 | * Cast empty elements to None when updating dtypes to avoid failure of ``np.astype()``.
28 | * Update some typing, minor refactoring for performance
29 | 
30 | 
31 | 1.0.6
32 | -----
33 | 
34 | * Additional validations in constructor, primary to evaluate Iterables of values
35 | * Split ``col_map`` to ``col_map`` and ``key_map`` as the original function's return signature depending upon an argument.
36 | * Fix some documentation typos
37 | 
38 | 
39 | 1.0.5
40 | -----
41 | 
42 | * Add ``tuple_map`` method
43 | * Refactor all iterators and ``..._map`` functions to improve performance
44 | * Unpack ``np.ndarray`` if given as keys to constructor
45 | * Add ``validate=False`` in ``__post_init__`` if inputs are **known** to be valid to improve performance
46 | 
47 | 
48 | 1.0.4
49 | -----
50 | 
51 | * Add ``read_csv``, ``to_csv``
52 | * Various refactoring and improvement in data validation
53 | * Add ``typing_extensions`` to dependencies
54 | * Change method of ``dtype`` storage, extract ``str`` representation from ``np.dtype()``
55 | 
56 | 
57 | 1.0.3
58 | -----
59 | 
60 | * Add ``read_sql`` and ``read_sql_chunks``
61 | * Add ``to_tuple`` and ``to_pandas``
62 | * Cleanup constructor data validation
63 | 
64 | 
65 | 1.0.2
66 | -----
67 | 
68 | * Add object_formatter to expose user formatting for dtype=object
69 | * Improvements to indexing and slicing
70 | 
71 | 
72 | 1.0.1
73 | -----
74 | 
75 | * Add iter functions
76 | * Add map functions
77 | * Various constructor improvements
78 | 
79 | 
80 | 1.0.0
81 | -----
82 | 
83 | * Initial Release
84 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | max-line-length = 100
 3 | ignore =
 4 |     F401,
 5 |     F841,
 6 |     E116,
 7 |     E251,
 8 |     E261,
 9 |     E265,
10 |     E266,
11 |     E302,
12 |     E305,
13 |     E402,
14 |     E722,
15 |     E741,
16 |     W503,
17 |     W605
18 | exclude =
19 |     .git,
20 |     __pycache__,
21 |     docs/conf.py,
22 |     docs/source/conf.py,
23 |     old,
24 |     build,
25 |     dist
26 | max-complexity = 20
27 | # output-file = src\test\flake8_run.txt
28 | 
29 | [mypy]
30 | check_untyped_defs = true
31 | disallow_any_generics = true
32 | disallow_incomplete_defs = true
33 | disallow_subclassing_any = true
34 | disallow_untyped_calls = true
35 | disallow_untyped_decorators = true
36 | disallow_untyped_defs = true
37 | # ignore_missing_imports = true
38 | no_implicit_optional = true
39 | show_error_codes = true
40 | strict_equality = true
41 | warn_redundant_casts = true
42 | # warn_return_any = true
43 | warn_unreachable = true
44 | warn_unused_configs = true
45 | warn_unused_ignores = true
46 | 
47 | [tool:pytest]
48 | addopts = --cov=tafra --cov-report=term-missing --hypothesis-show-statistics -v test
49 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tafra: a minimalist dataframe
  3 | 
  4 | Copyright (c) 2020 Derrick W. Turk and David S. Fulford
  5 | 
  6 | Author
  7 | ------
  8 | Derrick W. Turk
  9 | David S. Fulford
 10 | 
 11 | Notes
 12 | -----
 13 | Created on April 25, 2020
 14 | """
 15 | 
 16 | import os
 17 | import sys
 18 | import re
 19 | 
 20 | try:
 21 |     from setuptools import setup
 22 | except ImportError:
 23 |     from distutils.core import setup
 24 | 
 25 | 
 26 | def find_version() -> str:
 27 |     v = {}
 28 |     with open('tafra/version.py', 'r') as f:
 29 |         exec(f.read(), globals(), v)
 30 | 
 31 |     return v['__version__']
 32 | 
 33 | 
 34 | def get_long_description() -> str:
 35 |     # Fix display issues on PyPI caused by RST markup
 36 |     with open('README.rst', 'r') as f:
 37 |         readme = f.read()
 38 | 
 39 |     replacements = [
 40 |         '.. automodule:: tafra',
 41 |         ':noindex:',
 42 |     ]
 43 | 
 44 |     subs = [
 45 |         r':func:`([a-zA-Z0-9._]+)`',
 46 |         r':meth:`([a-zA-Z0-9._]+)`',
 47 |     ]
 48 | 
 49 |     def replace(s: str) -> str:
 50 |         for r in replacements:
 51 |             s = s.replace(r, '')
 52 |         return s
 53 | 
 54 |     lines = []
 55 |     with open('docs/versions.rst', 'r') as f:
 56 |         iter_f = iter(f)
 57 |         _ = next(f)
 58 |         for line in f:
 59 |             if any(r in line for r in replacements):
 60 |                 continue
 61 |             lines.append(line)
 62 | 
 63 |     version_history = ''.join(lines)
 64 |     for sub in subs:
 65 |         version_history = re.sub(sub, r'\1', version_history)
 66 | 
 67 |     return readme + '\n\n' + version_history
 68 | 
 69 | 
 70 | version = find_version()
 71 | 
 72 | if sys.argv[-1] == 'build':
 73 |     print(f'\nBuilding version {version}...\n')
 74 |     os.system('rm -r dist\\')  # clean out dist/
 75 |     os.system('python setup.py sdist bdist_wheel')
 76 |     sys.exit()
 77 | 
 78 | 
 79 | setup(
 80 |     name='tafra',
 81 |     version=version,
 82 |     description='Tafra: innards of a dataframe',
 83 |     long_description=get_long_description(),
 84 |     long_description_content_type="text/x-rst",
 85 |     url='https://github.com/petbox-dev/tafra',
 86 |     author='David S. Fulford',
 87 |     author_email='petbox.dev@gmail.com',
 88 |     install_requires=['numpy>=1.17', 'typing_extensions'],
 89 |     zip_safe=False,
 90 |     packages=['tafra'],
 91 |     package_data={
 92 |         'tafra': ['py.typed']
 93 |     },
 94 |     python_requires='>=3.7',
 95 |     classifiers=[
 96 |         'Development Status :: 5 - Production/Stable',
 97 |         'Intended Audience :: Science/Research',
 98 |         'Intended Audience :: Education',
 99 |         'Intended Audience :: Developers',
100 |         'Natural Language :: English',
101 |         'License :: OSI Approved :: MIT License',
102 |         'Programming Language :: Python :: 3.7',
103 |         'Programming Language :: Python :: 3.8',
104 |         'Programming Language :: Python :: Implementation :: CPython',
105 |         'Topic :: Scientific/Engineering',
106 |         'Topic :: Scientific/Engineering :: Mathematics',
107 |         'Topic :: Software Development :: Libraries',
108 |         'Typing :: Typed'
109 |     ],
110 |     keywords=[
111 |         'tafra', 'dataframe', 'sql', 'group-by', 'aggregation',
112 |         'performance', 'minimalist'
113 |     ],
114 | )
115 | 


--------------------------------------------------------------------------------
/tafra/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tafra: a minimalist dataframe
 3 | 
 4 | Copyright (c) 2020 Derrick W. Turk and David S. Fulford
 5 | 
 6 | Author
 7 | ------
 8 | Derrick W. Turk
 9 | David S. Fulford
10 | 
11 | Notes
12 | -----
13 | Created on April 25, 2020
14 | """
15 | 
16 | from .version import __version__
17 | 
18 | from .base import Tafra, object_formatter
19 | from .group import GroupBy, Transform, IterateBy, InnerJoin, LeftJoin
20 | 
21 | read_sql = Tafra.read_sql
22 | read_sql_chunks = Tafra.read_sql_chunks
23 | read_csv = Tafra.read_csv
24 | as_tafra = Tafra.as_tafra
25 | 


--------------------------------------------------------------------------------
/tafra/base.py:
--------------------------------------------------------------------------------
   1 | """
   2 | Tafra: a minimalist dataframe
   3 | 
   4 | Copyright (c) 2020 Derrick W. Turk and David S. Fulford
   5 | 
   6 | Author
   7 | ------
   8 | Derrick W. Turk
   9 | David S. Fulford
  10 | 
  11 | Notes
  12 | -----
  13 | Created on April 25, 2020
  14 | """
  15 | __all__ = ['Tafra']
  16 | 
  17 | from pathlib import Path
  18 | import re
  19 | import warnings
  20 | import csv
  21 | import pprint as pprint
  22 | from datetime import date, datetime
  23 | from itertools import chain, islice
  24 | from collections import namedtuple
  25 | import dataclasses as dc
  26 | 
  27 | import numpy as np
  28 | from .protocol import Series, DataFrame, Cursor  # just for mypy...
  29 | 
  30 | from typing import (Any, Callable, Dict, Mapping, List, Tuple, Optional, Union as _Union, Sequence,
  31 |                     Sized, Iterable, Iterator, Type, KeysView, ValuesView, ItemsView,
  32 |                     IO)
  33 | from typing_extensions import Concatenate, ParamSpec
  34 | from typing import cast
  35 | from io import TextIOWrapper
  36 | 
  37 | from .formatter import ObjectFormatter
  38 | from .csvreader import CSVReader
  39 | 
  40 | 
  41 | P = ParamSpec('P')
  42 | 
  43 | 
  44 | # default object formats
  45 | object_formatter = ObjectFormatter()
  46 | object_formatter['Decimal'] = lambda x: x.astype(float)
  47 | 
  48 | 
  49 | NAMEDTUPLE_TYPE: Dict[str, Type[Any]] = {
  50 |     'int': int,
  51 |     'float': float,
  52 |     'bool': bool,
  53 |     'str': str,
  54 |     'date': date,
  55 |     'datetime': datetime,
  56 |     'object': str,
  57 | }
  58 | 
  59 | RECORD_TYPE: Dict[str, Callable[[Any], Any]] = {
  60 |     'int': int,
  61 |     'float': float,
  62 |     'bool': bool,
  63 |     'str': str,
  64 |     'date': lambda x: x.isoformat(),
  65 |     'datetime': lambda x: x.isoformat(),
  66 |     'object': str,
  67 | }
  68 | 
  69 | 
  70 | Scalar = _Union[str, int, float, bool]
  71 | _Mapping = _Union[
  72 |     Mapping[str, Any],
  73 |     Mapping[int, Any],
  74 |     Mapping[float, Any],
  75 |     Mapping[bool, Any],
  76 | ]
  77 | _Element = _Union[Tuple[_Union[str, int, float, np.ndarray], Any], List[Any], _Mapping]
  78 | InitVar = _Union[
  79 |     Tuple[str, Any],
  80 |     _Mapping,
  81 |     Sequence[_Element],
  82 |     Iterable[_Element],
  83 |     Iterator[_Element],
  84 |     enumerate
  85 | ]
  86 | 
  87 | 
  88 | @dc.dataclass(repr=False, eq=False)
  89 | class Tafra:
  90 |     """
  91 |     A minimalist dataframe.
  92 | 
  93 |     Constructs a :class:`Tafra` from :class:`dict` of data and (optionally)
  94 |     dtypes. Types on parameters are the types of the constructed :class:`Tafra`,
  95 |     but attempts are made to parse anything that "looks" like the correct data
  96 |     structure, including :class:`Iterable`, :class:`Iterator`, :class:`Sequence`,
  97 |     and :class:`Mapping` and various combinations.
  98 | 
  99 |     Parameters are given as an ``InitVar``, defined as:
 100 | 
 101 |         ``InitVar = Union[Tuple[str, Any], _Mapping, Sequence[_Element], Iterable[_Element],``
 102 |         ``Iterator[_Element], enumerate]``
 103 | 
 104 |         ``_Mapping = Union[Mapping[str, Any], Mapping[int, Any], Mapping[float, Any],``
 105 |         ``Mapping[bool, Any]``
 106 | 
 107 |         ``_Element = Union[Tuple[Union[str, int, float, np.ndarray], Any], List[Any], Mapping]``
 108 | 
 109 |     Parameters
 110 |     ----------
 111 |     data: InitVar
 112 |         The data of the Tafra.
 113 | 
 114 |     dtypes: InitVar
 115 |         The dtypes of the columns.
 116 | 
 117 |     validate: bool = True
 118 |         Run validation checks of the data. False will improve performance, but `data` and `dtypes`
 119 |         will not be validated for conformance to expected data structures.
 120 | 
 121 |     check_rows: bool = True
 122 |         Run row count checks. False will allow columns of differing lengths, which may break several
 123 |         methods.
 124 | 
 125 |     Returns
 126 |     -------
 127 |         tafra: Tafra
 128 |             The constructed :class:`Tafra`.
 129 | 
 130 |     """
 131 |     data: dc.InitVar[InitVar]
 132 |     dtypes: dc.InitVar[Optional[InitVar]] = None
 133 |     validate: dc.InitVar[bool] = True
 134 |     check_rows: bool = True
 135 | 
 136 |     _data: Dict[str, np.ndarray] = dc.field(init=False)
 137 |     _dtypes: Dict[str, str] = dc.field(init=False)
 138 | 
 139 |     def __post_init__(self, data: InitVar, dtypes: Optional[InitVar], validate: bool) -> None:
 140 |         # TODO: enable this?
 141 |         # if isinstance(self._data, DataFrame):
 142 |         #     tf = self.from_dataframe(df=self._data)
 143 |         #     self._data = tf._data
 144 |         #     self._dtypes = tf._dtypes
 145 |         #     self._rows = tf._rows
 146 |         #     return
 147 | 
 148 |         rows: Optional[int] = None
 149 | 
 150 |         if validate:
 151 |             # check that the structure is actually a dict
 152 |             self._data = self._check_initvar(data)
 153 |             if dtypes is None or isinstance(dtypes, property):
 154 |                 self._dtypes = {}
 155 |             else:
 156 |                 self._dtypes = cast(Dict[str, str], self._check_initvar(dtypes))
 157 | 
 158 |             # check that the values are properly formed np.ndarray
 159 |             for column, value in self._data.items():
 160 |                 self._ensure_valid(column, value, check_rows=False)
 161 | 
 162 |                 n_rows = len(self._data[column])
 163 |                 if rows is None:
 164 |                     rows = n_rows
 165 | 
 166 |                 if self.check_rows and rows != n_rows:
 167 |                     raise ValueError('`Tafra` must have consistent row counts.')
 168 |                 elif rows < n_rows:  # pragma: no cover
 169 |                     rows = n_rows
 170 | 
 171 |             if rows is None:
 172 |                 raise ValueError('No data provided in constructor statement.')
 173 | 
 174 |             self.update_dtypes_inplace(self._dtypes)
 175 |             # must coalesce all dtypes immediately, other functions assume a
 176 |             # proper structure of the Tafra
 177 |             self._coalesce_dtypes()
 178 | 
 179 |         else:
 180 |             self._data = cast(Dict[str, np.ndarray], data)
 181 |             if dtypes is None or isinstance(dtypes, property):
 182 |                 self._dtypes = {}
 183 |                 self._coalesce_dtypes()
 184 |             else:
 185 |                 self._dtypes = cast(Dict[str, str], dtypes)
 186 | 
 187 |         self._update_rows()
 188 | 
 189 |     def _check_initvar(self, values: InitVar) -> Dict[str, Any]:
 190 |         """
 191 |         Pre-process an :class:`InitVar` into a :class:`Dict`.
 192 |         """
 193 |         _values: Dict[Any, Any]
 194 | 
 195 |         if isinstance(values, (Mapping, dict)):
 196 |             _values = cast(Dict[str, Any], values)
 197 | 
 198 |         elif isinstance(values, Sequence):
 199 |             _values = self._parse_sequence(values)
 200 | 
 201 |         elif isinstance(values, (Iterator, enumerate)):
 202 |             _values = self._parse_iterator(cast(Iterator[_Element], values))
 203 | 
 204 |         elif isinstance(values, Iterable):
 205 |             _values = self._parse_iterable(cast(Iterable[_Element], values))
 206 | 
 207 |         else:
 208 |             # last ditch attempt
 209 |             _values = cast(Dict[Any, Any], values)
 210 | 
 211 |         if not isinstance(_values, Dict):
 212 |             raise TypeError('Must contain `Dict`, `Mapping`, `Sequence`, Iterable, or Iterator, '
 213 |                             f'got `{type(_values)}`')
 214 | 
 215 |         # cast all keys to strings if they are not
 216 |         # must copy first as mutating the dict changes next(iterator)
 217 |         columns = [c for c in _values.keys() if not isinstance(c, str)]
 218 |         for column in columns:
 219 |             _values[str(column)] = _values.pop(column)
 220 | 
 221 |         return _values
 222 | 
 223 |     def _parse_sequence(self, values: Sequence[_Element]) -> Dict[Any, Any]:
 224 |         """
 225 |         Pre-Process a :class:`Sequence` :class:`InitVar` into a :class:`Dict`.
 226 |         """
 227 |         head = values[0]
 228 |         if isinstance(head, Dict):
 229 |             for _dict in values:
 230 |                 head.update(cast(Dict[Any, Any], _dict))
 231 |             _values = head
 232 | 
 233 |         # maybe a Sequence of 2-tuples or 2-lists? Cast and try it.
 234 |         elif isinstance(head, Sequence) and len(head) == 2:
 235 |             # is the key an ndarray? turn it into a scalar
 236 |             if isinstance(head[0], np.ndarray) and len(np.atleast_1d(head[0])) == 1:
 237 |                 # mypy doesn't get that we've checked the head of values as an ndarray
 238 |                 _values = {key.item(): value for key, value in
 239 |                            cast(Iterable[Tuple[np.ndarray, Any]], values)}
 240 |             else:
 241 |                 _values = dict(cast(Iterable[Tuple[Any, Any]], values))
 242 | 
 243 |         else:
 244 |             raise TypeError('Sequence must contain `Dict`, `Mapping`, or `Sequence`, '
 245 |                             f'got `{type(head)}`')
 246 | 
 247 |         return _values
 248 | 
 249 |     def _parse_iterable(self, values: Iterable[_Element]) -> Dict[Any, Any]:
 250 |         """
 251 |         Pre-Process a :class:`Iterable` :class:`InitVar` into a :class:`Dict`.
 252 |         """
 253 |         iter_values = iter(values)
 254 |         head = next(iter_values)
 255 |         if isinstance(head, Dict):
 256 |             for _dict in iter_values:
 257 |                 head.update(cast(Dict[Any, Any], _dict))
 258 |             _values = head
 259 | 
 260 |         # maybe an Iterable of 2-tuples or 2-lists? Cast and try it.
 261 |         elif isinstance(head, Sequence) and len(head) == 2:
 262 |             # is the key an ndarray? turn it into a scalar
 263 |             if isinstance(head[0], np.ndarray) and len(np.atleast_1d(head[0])) == 1:
 264 |                 # mypy doesn't get that we've checked the head of values as an ndarray
 265 |                 _values = _values = {key.item(): value for key, value in chain(
 266 |                     cast(Iterable[Tuple[np.ndarray, Any]], [head]),
 267 |                     cast(Iterator[Tuple[np.ndarray, Any]], values))}
 268 |             else:
 269 |                 _values = dict(chain(
 270 |                     cast(Iterable[Tuple[Any, Any]], [head]),
 271 |                     cast(Iterator[Tuple[Any, Any]], values)))
 272 | 
 273 |         else:
 274 |             raise TypeError('Iterable must contain `Dict`, `Mapping`, or `Sequence`, '
 275 |                             f'got `{type(head)}`')
 276 | 
 277 |         return _values
 278 | 
 279 |     def _parse_iterator(self, values: Iterator[_Element]) -> Dict[Any, Any]:
 280 |         """
 281 |         Pre-Process a :class:`Iterator` :class:`InitVar` into a :class:`Dict`.
 282 |         """
 283 |         head = next(values)
 284 | 
 285 |         if isinstance(head, Dict):
 286 |             # consume the iterator if its a dict
 287 |             for _dict in values:
 288 |                 head.update(cast(Dict[Any, Any], _dict))
 289 |             _values = head
 290 | 
 291 |             # maybe an Iterator of 2-tuples or 2-lists? Cast and try it.
 292 |         elif isinstance(head, Sequence) and len(head) == 2:
 293 |             # is the key an ndarray? turn it into a scalar
 294 |             if isinstance(head[0], np.ndarray) and len(np.atleast_1d(head[0])) == 1:
 295 |                 # mypy doesn't get that we've checked the head of values as an ndarray
 296 |                 _values = {key.item(): value for key, value in chain(
 297 |                     cast(Iterable[Tuple[np.ndarray, Any]], [head]),
 298 |                     cast(Iterator[Tuple[np.ndarray, Any]], values))}
 299 |             else:
 300 |                 _values = dict(chain(
 301 |                     cast(Iterable[Tuple[Any, Any]], [head]),
 302 |                     cast(Iterator[Tuple[Any, Any]], values)))
 303 | 
 304 |         else:
 305 |             raise TypeError('Iterator must contain `Dict`, `Mapping`, or `Sequence`, '
 306 |                             f'got `{type(head)}`')
 307 | 
 308 |         return _values
 309 | 
 310 |     def __getitem__(
 311 |             self,
 312 |             item: _Union[str, int, slice, Sequence[_Union[str, int, bool]], np.ndarray]) -> Any:
 313 |         # return type is actually Union[np.ndarray, 'Tafra'] but mypy requires user to type check
 314 |         # in either case, what we return is a "slice" of the :class:`Tafra`
 315 |         if isinstance(item, str):
 316 |             return self._data[item]
 317 | 
 318 |         elif isinstance(item, int):
 319 |             return self._iindex(item)
 320 | 
 321 |         elif isinstance(item, slice):
 322 |             return self._slice(item)
 323 | 
 324 |         elif isinstance(item, np.ndarray):
 325 |             return self._ndindex(item)
 326 | 
 327 |         elif isinstance(item, Sequence):
 328 |             if isinstance(item[0], str):
 329 |                 return self.select(cast(Sequence[str], item))
 330 |             else:
 331 |                 return self._aindex(cast(Sequence[_Union[int, bool]], item))
 332 | 
 333 |         else:
 334 |             raise TypeError(f'Type {type(item)} not supported.')
 335 | 
 336 |     def __setitem__(self, item: str, value: _Union[np.ndarray, Sequence[Any], Any]) -> None:
 337 |         self._ensure_valid(item, value, set_item=True)
 338 | 
 339 |     def __repr__(self) -> str:
 340 |         if not hasattr(self, '_rows'):
 341 |             return f'Tafra(data={self._data}, dtypes={self._dtypes}, rows=n/a)'
 342 |         return f'Tafra(data={self._data}, dtypes={self._dtypes}, rows={self._rows})'
 343 | 
 344 |     def __str__(self) -> str:
 345 |         return self.__repr__()
 346 | 
 347 |     def __len__(self) -> int:
 348 |         assert self._data is not None, \
 349 |             'Interal error: Cannot construct a Tafra with no data.'
 350 |         return self._rows
 351 | 
 352 |     def __iter__(self) -> Iterator['Tafra']:
 353 |         return (self._iindex(i) for i in range(self._rows))
 354 | 
 355 |     def __rshift__(self, other: Callable[['Tafra'], 'Tafra']) -> 'Tafra':
 356 |         return self.pipe(other)
 357 | 
 358 |     def iterrows(self) -> Iterator['Tafra']:
 359 |         """
 360 |         Yield rows as :class:`Tafra`. Use :meth:`itertuples` for better performance.
 361 | 
 362 |         Returns
 363 |         -------
 364 |             tafras: Iterator[Tafra]
 365 |                 An iterator of :class:`Tafra`.
 366 |         """
 367 |         yield from self.__iter__()
 368 | 
 369 |     def itertuples(self, name: Optional[str] = 'Tafra') -> Iterator[Tuple[Any, ...]]:
 370 |         """
 371 |         Yield rows as :class:`NamedTuple`, or if ``name`` is ``None``, yield
 372 |         rows as :class:`tuple`.
 373 | 
 374 |         Parameters
 375 |         ----------
 376 |             name: Optional[str] = 'Tafra'
 377 |                 The name for the :class:`NamedTuple`. If ``None``, construct a
 378 |                 :class:`Tuple` instead.
 379 | 
 380 |         Returns
 381 |         -------
 382 |             tuples: Iterator[NamedTuple[Any, ...]]
 383 |                 An iterator of :class:`NamedTuple`.
 384 |         """
 385 |         if name is None:
 386 |             return (tuple(values) for values in zip(*self._data.values()))
 387 | 
 388 |         TafraNT = namedtuple(name, self._data.keys())  # type: ignore
 389 |         return map(TafraNT._make, zip(*self._data.values()))
 390 | 
 391 |     def itercols(self) -> Iterator[Tuple[str, np.ndarray]]:
 392 |         """
 393 |         Yield columns as :class:`Tuple[str, np.ndarray]`, where the ``str`` is the column
 394 |         name.
 395 | 
 396 |         Returns
 397 |         -------
 398 |             tuples: Iterator[Tuple[str, np.ndarray]]
 399 |                 An iterator of :class:`Tafra`.
 400 |         """
 401 |         return map(tuple, self.data.items())  # type: ignore
 402 | 
 403 |     def _update_rows(self) -> None:
 404 |         """
 405 |         Updates :attr:`_rows`. User should call this if they have directly assigned to
 406 |         :attr:_data and need to validate the :class:`Tafra`.
 407 |         """
 408 |         iter_values = iter(self._data.values())
 409 |         self._rows = len(next(iter_values))
 410 |         if self.check_rows and not all(len(v) == self._rows for v in iter_values):
 411 |             raise TypeError('Uneven length of data.')
 412 | 
 413 |     def _slice(self, _slice: slice) -> 'Tafra':
 414 |         """
 415 |         Use a :class:`slice` to slice the :class:`Tafra`.
 416 | 
 417 |         Parameters
 418 |         ----------
 419 |             _slice: slice
 420 |                 The ``slice`` object.
 421 | 
 422 |         Returns
 423 |         -------
 424 |             tafra: Tafra
 425 |                 The sliced :class:`Tafra`.
 426 |         """
 427 |         return Tafra(
 428 |             {column: np.atleast_1d(value[_slice])
 429 |                 for column, value in self._data.items()},
 430 |             self._dtypes,
 431 |             validate=False
 432 |         )
 433 | 
 434 |     def _iindex(self, index: int) -> 'Tafra':
 435 |         """
 436 |         Use a :class`int` to slice the :class:`Tafra`.
 437 | 
 438 |         Parameters
 439 |         ----------
 440 |             index: int
 441 | 
 442 |         Returns
 443 |         -------
 444 |             tafra: Tafra
 445 |                 The sliced :class:`Tafra`.
 446 |         """
 447 |         return Tafra(
 448 |             {column: value[[index]]
 449 |                 for column, value in self._data.items()},
 450 |             self._dtypes,
 451 |             validate=False
 452 |         )
 453 | 
 454 |     def _aindex(self, index: Sequence[_Union[int, bool]]) -> 'Tafra':
 455 |         """
 456 |         Use numpy advanced indexing to slice the :class:`Tafra`.
 457 | 
 458 |         Parameters
 459 |         ----------
 460 |             index: Sequence[Union[int, bool]]
 461 | 
 462 |         Returns
 463 |         -------
 464 |             tafra: Tafra
 465 |                 The sliced :class:`Tafra`.
 466 |         """
 467 |         return Tafra(
 468 |             {column: value[index]
 469 |                 for column, value in self._data.items()},
 470 |             self._dtypes,
 471 |             validate=False
 472 |         )
 473 | 
 474 |     def _ndindex(self, index: np.ndarray) -> 'Tafra':
 475 |         """
 476 |         Use :class:`numpy.ndarray` indexing to slice the :class:`Tafra`.
 477 | 
 478 |         Parameters
 479 |         ----------
 480 |             index: np.ndarray
 481 | 
 482 |         Returns
 483 |         -------
 484 |             tafra: Tafra
 485 |                 The sliced :class:`Tafra`.
 486 |         """
 487 |         if index.ndim != 1:
 488 |             raise IndexError(f'Indexing np.ndarray must ndim == 1, got ndim == {index.ndim}')
 489 | 
 490 |         return Tafra(
 491 |             {column: value[index]
 492 |                 for column, value in self._data.items()},
 493 |             self._dtypes,
 494 |             validate=False
 495 |         )
 496 | 
 497 |     def _repr_pretty_(self, p: 'IPython.lib.pretty.RepresentationPrinter',  # type: ignore # noqa
 498 |                       cycle: bool) -> None:
 499 |         """
 500 |         A dunder method for IPython to pretty print.
 501 | 
 502 |         Parameters
 503 |         ----------
 504 |             p: IPython.lib.pretty.RepresentationPrinter
 505 |                 IPython provides this class to handle the object representation.
 506 | 
 507 |             cycle: bool
 508 |                 IPython has detected an infinite loop. Print an alternative represenation
 509 |                 and return.
 510 | 
 511 |         Returns
 512 |         -------
 513 |             None
 514 |                 Calls p.text and returns.
 515 |         """
 516 |         if cycle:
 517 |             p.text('Tafra(...)')
 518 |         else:
 519 |             p.text(self._pretty_format(lambda s: ' ' + pprint.pformat(s, indent=1)[1:].strip()))
 520 | 
 521 |     def _repr_html_(self) -> str:
 522 |         """
 523 |         a dunder method for Jupyter Notebook to print HTML.
 524 |         """
 525 |         return self.to_html()
 526 | 
 527 |     def _pretty_format(self, formatter: Callable[[object], str]) -> str:
 528 |         """
 529 |         Format _data and _dtypes for pretty printing.
 530 | 
 531 |         Parameters
 532 |         ----------
 533 |             formatter: Callable[[object], str]
 534 |                 A formatter that operates on the _data and _dtypes :class:`dict`.
 535 | 
 536 |         Returns
 537 |         -------
 538 |             string: str
 539 |                 The formatted string for printing.
 540 |         """
 541 |         PATTERN = r'(, dtype=[a-z]+)(?=\))'
 542 | 
 543 |         return '\n'.join([
 544 |             'Tafra(data = {',
 545 |             f'{re.sub(PATTERN, "", formatter(self._data))},',
 546 |             'dtypes = {',
 547 |             f'{re.sub(PATTERN, "", formatter(self._dtypes))},',
 548 |             f'rows = {self._rows})'
 549 |         ])
 550 | 
 551 |     def pformat(self, indent: int = 1, width: int = 80, depth: Optional[int] = None,
 552 |                 compact: bool = False) -> str:
 553 |         """
 554 |         Format for pretty printing. Parameters are passed to
 555 |         :class:`pprint.PrettyPrinter`.
 556 | 
 557 |         Parameters
 558 |         ----------
 559 |             indent: int
 560 |                 Number of spaces to indent for each level of nesting.
 561 | 
 562 |             width: int
 563 |                 Attempted maximum number of columns in the output.
 564 | 
 565 |             depth: Optional[int]
 566 |                 The maximum depth to print out nested structures.
 567 | 
 568 |             compact: bool
 569 |                 If true, several items will be combined in one line.
 570 | 
 571 |         Returns
 572 |         -------
 573 |             formatted string: str
 574 |                 A formatted string for pretty printing.
 575 |         """
 576 |         return self._pretty_format(
 577 |             lambda s: indent * ' ' + pprint.pformat(
 578 |                 s, indent, width, depth, compact=compact)[1:].strip())
 579 | 
 580 |     def pprint(self, indent: int = 1, width: int = 80, depth: Optional[int] = None,
 581 |                compact: bool = False) -> None:
 582 |         """
 583 |         Pretty print. Parameters are passed to :class:`pprint.PrettyPrinter`.
 584 | 
 585 |         Parameters
 586 |         ----------
 587 |             indent: int
 588 |                 Number of spaces to indent for each level of nesting.
 589 | 
 590 |             width: int
 591 |                 Attempted maximum number of columns in the output.
 592 | 
 593 |             depth: Optional[int]
 594 |                 The maximum depth to print out nested structures.
 595 | 
 596 |             compact: bool
 597 |                 If true, several items will be combined in one line.
 598 | 
 599 |         Returns
 600 |         -------
 601 |             None: None
 602 |         """
 603 |         print(self.pformat(indent, width, depth, compact=compact))
 604 | 
 605 |     @staticmethod
 606 |     def _html_thead(columns: Iterable[Any]) -> str:
 607 |         """
 608 |         Construct the table head of the HTML representation.
 609 | 
 610 |         Parameters
 611 |         ----------
 612 |             columns: Iterable[Any]
 613 |                 An iterable of items with defined func:`__repr__` methods.
 614 | 
 615 |         Returns
 616 |         -------
 617 |             HTML: str
 618 |                 The HTML table head.
 619 |         """
 620 |         return '<thead>\n<tr>\n{th}\n</tr>\n</thead>' \
 621 |             .format(th='\n'.join(f'<th>{c}</th>' for c in columns))
 622 | 
 623 |     @staticmethod
 624 |     def _html_tr(row: Iterable[Any]) -> str:
 625 |         """
 626 |         Construct each table row of the HTML representation.
 627 | 
 628 |         Parameters
 629 |         ----------
 630 |             row: Iterable[Any]
 631 |                 An iterable of items with defined func:`__repr__` methods.
 632 | 
 633 |         Returns
 634 |         -------
 635 |             HTML: str
 636 |                 The HTML table row.
 637 |         """
 638 |         return '<tr>\n{td}\n</tr>' \
 639 |             .format(td='\n'.join(f'<td>{td}</td>' for td in row))
 640 | 
 641 |     @staticmethod
 642 |     def _html_tbody(tr: Iterable[str]) -> str:
 643 |         """
 644 |         Construct the table body of the HTML representation.
 645 | 
 646 |         Parameters
 647 |         ----------
 648 |             tr: Iterable[str]
 649 |                 An iterable of HTML table rows.
 650 | 
 651 |         Returns
 652 |         -------
 653 |             HTML: str
 654 |                 The HTML table body.
 655 |         """
 656 |         return '<tbody>\n{tr}\n</tbody>' \
 657 |             .format(tr='\n'.join(tr))
 658 | 
 659 |     @staticmethod
 660 |     def _html_table(thead: str, tbody: str) -> str:
 661 |         """
 662 |         Construct the final table of the HTML representation.
 663 | 
 664 |         Parameters
 665 |         ----------
 666 |             thead: str
 667 |                 An HTML representation of the table head.
 668 | 
 669 |             tbody: str
 670 |                 An HTML representation of the table body.
 671 | 
 672 |         Returns
 673 |         -------
 674 |             HTML: str
 675 |                 The HTML table.
 676 |         """
 677 |         return f'<table>\n{thead}\n{tbody}\n</table>'
 678 | 
 679 |     def to_html(self, n: int = 20) -> str:
 680 |         """
 681 |         Construct an HTML table representation of the :class:`Tafra` data.
 682 | 
 683 |         Parameters
 684 |         ----------
 685 |             n: int = 20
 686 |                 Number of items to print.
 687 | 
 688 |         Returns
 689 |         -------
 690 |             HTML: str
 691 |                 The HTML table representation.
 692 |         """
 693 |         thead = self._html_thead(chain([''], self._data.keys()))
 694 |         tr = chain(
 695 |             [self._html_tr(chain(
 696 |                 ['dtype'],
 697 |                 (self._dtypes[column] for column in self._data.keys())
 698 |             ))],
 699 |             (self._html_tr(chain(
 700 |                 [i],
 701 |                 (v[i] for v in self._data.values())
 702 |             ))
 703 |                 for i in range(min(n, self._rows)))
 704 |         )
 705 |         tbody = self._html_tbody(tr)
 706 |         return self._html_table(thead, tbody)
 707 | 
 708 |     def _ensure_valid(self, column: str, value: _Union[np.ndarray, Sequence[Any], Any],
 709 |                       check_rows: bool = True, set_item: bool = False) -> None:
 710 |         """
 711 |         Validate values as an :class:`np.ndarray` of equal length to :attr:`rows` before
 712 |         assignment. Will attempt to create a :class:`np.ndarray` if ``value`` is not one
 713 |         already, and will check that :attr`np.ndarray.ndim` ``== 1``. If
 714 |         :attr:`np.ndarray.ndim` ``> 1`` it will attempt :meth:`np.squeeze` on ``value``.
 715 | 
 716 |         Parameters
 717 |         ----------
 718 |             column: str
 719 |                 The column to assign to.
 720 | 
 721 |             value: Union[np.ndarray, Sequence[Any], Any]
 722 |                 The value to be assigned.
 723 | 
 724 |         Returns
 725 |         -------
 726 |             None: None
 727 |         """
 728 |         _type = type(value).__name__
 729 |         id_value = id(value)
 730 |         rows = self._rows if check_rows else 1
 731 | 
 732 |         if value is None:
 733 |             value = np.full(rows, value)
 734 | 
 735 |         elif isinstance(value, np.ndarray):
 736 |             if value.ndim == 0:
 737 |                 value = np.full(rows, value.item())
 738 |             elif value.ndim == 1 and value.shape[0] == 1 and rows > 1:
 739 |                 value = np.full(rows, value)
 740 | 
 741 |         elif isinstance(value, str):
 742 |             value = np.full(rows, value)
 743 | 
 744 |         elif isinstance(value, Iterator):
 745 |             value = np.asarray(tuple(value))
 746 | 
 747 |         elif isinstance(value, Iterable):
 748 |             value = np.asarray(value)
 749 | 
 750 |         elif not isinstance(value, Sized):
 751 |             value = np.full(rows, value)
 752 | 
 753 |         assert isinstance(value, np.ndarray), \
 754 |             'Internal error: `Tafra` only supports assigning `ndarray`.'
 755 | 
 756 |         if value.ndim > 1:
 757 |             sq_value = value.squeeze()
 758 |             if sq_value.ndim > 1:
 759 |                 raise ValueError('`ndarray` or `np.squeeze(ndarray)` must have ndim == 1.')
 760 |             elif sq_value.ndim == 1:
 761 |                 # if value was a single item, squeeze returns zero length item
 762 |                 warnings.warn('`np.squeeze(ndarray)` applied to set ndim == 1.')
 763 |                 warnings.resetwarnings()
 764 |                 value = sq_value
 765 | 
 766 |         assert value.ndim >= 1, \
 767 |             'Interal error: `Tafra` only supports assigning ndim == 1.'
 768 | 
 769 |         if check_rows and len(value) != rows:
 770 |             raise ValueError(
 771 |                 '`Tafra` must have consistent row counts.\n'
 772 |                 f'This `Tafra` has {rows} rows. Assigned {_type} has {len(value)} rows.')
 773 | 
 774 |         # special parsing of various object types
 775 |         parsed_value = object_formatter.parse_dtype(value)
 776 |         if parsed_value is not None:
 777 |             value = parsed_value
 778 | 
 779 |         # have we modified value?
 780 |         if set_item or id(value) != id_value:
 781 |             self._data[column] = value
 782 |             self._dtypes[column] = self._format_dtype(value.dtype)
 783 | 
 784 |     def parse_object_dtypes(self) -> 'Tafra':
 785 |         """
 786 |         Parse the object dtypes using the :class:`ObjectFormatter` instance.
 787 |         """
 788 |         tafra = self.copy()
 789 |         tafra.parse_object_dtypes_inplace()
 790 |         return tafra
 791 | 
 792 |     def parse_object_dtypes_inplace(self) -> None:
 793 |         """
 794 |         Inplace version.
 795 | 
 796 |         Parse the object dtypes using the :class:`ObjectFormatter` instance.
 797 |         """
 798 |         for column, value in self._data.items():
 799 |             parsed_value = object_formatter.parse_dtype(value)
 800 |             if parsed_value is not None:
 801 |                 self._data[column] = parsed_value
 802 |                 self._dtypes[column] = self._format_dtype(parsed_value.dtype)
 803 | 
 804 |     def _validate_columns(self, columns: Iterable[str]) -> None:
 805 |         """
 806 |         Validate that the column name(s) exists in :attr:`_data`.
 807 | 
 808 |         Parameters
 809 |         ----------
 810 |             columns: Iterable[str]
 811 |                 The column names to validate.
 812 | 
 813 |         Returns
 814 |         -------
 815 |             None: None
 816 |         """
 817 |         for column in columns:
 818 |             if column not in self._data.keys():
 819 |                 raise ValueError(f'Column {column} does not exist in `tafra`.')
 820 | 
 821 |     def _validate_dtypes(self, dtypes: Dict[str, Any]) -> Dict[str, str]:
 822 |         """
 823 |         Validate that the dtypes as internally used names and that the columns exists in
 824 |         :attr:`_data`.
 825 | 
 826 |         Parameters
 827 |         ----------
 828 |             dtypes: Dict[str, Any]
 829 |                 The dtypes to validate.
 830 | 
 831 |         Returns
 832 |         -------
 833 |             dtypes: Dict[str, str]
 834 |                 The validated types.
 835 |         """
 836 | 
 837 |         self._validate_columns(dtypes.keys())
 838 |         return {column: self._format_dtype(dtype) for column, dtype in dtypes.items()}
 839 | 
 840 |     @staticmethod
 841 |     def _format_dtype(dtype: Any) -> str:
 842 |         """
 843 |         Parse a dtype into the internally used string representation, if defined.
 844 |         Otherwise, pass through and let numpy raise error if it is not a valid dtype.
 845 | 
 846 |         Parameters
 847 |         ----------
 848 |             dtype: Any
 849 |                 The dtype to parse.
 850 | 
 851 |         Returns
 852 |         -------
 853 |             dtype: str
 854 |                 The parsed dtype.
 855 |         """
 856 |         _dtype = np.dtype(dtype)
 857 |         name = _dtype.type.__name__
 858 |         if 'str' in name:
 859 |             return 'str'
 860 | 
 861 |         return name.replace('_', '')
 862 | 
 863 |     @staticmethod
 864 |     def _reduce_dtype(dtype: Any) -> str:
 865 |         """
 866 |         Parse a dtype to the base type.
 867 | 
 868 |         Parameters
 869 |         ----------
 870 |             dtype: Any
 871 |                 The dtype to parse.
 872 | 
 873 |         Returns
 874 |         -------
 875 |             dtype: str
 876 |                 The parsed dtype.
 877 |         """
 878 |         name = np.dtype(dtype).type.__name__
 879 |         m = re.search(r'([a-z]+)', name)
 880 |         if m:
 881 |             return m.group(1)
 882 | 
 883 |         # are there any dtypes without text names?
 884 |         return name  # pragma: no cover
 885 | 
 886 |     @classmethod
 887 |     def from_records(cls, records: Iterable[Iterable[Any]], columns: Iterable[str],
 888 |                      dtypes: Optional[Iterable[Any]] = None, **kwargs: Any) -> 'Tafra':
 889 |         """
 890 |         Construct a :class:`Tafra` from an Iterator of records, e.g. from a SQL query. The
 891 |         records should be a nested Iterable, but can also be fed a cursor method such as
 892 |         ``cur.fetchmany()`` or ``cur.fetchall()``.
 893 | 
 894 |         Parameters
 895 |         ----------
 896 |             records: ITerable[Iteralble[str]]
 897 |                 The records to turn into a :class:`Tafra`.
 898 | 
 899 |             columns: Iterable[str]
 900 |                 The column names to use.
 901 | 
 902 |             dtypes: Optional[Iterable[Any]] = None
 903 |                 The dtypes of the columns.
 904 | 
 905 |         Returns
 906 |         -------
 907 |             tafra: Tafra
 908 |                 The constructed :class:`Tafra`.
 909 |         """
 910 |         if dtypes is None:
 911 |             return Tafra({column: value for column, value in zip(columns, zip(*records))}, **kwargs)
 912 | 
 913 |         return Tafra(
 914 |             {column: value for column, value in zip(columns, zip(*records))},
 915 |             {column: value for column, value in zip(columns, dtypes)},
 916 |             **kwargs
 917 |         )
 918 | 
 919 |     @classmethod
 920 |     def from_series(cls, s: Series, dtype: Optional[str] = None, **kwargs: Any) -> 'Tafra':
 921 |         """
 922 |         Construct a :class:`Tafra` from a :class:`pandas.Series`. If ``dtype`` is not
 923 |         given, take from :attr:`pandas.Series.dtype`.
 924 | 
 925 |         Parameters
 926 |         ----------
 927 |             df: pandas.Series
 928 |                 The series used to build the :class:`Tafra`.
 929 | 
 930 |             dtype: Optional[str] = None
 931 |                 The dtypes of the column.
 932 | 
 933 |         Returns
 934 |         -------
 935 |             tafra: Tafra
 936 |                 The constructed :class:`Tafra`.
 937 |         """
 938 |         if dtype is None:
 939 |             dtype = s.dtype
 940 |         dtypes = {s.name: cls._format_dtype(dtype)}
 941 | 
 942 |         return cls(
 943 |             {s.name: s.values.astype(dtypes[s.name])},
 944 |             dtypes,
 945 |             **kwargs
 946 |         )
 947 | 
 948 |     @classmethod
 949 |     def from_dataframe(cls, df: DataFrame, dtypes: Optional[Dict[str, Any]] = None,
 950 |                        **kwargs: Any) -> 'Tafra':
 951 |         """
 952 |         Construct a :class:`Tafra` from a :class:`pandas.DataFrame`. If ``dtypes`` are not
 953 |         given, take from :attr:`pandas.DataFrame.dtypes`.
 954 | 
 955 |         Parameters
 956 |         ----------
 957 |             df: pandas.DataFrame
 958 |                 The dataframe used to build the :class:`Tafra`.
 959 | 
 960 |             dtypes: Optional[Dict[str, Any]] = None
 961 |                 The dtypes of the columns.
 962 | 
 963 |         Returns
 964 |         -------
 965 |             tafra: Tafra
 966 |                 The constructed :class:`Tafra`.
 967 |         """
 968 |         if dtypes is None:
 969 |             dtypes = {c: t for c, t in zip(df.columns, df.dtypes)}
 970 |         dtypes = {c: cls._format_dtype(t) for c, t in dtypes.items()}
 971 | 
 972 |         return cls(
 973 |             {c: df[c].values.astype(dtypes[c]) for c in df.columns},
 974 |             {c: dtypes[c] for c in df.columns},
 975 |             **kwargs
 976 |         )
 977 | 
 978 |     @classmethod
 979 |     def read_sql(cls, query: str, cur: Cursor) -> 'Tafra':
 980 |         """
 981 |         Execute a SQL SELECT statement using a :class:`pyodbc.Cursor` and return a Tuple
 982 |         of column names and an Iterator of records.
 983 | 
 984 |         Parameters
 985 |         ----------
 986 |             query: str
 987 |                 The SQL query.
 988 | 
 989 |             cur: pyodbc.Cursor
 990 |                 The ``pyodbc`` cursor.
 991 | 
 992 |         Returns
 993 |         -------
 994 |             tafra: Tafra
 995 |                 The constructed :class:`Tafra`.
 996 |         """
 997 |         cur.execute(query)
 998 | 
 999 |         columns, dtypes = zip(*((d[0], d[1]) for d in cur.description))
1000 | 
1001 |         head = cur.fetchone()
1002 |         if head is None:
1003 |             return Tafra({column: () for column in columns})
1004 | 
1005 |         return Tafra.from_records(chain([head], cur.fetchall()), columns, dtypes)
1006 | 
1007 |     @classmethod
1008 |     def read_sql_chunks(cls, query: str, cur: Cursor, chunksize: int = 100) -> Iterator['Tafra']:
1009 |         """
1010 |         Execute a SQL SELECT statement using a :class:`pyodbc.Cursor` and return a Tuple
1011 |         of column names and an Iterator of records.
1012 | 
1013 |         Parameters
1014 |         ----------
1015 |             query: str
1016 |                 The SQL query.
1017 | 
1018 |             cur: pyodbc.Cursor
1019 |                 The ``pyodbc`` cursor.
1020 | 
1021 |         Returns
1022 |         -------
1023 |             tafra: Tafra
1024 |                 The constructed :class:`Tafra`.
1025 |         """
1026 |         cur.execute(query)
1027 | 
1028 |         columns, dtypes = zip(*((d[0], d[1]) for d in cur.description))
1029 | 
1030 |         head = cur.fetchone()
1031 |         if head is None:
1032 |             yield Tafra({column: () for column in columns})
1033 |             return
1034 | 
1035 |         def chunks(iterable: Iterable[Any], chunksize: int = 1000) -> Iterator[Iterable[Any]]:
1036 |             for f in iterable:
1037 |                 yield list(chain([f], islice(iterable, chunksize - 1)))
1038 | 
1039 |         for chunk in chunks(chain([head], cur), chunksize):
1040 |             yield Tafra.from_records(chunk, columns, dtypes)
1041 | 
1042 |     @classmethod
1043 |     def read_csv(cls, csv_file: _Union[str, Path, TextIOWrapper, IO[str]], guess_rows: int = 5,
1044 |                  missing: Optional[str] = '', dtypes: Optional[Dict[str, Any]] = None,
1045 |                  **csvkw: Dict[str, Any]
1046 |                  ) -> 'Tafra':
1047 |         """
1048 |         Read a CSV file with a header row, infer the types of each column,
1049 |         and return a Tafra containing the file's contents.
1050 | 
1051 |         Parameters
1052 |         ----------
1053 |             csv_file: Union[str, TextIOWrapper]
1054 |                 The path to the CSV file, or an open file-like object.
1055 | 
1056 |             guess_rows: int
1057 |                 The number of rows to use when guessing column types.
1058 | 
1059 |             dtypes: Optional[Dict[str, str]]
1060 |                 dtypes by column name; by default, all dtypes will be inferred
1061 |                 from the file contents.
1062 | 
1063 |             **csvkw: Dict[str, Any]
1064 |                 Additional keyword arguments passed to csv.reader.
1065 | 
1066 |         Returns
1067 |         -------
1068 |             tafra: Tafra
1069 |                 The constructed :class:`Tafra`.
1070 |         """
1071 |         reader = CSVReader(cast(_Union[str, Path, TextIOWrapper], csv_file),
1072 |                            guess_rows, missing, **csvkw)
1073 |         return Tafra(reader.read(), dtypes=dtypes)
1074 | 
1075 |     @classmethod
1076 |     def as_tafra(cls, maybe_tafra: _Union['Tafra', DataFrame, Series, Dict[str, Any], Any]
1077 |                  ) -> Optional['Tafra']:
1078 |         """
1079 |         Returns the unmodified `tafra`` if already a :class:`Tafra`, else construct a
1080 |         :class:`Tafra` from known types or subtypes of :class:`DataFrame` or `dict`.
1081 |         Structural subtypes of :class:`DataFrame` or :class:`Series` are also valid,
1082 |         as are classes that have ``cls.__name__ == 'DataFrame'`` or
1083 |         ``cls.__name__ == 'Series'``.
1084 | 
1085 |         Parameters
1086 |         ----------
1087 |             maybe_tafra: Union['tafra', DataFrame]
1088 |                 The object to ensure is a :class:`Tafra`.
1089 | 
1090 |         Returns
1091 |         -------
1092 |             tafra: Optional[Tafra]
1093 |                 The :class:`Tafra`, or None is ``maybe_tafra`` is an unknown
1094 |                 type.
1095 |         """
1096 |         if isinstance(maybe_tafra, Tafra):
1097 |             return maybe_tafra
1098 | 
1099 |         elif isinstance(maybe_tafra, Series):  # pragma: no cover
1100 |             return cls.from_series(maybe_tafra)
1101 | 
1102 |         elif type(maybe_tafra).__name__ == 'Series':  # pragma: no cover
1103 |             return cls.from_series(cast(Series, maybe_tafra))
1104 | 
1105 |         elif isinstance(maybe_tafra, DataFrame):  # pragma: no cover
1106 |             return cls.from_dataframe(maybe_tafra)
1107 | 
1108 |         elif type(maybe_tafra).__name__ == 'DataFrame':  # pragma: no cover
1109 |             return cls.from_dataframe(cast(DataFrame, maybe_tafra))
1110 | 
1111 |         elif isinstance(maybe_tafra, dict):
1112 |             return cls(maybe_tafra)
1113 | 
1114 |         raise TypeError(f'Unknown type `{type(maybe_tafra)}` for conversion to `Tafra`')
1115 | 
1116 |     @property
1117 |     def columns(self) -> Tuple[str, ...]:
1118 |         """
1119 |         The names of the columns. Equivalent to `Tafra`.keys().
1120 | 
1121 |         Returns
1122 |         -------
1123 |             columns: Tuple[str, ...]
1124 |                 The column names.
1125 |         """
1126 |         return tuple(self._data.keys())
1127 | 
1128 |     @columns.setter
1129 |     def columns(self, value: Any) -> None:
1130 |         raise ValueError('Assignment to `columns` is forbidden.')
1131 | 
1132 |     @property
1133 |     def rows(self) -> int:
1134 |         """
1135 |         The number of rows of the first item in :attr:`data`. The :func:`len()`
1136 |         of all items have been previously validated.
1137 | 
1138 |         Returns
1139 |         -------
1140 |             rows: int
1141 |                 The number of rows of the :class:`Tafra`.
1142 |         """
1143 |         return self.__len__()
1144 | 
1145 |     @rows.setter
1146 |     def rows(self, value: Any) -> None:
1147 |         raise ValueError('Assignment to `rows` is forbidden.')
1148 | 
1149 |     @property  # type: ignore
1150 |     def data(self) -> Dict[str, np.ndarray]:
1151 |         """
1152 |         The :class:`Tafra` data.
1153 | 
1154 |         Returns
1155 |         -------
1156 |             data: Dict[str, np.ndarray]
1157 |                 The data.
1158 |         """
1159 |         return self._data
1160 | 
1161 |     @data.setter
1162 |     def data(self, value: Any) -> None:
1163 |         raise ValueError('Assignment to `data` is forbidden.')
1164 | 
1165 |     @property  # type: ignore
1166 |     def dtypes(self) -> Dict[str, str]:
1167 |         """
1168 |         The :class:`Tafra` dtypes.
1169 | 
1170 |         Returns
1171 |         -------
1172 |             dtypes: Dict[str, str]
1173 |                 The dtypes.
1174 |         """
1175 |         return self._dtypes
1176 | 
1177 |     @dtypes.setter
1178 |     def dtypes(self, value: Any) -> None:
1179 |         raise ValueError('Assignment to `dtypes` is forbidden.')
1180 | 
1181 |     @property
1182 |     def size(self) -> int:
1183 |         """
1184 |         The :class:`Tafra` size.
1185 | 
1186 |         Returns
1187 |         -------
1188 |             size: int
1189 |                 The size.
1190 |         """
1191 |         return self.rows * len(self.columns)
1192 | 
1193 |     @size.setter
1194 |     def size(self, value: Any) -> None:
1195 |         raise ValueError('Assignment to `size` is forbidden.')
1196 | 
1197 |     @property
1198 |     def ndim(self) -> int:
1199 |         """
1200 |         The :class:`Tafra` number of dimensions.
1201 | 
1202 |         Returns
1203 |         -------
1204 |             ndim: int
1205 |                 The number of dimensions.
1206 |         """
1207 |         return max(2, len(self.columns))
1208 | 
1209 |     @ndim.setter
1210 |     def ndim(self, value: Any) -> None:
1211 |         raise ValueError('Assignment to `ndim` is forbidden.')
1212 | 
1213 |     @property
1214 |     def shape(self) -> Tuple[int, int]:
1215 |         """
1216 |         The :class:`Tafra` shape.
1217 | 
1218 |         Returns
1219 |         -------
1220 |             shape: int
1221 |                 The shape.
1222 |         """
1223 |         return self.rows, len(self.columns)
1224 | 
1225 |     @shape.setter
1226 |     def shape(self, value: Any) -> None:
1227 |         raise ValueError('Assignment to `shape` is forbidden.')
1228 | 
1229 |     def row_map(self, fn: Callable[..., Any], *args: Any, **kwargs: Any) -> Iterator[Any]:
1230 |         """
1231 |         Map a function over rows. To apply to specific columns, use :meth:`select`
1232 |         first. The function must operate on :class:`Tafra`.
1233 | 
1234 |         Parameters
1235 |         ----------
1236 |             fn: Callable[..., Any]
1237 |                 The function to map.
1238 | 
1239 |             *args: Any
1240 |                 Additional positional arguments to ``fn``.
1241 | 
1242 |             **kwargs: Any
1243 |                 Additional keyword arguments to ``fn``.
1244 | 
1245 |         Returns
1246 |         -------
1247 |             iter_tf: Iterator[Any]
1248 |                 An iterator to map the function.
1249 |         """
1250 |         return (fn(tf, *args, **kwargs) for tf in self.__iter__())
1251 | 
1252 |     def tuple_map(self, fn: Callable[..., Any], *args: Any, **kwargs: Any) -> Iterator[Any]:
1253 |         """
1254 |         Map a function over rows. This is faster than :meth:`row_map`. To apply to
1255 |         specific columns, use :meth:`select` first. The function must operate on
1256 |         :class:`NamedTuple` from :meth:`itertuples`.
1257 | 
1258 |         Parameters
1259 |         ----------
1260 |             fn: Callable[..., Any]
1261 |                 The function to map.
1262 | 
1263 |             name: Optional[str] = 'Tafra'
1264 |                 The name for the :class:`NamedTuple`. If ``None``, construct a
1265 |                 :class:`Tuple` instead. Must be given as a keyword argument.
1266 | 
1267 |             *args: Any
1268 |                 Additional positional arguments to ``fn``.
1269 | 
1270 |             **kwargs: Any
1271 |                 Additional keyword arguments to ``fn``.
1272 | 
1273 |         Returns
1274 |         -------
1275 |             iter_tf: Iterator[Any]
1276 |                 An iterator to map the function.
1277 |         """
1278 |         name = kwargs.pop('name', 'Tafra')
1279 |         return (fn(tf, *args, **kwargs) for tf in self.itertuples(name))
1280 | 
1281 |     def col_map(self, fn: Callable[..., Any], *args: Any, **kwargs: Any) -> Iterator[Any]:
1282 |         """
1283 |         Map a function over columns. To apply to specific columns, use :meth:`select`
1284 |         first. The function must operate on :class:`Tuple[str, np.ndarray]`.
1285 | 
1286 |         Parameters
1287 |         ----------
1288 |             fn: Callable[..., Any]
1289 |                 The function to map.
1290 | 
1291 |             *args: Any
1292 |                 Additional positional arguments to ``fn``.
1293 | 
1294 |             **kwargs: Any
1295 |                 Additional keyword arguments to ``fn``.
1296 | 
1297 |         Returns
1298 |         -------
1299 |             iter_tf: Iterator[Any]
1300 |                 An iterator to map the function.
1301 |         """
1302 | 
1303 |         return (fn(value, *args, **kwargs) for column, value in self.itercols())
1304 | 
1305 |     def key_map(self, fn: Callable[..., Any],
1306 |                 *args: Any, **kwargs: Any) -> Iterator[Tuple[str, Any]]:
1307 |         """
1308 |         Map a function over columns like :meth:col_map, but return :class:`Tuple` of the
1309 |         key with the function result. To apply to specific columns, use :meth:`select`
1310 |         first. The function must operate on :class:`Tuple[str, np.ndarray]`.
1311 | 
1312 |         Parameters
1313 |         ----------
1314 |             fn: Callable[..., Any]
1315 |                 The function to map.
1316 | 
1317 |             *args: Any
1318 |                 Additional positional arguments to ``fn``.
1319 | 
1320 |             **kwargs: Any
1321 |                 Additional keyword arguments to ``fn``.
1322 | 
1323 |         Returns
1324 |         -------
1325 |             iter_tf: Iterator[Any]
1326 |                 An iterator to map the function.
1327 |         """
1328 |         return ((column, fn(value, *args, **kwargs)) for column, value in self.itercols())
1329 | 
1330 |     def pipe(self, fn: Callable[Concatenate['Tafra', P], 'Tafra'],
1331 |              *args: Any, **kwargs: Any) -> 'Tafra':
1332 |         """
1333 |         Apply a function to the :class:`Tafra` and return the resulting :class:`Tafra`. Primarily
1334 |         used to build a tranformer pipeline.
1335 | 
1336 |         Parameters
1337 |         ----------
1338 |             fn: Callable[[], 'Tafra']
1339 |                 The function to apply.
1340 | 
1341 |             *args: Any
1342 |                 Additional positional arguments to ``fn``.
1343 | 
1344 |             **kwargs: Any
1345 |                 Additional keyword arguments to ``fn``.
1346 | 
1347 |         Returns
1348 |         -------
1349 |             tafra: Tafra
1350 |                 A new :class:`Tafra` result of the function.
1351 |         """
1352 |         return fn(self, *args, **kwargs)
1353 | 
1354 |     def select(self, columns: Iterable[str]) -> 'Tafra':
1355 |         """
1356 |         Use column names to slice the :class:`Tafra` columns analogous to SQL SELECT.
1357 |         This does not copy the data. Call :meth:`copy` to obtain a copy of the sliced
1358 |         data.
1359 | 
1360 |         Parameters
1361 |         ----------
1362 |             columns: Iterable[str]
1363 |                 The column names to slice from the :class:`Tafra`.
1364 | 
1365 |         Returns
1366 |         -------
1367 |             tafra: Tafra
1368 |                 the :class:`Tafra` with the sliced columns.
1369 |         """
1370 |         if isinstance(columns, str):
1371 |             columns = [columns]
1372 |         self._validate_columns(columns)
1373 | 
1374 |         return Tafra(
1375 |             {column: self._data[column] for column in columns},
1376 |             {column: self._dtypes[column] for column in columns},
1377 |             validate=False
1378 |         )
1379 | 
1380 |     def head(self, n: int = 5) -> 'Tafra':
1381 |         """
1382 |         Display the head of the :class:`Tafra`.
1383 | 
1384 |         Parameters
1385 |         ----------
1386 |             n: int = 5
1387 |                 The number of rows to display.
1388 | 
1389 |         Returns
1390 |         -------
1391 |             None: None
1392 |         """
1393 |         return self._slice(slice(n))
1394 | 
1395 |     def keys(self) -> KeysView[str]:
1396 |         """
1397 |         Return the keys of :attr:`data`, i.e. like :meth:`dict.keys()`.
1398 | 
1399 |         Returns
1400 |         -------
1401 |             data keys: KeysView[str]
1402 |                 The keys of the data property.
1403 |         """
1404 |         return self._data.keys()
1405 | 
1406 |     def values(self) -> ValuesView[np.ndarray]:
1407 |         """
1408 |         Return the values of :attr:`data`, i.e. like :meth:`dict.values()`.
1409 | 
1410 |         Returns
1411 |         -------
1412 |             data values: ValuesView[np.ndarray]
1413 |                 The values of the data property.
1414 |         """
1415 |         return self._data.values()
1416 | 
1417 |     def items(self) -> ItemsView[str, np.ndarray]:
1418 |         """
1419 |         Return the items of :attr:`data`, i.e. like :meth:`dict.items()`.
1420 | 
1421 |         Returns
1422 |         -------
1423 |             items: ItemsView[str, np.ndarray]
1424 |                 The data items.
1425 |         """
1426 |         return self._data.items()
1427 | 
1428 |     def get(self, key: str, default: Any = None) -> Any:
1429 |         """
1430 |         Return from the :meth:`get` function of :attr:`data`, i.e. like
1431 |         :meth:`dict.get()`.
1432 | 
1433 |         Parameters
1434 |         ----------
1435 |             key: str
1436 |                 The key value in the data property.
1437 | 
1438 |             default: Any
1439 |                 The default to return if the key does not exist.
1440 | 
1441 |         Returns
1442 |         -------
1443 |             value: Any
1444 |                 The value for the key, or the default if the key does not
1445 |                 exist.
1446 |         """
1447 |         return self._data.get(key, default)
1448 | 
1449 |     def update(self, other: 'Tafra') -> 'Tafra':
1450 |         """
1451 |         Update the data and dtypes of this :class:`Tafra` with another :class:`Tafra`.
1452 |         Length of rows must match, while data of different ``dtype`` will overwrite.
1453 | 
1454 |         Parameters
1455 |         ----------
1456 |             other: Tafra
1457 |                 The other :class:`Tafra` from which to update.
1458 | 
1459 |         Returns
1460 |         -------
1461 |             None: None
1462 |         """
1463 |         tafra = self.copy()
1464 |         tafra.update_inplace(other)
1465 |         return tafra
1466 | 
1467 |     def update_inplace(self, other: 'Tafra') -> None:
1468 |         """
1469 |         Inplace version.
1470 | 
1471 |         Update the data and dtypes of this :class:`Tafra` with another :class:`Tafra`.
1472 |         Length of rows must match, while data of different ``dtype`` will overwrite.
1473 | 
1474 |         Parameters
1475 |         ----------
1476 |             other: Tafra
1477 |                 The other :class:`Tafra` from which to update.
1478 | 
1479 |         Returns
1480 |         -------
1481 |             None: None
1482 |         """
1483 |         if not isinstance(other, Tafra):
1484 |             # should be a Tafra, but if not let's construct one
1485 |             other = Tafra(other)  # type: ignore
1486 | 
1487 |         rows = self._rows
1488 | 
1489 |         for column, value in other._data.items():
1490 |             if len(value) != rows:
1491 |                 raise ValueError(
1492 |                     'Other `Tafra` must have consistent row count. '
1493 |                     f'This `Tafra` has {rows} rows, other `Tafra` has {len(value)} rows.')
1494 |             self._data[column] = value
1495 | 
1496 |         self.update_dtypes_inplace(other._dtypes)
1497 | 
1498 |     def _coalesce_dtypes(self) -> None:
1499 |         """
1500 |         Update :attr:`dtypes` with missing keys that exist in :attr:`data`.
1501 | 
1502 |         **Must be called if :attr:`data` or :attr:`data` is directly modified!**
1503 | 
1504 |         Returns
1505 |         -------
1506 |             None: None
1507 |         """
1508 |         for column in self._data.keys():
1509 |             if column not in self._dtypes:
1510 |                 self._dtypes[column] = self._format_dtype(self._data[column].dtype)
1511 | 
1512 |     def update_dtypes(self, dtypes: Dict[str, Any]) -> 'Tafra':
1513 |         """
1514 |         Apply new dtypes.
1515 | 
1516 |         Parameters
1517 |         ----------
1518 |             dtypes: Dict[str, Any]
1519 |                 The dtypes to update. If ``None``, create from entries in :attr:`data`.
1520 | 
1521 |         Returns
1522 |         -------
1523 |             tafra: Optional[Tafra]
1524 |                 The updated :class:`Tafra`.
1525 |         """
1526 |         tafra = self.copy()
1527 |         tafra.update_dtypes_inplace(dtypes)
1528 |         return tafra
1529 | 
1530 |     def update_dtypes_inplace(self, dtypes: Dict[str, Any]) -> None:
1531 |         """
1532 |         Inplace version.
1533 | 
1534 |         Apply new dtypes.
1535 | 
1536 |         Parameters
1537 |         ----------
1538 |             dtypes: Dict[str, Any]
1539 |                 The dtypes to update. If ``None``, create from entries in :attr:`data`.
1540 | 
1541 |         Returns
1542 |         -------
1543 |             tafra: Optional[Tafra]
1544 |                 The updated :class:`Tafra`.
1545 |         """
1546 |         dtypes = self._validate_dtypes(dtypes)
1547 |         self._dtypes.update(dtypes)
1548 | 
1549 |         for column in dtypes.keys():
1550 |             if self._format_dtype(self._data[column].dtype) != self._dtypes[column]:
1551 |                 try:
1552 |                     self._data[column] = self._data[column].astype(self._dtypes[column])
1553 |                 except ValueError:
1554 |                     REPL_VALS = ['', ]
1555 |                     for repl_val in REPL_VALS:
1556 |                         where_repl = np.equal(self._data[column], repl_val)
1557 |                         self._data[column][where_repl] = None
1558 |                     self._data[column] = self._data[column].astype(self._dtypes[column])
1559 | 
1560 |     def rename(self, renames: Dict[str, str]) -> 'Tafra':
1561 |         """
1562 |         Rename columns in the :class:`Tafra` from a :class:`dict`.
1563 | 
1564 |         Parameters
1565 |         ----------
1566 |             renames: Dict[str, str]
1567 |                 The map from current names to new names.
1568 | 
1569 |         Returns
1570 |         -------
1571 |             tafra: Optional[Tafra]
1572 |                 The :class:`Tafra` with update names.
1573 |         """
1574 | 
1575 |         tafra = self.copy()
1576 |         tafra.rename_inplace(renames)
1577 |         return tafra
1578 | 
1579 |     def rename_inplace(self, renames: Dict[str, str]) -> None:
1580 |         """
1581 |         In-place version.
1582 | 
1583 |         Rename columns in the :class:`Tafra` from a :class:`dict`.
1584 | 
1585 |         Parameters
1586 |         ----------
1587 |             renames: Dict[str, str]
1588 |                 The map from current names to new names.
1589 | 
1590 |         Returns
1591 |         -------
1592 |             tafra: Optional[Tafra]
1593 |                 The :class:`Tafra` with update names.
1594 |         """
1595 |         self._validate_columns(renames.keys())
1596 | 
1597 |         for cur, new in renames.items():
1598 |             self._data[new] = self._data.pop(cur)
1599 |             self._dtypes[new] = self._dtypes.pop(cur)
1600 |         return None
1601 | 
1602 |     def delete(self, columns: Iterable[str]) -> 'Tafra':
1603 |         """
1604 |         Remove a column from :attr:`data` and :attr:`dtypes`.
1605 | 
1606 |         Parameters
1607 |         ----------
1608 |             column: str
1609 |                 The column to remove.
1610 | 
1611 |         Returns
1612 |         -------
1613 |             tafra: Optional[Tafra]
1614 |                 The :class:`Tafra` with the deleted column.
1615 |         """
1616 |         if isinstance(columns, str):
1617 |             columns = [columns]
1618 |         self._validate_columns(columns)
1619 | 
1620 |         return Tafra(
1621 |             {column: value.copy() for column, value in self._data.items()
1622 |              if column not in columns},
1623 |             {column: value for column, value in self._dtypes.items()
1624 |              if column not in columns},
1625 |             validate=False
1626 |         )
1627 | 
1628 |     def delete_inplace(self, columns: Iterable[str]) -> None:
1629 |         """
1630 |         In-place version.
1631 | 
1632 |         Remove a column from :attr:`data` and :attr:`dtypes`.
1633 | 
1634 |         Parameters
1635 |         ----------
1636 |             column: str
1637 |                 The column to remove.
1638 | 
1639 |         Returns
1640 |         -------
1641 |             tafra: Optional[Tafra]
1642 |                 The :class:`Tafra` with the deleted column.
1643 |         """
1644 |         if isinstance(columns, str):
1645 |             columns = [columns]
1646 |         self._validate_columns(columns)
1647 | 
1648 |         for column in columns:
1649 |             _ = self._data.pop(column, None)
1650 |             _ = self._dtypes.pop(column, None)
1651 | 
1652 |     def copy(self, order: str = 'C') -> 'Tafra':
1653 |         """
1654 |         Create a copy of a :class:`Tafra`.
1655 | 
1656 |         Parameters
1657 |         ----------
1658 |             order: str = 'C' {‘C’, ‘F’, ‘A’, ‘K’}
1659 |                 Controls the memory layout of the copy. ‘C’ means C-order, ‘F’ means
1660 |                 F-order, ‘A’ means ‘F’ if a is Fortran contiguous, ‘C’ otherwise. ‘K’
1661 |                 means match the layout of a as closely as possible.
1662 | 
1663 |         Returns
1664 |         -------
1665 |             tafra: Tafra
1666 |                 A copied :class:`Tafra`.
1667 |         """
1668 |         return Tafra(
1669 |             {column: value.copy(order=order)
1670 |                 for column, value in self._data.items()},
1671 |             self._dtypes.copy(),
1672 |             validate=False
1673 |         )
1674 | 
1675 |     def coalesce(self, column: str, fills: Iterable[
1676 |         Iterable[_Union[None, str, int, float, bool, np.ndarray]]
1677 |     ]) -> np.ndarray:
1678 |         """
1679 |         Fill ``None`` values from ``fills``. Analogous to ``SQL COALESCE`` or
1680 |         :meth:`pandas.fillna`.
1681 | 
1682 |         Parameters
1683 |         ----------
1684 |             column: str
1685 |                 The column to coalesce.
1686 | 
1687 |             fills: Iterable[Union[str, int, float, bool, np.ndarray]:
1688 | 
1689 |         Returns
1690 |         -------
1691 |             data: np.ndarray
1692 |                 The coalesced data.
1693 |         """
1694 |         # TODO: handle dtype?
1695 |         iter_fills = iter(fills)
1696 |         head = next(iter_fills)
1697 | 
1698 |         if column in self._data.keys():
1699 |             value = self._data[column].copy()
1700 |         else:
1701 |             value = np.empty(self._rows, np.asarray(head).dtype)
1702 | 
1703 |         for _fill in chain([head], iter_fills):
1704 |             fill = np.atleast_1d(_fill)
1705 |             where_na = np.full(self._rows, False)
1706 |             where_na |= value == np.array([None])
1707 |             try:
1708 |                 where_na |= np.isnan(value)
1709 |             except:
1710 |                 pass
1711 | 
1712 |             if len(fill) == 1:
1713 |                 value[where_na] = fill
1714 |             else:
1715 |                 value[where_na] = fill[where_na]
1716 | 
1717 |         return value
1718 | 
1719 |     def coalesce_inplace(self, column: str, fills: Iterable[
1720 |         Iterable[_Union[None, str, int, float, bool, np.ndarray]]
1721 |     ]) -> None:
1722 |         """
1723 |         In-place version.
1724 | 
1725 |         Fill ``None`` values from ``fills``. Analogous to ``SQL COALESCE`` or
1726 |         :meth:`pandas.fillna`.
1727 | 
1728 |         Parameters
1729 |         ----------
1730 |             column: str
1731 |                 The column to coalesce.
1732 | 
1733 |             fills: Iterable[Union[str, int, float, bool, np.ndarray]:
1734 | 
1735 |         Returns
1736 |         -------
1737 |             data: np.ndarray
1738 |                 The coalesced data.
1739 |         """
1740 |         self._data[column] = self.coalesce(column, fills)
1741 |         self.update_dtypes_inplace({column: self._data[column].dtype})
1742 | 
1743 |     def _cast_record(self, dtype: str, data: np.ndarray, cast_null: bool) -> Optional[float]:
1744 |         """
1745 |         Casts needed to generate records for database insert.
1746 | 
1747 |         Will cast ``np.nan`` to ``None``. Requires changing ``dtype`` to
1748 |         ``object``.
1749 | 
1750 |         Parameters
1751 |         ----------
1752 |             dtype: str
1753 |                 The dtype of the data value.
1754 | 
1755 |             data: np.ndarray
1756 |                 The data to have its values cast.
1757 | 
1758 |             cast_null: bool
1759 |                 Perform the cast for ``np.nan``
1760 | 
1761 |         Returns
1762 |         -------
1763 |             value: Any
1764 |                 The cast value.
1765 |         """
1766 |         _dtype = self._reduce_dtype(dtype)
1767 |         value: Any = RECORD_TYPE[_dtype](data.item())
1768 |         if cast_null and _dtype == 'float' and np.isnan(data.item()):
1769 |             return None
1770 |         return value
1771 | 
1772 |     def to_records(self, columns: Optional[Iterable[str]] = None,
1773 |                    cast_null: bool = True) -> Iterator[Tuple[Any, ...]]:
1774 |         """
1775 |         Return a :class:`Iterator` of :class:`Tuple`, each being a record (i.e. row) and
1776 |         allowing heterogeneous typing. Useful for e.g. sending records back to a
1777 |         database.
1778 | 
1779 |         Parameters
1780 |         ----------
1781 |             columns: Optional[Iterable[str]] = None
1782 |                 The columns to extract. If ``None``, extract all columns.
1783 | 
1784 |             cast_null: bool
1785 |                 Cast ``np.nan`` to None. Necessary for :mod:``pyodbc``
1786 | 
1787 |         Returns
1788 |         -------
1789 |             records: Iterator[Tuple[Any, ...]]
1790 |         """
1791 |         if columns is None:
1792 |             columns = self.columns
1793 |         else:
1794 |             if isinstance(columns, str):
1795 |                 columns = [columns]
1796 |             self._validate_columns(columns)
1797 | 
1798 |         return (tuple(
1799 |             None if len(self._data[c]) <= row else self._cast_record(
1800 |                 self._dtypes[c], self._data[c][[row]],
1801 |                 cast_null
1802 |             )
1803 |             for c in columns)
1804 |             for row in range(self._rows))
1805 | 
1806 |     def to_list(self, columns: Optional[Iterable[str]] = None,
1807 |                 inner: bool = False) -> _Union[List[np.ndarray], List[List[Any]]]:
1808 |         """
1809 |         Return a list of homogeneously typed columns (as :class:`numpy.ndarray`). If a
1810 |         generator is needed, use :meth:`to_records`. If ``inner == True`` each column
1811 |         will be cast from :class:`numpy.ndarray` to a :class:`List`.
1812 | 
1813 |         Parameters
1814 |         ----------
1815 |             columns: Optional[Iterable[str]] = None
1816 |                 The columns to extract. If ``None``, extract all columns.
1817 | 
1818 |             inner: bool = False
1819 |                 Cast all :class:`np.ndarray` to :class`List`.
1820 | 
1821 |         Returns
1822 |         -------
1823 |             list: Union[List[np.ndarray], List[List[Any]]]
1824 |         """
1825 |         if columns is None:
1826 |             columns = self.columns
1827 |         else:
1828 |             if isinstance(columns, str):
1829 |                 columns = [columns]
1830 |             self._validate_columns(columns)
1831 | 
1832 |         if inner:
1833 |             return [list(self._data[c]) for c in columns]
1834 |         return [self._data[c] for c in columns]
1835 | 
1836 |     def to_tuple(self, columns: Optional[Iterable[str]] = None, name: Optional[str] = 'Tafra',
1837 |                  inner: bool = False) -> _Union[Tuple[np.ndarray], Tuple[Tuple[Any, ...]]]:
1838 |         """
1839 |         Return a :class:`NamedTuple` or :class:`Tuple`. If a generator is needed, use
1840 |         :meth:`to_records`. If ``inner == True`` each column will be cast from
1841 |         :class:`np.ndarray` to a :class:`Tuple`. If `name` is `None`, returns a
1842 |         :class:`Tuple` instead.
1843 | 
1844 |         Parameters
1845 |         ----------
1846 |             columns: Optional[Iterable[str]] = None
1847 |                 The columns to extract. If ``None``, extract all columns.
1848 | 
1849 |             name: Optional[str] = 'Tafra'
1850 |                 The name for the :class:`NamedTuple`. If ``None``, construct a
1851 |                 :class:`Tuple` instead.
1852 | 
1853 |             inner: bool = False
1854 |                 Cast all :class:`np.ndarray` to :class`List`.
1855 | 
1856 |         Returns
1857 |         -------
1858 |             list: Union[Tuple[np.ndarray], Tuple[Tuple[Any, ...]]]
1859 |         """
1860 |         if columns is None:
1861 |             columns = self.columns
1862 |         else:
1863 |             if isinstance(columns, str):
1864 |                 columns = [columns]
1865 |             self._validate_columns(columns)
1866 | 
1867 |         if name is None:
1868 |             if inner:
1869 |                 return tuple(tuple(self._data[c]) for c in columns)  # type: ignore
1870 |             return tuple(self._data[c] for c in columns)  # type: ignore
1871 | 
1872 |         TafraNT = namedtuple(name, columns, rename=True)  # type: ignore
1873 | 
1874 |         if inner:
1875 |             return TafraNT._make((tuple(self._data[c]) for c in columns))  # type: ignore
1876 |         return TafraNT._make((self._data[c] for c in columns))  # type: ignore
1877 | 
1878 |     def to_array(self, columns: Optional[Iterable[str]] = None) -> np.ndarray:
1879 |         """
1880 |         Return an object array.
1881 | 
1882 |         Parameters
1883 |         ----------
1884 |             columns: Optional[Iterable[str]] = None
1885 |                 The columns to extract. If ``None``, extract all columns.
1886 | 
1887 |         Returns
1888 |         -------
1889 |             array: np.ndarray
1890 |         """
1891 |         if columns is None:
1892 |             columns = self.columns
1893 |         else:
1894 |             if isinstance(columns, str):
1895 |                 columns = [columns]
1896 |             self._validate_columns(columns)
1897 | 
1898 |         return np.array([self._data[c] for c in columns], dtype=object).T
1899 | 
1900 |     def to_pandas(self, columns: Optional[Iterable[str]] = None) -> DataFrame:
1901 |         """
1902 |         Construct a :class:`pandas.DataFrame`.
1903 | 
1904 |         Parameters
1905 |         ----------
1906 |             columns: Iterable[str]
1907 |                 The columns to write. IF ``None``, write all columns.
1908 | 
1909 |         Returns
1910 |         -------
1911 |             dataframe: :class:`pandas.DataFrame`
1912 |         """
1913 |         try:
1914 |             import pandas as pd  # type: ignore
1915 |         except ImportError as e:  # pragma: no cover
1916 |             raise ImportError('`pandas` does not appear to be installed.')
1917 | 
1918 |         if columns is None:
1919 |             columns = self.columns
1920 |         else:
1921 |             if isinstance(columns, str):
1922 |                 columns = [columns]
1923 |             self._validate_columns(columns)
1924 | 
1925 |         return pd.DataFrame({
1926 |             column: pd.Series(value) for column, value in self._data.items()
1927 |             if column in columns
1928 |         })
1929 | 
1930 |     def to_csv(self, filename: _Union[str, Path, TextIOWrapper, IO[str]],
1931 |                columns: Optional[Iterable[str]] = None) -> None:
1932 |         """
1933 |         Write the :class:`Tafra` to a CSV.
1934 | 
1935 |         Parameters
1936 |         ----------
1937 |             filename: Union[str, Path]
1938 |                 The path of the filename to write.
1939 | 
1940 |             columns: Iterable[str]
1941 |                 The columns to write. IF ``None``, write all columns.
1942 |         """
1943 |         if columns is None:
1944 |             columns = self.columns
1945 |         else:
1946 |             if isinstance(columns, str):
1947 |                 columns = [columns]
1948 |             self._validate_columns(columns)
1949 | 
1950 |         if isinstance(filename, (str, Path)):
1951 |             f = open(filename, 'w', newline='')
1952 |             should_close = True
1953 | 
1954 |         elif isinstance(filename, TextIOWrapper):
1955 |             if 'w' not in filename.mode:
1956 |                 raise ValueError(f'file must be opened in write mode, got {filename.mode}')
1957 |             f = filename
1958 |             should_close = False
1959 | 
1960 |             f.reconfigure(newline='')
1961 | 
1962 |         writer = csv.writer(f, delimiter=',', quotechar='"')
1963 |         writer.writerow((column for column in self._data.keys() if column in columns))
1964 |         writer.writerows(self.to_records(columns))
1965 | 
1966 |         if should_close:
1967 |             f.close()
1968 | 
1969 |     def union(self, other: 'Tafra') -> 'Tafra':
1970 |         """
1971 |         Helper function to implement :meth:`tafra.group.Union.apply`.
1972 | 
1973 |         Union two :class:`Tafra` together. Analogy to SQL UNION or `pandas.append`. All
1974 |         column names and dtypes must match.
1975 | 
1976 |         Parameters
1977 |         ----------
1978 |             other: Tafra
1979 |                 The other tafra to union.
1980 | 
1981 |         Returns
1982 |         -------
1983 |             tafra: Tafra
1984 |                 A new tafra with the unioned data.
1985 |         """
1986 |         return Union().apply(self, other)
1987 | 
1988 |     def union_inplace(self, other: 'Tafra') -> None:
1989 |         """
1990 |         Inplace version.
1991 | 
1992 | 
1993 |         Helper function to implement :meth:`tafra.group.Union.apply_inplace`.
1994 | 
1995 |         Union two :class:`Tafra` together. Analogy to SQL UNION or `pandas.append`. All
1996 |         column names and dtypes must match.
1997 | 
1998 |         Parameters
1999 |         ----------
2000 |             other: Tafra
2001 |                 The other tafra to union.
2002 | 
2003 |         Returns
2004 |         -------
2005 |             None: None
2006 |         """
2007 |         Union().apply_inplace(self, other)
2008 | 
2009 |     def group_by(self, columns: Iterable[str], aggregation: 'InitAggregation' = {},
2010 |                  iter_fn: Mapping[str, Callable[[np.ndarray], Any]] = dict()) -> 'Tafra':
2011 |         """
2012 |         Helper function to implement :meth:`tafra.group.GroupBy.apply`.
2013 | 
2014 |         Aggregation by a set of unique values.
2015 | 
2016 |         Analogy to SQL ``GROUP BY``, not :meth:`pandas.DataFrame.groupby()`.
2017 | 
2018 |         Parameters
2019 |         ----------
2020 |             columns: Iterable[str]
2021 |                 The column names to group by.
2022 | 
2023 |             aggregation: Mapping[str, Union[Callable[[np.ndarray], Any], \
2024 |             Tuple[Callable[[np.ndarray], Any], str]]]
2025 |                 Optional. A mapping for columns and aggregation functions. Should be
2026 |                 given as {'column': fn} or {'new_column': (fn, 'column')}.
2027 | 
2028 |             iter_fn: Mapping[str, Callable[[np.ndarray], Any]]
2029 |                 Optional. A mapping for new columns names to the function to apply to
2030 |                 the enumeration. Should be given as {'new_column': fn}.
2031 | 
2032 |         Returns
2033 |         -------
2034 |             tafra: Tafra
2035 |                 The aggregated :class:`Tafra`.
2036 |         """
2037 |         return GroupBy(columns, aggregation, iter_fn).apply(self)
2038 | 
2039 |     def transform(self, columns: Iterable[str], aggregation: 'InitAggregation' = {},
2040 |                   iter_fn: Dict[str, Callable[[np.ndarray], Any]] = dict()) -> 'Tafra':
2041 |         """
2042 |         Helper function to implement :meth:`tafra.group.Transform.apply`.
2043 | 
2044 |         Apply a function to each unique set of values and join to the original table.
2045 |         Analogy to :meth:`pandas.DataFrame.groupby().transform()`,
2046 |         i.e. a SQL ``GROUP BY`` and ``LEFT JOIN`` back to the original table.
2047 | 
2048 |         Parameters
2049 |         ----------
2050 |             group_by: Iterable[str]
2051 |                 The column names to group by.
2052 | 
2053 |             aggregation: Mapping[str, Union[Callable[[np.ndarray], Any], \
2054 |             Tuple[Callable[[np.ndarray], Any], str]]]
2055 |                 Optional. A mapping for columns and aggregation functions. Should be
2056 |                 given as {'column': fn} or {'new_column': (fn, 'column')}.
2057 | 
2058 |             iter_fn: Mapping[str, Callable[[np.ndarray], Any]]
2059 |                 Optional. A mapping for new columns names to the function to apply to
2060 |                 the enumeration. Should be given as {'new_column': fn}.
2061 | 
2062 |         Returns
2063 |         -------
2064 |             tafra: Tafra
2065 |                 The transformed :class:`Tafra`.
2066 |         """
2067 |         return Transform(columns, aggregation, iter_fn).apply(self)
2068 | 
2069 |     def iterate_by(self, columns: Iterable[str]) -> Iterator['GroupDescription']:
2070 |         """
2071 |         Helper function to implement :meth:`tafra.group.IterateBy.apply`.
2072 | 
2073 |         A generator that yields a :class:`Tafra` for each set of unique values. Analogy
2074 |         to `pandas.DataFrame.groupby()`, i.e. an :class:`Iterator` of :class:`Tafra`.
2075 | 
2076 |         Yields tuples of ((unique grouping values, ...), row indices array, subset
2077 |         tafra)
2078 | 
2079 |         Parameters
2080 |         ----------
2081 |             group_by: Iterable[str]
2082 |                 The column names to group by.
2083 | 
2084 |         Returns
2085 |         -------
2086 |             tafras: Iterator[GroupDescription]
2087 |                 An iterator over the grouped :class:`Tafra`.
2088 |         """
2089 |         yield from IterateBy(columns).apply(self)
2090 | 
2091 |     def inner_join(self, right: 'Tafra', on: Iterable[Tuple[str, str, str]],
2092 |                    select: Iterable[str] = list()) -> 'Tafra':
2093 |         """
2094 |         Helper function to implement :meth:`tafra.group.InnerJoin.apply`.
2095 | 
2096 |         An inner join.
2097 | 
2098 |         Analogy to SQL INNER JOIN, or `pandas.merge(..., how='inner')`,
2099 | 
2100 |         Parameters
2101 |         ----------
2102 |             right: Tafra
2103 |                 The right-side :class:`Tafra` to join.
2104 | 
2105 |             on: Iterable[Tuple[str, str, str]]
2106 |                 The columns and operator to join on. Should be given as
2107 |                 ('left column', 'right column', 'op') Valid ops are:
2108 | 
2109 |                 '==' : equal to
2110 |                 '!=' : not equal to
2111 |                 '<'  : less than
2112 |                 '<=' : less than or equal to
2113 |                 '>'  : greater than
2114 |                 '>=' : greater than or equal to
2115 | 
2116 |             select: Iterable[str] = []
2117 |                 The columns to return. If not given, all unique columns names are
2118 |                 returned. If the column exists in both :class`Tafra`, prefers the left
2119 |                 over the right.
2120 | 
2121 |         Returns
2122 |         -------
2123 |             tafra: Tafra
2124 |                 The joined :class:`Tafra`.
2125 |         """
2126 |         return InnerJoin(on, select).apply(self, right)
2127 | 
2128 |     def left_join(self, right: 'Tafra', on: Iterable[Tuple[str, str, str]],
2129 |                   select: Iterable[str] = list()) -> 'Tafra':
2130 |         """
2131 |         Helper function to implement :meth:`tafra.group.LeftJoin.apply`.
2132 | 
2133 |         A left join.
2134 | 
2135 |         Analogy to SQL LEFT JOIN, or `pandas.merge(..., how='left')`,
2136 | 
2137 |         Parameters
2138 |         ----------
2139 |             right: Tafra
2140 |                 The right-side :class:`Tafra` to join.
2141 | 
2142 |             on: Iterable[Tuple[str, str, str]]
2143 |                 The columns and operator to join on. Should be given as
2144 |                 ('left column', 'right column', 'op') Valid ops are:
2145 | 
2146 |                 '==' : equal to
2147 |                 '!=' : not equal to
2148 |                 '<'  : less than
2149 |                 '<=' : less than or equal to
2150 |                 '>'  : greater than
2151 |                 '>=' : greater than or equal to
2152 | 
2153 |             select: Iterable[str] = []
2154 |                 The columns to return. If not given, all unique columns names are
2155 |                 returned. If the column exists in both :class`Tafra`, prefers the left
2156 |                 over the right.
2157 | 
2158 |         Returns
2159 |         -------
2160 |             tafra: Tafra
2161 |                 The joined :class:`Tafra`.
2162 |         """
2163 |         return LeftJoin(on, select).apply(self, right)
2164 | 
2165 |     def cross_join(self, right: 'Tafra',
2166 |                    select: Iterable[str] = list()) -> 'Tafra':
2167 |         """
2168 |         Helper function to implement :meth:`tafra.group.CrossJoin.apply`.
2169 | 
2170 |         A cross join.
2171 | 
2172 |         Analogy to SQL CROSS JOIN, or `pandas.merge(..., how='outer') using temporary
2173 |         columns of static value to intersect all rows`.
2174 | 
2175 |         Parameters
2176 |         ----------
2177 |             right: Tafra
2178 |                 The right-side :class:`Tafra` to join.
2179 | 
2180 |             select: Iterable[str] = []
2181 |                 The columns to return. If not given, all unique columns names are
2182 |                 returned. If the column exists in both :class`Tafra`, prefers the left
2183 |                 over the right.
2184 | 
2185 |         Returns
2186 |         -------
2187 |             tafra: Tafra
2188 |                 The joined :class:`Tafra`.
2189 |         """
2190 |         return CrossJoin([], select).apply(self, right)
2191 | 
2192 | def to_field_name(maybe_text: _Union[str, int, float]) -> str:  # pragma: no cover
2193 |     text = str(maybe_text)
2194 | 
2195 |     # Remove invalid characters
2196 |     mid_text = re.sub('[^0-9a-zA-Z]', '', text)
2197 | 
2198 |     # Remove leading characters until we find a letter
2199 |     final_text = re.sub('^[^a-zA-Z]+', '', mid_text)
2200 | 
2201 |     if final_text == '':
2202 |         final_text = 'field_' + mid_text
2203 | 
2204 |     return final_text
2205 | 
2206 | def _in_notebook() -> bool:  # pragma: no cover
2207 |     """
2208 |     Checks if running in a Jupyter Notebook.
2209 | 
2210 |     Returns
2211 |     -------
2212 |         in_notebook: bool
2213 |     """
2214 |     try:
2215 |         from IPython import get_ipython  # type: ignore
2216 |         if 'IPKernelApp' in get_ipython().config:
2217 |             return True
2218 |     except Exception as e:
2219 |         pass
2220 |     return False
2221 | 
2222 | # Import here to resolve circular dependency
2223 | from .group import (GroupBy, Transform, IterateBy, InnerJoin, LeftJoin, CrossJoin, Union,
2224 |                     InitAggregation, GroupDescription)
2225 | 


--------------------------------------------------------------------------------
/tafra/csvreader.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tafra: a minimalist dataframe
  3 | 
  4 | Copyright (c) 2020 Derrick W. Turk and David S. Fulford
  5 | 
  6 | Author
  7 | ------
  8 | Derrick W. Turk
  9 | David S. Fulford
 10 | 
 11 | Notes
 12 | -----
 13 | Created on April 25, 2020
 14 | """
 15 | from pathlib import Path
 16 | import csv
 17 | import dataclasses as dc
 18 | 
 19 | import numpy as np
 20 | 
 21 | from enum import Enum, auto
 22 | from io import TextIOWrapper
 23 | from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Type
 24 | from typing import Union, cast
 25 | 
 26 | # this doesn't type well in Python
 27 | @dc.dataclass(frozen=True)
 28 | class ReadableType:
 29 |     dtype: Type[Any]
 30 |     parse: Callable[[str], Any]
 31 | 
 32 | def _parse_bool(val: str) -> bool:
 33 |     folded = val.casefold()
 34 |     if folded in ('false', 'no', 'f'):
 35 |         return False
 36 |     if folded in ('true', 'yes', 't'):
 37 |         return True
 38 |     raise ValueError('not a boolean')
 39 | 
 40 | # numpy-stubs is a lie about many of these, hence the type: ignore spam
 41 | _TYPE_PRECEDENCE: List[ReadableType] = [
 42 |     ReadableType(np.int32, cast(Callable[[str], Any], np.int32)),
 43 |     ReadableType(np.int64, cast(Callable[[str], Any], np.int64)),
 44 |     # np.float32, # nervous about ever inferring this
 45 |     ReadableType(np.float64, cast(Callable[[str], Any], np.float64)),
 46 |     ReadableType(bool, _parse_bool),
 47 |     # TODO: date,
 48 |     # TODO: datetime,
 49 | ]
 50 | 
 51 | _TYPE_OBJECT: ReadableType = ReadableType(object, lambda x: x)
 52 | 
 53 | class ReaderState(Enum):
 54 |     AWAIT_GUESSABLE = auto()
 55 |     EARLY_EOF = auto()
 56 |     GUESS = auto()
 57 |     READ = auto()
 58 |     EOF = auto()
 59 |     DONE = auto()
 60 | 
 61 | class CSVReader:
 62 |     def __init__(self, source: Union[str, Path, TextIOWrapper],
 63 |                  guess_rows: int = 5, missing: Optional[str] = '',
 64 |                  **csvkw: Dict[str, Any]):
 65 |         if isinstance(source, (str, Path)):
 66 |             self._stream = open(source, newline='')
 67 |             self._should_close = True
 68 |         elif isinstance(source, TextIOWrapper):
 69 |             source.reconfigure(newline='')
 70 |             self._stream = source
 71 |             self._should_close = False
 72 |         reader = csv.reader(self._stream, dialect='excel', **csvkw)
 73 |         self._header = _unique_header(next(reader))
 74 |         self._reader = (self._decode_missing(t) for t in reader)
 75 |         self._guess_types = {
 76 |             col: _TYPE_PRECEDENCE[0] for col in self._header
 77 |         }
 78 |         self._guess_data: Dict[str, List[Any]] = {
 79 |             col: list() for col in self._header
 80 |         }
 81 |         self._data: Dict[str, List[Any]] = dict()
 82 |         self._guess_rows = guess_rows
 83 |         self._missing = missing
 84 |         self._rows = 0
 85 |         self._state = ReaderState.AWAIT_GUESSABLE
 86 | 
 87 |     def read(self) -> Dict[str, np.ndarray]:
 88 |         while self._state != ReaderState.DONE:
 89 |             self._step()
 90 |         return self._finalize()
 91 | 
 92 |     def _step(self) -> None:
 93 |         if self._state == ReaderState.AWAIT_GUESSABLE:
 94 |             self.state_await_guessable()
 95 |             return
 96 | 
 97 |         if self._state == ReaderState.GUESS:
 98 |             self.state_guess()
 99 |             return
100 | 
101 |         if self._state == ReaderState.READ:
102 |             self.state_read()
103 |             return
104 | 
105 |         if self._state == ReaderState.EARLY_EOF:
106 |             self.state_early_eof()
107 |             return
108 | 
109 |         if self._state == ReaderState.EOF:
110 |             self.state_eof()
111 |             return
112 | 
113 |         if self._state == ReaderState.DONE:  # pragma: no cover
114 |             return
115 | 
116 |     def state_await_guessable(self) -> None:
117 |         try:
118 |             row = next(self._reader)
119 |         except StopIteration:
120 |             self._state = ReaderState.EARLY_EOF
121 |             return
122 | 
123 |         self._rows += 1
124 |         if len(row) != len(self._header):
125 |             raise ValueError(f'length of row #{self._rows}'
126 |                              ' does not match header length')
127 | 
128 |         for col, val in zip(self._header, row):
129 |             self._guess_data[col].append(val)
130 | 
131 |         if self._rows == self._guess_rows:
132 |             self._state = ReaderState.GUESS
133 | 
134 |     def state_guess(self) -> None:
135 |         for col in self._header:
136 |             ty, parsed = _guess_column(_TYPE_PRECEDENCE,
137 |                                        self._guess_data[col])
138 |             self._guess_types[col] = ty
139 |             self._data[col] = parsed
140 |         self._state = ReaderState.READ
141 | 
142 |     def state_read(self) -> None:
143 |         try:
144 |             row = next(self._reader)
145 |         except StopIteration:
146 |             self._state = ReaderState.EOF
147 |             return
148 | 
149 |         self._rows += 1
150 |         if len(row) != len(self._header):
151 |             raise ValueError(f'length of row #{self._rows}'
152 |                              ' does not match header length')
153 | 
154 |         for col, val in zip(self._header, row):
155 |             try:
156 |                 self._data[col].append(self._guess_types[col].parse(val)) # type: ignore
157 |             except:
158 |                 self._promote(col, val)
159 | 
160 |     def state_early_eof(self) -> None:
161 |         if self._should_close:
162 |             self._stream.close()
163 | 
164 |         for col in self._header:
165 |             ty, parsed = _guess_column(_TYPE_PRECEDENCE,
166 |                                        self._guess_data[col])
167 |             self._guess_types[col] = ty
168 |             self._data[col] = parsed
169 | 
170 |         self._state = ReaderState.DONE
171 | 
172 |     def state_eof(self) -> None:
173 |         if self._should_close:
174 |             self._stream.close()
175 |         self._state = ReaderState.DONE
176 | 
177 |     def _promote(self, col: str, val: Optional[str]) -> None:
178 |         ty_ix = _TYPE_PRECEDENCE.index(self._guess_types[col])
179 |         try_next = _TYPE_PRECEDENCE[ty_ix + 1:]
180 |         stringized = self._encode_missing(self._data[col])
181 |         stringized.append(val)
182 |         ty, parsed = _guess_column(try_next, stringized)
183 |         self._guess_types[col] = ty
184 |         self._data[col] = parsed
185 | 
186 |     def _finalize(self) -> Dict[str, np.ndarray]:
187 |         assert self._state == ReaderState.DONE, 'CSVReader is not in DONE state.'
188 |         return {
189 |             col: np.array(self._data[col], dtype=self._guess_types[col].dtype)
190 |             for col in self._header
191 |         }
192 | 
193 |     def _decode_missing(self, row: List[str]) -> Sequence[Optional[str]]:
194 |         if self._missing is None:
195 |             return row
196 |         return [v if v != self._missing else None for v in row]
197 | 
198 |     def _encode_missing(self, row: Sequence[Optional[Any]]) -> List[Optional[str]]:
199 |         return [str(v) if v is not None else self._missing for v in row]
200 | 
201 | def _unique_header(header: List[str]) -> List[str]:
202 |     uniq: List[str] = list()
203 |     for col in header:
204 |         col_unique = col
205 |         i = 2
206 |         while col_unique in uniq:
207 |             col_unique = f'{col} ({i})'
208 |             i += 1
209 |         uniq.append(col_unique)
210 |     return uniq
211 | 
212 | # the "real" return type is a dependent pair (t: ReadableType ** List[t.dtype])
213 | def _guess_column(precedence: List[ReadableType], vals: List[Optional[str]]
214 |                   ) -> Tuple[ReadableType, List[Any]]:
215 |     for ty in precedence:
216 |         try:
217 |             # mypy doesn't really get that the thing we're mapping is not a method
218 |             #   on `ty` but a data member
219 |             typed = list(map(ty.parse, vals)) # type: ignore
220 |             return ty, typed
221 |         except:
222 |             next
223 |     return _TYPE_OBJECT, vals
224 | 


--------------------------------------------------------------------------------
/tafra/formatter.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tafra: a minimalist dataframe
  3 | 
  4 | Copyright (c) 2020 Derrick W. Turk and David S. Fulford
  5 | 
  6 | Author
  7 | ------
  8 | Derrick W. Turk
  9 | David S. Fulford
 10 | 
 11 | Notes
 12 | -----
 13 | Created on April 25, 2020
 14 | """
 15 | from typing import Callable, Dict, Tuple, Any, Iterator, MutableMapping, Type, Optional
 16 | 
 17 | import numpy as np
 18 | 
 19 | 
 20 | class ObjectFormatter(Dict[str, Callable[[np.ndarray], np.ndarray]],
 21 |                       MutableMapping[str, Callable[[np.ndarray], np.ndarray]]):
 22 |     """
 23 |     A dictionary that contains mappings for formatting objects. Some numpy objects
 24 |     should be cast to other types, e.g. the :class:`decimal.Decimal` type cannot
 25 |     operate with :class:`np.float`. These mappings are defined in this class.
 26 | 
 27 |     Each mapping must define a function that takes a :class:`np.ndarray` and
 28 |     returns a :class:`np.ndarray`.
 29 | 
 30 |     The key for each mapping is the name of the type of the actual value,
 31 |     looked up from the first element of the :class:`np.ndarray`, i.e.
 32 |     ``type(array[0]).__name__``.
 33 |     """
 34 |     test_array = np.arange(4)
 35 | 
 36 |     def __setitem__(self, dtype: str, value: Callable[[np.ndarray], np.ndarray]) -> None:
 37 |         """
 38 |         Set the dtype formatter.
 39 |         """
 40 |         try:
 41 |             if not isinstance(value(self.test_array), np.ndarray):
 42 |                 raise ValueError(
 43 |                     'Must provide a function that takes an ``np.ndarray`` and returns '
 44 |                     'an np.ndarray.')
 45 |         except Exception as e:
 46 |             raise ValueError(
 47 |                 'Must provide a function that takes an ``np.ndarray`` and returns '
 48 |                 'an np.ndarray.')
 49 | 
 50 |         dict.__setitem__(self, dtype, value)
 51 | 
 52 |     def __getitem__(self, dtype: str) -> Callable[[np.ndarray], np.ndarray]:
 53 |         """
 54 |         Get the dtype formatter.
 55 |         """
 56 |         return dict.__getitem__(self, dtype)
 57 | 
 58 |     def __delitem__(self, dtype: str) -> None:
 59 |         """
 60 |         Delete the dtype formatter.
 61 |         """
 62 |         dict.__delitem__(self, dtype)
 63 | 
 64 |     def __repr__(self) -> str:
 65 |         return self.__str__()
 66 | 
 67 |     def __str__(self) -> str:
 68 |         if self.__len__() < 1:
 69 |             return r'{}'
 70 |         return '{' + '\n'.join(f'{c}: {v}' for c, v in self.items()) + '}'
 71 | 
 72 |     def __iter__(self) -> Iterator[Any]:
 73 |         yield from dict.__iter__(self)
 74 | 
 75 |     def __len__(self) -> int:
 76 |         return dict.__len__(self)
 77 | 
 78 |     def copy(self) -> Dict[str, Any]:
 79 |         return {k: dict.__getitem__(self, k) for k in self}
 80 | 
 81 |     def parse_dtype(self, value: np.ndarray) -> Optional[np.ndarray]:
 82 |         """
 83 |         Parse an object dtype.
 84 | 
 85 |         Parameters
 86 |         ----------
 87 |             value: np.ndarray
 88 |                 The :class:`np.ndarray` to be parsed.
 89 | 
 90 |         Returns
 91 |         -------
 92 |             value, modified: Tuple(np.ndarray, bool)
 93 |                 The :class:`np.ndarray` and whether it was modified or not.
 94 |         """
 95 |         if value.dtype != np.dtype(object):
 96 |             return None
 97 | 
 98 |         type_name = type(value[0]).__name__
 99 |         if type_name in self.keys():
100 |             value = self[type_name](value)
101 |             return value
102 | 
103 |         return None
104 | 


--------------------------------------------------------------------------------
/tafra/group.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tafra: a minimalist dataframe
  3 | 
  4 | Copyright (c) 2020 Derrick W. Turk and David S. Fulford
  5 | 
  6 | Author
  7 | ------
  8 | Derrick W. Turk
  9 | David S. Fulford
 10 | 
 11 | Notes
 12 | -----
 13 | Created on April 25, 2020
 14 | """
 15 | __all__ = ['GroupBy', 'Transform', 'IterateBy', 'InnerJoin', 'LeftJoin']
 16 | 
 17 | import operator
 18 | from collections import OrderedDict
 19 | from itertools import chain
 20 | import dataclasses as dc
 21 | 
 22 | import numpy as np
 23 | 
 24 | from typing import (Any, Callable, Dict, Mapping, List, Tuple, Optional, Union as _Union, Sequence,
 25 |                     Iterable, Iterator)
 26 | from typing import cast
 27 | 
 28 | 
 29 | JOIN_OPS: Dict[str, Callable[[Any, Any], Any]] = {
 30 |     '==': operator.eq,
 31 |     '!=': operator.ne,
 32 |     '<': operator.lt,
 33 |     '<=': operator.le,
 34 |     '>': operator.gt,
 35 |     '>=': operator.ge
 36 | }
 37 | 
 38 | # for the passed argument to an aggregation
 39 | InitAggregation = Mapping[
 40 |     str,
 41 |     _Union[
 42 |         Callable[[np.ndarray], Any],
 43 |         Tuple[Callable[[np.ndarray], Any], str]
 44 |     ]
 45 | ]
 46 | 
 47 | 
 48 | # for the result type of IterateBy
 49 | GroupDescription = Tuple[
 50 |     Tuple[Any, ...],  # tuple of unique values from group-by columns
 51 |     np.ndarray,  # int array of row indices into original tafra for this group
 52 |     'Tafra'  # sub-tafra for the group
 53 | ]
 54 | 
 55 | 
 56 | class Union:
 57 |     """
 58 |     Union two :class:`Tafra` together. Analogy to SQL UNION or
 59 |     `pandas.append`. All column names and dtypes must match.
 60 |     """
 61 |     @staticmethod
 62 |     def _validate(left: 'Tafra', right: 'Tafra') -> None:
 63 |         """
 64 |         Validate the :class:`Tafra` before applying.
 65 |         """
 66 |         # These should be unreachable unless attributes were directly modified
 67 |         if len(left._data) != len(left._dtypes):
 68 |             assert 0, 'This `Tafra` length of data and dtypes do not match'
 69 |         if len(right._data) != len(right._dtypes):
 70 |             assert 0, 'right `Tafra` length of data and dtypes do not match'
 71 | 
 72 |         # ensure same number of columns
 73 |         if len(left._data) != len(right._data) or len(left._dtypes) != len(right._dtypes):
 74 |             raise ValueError(
 75 |                 'This `Tafra` column count does not match right `Tafra` column count.')
 76 | 
 77 |         # ensure all columns in this `Tafra` exist in right `Tafra`
 78 |         # if len() is same AND all columns in this exist in right,
 79 |         # do not need to check right `Tafra` columns in this `Tafra`.
 80 |         for (data_column, value), (dtype_column, dtype) \
 81 |                 in zip(left._data.items(), left._dtypes.items()):
 82 | 
 83 |             if data_column not in right._data or dtype_column not in right._dtypes:
 84 |                 raise TypeError(
 85 |                     f'This `Tafra` column `{data_column}` does not exist in right `Tafra`.')
 86 | 
 87 |             elif value.dtype != right._data[data_column].dtype:
 88 |                 raise TypeError(
 89 |                     f'This `Tafra` column `{data_column}` dtype `{value.dtype}` '
 90 |                     f'does not match right `Tafra` dtype `{right._data[data_column].dtype}`.')
 91 | 
 92 |             # should not happen unless dtypes manually changed, but let's check it
 93 |             elif dtype != right._dtypes[dtype_column]:
 94 |                 raise TypeError(
 95 |                     f'This `Tafra` column `{data_column}` dtype `{dtype}` '
 96 |                     f'does not match right `Tafra` dtype `{right._dtypes[dtype_column]}`.')
 97 | 
 98 |     def apply(self, left: 'Tafra', right: 'Tafra') -> 'Tafra':
 99 |         """
100 |         Apply the :class:`Union_` to the :class:`Tafra`.
101 | 
102 |         Parameters
103 |         ----------
104 |             left: Tafra
105 |                 The left :class:`Tafra` to union.
106 | 
107 |             right: Tafra
108 |                 The right :class:`Tafra` to union.
109 | 
110 |         Returns
111 |         -------
112 |             tafra: Tafra
113 |                 The unioned :class`Tafra`.
114 |         """
115 |         self._validate(left, right)
116 | 
117 |         return Tafra(
118 |             {column: np.append(value, right._data[column]) for column, value in left._data.items()},
119 |             left._dtypes.copy()
120 |         )
121 | 
122 |     def apply_inplace(self, left: 'Tafra', right: 'Tafra') -> None:
123 |         """
124 |         In-place version.
125 | 
126 |         Apply the :class:`Union_` to the :class:`Tafra`.
127 | 
128 |         Parameters
129 |         ----------
130 |             left: Tafra
131 |                 The left :class:`Tafra` to union.
132 | 
133 |             right: Tafra
134 |                 The right :class:`Tafra` to union.
135 | 
136 |         Returns
137 |         -------
138 |             tafra: Tafra
139 |                 The unioned :class`Tafra`.
140 |         """
141 |         self._validate(left, right)
142 | 
143 |         for column, value in left._data.items():
144 |             left._data[column] = np.append(value, right._data[column])
145 |         left._update_rows()
146 | 
147 | @dc.dataclass
148 | class GroupSet:
149 |     """
150 |     A `GroupSet` is the set of columns by which we construct our groups.
151 |     """
152 | 
153 |     @staticmethod
154 |     def _unique_groups(tafra: 'Tafra', columns: Iterable[str]) -> List[Any]:
155 |         """
156 |         Construct a unique set of grouped values.
157 |         Uses :class:``OrderedDict`` rather than :class:``set`` to maintain order.
158 |         """
159 |         return list(OrderedDict.fromkeys(zip(*(tafra._data[col] for col in columns))))
160 | 
161 |     @staticmethod
162 |     def _validate(tafra: 'Tafra', columns: Iterable[str]) -> None:  # pragma: no cover
163 |         """
164 |         Validate the :class:`Tafra` before applying.
165 |         """
166 |         assert tafra._rows >= 1, 'No rows exist in `tafra`.'
167 |         tafra._validate_columns(columns)
168 | 
169 | 
170 | @dc.dataclass
171 | class AggMethod(GroupSet):
172 |     """
173 |     Basic methods for aggregations over a data table.
174 |     """
175 |     group_by_cols: Iterable[str]
176 |     aggregation: dc.InitVar[InitAggregation]
177 |     _aggregation: Mapping[str, Tuple[Callable[[np.ndarray], Any], str]] = dc.field(init=False)
178 |     iter_fn: Mapping[str, Callable[[np.ndarray], Any]]
179 | 
180 |     def __post_init__(self, aggregation: InitAggregation) -> None:
181 |         self._aggregation = dict()
182 |         for rename, agg in aggregation.items():
183 |             if callable(agg):
184 |                 self._aggregation[rename] = (agg, rename)
185 |             elif (isinstance(agg, Sequence) and len(agg) == 2
186 |                   and callable(cast(Tuple[Callable[[np.ndarray], Any], str], agg)[0])):
187 |                 self._aggregation[rename] = agg
188 |             else:
189 |                 raise ValueError(f'{rename}: {agg} is not a valid aggregation argument')
190 | 
191 |         for rename, agg in self.iter_fn.items():
192 |             if not callable(agg):
193 |                 raise ValueError(f'{rename}: {agg} is not a valid aggregation argument')
194 | 
195 |     def result_factory(self, fn: Callable[[str, str], np.ndarray]) -> Dict[str, np.ndarray]:
196 |         """
197 |         Factory function to generate the dict for the results set.
198 |         A function to take the new column name and source column name
199 |         and return an empty `np.ndarray` should be given.
200 |         """
201 |         return {
202 |             rename: fn(rename, col) for rename, col in (
203 |                 *((col, col) for col in self.group_by_cols),
204 |                 *((rename, agg[1]) for rename, agg in self._aggregation.items())
205 |             )
206 |         }
207 | 
208 |     def iter_fn_factory(self, fn: Callable[[], np.ndarray]) -> Dict[str, np.ndarray]:
209 |         return {rename: fn() for rename in self.iter_fn.keys()}
210 | 
211 |     def apply(self, tafra: 'Tafra') -> 'Tafra':
212 |         raise NotImplementedError
213 | 
214 | 
215 | class GroupBy(AggMethod):
216 |     """
217 |     Aggregation by a set of unique values.
218 | 
219 |     Analogy to SQL ``GROUP BY``, not :meth:`pandas.DataFrame.groupby()`.
220 | 
221 |     Parameters
222 |     ----------
223 |         columns: Iterable[str]
224 |             The column names to group by.
225 | 
226 |         aggregation: Mapping[str, Union[Callable[[np.ndarray], Any], \
227 |             Optional. Tuple[Callable[[np.ndarray], Any], str]]]
228 |             A mapping for columns and aggregation functions. Should be
229 |             given as {'column': fn} or {'new_column': (fn, 'column')}.
230 | 
231 |         iter_fn: Mapping[str, Callable[[np.ndarray], Any]]
232 |             Optional. A mapping for new columns names to the function to apply to
233 |             the enumeration. Should be given as {'new_column': fn}.
234 |     """
235 | 
236 |     def apply(self, tafra: 'Tafra') -> 'Tafra':
237 |         """
238 |         Apply the :class:`GroupBy` to the :class:`Tafra`.
239 | 
240 |         Parameters
241 |         ----------
242 |             tafra: Tafra
243 |                 The tafra to apply the operation to.
244 | 
245 |         Returns
246 |         -------
247 |             tafra: Tafra
248 |                 The aggregated :class:`Tafra`.
249 |         """
250 |         self._validate(tafra, (
251 |             *self.group_by_cols,
252 |             *(col for (_, col) in self._aggregation.values())
253 |         ))
254 |         unique = self._unique_groups(tafra, self.group_by_cols)
255 |         result = self.result_factory(
256 |             lambda rename, col: np.empty(len(unique), dtype=tafra._data[col].dtype))
257 |         iter_fn = self.iter_fn_factory(lambda: np.ones(len(unique), dtype=int))
258 |         ones = np.ones(tafra._rows, dtype=int)
259 | 
260 |         for i, u in enumerate(unique):
261 |             which_rows = np.full(tafra._rows, True)
262 | 
263 |             for val, col in zip(u, self.group_by_cols):
264 |                 which_rows &= tafra._data[col] == val
265 |                 result[col][i] = val
266 | 
267 |             for rename, (fn, col) in self._aggregation.items():
268 |                 result[rename][i] = fn(tafra._data[col][which_rows])
269 | 
270 |             for rename, fn in self.iter_fn.items():
271 |                 iter_fn[rename][i] = fn(i * ones[which_rows])
272 | 
273 |         result.update(iter_fn)
274 |         return Tafra(result)
275 | 
276 | 
277 | class Transform(AggMethod):
278 |     """
279 |     Apply a function to each unique set of values and join to the original table.
280 | 
281 |     Analogy to :meth:`pandas.DataFrame.groupby().transform()`,
282 |     i.e. a SQL ``GROUP BY`` and ``LEFT JOIN`` back to the original table.
283 | 
284 |     Parameters
285 |     ----------
286 |         group_by: Iterable[str]
287 |             The column names to group by.
288 | 
289 |         aggregation: Mapping[str, Union[Callable[[np.ndarray], Any], \
290 |         Tuple[Callable[[np.ndarray], Any], str]]]
291 |             Optional. A mapping for columns and aggregation functions. Should be
292 |             given as {'column': fn} or {'new_column': (fn, 'column')}.
293 | 
294 |         iter_fn: Mapping[str, Callable[[np.ndarray], Any]]
295 |             Optional. A mapping for new columns names to the function to apply to
296 |             the enumeration. Should be given as {'new_column': fn}.
297 |     """
298 | 
299 |     def apply(self, tafra: 'Tafra') -> 'Tafra':
300 |         """
301 |         Apply the :class:`Transform` to the :class:`Tafra`.
302 | 
303 |         Parameters
304 |         ----------
305 |             tafra: Tafra
306 |                 The tafra to apply the operation to.
307 | 
308 |         Returns
309 |         -------
310 |             tafra: Tafra
311 |                 The transformed :class:`Tafra`.
312 |         """
313 |         self._validate(tafra, (
314 |             *self.group_by_cols,
315 |             *(col for (_, col) in self._aggregation.values())
316 |         ))
317 |         unique = self._unique_groups(tafra, self.group_by_cols)
318 |         result = self.result_factory(
319 |             lambda rename, col: np.empty_like(tafra._data[col]))
320 |         iter_fn = self.iter_fn_factory(lambda: np.ones(tafra._rows, dtype=int))
321 |         ones = np.ones(tafra._rows, dtype=int)
322 | 
323 |         for i, u in enumerate(unique):
324 |             which_rows = np.full(tafra._rows, True)
325 | 
326 |             for val, col in zip(u, self.group_by_cols):
327 |                 which_rows &= tafra._data[col] == val
328 |                 result[col][which_rows] = tafra._data[col][which_rows]
329 | 
330 |             for rename, agg in self._aggregation.items():
331 |                 fn, col = agg
332 |                 result[rename][which_rows] = fn(tafra._data[col][which_rows])
333 | 
334 |             for rename, fn in self.iter_fn.items():
335 |                 iter_fn[rename][which_rows] = fn(i * ones[which_rows])
336 | 
337 |         result.update(iter_fn)
338 |         return Tafra(result)
339 | 
340 | 
341 | @dc.dataclass
342 | class IterateBy(GroupSet):
343 |     """
344 |     A generator that yields a :class:`Tafra` for each set of unique values.
345 | 
346 |     Analogy to `pandas.DataFrame.groupby()`, i.e. an Sequence of `Tafra` objects.
347 |     Yields tuples of ((unique grouping values, ...), row indices array, subset tafra)
348 | 
349 |     Parameters
350 |     ----------
351 |         group_by: Iterable[str]
352 |             The column names to group by.
353 |     """
354 |     group_by_cols: Iterable[str]
355 | 
356 |     def apply(self, tafra: 'Tafra') -> Iterator[GroupDescription]:
357 |         """
358 |         Apply the :class:`IterateBy` to the :class:`Tafra`.
359 | 
360 |         Parameters
361 |         ----------
362 |             tafra: Tafra
363 |                 The tafra to apply the operation to.
364 | 
365 |         Returns
366 |         -------
367 |             tafras: Iterator[GroupDescription]
368 |                 An iterator over the grouped :class:`Tafra`.
369 |         """
370 |         self._validate(tafra, self.group_by_cols)
371 |         unique = self._unique_groups(tafra, self.group_by_cols)
372 | 
373 |         for u in unique:
374 |             which_rows = np.full(tafra._rows, True)
375 |             for val, col in zip(u, self.group_by_cols):
376 |                 which_rows &= tafra._data[col] == val
377 | 
378 |             if len(u) == 1:
379 |                 u = u[0]
380 | 
381 |             yield (u, which_rows, tafra._ndindex(which_rows))
382 | 
383 | 
384 | @dc.dataclass
385 | class Join(GroupSet):
386 |     """
387 |     Base class for SQL-like JOINs.
388 |     """
389 |     on: Iterable[Tuple[str, str, str]]
390 |     select: Iterable[str]
391 | 
392 |     def _validate_dtypes(self, l_table: 'Tafra', r_table: 'Tafra') -> None:
393 |         for l_column, r_column, _ in self.on:
394 |             l_value = l_table._data[l_column]
395 |             r_value = r_table._data[r_column]
396 |             l_dtype = l_table._dtypes[l_column]
397 |             r_dtype = r_table._dtypes[r_column]
398 | 
399 |             if l_value.dtype != r_value.dtype:
400 |                 raise TypeError(
401 |                     f'This `Tafra` column `{l_column}` dtype `{l_value.dtype}` '
402 |                     f'does not match other `Tafra` dtype `{r_value.dtype}`.')
403 | 
404 |             # should not happen unless dtypes manually changed, but let's check it
405 |             elif l_dtype != r_dtype:
406 |                 raise TypeError(
407 |                     f'This `Tafra` column `{l_column}` dtype `{l_dtype}` '
408 |                     f'does not match other `Tafra` dtype `{r_dtype}`.')
409 | 
410 |     @staticmethod
411 |     def _validate_ops(ops: Iterable[str]) -> None:
412 |         for op in ops:
413 |             _op = JOIN_OPS.get(op, None)
414 |             if _op is None:
415 |                 raise TypeError(f'The operator {op} is not valid.')
416 | 
417 |     def apply(self, left_t: 'Tafra', right_t: 'Tafra') -> 'Tafra':
418 |         raise NotImplementedError
419 | 
420 | 
421 | class InnerJoin(Join):
422 |     """
423 |     An inner join.
424 | 
425 |     Analogy to SQL INNER JOIN, or `pandas.merge(..., how='inner')`,
426 | 
427 |     Parameters
428 |     ----------
429 |         right: Tafra
430 |             The right-side :class:`Tafra` to join.
431 | 
432 |         on: Iterable[Tuple[str, str, str]]
433 |             The columns and operator to join on. Should be given as
434 |             ('left column', 'right column', 'op') Valid ops are:
435 | 
436 |             '==' : equal to
437 |             '!=' : not equal to
438 |             '<'  : less than
439 |             '<=' : less than or equal to
440 |             '>'  : greater than
441 |             '>=' : greater than or equal to
442 | 
443 |         select: Iterable[str] = []
444 |             The columns to return. If not given, all unique columns names
445 |             are returned. If the column exists in both :class`Tafra`,
446 |             prefers the left over the right.
447 |     """
448 | 
449 |     def apply(self, left_t: 'Tafra', right_t: 'Tafra') -> 'Tafra':
450 |         """
451 |         Apply the :class:`InnerJoin` to the :class:`Tafra`.
452 | 
453 |         Parameters
454 |         ----------
455 |             left_t: Tafra
456 |                 The left tafra to join.
457 | 
458 |             right_t: Tafra
459 |                 The right tafra to join.
460 | 
461 |         Returns
462 |         -------
463 |             tafra: Tafra
464 |                 The joined :class:`Tafra`.
465 |         """
466 |         left_cols, right_cols, ops = list(zip(*self.on))
467 |         self._validate(left_t, left_cols)
468 |         self._validate(right_t, right_cols)
469 |         self._validate_dtypes(left_t, right_t)
470 |         self._validate_ops(ops)
471 | 
472 |         _on = tuple((left_col, right_col, JOIN_OPS[op]) for left_col, right_col, op in self.on)
473 | 
474 |         join: Dict[str, List[Any]] = {column: list() for column in chain(
475 |             left_t._data.keys(),
476 |             right_t._data.keys()
477 |         ) if not self.select
478 |             or (self.select and column in self.select)}
479 | 
480 |         # right-to-left so left dtypes overwrite
481 |         dtypes: Dict[str, str] = {column: dtype for column, dtype in chain(
482 |             right_t._dtypes.items(),
483 |             left_t._dtypes.items()
484 |         ) if column in join.keys()}
485 | 
486 |         for i in range(left_t._rows):
487 |             right_rows = np.full(right_t._rows, True)
488 | 
489 |             for left_col, right_col, op in _on:
490 |                 right_rows &= op(left_t[left_col][i], right_t[right_col])
491 | 
492 |             right_count = np.sum(right_rows)
493 | 
494 |             # this is the only difference from the LeftJoin
495 |             if right_count <= 0:
496 |                 continue
497 | 
498 |             for column in join.keys():
499 |                 if column in left_t._data:
500 |                     join[column].extend(max(1, right_count) * [left_t[column][i]])
501 | 
502 |                 elif column in right_t._data:
503 |                     join[column].extend(right_t[column][right_rows])
504 | 
505 |         return Tafra(
506 |             {column: np.array(value)
507 |              for column, value in join.items()},
508 |             dtypes
509 |         )
510 | 
511 | 
512 | class LeftJoin(Join):
513 |     """
514 |     A left join.
515 | 
516 |     Analogy to SQL LEFT JOIN, or `pandas.merge(..., how='left')`,
517 | 
518 |     Parameters
519 |     ----------
520 |         right: Tafra
521 |             The right-side :class:`Tafra` to join.
522 | 
523 |         on: Iterable[Tuple[str, str, str]]
524 |             The columns and operator to join on. Should be given as
525 |             ('left column', 'right column', 'op') Valid ops are:
526 | 
527 |             '==' : equal to
528 |             '!=' : not equal to
529 |             '<'  : less than
530 |             '<=' : less than or equal to
531 |             '>'  : greater than
532 |             '>=' : greater than or equal to
533 | 
534 |         select: Iterable[str] = []
535 |             The columns to return. If not given, all unique columns names
536 |             are returned. If the column exists in both :class`Tafra`,
537 |             prefers the left over the right.
538 |     """
539 | 
540 |     def apply(self, left_t: 'Tafra', right_t: 'Tafra') -> 'Tafra':
541 |         """
542 |         Apply the :class:`LeftJoin` to the :class:`Tafra`.
543 | 
544 |         Parameters
545 |         ----------
546 |             left_t: Tafra
547 |                 The left tafra to join.
548 | 
549 |             right_t: Tafra
550 |                 The right tafra to join.
551 | 
552 |         Returns
553 |         -------
554 |             tafra: Tafra
555 |                 The joined :class:`Tafra`.
556 |         """
557 |         left_cols, right_cols, ops = list(zip(*self.on))
558 |         self._validate(left_t, left_cols)
559 |         self._validate(right_t, right_cols)
560 |         self._validate_dtypes(left_t, right_t)
561 |         self._validate_ops(ops)
562 | 
563 |         _on = tuple((left_col, right_col, JOIN_OPS[op]) for left_col, right_col, op in self.on)
564 | 
565 |         join: Dict[str, List[Any]] = {column: list() for column in chain(
566 |             left_t._data.keys(),
567 |             right_t._data.keys()
568 |         ) if not self.select
569 |             or (self.select and column in self.select)}
570 | 
571 |         dtypes: Dict[str, str] = {column: dtype for column, dtype in chain(
572 |             left_t._dtypes.items(),
573 |             right_t._dtypes.items()
574 |         ) if column in join.keys()}
575 | 
576 |         for i in range(left_t._rows):
577 |             right_rows = np.full(right_t._rows, True)
578 | 
579 |             for left_col, right_col, op in _on:
580 |                 right_rows &= op(left_t[left_col][i], right_t[right_col])
581 | 
582 |             right_count = np.sum(right_rows)
583 | 
584 |             for column in join.keys():
585 |                 if column in left_t._data:
586 |                     join[column].extend(max(1, right_count) * [left_t[column][i]])
587 | 
588 |                 elif column in right_t._data:
589 |                     if right_count <= 0:
590 |                         join[column].append(None)
591 |                         if dtypes[column] != 'object':
592 |                             dtypes[column] = 'object'
593 |                     else:
594 |                         join[column].extend(right_t[column][right_rows])
595 | 
596 |         return Tafra(
597 |             {column: np.array(value)
598 |              for column, value in join.items()},
599 |             dtypes
600 |         )
601 | 
602 | 
603 | @dc.dataclass
604 | class CrossJoin(Join):
605 |     """
606 |     A cross join.
607 | 
608 |     Analogy to SQL CROSS JOIN, or `pandas.merge(..., how='outer')
609 |     using temporary columns of static value to intersect all rows`.
610 | 
611 |     Parameters
612 |     ----------
613 |         right: Tafra
614 |             The right-side :class:`Tafra` to join.
615 | 
616 |         select: Iterable[str] = []
617 |             The columns to return. If not given, all unique columns names
618 |             are returned. If the column exists in both :class`Tafra`,
619 |             prefers the left over the right.
620 |     """
621 | 
622 |     def apply(self, left_t: 'Tafra', right_t: 'Tafra') -> 'Tafra':
623 |         """
624 |         Apply the :class:`CrossJoin` to the :class:`Tafra`.
625 | 
626 |         Parameters
627 |         ----------
628 |             left_t: Tafra
629 |                 The left tafra to join.
630 | 
631 |             right_t: Tafra
632 |                 The right tafra to join.
633 | 
634 |         Returns
635 |         -------
636 |             tafra: Tafra
637 |                 The joined :class:`Tafra`.
638 |         """
639 |         self._validate_dtypes(left_t, right_t)
640 | 
641 |         left_rows = left_t._rows
642 |         right_rows = right_t._rows
643 | 
644 |         select = set(self.select)
645 |         if len(select) > 0:
646 |             left_cols = list(select.intersection(list(left_t._data.keys())))
647 |             right_cols = list(select.intersection(list(right_t._data.keys())))
648 | 
649 |             if len(left_cols) == 0:
650 |                 raise IndexError('No columns given to select from left `Tafra`.')
651 |             if len(right_cols) == 0:
652 |                 raise IndexError('No columns given to select from right `Tafra`.')
653 | 
654 |         else:
655 |             left_cols = list(left_t._data.keys())
656 |             right_cols = list(right_t._data.keys())
657 | 
658 |         left_new = Tafra(left_t[left_cols].key_map(np.tile, reps=right_rows))
659 |         right_new = Tafra(right_t[right_cols].key_map(np.tile, reps=left_rows))
660 | 
661 |         left_new.update_inplace(right_new)
662 | 
663 |         return left_new
664 | 
665 | 
666 | # Import here to resolve circular dependency
667 | from .base import Tafra
668 | 


--------------------------------------------------------------------------------
/tafra/protocol.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tafra: a minimalist dataframe
 3 | 
 4 | Copyright (c) 2020 Derrick W. Turk and David S. Fulford
 5 | 
 6 | Author
 7 | ------
 8 | Derrick W. Turk
 9 | David S. Fulford
10 | 
11 | Notes
12 | -----
13 | Created on April 25, 2020
14 | """
15 | import numpy as np
16 | from typing import Dict, List, Tuple, Any, Optional, Type, Iterable, Iterator
17 | from typing_extensions import Protocol, runtime_checkable
18 | 
19 | 
20 | @runtime_checkable
21 | class Series(Protocol):
22 |     name: str
23 |     values: np.ndarray
24 |     dtype: str
25 | 
26 | 
27 | @runtime_checkable
28 | class DataFrame(Protocol):
29 |     """
30 |     A fake class to satisfy typing of a ``pandas.DataFrame`` without a dependency.
31 |     """
32 |     _data: Dict[str, Series]
33 |     columns: List[str]
34 |     dtypes: List[str]
35 | 
36 |     def __getitem__(self, column: str) -> Series:
37 |         raise NotImplementedError
38 | 
39 |     def __setitem__(self, column: str, value: np.ndarray) -> None:
40 |         raise NotImplementedError
41 | 
42 | @runtime_checkable
43 | class Cursor(Protocol):
44 |     """
45 |     A fake class to satisfy typing of a ``pyodbc.Cursor`` without a dependency.
46 |     """
47 |     description: Tuple[Tuple[str, Type[Any], Optional[int], int, int, int, bool]]
48 | 
49 |     def __iter__(self) -> Iterator[Tuple[Any, ...]]:
50 |         raise NotImplementedError
51 | 
52 |     def __next__(self) -> Tuple[Any, ...]:
53 |         raise NotImplementedError
54 | 
55 |     def execute(self, sql: str) -> None:
56 |         raise NotImplementedError
57 | 
58 |     def fetchone(self) -> Optional[Tuple[Any, ...]]:
59 |         raise NotImplementedError
60 | 
61 |     def fetchmany(self, size: int) -> List[Tuple[Any, ...]]:
62 |         raise NotImplementedError
63 | 
64 |     def fetchall(self) -> List[Tuple[Any, ...]]:
65 |         raise NotImplementedError
66 | 


--------------------------------------------------------------------------------
/tafra/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petbox-dev/tafra/c8bd5452f314e498fc7a7dbc9a30d1f6efde4174/tafra/py.typed


--------------------------------------------------------------------------------
/tafra/version.py:
--------------------------------------------------------------------------------
1 | __version__ = '1.0.10'
2 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petbox-dev/tafra/c8bd5452f314e498fc7a7dbc9a30d1f6efde4174/test/__init__.py


--------------------------------------------------------------------------------
/test/ex1.csv:
--------------------------------------------------------------------------------
1 | a,b,c
2 | 1,true,23
3 | 2,false,45.6
4 | 3,true,90.2
5 | 4,false,2.1
6 | 5,true,9.6
7 | 6,false,-10.1
8 | 


--------------------------------------------------------------------------------
/test/ex2.csv:
--------------------------------------------------------------------------------
1 | a,b,c
2 | 1,true,23
3 | 2,false,45.6
4 | 


--------------------------------------------------------------------------------
/test/ex3.csv:
--------------------------------------------------------------------------------
1 | a,b,b
2 | 1,true,23
3 | 2,false,45.6
4 | 3,true,90.2
5 | 4,false,2.1
6 | 5,jimmy,9.6
7 | 6,false,-10.1
8 | 


--------------------------------------------------------------------------------
/test/ex4.csv:
--------------------------------------------------------------------------------
1 | a,b,b
2 | 1,true,23
3 | 2,false,45.6
4 | 3,true,90.2
5 | 4,false,2.1
6 | 5,false,9.6
7 | 6,jimmy,-10.1
8 | 


--------------------------------------------------------------------------------
/test/ex5.csv:
--------------------------------------------------------------------------------
1 | a,b,c
2 | 1,true,23
3 | 2,false,45.6
4 | 3,true,90.2
5 | 4,2.1
6 | 5,false,9.6
7 | 6,true,-10.1
8 | 7.3,false,2.3
9 | 


--------------------------------------------------------------------------------
/test/ex6.csv:
--------------------------------------------------------------------------------
1 | t,te,dp,dp_prime,dp_prime_te
2 | 0.00417,0.00417,0.57,,
3 | 8.33E-03,8.33E-03,3.81,6.75,6.76
4 | 1.25E-02,1.25E-02,6.55,9.87,9.88
5 | 1.67E-02,1.67E-02,10.03,13.98,13.99
6 | 2.08E-02,2.08E-02,13.27,17.29,17.32
7 | 2.50E-02,2.50E-02,16.77,20.08,20.12
8 | 


--------------------------------------------------------------------------------
/test/test.bat:
--------------------------------------------------------------------------------
1 | :: Run tests and generate report
2 | 
3 | flake8 %~dp0..\tafra
4 | mypy %~dp0..\tafra
5 | 
6 | sphinx-build -W -b html -a %~dp0..\docs %~dp0..\docs\_build\html
7 | 
8 | pytest
9 | 


--------------------------------------------------------------------------------
/test/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 4 | 
 5 | 
 6 | echo flake8 ../tafra
 7 | flake8 $DIR/../tafra
 8 | echo
 9 | 
10 | echo mypy ../tafra
11 | mypy $DIR/../tafra
12 | echo
13 | 
14 | echo sphinx-build -W -b html -a ../docs ../docs/_build/html
15 | sphinx-build -W -b html -a $DIR/..docs $DIR/..docs/_build/html
16 | echo
17 | 
18 | echo pytest
19 | pytest
20 | 


--------------------------------------------------------------------------------
/test/test_tafra.py:
--------------------------------------------------------------------------------
   1 | from pathlib import Path
   2 | import platform
   3 | import warnings
   4 | from decimal import Decimal
   5 | from datetime import date, datetime
   6 | 
   7 | import numpy as np
   8 | from tafra import Tafra, object_formatter
   9 | import pandas as pd  # type: ignore
  10 | from itertools import islice
  11 | 
  12 | from typing import Dict, List, Any, Iterator, Iterable, Sequence, Tuple, Optional, Type
  13 | 
  14 | import pytest  # type: ignore
  15 | from unittest.mock import MagicMock
  16 | 
  17 | 
  18 | class TestClass:
  19 |     ...
  20 | 
  21 | 
  22 | class Series:
  23 |     name: str = 'x'
  24 |     values: np.ndarray = np.arange(5)
  25 |     dtype: str = 'int'
  26 | 
  27 | 
  28 | class DataFrame:
  29 |     _data: Dict[str, Series] = {'x': Series(), 'y': Series()}
  30 |     columns: List[str] = ['x', 'y']
  31 |     dtypes: List[str] = ['int', 'int']
  32 | 
  33 |     def __getitem__(self, column: str) -> Series:
  34 |         return self._data[column]
  35 | 
  36 |     def __setitem__(self, column: str, value: np.ndarray) -> None:
  37 |         self._data[column].values = value
  38 | 
  39 | 
  40 | class Cursor:
  41 |     description = (
  42 |         ('Fruit', str, None, 1, 1, 1, True),
  43 |         ('Amount', int, None, 1, 1, 1, True),
  44 |         ('Price', float, None, 1, 1, 1, True)
  45 |     )
  46 |     _iter = [
  47 |         ('Apples', 5, .95),
  48 |         ('Pears', 2, .80)
  49 |     ]
  50 |     idx = 0
  51 | 
  52 |     def __iter__(self) -> Iterator[Tuple[Any, ...]]:
  53 |         return self
  54 | 
  55 |     def __next__(self) -> Tuple[Any, ...]:
  56 |         try:
  57 |             item = self._iter[self.idx]
  58 |         except IndexError:
  59 |             raise StopIteration()
  60 |         self.idx += 1
  61 |         return item
  62 | 
  63 |     def execute(self, sql: str) -> None:
  64 |         ...
  65 | 
  66 |     def fetchone(self) -> Optional[Tuple[Any, ...]]:
  67 |         try:
  68 |             return next(self)
  69 |         except:
  70 |             return None
  71 | 
  72 |     def fetchmany(self, size: int) -> List[Tuple[Any, ...]]:
  73 |         return list(islice(self, size))
  74 | 
  75 |     def fetchall(self) -> List[Tuple[Any, ...]]:
  76 |         return [rec for rec in self]
  77 | 
  78 | 
  79 | def build_tafra() -> Tafra:
  80 |     return Tafra({
  81 |         'x': np.array([1, 2, 3, 4, 5, 6]),
  82 |         'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
  83 |         'z': np.array([0, 0, 0, 1, 1, 1])
  84 |     })
  85 | 
  86 | 
  87 | def check_tafra(t: Tafra, check_rows: bool = True) -> bool:
  88 |     assert len(t._data) == len(t._dtypes)
  89 |     for c in t.columns:
  90 |         assert isinstance(t[c], np.ndarray)
  91 |         assert isinstance(t.data[c], np.ndarray)
  92 |         assert isinstance(t._data[c], np.ndarray)
  93 |         assert isinstance(t.dtypes[c], str)
  94 |         assert isinstance(t._dtypes[c], str)
  95 |         if check_rows:
  96 |             assert t._rows == len(t._data[c])
  97 |         pd.Series(t._data[c])
  98 | 
  99 |     columns = [c for c in t.columns][:-1]
 100 | 
 101 |     _ = t.to_records()
 102 |     _ = t.to_records(columns=columns)
 103 |     _ = t.to_list()
 104 |     _ = t.to_list(columns=columns)
 105 |     _ = t.to_list(inner=True)
 106 |     _ = t.to_list(columns=columns, inner=True)
 107 |     _ = t.to_tuple()
 108 |     _ = t.to_tuple(columns=columns)
 109 |     _ = t.to_tuple(name=None)
 110 |     _ = t.to_tuple(name='tf')
 111 |     _ = t.to_tuple(columns=columns, name=None)
 112 |     _ = t.to_tuple(columns=columns, name='tf')
 113 |     _ = t.to_tuple(inner=True)
 114 |     _ = t.to_tuple(inner=True, name=None)
 115 |     _ = t.to_tuple(inner=True, name='tf')
 116 |     _ = t.to_tuple(columns=columns, inner=True)
 117 |     _ = t.to_tuple(columns=columns, inner=True, name=None)
 118 |     _ = t.to_tuple(columns=columns, inner=True, name='tf')
 119 |     _ = t.to_array()
 120 |     _ = t.to_array(columns=columns)
 121 |     df = t.to_pandas()
 122 |     df = t.to_pandas(columns=columns)
 123 |     assert isinstance(df, pd.DataFrame)
 124 |     write_path = Path('test/test_to_csv.csv')
 125 |     t.to_csv(write_path)
 126 |     # t.to_csv(write_path, columns=columns)
 127 | 
 128 |     return True
 129 | 
 130 | def test_constructions() -> None:
 131 |     t = build_tafra()
 132 |     check_tafra(t)
 133 | 
 134 |     t = Tafra({
 135 |         'x': np.array([1, 2, 3, 4, 5, 6]),
 136 |         'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
 137 |         'z': np.array([0, 0, 0, 1, 1, 1])
 138 |     }, validate=False)
 139 |     check_tafra(t)
 140 | 
 141 |     t = Tafra({
 142 |         'x': np.array([1, 2, 3, 4, 5, 6]),
 143 |         'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
 144 |         'z': np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
 145 |     }, validate=False, check_rows=False)
 146 |     check_tafra(t, check_rows=False)
 147 | 
 148 |     with pytest.raises(TypeError) as e:
 149 |         t = Tafra()  # type: ignore # noqa
 150 | 
 151 |     with pytest.raises(ValueError) as e:
 152 |         t = Tafra({})
 153 | 
 154 |     t = Tafra({'x': None})
 155 |     with warnings.catch_warnings(record=True) as w:
 156 |         check_tafra(t)
 157 | 
 158 |     t = Tafra({'x': Decimal('1.23456')})
 159 |     check_tafra(t)
 160 | 
 161 |     t = Tafra({'x': np.array(1)})
 162 |     check_tafra(t)
 163 | 
 164 |     t = Tafra({'x': np.array([1])})
 165 |     check_tafra(t)
 166 | 
 167 |     t = Tafra({'x': [True, False]})
 168 |     check_tafra(t)
 169 | 
 170 |     t = Tafra({'x': 'test'})
 171 |     check_tafra(t)
 172 | 
 173 |     t = Tafra((('x', np.arange(6)),))
 174 |     check_tafra(t)
 175 | 
 176 |     t = Tafra([('x', np.arange(6))])
 177 |     check_tafra(t)
 178 | 
 179 |     t = Tafra([['x', np.arange(6)]])
 180 |     check_tafra(t)
 181 | 
 182 |     t = Tafra([(np.array('x'), np.arange(6))])
 183 |     check_tafra(t)
 184 | 
 185 |     t = Tafra([(np.array(['x']), np.arange(6))])
 186 |     check_tafra(t)
 187 | 
 188 |     t = Tafra([('x', np.arange(6)), ('y', np.linspace(0, 1, 6))])
 189 |     check_tafra(t)
 190 | 
 191 |     t = Tafra([['x', np.arange(6)], ('y', np.linspace(0, 1, 6))])
 192 |     check_tafra(t)
 193 | 
 194 |     t = Tafra([('x', np.arange(6)), ['y', np.linspace(0, 1, 6)]])
 195 |     check_tafra(t)
 196 | 
 197 |     t = Tafra([['x', np.arange(6)], ['y', np.linspace(0, 1, 6)]])
 198 |     check_tafra(t)
 199 | 
 200 |     t = Tafra([{'x': np.arange(6)}, {'y': np.linspace(0, 1, 6)}])
 201 |     check_tafra(t)
 202 | 
 203 |     t = Tafra(iter([{'x': np.arange(6)}, {'y': np.linspace(0, 1, 6)}]))
 204 |     check_tafra(t)
 205 | 
 206 |     def iterator() -> Iterator[Dict[str, np.ndarray]]:
 207 |         yield {'x': np.array([1, 2, 3, 4, 5, 6])}
 208 |         yield {'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object')}
 209 |         yield {'z': np.array([0, 0, 0, 1, 1, 1])}
 210 | 
 211 |     t = Tafra(iterator())
 212 |     check_tafra(t)
 213 | 
 214 |     class DictIterable:
 215 |         def __iter__(self) -> Iterator[Dict[str, np.ndarray]]:
 216 |             yield {'x': np.array([1, 2, 3, 4, 5, 6])}
 217 |             yield {'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object')}
 218 |             yield {'z': np.array([0, 0, 0, 1, 1, 1])}
 219 | 
 220 |     t = Tafra(DictIterable())
 221 |     check_tafra(t)
 222 | 
 223 |     t = Tafra(iter(DictIterable()))
 224 |     check_tafra(t)
 225 | 
 226 |     class SequenceIterable:
 227 |         def __iter__(self) -> Iterator[Any]:
 228 |             yield ('x', np.array([1, 2, 3, 4, 5, 6]))
 229 |             yield ['y', np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object')]
 230 |             yield ('z', np.array([0, 0, 0, 1, 1, 1]))
 231 | 
 232 |     t = Tafra(SequenceIterable())
 233 |     check_tafra(t)
 234 | 
 235 |     class SequenceIterable2:
 236 |         def __iter__(self) -> Iterator[Any]:
 237 |             yield (np.array(['x']), np.array([1, 2, 3, 4, 5, 6]))
 238 |             yield [np.array(['y']), np.array(['one', 'two', 'one', 'two', 'one', 'two'],
 239 |                                              dtype='object')]
 240 |             yield (np.array(['z']), np.array([0, 0, 0, 1, 1, 1]))
 241 | 
 242 |     t = Tafra(SequenceIterable2())
 243 |     check_tafra(t)
 244 | 
 245 |     t = Tafra(iter(SequenceIterable2()))
 246 |     check_tafra(t)
 247 | 
 248 |     t = Tafra(enumerate(np.arange(6)))
 249 |     check_tafra(t)
 250 | 
 251 |     t = build_tafra()
 252 |     df = pd.DataFrame(t.data)
 253 |     _ = Tafra.from_series(df['x'])
 254 |     check_tafra(_)
 255 | 
 256 |     _ = Tafra.from_dataframe(df)
 257 |     check_tafra(_)
 258 | 
 259 |     _ = Tafra.as_tafra(df)
 260 |     check_tafra(_)
 261 | 
 262 |     _ = Tafra.as_tafra(df['x'])
 263 |     check_tafra(_)
 264 | 
 265 |     _ = Tafra.as_tafra(t)
 266 |     check_tafra(_)
 267 | 
 268 |     _ = Tafra.as_tafra({'x': np.array(1)})
 269 |     check_tafra(_)
 270 | 
 271 |     _ = Tafra.from_series(Series())
 272 |     check_tafra(_)
 273 | 
 274 |     _ = Tafra.as_tafra(Series())
 275 |     check_tafra(_)
 276 | 
 277 |     _ = Tafra.from_dataframe(DataFrame())  # type: ignore
 278 |     check_tafra(_)
 279 | 
 280 |     _ = Tafra.as_tafra(DataFrame())
 281 |     check_tafra(_)
 282 | 
 283 |     with pytest.raises(TypeError) as e:
 284 |         t = Tafra([{1, 2}])  # type: ignore
 285 | 
 286 |     class BadIterable:
 287 |         def __iter__(self) -> Iterator[Any]:
 288 |             yield {1, 2}
 289 |             yield {3.1412159, .5772156}
 290 | 
 291 |     with pytest.raises(TypeError) as e:
 292 |         t = Tafra(BadIterable())
 293 | 
 294 |     with pytest.raises(TypeError) as e:
 295 |         t = Tafra(iter(BadIterable()))
 296 | 
 297 |     with pytest.raises(TypeError) as e:
 298 |         _ = Tafra(np.arange(6))
 299 | 
 300 |     with pytest.raises(TypeError) as e:
 301 |         _ = Tafra.as_tafra(np.arange(6))
 302 | 
 303 |     with pytest.raises(ValueError) as e:
 304 |         t = Tafra({'x': np.array([1, 2]), 'y': np.array([3., 4., 5.])})
 305 | 
 306 | def test_read_sql() -> None:
 307 | 
 308 |     cur = Cursor()
 309 |     columns, dtypes = zip(*((d[0], d[1]) for d in cur.description))
 310 |     records = cur.fetchall()
 311 |     t = Tafra.from_records(records, columns)
 312 |     check_tafra(t)
 313 | 
 314 |     t = Tafra.from_records(records, columns, dtypes)
 315 |     check_tafra(t)
 316 | 
 317 |     cur = Cursor()
 318 |     t = Tafra.read_sql('SELECT * FROM [Table]', cur)  # type: ignore
 319 |     check_tafra(t)
 320 | 
 321 |     cur = Cursor()
 322 |     cur._iter = []
 323 |     t = Tafra.read_sql('SELECT * FROM [Table]', cur)  # type: ignore
 324 |     check_tafra(t)
 325 | 
 326 |     cur = Cursor()
 327 |     for t in Tafra.read_sql_chunks('SELECT * FROM [Table]', cur):  # type: ignore
 328 |         check_tafra(t)
 329 | 
 330 |     cur = Cursor()
 331 |     cur._iter = []
 332 |     for t in Tafra.read_sql_chunks('SELECT * FROM [Table]', cur):  # type: ignore
 333 |         check_tafra(t)
 334 | 
 335 | 
 336 | def test_destructors() -> None:
 337 |     def gen_values() -> Iterator[Dict[str, np.ndarray]]:
 338 |         yield {'x': np.arange(6)}
 339 |         yield {'y': np.arange(6)}
 340 | 
 341 |     t = Tafra(gen_values())
 342 |     check_tafra(t)
 343 | 
 344 |     t = build_tafra()
 345 |     t = t.update_dtypes({'x': 'float'})
 346 |     t.data['x'][2] = np.nan
 347 |     check_tafra(t)
 348 | 
 349 |     _ = tuple(t.to_records())
 350 |     _ = tuple(t.to_records(columns='x'))
 351 |     _ = tuple(t.to_records(columns=['x']))
 352 |     _ = tuple(t.to_records(columns=['x', 'y']))
 353 |     _ = tuple(t.to_records(cast_null=False))
 354 |     _ = tuple(t.to_records(columns='x', cast_null=False))
 355 |     _ = tuple(t.to_records(columns=['x'], cast_null=False))
 356 |     _ = tuple(t.to_records(columns=['x', 'y'], cast_null=False))
 357 | 
 358 |     _ = t.to_list()
 359 |     _ = t.to_list(columns='x')
 360 |     _ = t.to_list(columns=['x'])
 361 |     _ = t.to_list(columns=['x', 'y'])
 362 | 
 363 |     _ = t.to_list(inner=True)
 364 |     _ = t.to_list(columns='x', inner=True)
 365 |     _ = t.to_list(columns=['x'], inner=True)
 366 |     _ = t.to_list(columns=['x', 'y'], inner=True)
 367 | 
 368 |     _ = t.to_tuple()
 369 |     _ = t.to_tuple(columns='x')
 370 |     _ = t.to_tuple(columns=['x'])
 371 |     _ = t.to_tuple(columns=['x', 'y'])
 372 | 
 373 |     _ = t.to_tuple(inner=True)
 374 |     _ = t.to_tuple(columns='x', inner=True)
 375 |     _ = t.to_tuple(columns=['x'], inner=True)
 376 |     _ = t.to_tuple(columns=['x', 'y'], inner=True)
 377 | 
 378 |     _ = t.to_array()
 379 |     _ = t.to_array(columns='x')
 380 |     _ = t.to_array(columns=['x'])
 381 |     _ = t.to_array(columns=['x', 'y'])
 382 | 
 383 |     _ = t.to_pandas()
 384 |     _ = t.to_pandas(columns='x')
 385 |     _ = t.to_pandas(columns=['x'])
 386 |     _ = t.to_pandas(columns=['x', 'y'])
 387 | 
 388 |     filepath = Path('test/test_to_csv.csv')
 389 |     t.to_csv(filepath)
 390 |     t.to_csv(filepath, columns='x')
 391 |     t.to_csv(filepath, columns=['x'])
 392 |     t.to_csv(filepath, columns=['x', 'y'])
 393 | 
 394 | 
 395 | def test_properties() -> None:
 396 |     t = build_tafra()
 397 |     _ = t.columns
 398 |     _ = t.rows
 399 |     _ = t.data
 400 |     _ = t.dtypes
 401 |     _ = t.size
 402 |     _ = t.ndim
 403 |     _ = t.shape
 404 | 
 405 |     with pytest.raises(ValueError) as e:
 406 |         t.columns = ['x', 'a']  # type: ignore
 407 | 
 408 |     with pytest.raises(ValueError) as e:
 409 |         t.rows = 3
 410 | 
 411 |     with pytest.raises(ValueError) as e:
 412 |         t.data = {'x': np.arange(6)}
 413 | 
 414 |     with pytest.raises(ValueError) as e:
 415 |         t.dtypes = {'x': 'int'}
 416 | 
 417 |     with pytest.raises(ValueError) as e:
 418 |         t.size = 3
 419 | 
 420 |     with pytest.raises(ValueError) as e:
 421 |         t.ndim = 3
 422 | 
 423 |     with pytest.raises(ValueError) as e:
 424 |         t.shape = (10, 2)
 425 | 
 426 | def test_views() -> None:
 427 |     t = build_tafra()
 428 |     _ = t.keys()
 429 |     _ = t.values()
 430 |     _ = t.items()
 431 |     _ = t.get('x')
 432 | 
 433 | def test_assignment() -> None:
 434 |     t = build_tafra()
 435 |     t['x'] = np.arange(6)
 436 |     t['x'] = 3
 437 |     t['x'] = 6
 438 |     t['x'] = 'test'
 439 |     t['x'] = list(range(6))
 440 |     t['x'] = np.array(6)
 441 |     t['x'] = np.array([6])
 442 |     t['x'] = iter([1, 2, 3, 4, 5, 6])
 443 |     t['x'] = range(6)
 444 |     check_tafra(t)
 445 | 
 446 |     with pytest.raises(ValueError) as e:
 447 |         t['x'] = np.arange(3)
 448 | 
 449 | def test_dtype_update() -> None:
 450 |     t = build_tafra()
 451 |     assert t._data['x'].dtype != np.dtype(object)
 452 |     t.update_dtypes_inplace({'x': 'O'})
 453 |     assert t._data['x'].dtype == np.dtype(object)
 454 |     check_tafra(t)
 455 | 
 456 |     t = build_tafra()
 457 |     assert t._data['x'].dtype != np.dtype(object)
 458 |     _ = t.update_dtypes({'x': 'O'})
 459 |     assert _._data['x'].dtype == np.dtype(object)
 460 |     check_tafra(_)
 461 | 
 462 | 
 463 | def test_select() -> None:
 464 |     t = build_tafra()
 465 |     _ = t.select('x')
 466 |     _ = t.select(['x'])
 467 |     _ = t.select(['x', 'y'])
 468 | 
 469 |     with pytest.raises(ValueError) as e:
 470 |         _ = t.select('a')
 471 | 
 472 | def test_formatter() -> None:
 473 |     _ = str(object_formatter)
 474 | 
 475 |     t = Tafra({'x': Decimal(1.2345)})
 476 |     assert t._dtypes['x'] == 'float64'
 477 |     assert t['x'].dtype == np.dtype(float)
 478 | 
 479 |     object_formatter['Decimal'] = lambda x: x.astype(int)
 480 |     t = Tafra({'x': Decimal(1.2345)})
 481 |     if platform.system() == 'Windows':
 482 |         assert t._dtypes['x'] == 'int32'
 483 |     elif platform.system() == 'Linux':
 484 |         assert t._dtypes['x'] == 'int64'
 485 |     assert t['x'].dtype == np.dtype(int)
 486 | 
 487 |     _ = str(object_formatter)
 488 | 
 489 |     for fmt in object_formatter:
 490 |         pass
 491 | 
 492 |     _ = object_formatter.copy()
 493 | 
 494 |     del object_formatter['Decimal']
 495 | 
 496 |     with pytest.raises(ValueError) as e:
 497 |         object_formatter['Decimal'] = lambda x: 'int'  # type: ignore
 498 | 
 499 |     _ = str(object_formatter)
 500 | 
 501 | def test_prints() -> None:
 502 |     t = build_tafra()
 503 |     _ = t.pformat()
 504 |     t.pprint()
 505 |     t.head(5)
 506 | 
 507 |     mock = MagicMock()
 508 |     mock.text = print
 509 |     t._repr_pretty_(mock, True)
 510 |     t._repr_pretty_(mock, False)
 511 | 
 512 |     _ = t._repr_html_()
 513 | 
 514 | def test_dunder() -> None:
 515 |     t = build_tafra()
 516 |     l = len(t)
 517 |     s = str(t)
 518 | 
 519 | def test_update() -> None:
 520 |     t = build_tafra()
 521 |     t2 = build_tafra()
 522 |     _ = t2.update(t2)
 523 |     check_tafra(_)
 524 | 
 525 |     t.update_inplace(t2)
 526 |     check_tafra(t)
 527 | 
 528 |     _ = t.update(t2._data)  # type: ignore
 529 |     check_tafra(_)
 530 | 
 531 | def test_coalesce_dtypes() -> None:
 532 |     t = build_tafra()
 533 |     t._data['a'] = np.arange(6)
 534 |     assert 'a' not in t._dtypes
 535 | 
 536 |     t._coalesce_dtypes()
 537 |     assert 'a' in t._dtypes
 538 |     check_tafra(t)
 539 | 
 540 | def test_update_dtypes() -> None:
 541 |     t = build_tafra()
 542 |     t.update_dtypes_inplace({'x': float})
 543 |     check_tafra(t)
 544 |     assert t['x'].dtype == 'float'
 545 |     assert isinstance(t['x'][0], np.float64)
 546 | 
 547 |     t = build_tafra()
 548 |     _ = t.update_dtypes({'x': float})
 549 |     check_tafra(_)
 550 |     assert _['x'].dtype == 'float'
 551 |     assert isinstance(_['x'][0], np.float64)
 552 | 
 553 | def test_rename() -> None:
 554 |     t = build_tafra()
 555 |     t.rename_inplace({'x': 'a'})
 556 |     assert 'a' in t.data
 557 |     assert 'a' in t.dtypes
 558 |     assert 'x' not in t.data
 559 |     assert 'x' not in t.dtypes
 560 |     check_tafra(t)
 561 | 
 562 |     t = build_tafra()
 563 |     _ = t.rename({'x': 'a'})
 564 |     assert 'a' in _.data
 565 |     assert 'a' in _.dtypes
 566 |     assert 'x' not in _.data
 567 |     assert 'x' not in _.dtypes
 568 |     check_tafra(_)
 569 | 
 570 | def test_delete() -> None:
 571 |     t = build_tafra()
 572 |     t.delete_inplace('x')
 573 |     assert 'x' not in t.data
 574 |     assert 'x' not in t.dtypes
 575 |     check_tafra(t)
 576 | 
 577 |     t = build_tafra()
 578 |     t.delete_inplace(['x'])
 579 |     assert 'x' not in t.data
 580 |     assert 'x' not in t.dtypes
 581 |     check_tafra(t)
 582 | 
 583 |     t = build_tafra()
 584 |     t.delete_inplace(['x', 'y'])
 585 |     assert 'x' not in t.data
 586 |     assert 'y' not in t.dtypes
 587 |     assert 'x' not in t.data
 588 |     assert 'y' not in t.dtypes
 589 |     check_tafra(t)
 590 | 
 591 |     t = build_tafra()
 592 |     _ = t.delete('x')
 593 |     assert 'x' not in _.data
 594 |     assert 'x' not in _.dtypes
 595 |     check_tafra(t)
 596 |     check_tafra(_)
 597 | 
 598 |     t = build_tafra()
 599 |     _ = t.delete(['x'])
 600 |     assert 'x' not in _.data
 601 |     assert 'x' not in _.dtypes
 602 |     check_tafra(t)
 603 |     check_tafra(_)
 604 | 
 605 |     t = build_tafra()
 606 |     _ = t.delete(['x', 'y'])
 607 |     assert 'x' not in _.data
 608 |     assert 'y' not in _.dtypes
 609 |     assert 'x' not in _.data
 610 |     assert 'y' not in _.dtypes
 611 |     check_tafra(t)
 612 |     check_tafra(_)
 613 | 
 614 | def test_iter_methods() -> None:
 615 |     t = build_tafra()
 616 |     for _ in t:
 617 |         pass
 618 | 
 619 |     for _ in t.iterrows():
 620 |         pass
 621 | 
 622 |     for _ in t.itercols():
 623 |         pass
 624 | 
 625 |     for _ in t.itertuples():
 626 |         pass
 627 | 
 628 |     for _ in t.itertuples(name='test'):
 629 |         pass
 630 | 
 631 |     for _ in t.itertuples(name=None):
 632 |         pass
 633 | 
 634 | def test_groupby() -> None:
 635 |     t = build_tafra()
 636 |     gb = t.group_by(
 637 |         ['y', 'z'], {'x': sum}, {'count': len}
 638 |     )
 639 |     check_tafra(gb)
 640 | 
 641 | def test_groupby_iter_fn() -> None:
 642 |     t = build_tafra()
 643 |     gb = t.group_by(
 644 |         ['y', 'z'], {
 645 |             'x': sum,
 646 |             'new_x': (sum, 'x')
 647 |         }, {'count': len}
 648 |     )
 649 |     check_tafra(gb)
 650 | 
 651 | def test_transform() -> None:
 652 |     t = build_tafra()
 653 |     tr = t.transform(
 654 |         ['y', 'z'], {'x': sum}, {'id': max}
 655 |     )
 656 |     check_tafra(tr)
 657 | 
 658 | def test_iterate_by_attr() -> None:
 659 |     t = build_tafra()
 660 |     t.id = np.empty(t.rows, dtype=int)  # type: ignore
 661 |     t['id'] = np.empty(t.rows, dtype=int)
 662 |     for i, (u, ix, grouped) in enumerate(t.iterate_by(['y', 'z'])):
 663 |         t['x'][ix] = sum(grouped['x'])
 664 |         t.id[ix] = len(grouped['x'])  # type: ignore
 665 |         t['id'][ix] = max(grouped['x'])
 666 |     check_tafra(t)
 667 | 
 668 | def test_iterate_by() -> None:
 669 |     t = build_tafra()
 670 |     for u, ix, grouped in t.iterate_by(['y']):
 671 |         assert isinstance(grouped, Tafra)
 672 | 
 673 | def group_by_in_iterate_by() -> None:
 674 |     t = build_tafra()
 675 |     for u, ix, grouped in t.iterate_by(['y']):
 676 |         assert isinstance(grouped.group_by(['z'], {'x': sum}), Tafra)
 677 | 
 678 | def test_update_transform() -> None:
 679 |     t = build_tafra()
 680 |     t.update(t.transform(['y'], {}, {'id': max}))
 681 | 
 682 |     for u, ix, it in t.iterate_by(['y']):
 683 |         t['x'][ix] = it['x'] - np.mean(it['x'])
 684 |     check_tafra(t)
 685 | 
 686 | def test_transform_assignment() -> None:
 687 |     t = build_tafra()
 688 |     for u, ix, it in t.iterate_by(['y']):
 689 |         it['x'][0] = 9
 690 |     check_tafra(t)
 691 |     check_tafra(it)
 692 | 
 693 | def test_invalid_agg() -> None:
 694 |     t = build_tafra()
 695 |     with pytest.raises(ValueError) as e:
 696 |         gb = t.group_by(
 697 |             ['y', 'z'], {sum: 'x'}  # type: ignore
 698 |         )
 699 | 
 700 |     with pytest.raises(ValueError) as e:
 701 |         gb = t.group_by(
 702 |             ['y', 'z'], {}, {len: 'count'}  # type: ignore
 703 |         )
 704 | 
 705 | def test_map() -> None:
 706 |     t = build_tafra()
 707 | 
 708 |     def repeat(tf: Tafra, repeats: int) -> Tafra:
 709 |         return [tf for _ in range(repeats)]
 710 | 
 711 |     _ = list(t.row_map(repeat, 6))
 712 |     _ = list(t.tuple_map(repeat, 6))
 713 |     _ = list(t.col_map(repeat, repeats=6))
 714 |     _ = Tafra(t.key_map(np.repeat, repeats=6))
 715 | 
 716 | def test_pipe() -> None:
 717 |     def fn1(t: Tafra) -> Tafra:
 718 |         return t[t['y'] == 'one']
 719 |     def fn2(t: Tafra) -> Tafra:
 720 |         return t[t['z'] == 0]
 721 | 
 722 |     t = build_tafra()
 723 |     check_tafra(t.pipe(fn1))
 724 |     check_tafra(t >> fn1)
 725 |     check_tafra(t.pipe(fn1).pipe(fn2))
 726 |     check_tafra(t >> fn1 >> fn2)
 727 | 
 728 |     def fn3(t: Tafra, i: int) -> Tafra:
 729 |         return t[t['x'] == i]
 730 | 
 731 |     check_tafra(t.pipe(fn3, 1))
 732 |     check_tafra(t.pipe(fn3, i=1))
 733 |     check_tafra(t >> (lambda t: fn3(t, i=1)))
 734 | 
 735 | def test_union() -> None:
 736 |     t = build_tafra()
 737 |     t2 = build_tafra()
 738 | 
 739 |     _ = t2.union(t)
 740 |     check_tafra(_)
 741 |     assert len(_) == len(t) + len(t2)
 742 | 
 743 |     t2.union_inplace(t)
 744 |     check_tafra(t2)
 745 |     assert len(t2) == 2 * len(t)
 746 | 
 747 |     t = build_tafra()
 748 |     t2 = build_tafra()
 749 |     t._dtypes['a'] = 'int'
 750 |     with pytest.raises(Exception) as e:
 751 |         t.union_inplace(t2)
 752 | 
 753 |     t = build_tafra()
 754 |     t2._dtypes['a'] = 'int'
 755 |     with pytest.raises(Exception) as e:
 756 |         t.union_inplace(t2)
 757 | 
 758 |     t = build_tafra()
 759 |     t2 = build_tafra()
 760 |     t['a'] = np.arange(6)
 761 |     with pytest.raises(ValueError) as e:
 762 |         t.union_inplace(t2)
 763 | 
 764 |     t = build_tafra()
 765 |     t2 = build_tafra()
 766 |     t2['a'] = np.arange(6)
 767 |     with pytest.raises(ValueError) as e:
 768 |         t.union_inplace(t2)
 769 | 
 770 |     t = build_tafra()
 771 |     t2 = build_tafra()
 772 |     t.rename_inplace({'x': 'a'})
 773 |     with pytest.raises(TypeError) as e:
 774 |         t.union_inplace(t2)
 775 | 
 776 |     t = build_tafra()
 777 |     t2 = build_tafra()
 778 |     t2.rename_inplace({'x': 'a'})
 779 |     with pytest.raises(TypeError) as e:
 780 |         t.union_inplace(t2)
 781 | 
 782 |     t = build_tafra()
 783 |     t2 = build_tafra()
 784 |     t.update_dtypes_inplace({'x': float})
 785 |     with pytest.raises(TypeError) as e:
 786 |         t.union_inplace(t2)
 787 | 
 788 |     t = build_tafra()
 789 |     t2 = build_tafra()
 790 |     t2._dtypes['x'] = 'float'
 791 |     with pytest.raises(TypeError) as e:
 792 |         t.union_inplace(t2)
 793 | 
 794 | def test_slice() -> None:
 795 |     t = build_tafra()
 796 |     _ = t[:3]
 797 |     _['x'][0] = 0
 798 |     check_tafra(_)
 799 | 
 800 |     t = build_tafra()
 801 |     _ = t[slice(0, 3)]
 802 |     _['x'][0] = 7
 803 |     check_tafra(_)
 804 |     check_tafra(t)
 805 | 
 806 |     t = build_tafra()
 807 |     _ = t[:3].copy()
 808 |     _['x'][0] = 9
 809 |     check_tafra(_)
 810 |     check_tafra(t)
 811 | 
 812 |     t = build_tafra()
 813 |     _ = t[t['x'] <= 4]
 814 |     _['x'][1] = 15
 815 |     check_tafra(_)
 816 |     check_tafra(t)
 817 | 
 818 |     t = build_tafra()
 819 |     _ = t[2]
 820 |     _ = t[[1, 3]]
 821 |     _ = t[np.array([2, 4])]
 822 |     _ = t[[True, False, True, True, False, True]]
 823 |     _ = t[np.array([True, False, True, True, False, True])]
 824 |     _ = t[['x', 'y']]
 825 |     _ = t[('x', 'y')]
 826 |     _ = t[[True, 2]]
 827 |     check_tafra(_)
 828 |     check_tafra(t)
 829 | 
 830 |     with pytest.raises(IndexError) as e:
 831 |         _ = t[np.array([[1, 2]])]
 832 | 
 833 |     with pytest.raises(IndexError) as e:
 834 |         _ = t[[True, False]]
 835 | 
 836 |     with pytest.raises(IndexError) as e:
 837 |         _ = t[np.array([True, False])]
 838 | 
 839 |     with pytest.raises(IndexError) as e:
 840 |         _ = t[(1, 2)]  # noqa
 841 | 
 842 |     with pytest.raises(IndexError) as e:
 843 |         _ = t[(1, 2.)]  # type: ignore # noqa
 844 | 
 845 |     with pytest.raises(ValueError) as e:
 846 |         _ = t[['x', 2]]
 847 | 
 848 |     with pytest.raises(TypeError) as e:
 849 |         _ = t[{'x': [1, 2]}]  # type: ignore
 850 | 
 851 |     with pytest.raises(TypeError) as e:
 852 |         _ = t[TestClass()]  # type: ignore # noqa
 853 | 
 854 |     with pytest.raises(IndexError) as e:
 855 |         _ = t[[1, 2.]]  # type: ignore
 856 | 
 857 |     with pytest.raises(IndexError) as e:
 858 |         _ = t[np.array([1, 2.])]
 859 | 
 860 | 
 861 | def test_invalid_dtypes() -> None:
 862 |     t = build_tafra()
 863 |     with pytest.raises(Exception) as e:
 864 |         t.update_dtypes({'x': 'flot', 'y': 'st'})
 865 | 
 866 | def test_invalid_assignment() -> None:
 867 |     t = build_tafra()
 868 |     _ = build_tafra()
 869 |     _._data['x'] = np.arange(5)
 870 | 
 871 |     with pytest.raises(Exception) as e:
 872 |         _._update_rows()
 873 | 
 874 |     with pytest.raises(Exception) as e:
 875 |         _ = t.update(_)
 876 | 
 877 |     with pytest.raises(Exception) as e:
 878 |         t.update_inplace(_)
 879 | 
 880 |     with warnings.catch_warnings(record=True) as w:
 881 |         t['x'] = np.arange(6)[:, None]
 882 |         assert str(w[0].message) == '`np.squeeze(ndarray)` applied to set ndim == 1.'
 883 | 
 884 |     with warnings.catch_warnings(record=True) as w:
 885 |         t['x'] = np.atleast_2d(np.arange(6))
 886 |         assert str(w[0].message) == '`np.squeeze(ndarray)` applied to set ndim == 1.'
 887 | 
 888 |     with warnings.catch_warnings(record=True) as w:
 889 |         t['x'] = np.atleast_2d(np.arange(6)).T
 890 |         assert str(w[0].message) == '`np.squeeze(ndarray)` applied to set ndim == 1.'
 891 | 
 892 |     with warnings.catch_warnings(record=True) as w:
 893 |         t['x'] = np.atleast_2d(np.arange(6))
 894 |         assert str(w[0].message) == '`np.squeeze(ndarray)` applied to set ndim == 1.'
 895 | 
 896 |     with pytest.raises(Exception) as e:
 897 |         t['x'] = np.repeat(np.arange(6)[:, None], repeats=2, axis=1)
 898 | 
 899 | def test_datetime() -> None:
 900 |     t = build_tafra()
 901 |     t['d'] = np.array([np.datetime64(_, 'D') for _ in range(6)])
 902 |     t.update_dtypes({'d': '<M8[D]'})
 903 |     check_tafra(t)
 904 | 
 905 | def test_object_parse() -> None:
 906 |     t = build_tafra()
 907 |     t['d'] = np.array([datetime.fromisoformat(f'2020-0{_+1}-01') for _ in range(6)])
 908 |     assert t._dtypes['d'] == 'object'
 909 |     check_tafra(t)
 910 | 
 911 |     object_formatter['datetime'] = lambda x: x.astype('datetime64[D]')
 912 |     t2 = t.parse_object_dtypes()
 913 |     assert t2['d'].dtype == np.dtype('datetime64[D]')
 914 |     check_tafra(t2)
 915 | 
 916 |     t.parse_object_dtypes_inplace()
 917 |     assert t['d'].dtype == np.dtype('datetime64[D]')
 918 |     check_tafra(t)
 919 | 
 920 | def test_coalesce() -> None:
 921 |     t = Tafra({'x': np.array([1, 2, None, 4, None])})
 922 |     t['x'] = t.coalesce('x', [[1, 2, 3, None, 5], [None, None, None, None, 'five']])
 923 |     t['y'] = t.coalesce('y', [[1, 2, 3, None, 5], [None, None, None, None, 'five']])
 924 |     assert np.all(t['x'] != np.array(None))
 925 |     assert t['y'][3] == np.array(None)
 926 |     check_tafra(t)
 927 | 
 928 |     t = Tafra({'x': np.array([1, 2, None, 4, None])})
 929 |     t.coalesce_inplace('x', [[1, 2, 3, None, 5], [None, None, None, None, 'five']])
 930 |     t.coalesce_inplace('y', [[1, 2, 3, None, 5], [None, None, None, None, 'five']])
 931 |     assert np.all(t['x'] != np.array(None))
 932 |     assert t['y'][3] == np.array(None)
 933 |     check_tafra(t)
 934 | 
 935 |     t = Tafra({'x': np.array([None])})
 936 |     t.coalesce('x', [[1], [None]])
 937 |     check_tafra(t)
 938 | 
 939 | def test_left_join_equi() -> None:
 940 |     l = Tafra({
 941 |         'x': np.array([1, 2, 3, 4, 5, 6]),
 942 |         'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
 943 |         'z': np.array([0, 0, 0, 1, 1, 1])
 944 |     })
 945 | 
 946 |     r = Tafra({
 947 |         'a': np.array([1, 2, 3, 4, 5, 6]),
 948 |         'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
 949 |         'c': np.array([0, 0, 0, 1, 1, 1])
 950 |     })
 951 |     t = l.left_join(r, [('x', 'a', '==')], ['x', 'y', 'a', 'b'])
 952 |     check_tafra(t)
 953 | 
 954 |     r = Tafra({
 955 |         'a': np.array([1, 1, 1, 2, 2, 2]),
 956 |         'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
 957 |         'c': np.array([2, 2, 2, 3, 3, 3])
 958 |     })
 959 |     t = l.left_join(r, [('x', 'a', '=='), ('z', 'c', '==')], ['x', 'y', 'a', 'b'])
 960 |     check_tafra(t)
 961 | 
 962 |     r = Tafra({
 963 |         'a': np.array([1, 1, 1, 2, 2, 2]),
 964 |         '_a': np.array([1, 1, 2, 2, 3, 3]),
 965 |         'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
 966 |         'c': np.array([0, 0, 0, 1, 1, 1])
 967 |     })
 968 |     t = l.left_join(r, [('x', 'a', '=='), ('x', '_a', '==')], ['x', 'y', 'a', 'b'])
 969 |     check_tafra(t)
 970 | 
 971 |     r = Tafra({
 972 |         'a': np.array([1, 1, 2, 2, 3, 3]),
 973 |         'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
 974 |         'c': np.array([0, 0, 0, 1, 1, 1])
 975 |     })
 976 |     t = l.left_join(r, [('x', 'a', '<')], ['x', 'y', 'a', 'b'])
 977 |     check_tafra(t)
 978 | 
 979 | def test_inner_join() -> None:
 980 |     l = Tafra({
 981 |         'x': np.array([1, 2, 3, 4, 5, 6]),
 982 |         'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
 983 |         'z': np.array([0, 0, 0, 1, 1, 1])
 984 |     })
 985 | 
 986 |     r = Tafra({
 987 |         'a': np.array([1, 2, 3, 4, 5, 6]),
 988 |         'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
 989 |         'c': np.array([0, 0, 0, 1, 1, 1])
 990 |     })
 991 |     t = l.inner_join(r, [('x', 'a', '==')], ['x', 'y', 'a', 'b'])
 992 |     check_tafra(t)
 993 | 
 994 |     r = Tafra({
 995 |         'a': np.array([1, 1, 2, 2, 3, 3]),
 996 |         'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
 997 |         'c': np.array([0, 0, 0, 1, 1, 1])
 998 |     })
 999 |     t = l.inner_join(r, [('x', 'a', '==')], ['x', 'y', 'a', 'b'])
1000 |     check_tafra(t)
1001 | 
1002 |     r = Tafra({
1003 |         'a': np.array([1, 1, 1, 2, 2, 2]),
1004 |         'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
1005 |         'c': np.array([0, 0, 0, 1, 1, 1])
1006 |     })
1007 |     t = l.inner_join(r, [('x', 'a', '==')], ['x', 'y', 'a', 'b'])
1008 |     check_tafra(t)
1009 | 
1010 |     r = Tafra({
1011 |         'a': np.array([1, 1, 1, 2, 2, 2]),
1012 |         'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
1013 |         'c': np.array([0, 0, 0, 1, 1, 1])
1014 |     })
1015 | 
1016 |     t = l.inner_join(r, [('x', 'a', '<=')], ['x', 'y', 'a', 'b'])
1017 |     check_tafra(t)
1018 | 
1019 | 
1020 | def test_cross_join() -> None:
1021 |     l = Tafra({
1022 |         'x': np.array([1, 2, 3, 4, 5, 6]),
1023 |         'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
1024 |         'z': np.array([0, 0, 0, 1, 1, 1])
1025 |     })
1026 | 
1027 |     r = Tafra({
1028 |         'a': np.array([1, 2, 3, 4, 5, 6]),
1029 |         'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
1030 |         'c': np.array([0, 0, 0, 1, 1, 1])
1031 |     })
1032 |     t = l.cross_join(r)
1033 |     check_tafra(t)
1034 | 
1035 |     r = Tafra({
1036 |         'a': np.array([1, 1, 2, 2, 3, 3]),
1037 |         'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
1038 |         'c': np.array([0, 0, 0, 1, 1, 1])
1039 |     })
1040 |     t = l.cross_join(r)
1041 |     check_tafra(t)
1042 | 
1043 |     r = Tafra({
1044 |         'a': np.array([1, 1, 1, 2, 2, 2]),
1045 |         'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
1046 |         'c': np.array([0, 0, 0, 1, 1, 1])
1047 |     })
1048 |     t = l.cross_join(r)
1049 |     check_tafra(t)
1050 | 
1051 |     r = Tafra({
1052 |         'a': np.array([1, 1, 1, 2, 2, 2]),
1053 |         'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
1054 |         'c': np.array([0, 0, 0, 1, 1, 1])
1055 |     })
1056 | 
1057 |     t = l.cross_join(r, select=['x', 'z', 'a', 'c'])
1058 |     check_tafra(t)
1059 | 
1060 |     with pytest.raises(IndexError) as e:
1061 |         t = l.cross_join(r, select=['x', 'z'])
1062 | 
1063 |     with pytest.raises(IndexError) as e:
1064 |         t = l.cross_join(r, select=['a', 'c'])
1065 | 
1066 | def test_left_join_invalid() -> None:
1067 |     l = Tafra({
1068 |         'x': np.array([1, 2, 3, 4, 5, 6]),
1069 |         'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
1070 |         'z': np.array([0, 0, 0, 1, 1, 1])
1071 |     })
1072 | 
1073 |     r = Tafra({
1074 |         'a': np.array([1, 2, 3, 4, 5, 6]),
1075 |         'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
1076 |         'c': np.array([0, 0, 0, 1, 1, 1])
1077 |     })
1078 | 
1079 |     with pytest.raises(TypeError) as e:
1080 |         t = l.left_join(r, [('x', 'a', '===')], ['x', 'y', 'a', 'b'])
1081 | 
1082 |     r = Tafra({
1083 |         'a': np.array([1, 2, 3, 4, 5, 6], dtype='float'),
1084 |         'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
1085 |         'c': np.array([0, 0, 0, 1, 1, 1])
1086 |     })
1087 | 
1088 |     with pytest.raises(TypeError) as e:
1089 |         t = l.left_join(r, [('x', 'a', '==')], ['x', 'y', 'a', 'b'])
1090 | 
1091 |     r = Tafra({
1092 |         'a': np.array([1, 2, 3, 4, 5, 6]),
1093 |         'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
1094 |         'c': np.array([0, 0, 0, 1, 1, 1])
1095 |     })
1096 | 
1097 |     l._dtypes['x'] = 'float'
1098 |     with pytest.raises(TypeError) as e:
1099 |         t = l.left_join(r, [('x', 'a', '==')], ['x', 'y', 'a', 'b'])
1100 | 
1101 | def test_csv() -> None:
1102 |     write_path = 'test/test_to_csv.csv'
1103 | 
1104 |     def write_reread(t: Tafra) -> None:
1105 |         t.to_csv(write_path)
1106 |         t2 = Tafra.read_csv(write_path, dtypes=t.dtypes)
1107 | 
1108 |         for c1, c2 in zip(t.columns, t2.columns):
1109 |             assert np.array_equal(t.data[c1], t2.data[c2])
1110 |             assert np.array_equal(t.dtypes[c1], t2.dtypes[c2])
1111 | 
1112 |     # straightforward CSV - inference heuristic works
1113 |     path = Path('test/ex1.csv')
1114 |     t = Tafra.read_csv(path)
1115 |     assert t.dtypes['a'] == 'int32'
1116 |     assert t.dtypes['b'] == 'bool'
1117 |     assert t.dtypes['c'] == 'float64'
1118 |     assert t.rows == 6
1119 |     assert len(t.columns) == 3
1120 |     check_tafra(t)
1121 |     write_reread(t)
1122 | 
1123 |     # test again with TextIOWrapper
1124 |     with open('test/ex1.csv', 'r') as f:
1125 |         t = Tafra.read_csv(f)
1126 |     assert t.dtypes['a'] == 'int32'
1127 |     assert t.dtypes['b'] == 'bool'
1128 |     assert t.dtypes['c'] == 'float64'
1129 |     assert t.rows == 6
1130 |     assert len(t.columns) == 3
1131 |     check_tafra(t)
1132 |     write_reread(t)
1133 | 
1134 |     with open(write_path, 'w') as f:
1135 |         t.to_csv(f)
1136 |     with pytest.raises(ValueError) as e:
1137 |         with open(write_path) as f:
1138 |             t.to_csv(f)
1139 | 
1140 |     # short CSV - ends during inference period
1141 |     t = Tafra.read_csv('test/ex2.csv')
1142 |     assert t.dtypes['a'] == 'int32'
1143 |     assert t.dtypes['b'] == 'bool'
1144 |     assert t.dtypes['c'] == 'float64'
1145 |     assert t.rows == 2
1146 |     assert len(t.columns) == 3
1147 |     check_tafra(t)
1148 |     write_reread(t)
1149 | 
1150 |     # harder CSV - promote to object during inference period,
1151 |     #   duplicate column name
1152 |     t = Tafra.read_csv('test/ex3.csv')
1153 |     assert t.dtypes['a'] == 'int32'
1154 |     assert t.dtypes['b'] == 'object'
1155 |     assert t.dtypes['b (2)'] == 'float64'
1156 |     assert t.rows == 6
1157 |     assert len(t.columns) == 3
1158 |     check_tafra(t)
1159 |     write_reread(t)
1160 | 
1161 |     # as above, but with a promotion required after inference period
1162 |     #   (heuristic fails)
1163 |     t = Tafra.read_csv('test/ex4.csv')
1164 |     assert t.dtypes['a'] == 'int32'
1165 |     assert t.dtypes['b'] == 'object'
1166 |     assert t.dtypes['b (2)'] == 'float64'
1167 |     assert t.rows == 6
1168 |     assert len(t.columns) == 3
1169 |     check_tafra(t)
1170 |     write_reread(t)
1171 | 
1172 |     # bad CSV - missing column on row #4
1173 |     with pytest.raises(ValueError) as e:
1174 |         t = Tafra.read_csv('test/ex5.csv')
1175 | 
1176 |     # bad CSV - missing column on row #4 - after guess rows
1177 |     with pytest.raises(ValueError) as e:
1178 |         t = Tafra.read_csv('test/ex5.csv', guess_rows=2)
1179 | 
1180 |     # missing column - but numpy will automatically convert missing (None) to nan
1181 |     t = Tafra.read_csv('test/ex6.csv')
1182 |     assert t.dtypes['dp'] == 'float64'
1183 |     assert t.dtypes['dp_prime'] == 'float64'
1184 |     assert t.dtypes['dp_prime_te'] == 'float64'
1185 |     assert t.dtypes['t'] == 'float64'
1186 |     assert t.dtypes['te'] == 'float64'
1187 |     check_tafra(t)
1188 | 
1189 |     # missing column - do not automatically cast
1190 |     t = Tafra.read_csv('test/ex6.csv', missing=None)
1191 |     assert t.dtypes['dp'] == 'float64'
1192 |     assert t.dtypes['dp_prime'] == 'object'
1193 |     assert t.dtypes['dp_prime_te'] == 'object'
1194 |     assert t.dtypes['t'] == 'float64'
1195 |     assert t.dtypes['te'] == 'float64'
1196 |     check_tafra(t)
1197 | 
1198 |     t.update_dtypes_inplace({'dp_prime': float, 'dp_prime_te': 'float64'})
1199 |     assert t.dtypes['dp_prime'] == 'float64'
1200 |     assert t.dtypes['dp_prime_te'] == 'float64'
1201 |     check_tafra(t)
1202 | 
1203 |     # force dtypes on missing columns
1204 |     t = Tafra.read_csv('test/ex6.csv', missing=None, dtypes={'dp_prime': np.float64, 'dp_prime_te': np.float32})
1205 |     assert t.dtypes['dp'] == 'float64'
1206 |     assert t.dtypes['dp_prime'] == 'float64'
1207 |     assert t.dtypes['dp_prime_te'] == 'float32'
1208 |     assert t.dtypes['t'] == 'float64'
1209 |     assert t.dtypes['te'] == 'float64'
1210 |     check_tafra(t)
1211 | 
1212 |     # override a column type
1213 |     t = Tafra.read_csv('test/ex4.csv', dtypes={'a': 'float32'})
1214 |     assert t.dtypes['a'] == 'float32'
1215 |     assert t.dtypes['b'] == 'object'
1216 |     assert t.dtypes['b (2)'] == 'float64'
1217 |     assert t.rows == 6
1218 |     assert len(t.columns) == 3
1219 |     check_tafra(t)
1220 |     write_reread(t)
1221 | 


--------------------------------------------------------------------------------