├── .coveragerc ├── .gitignore ├── .readthedocs.yml ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── README.rst ├── dev-requirements.txt ├── docs ├── Makefile ├── _static │ └── theme_override.css ├── api.rst ├── conf.py ├── genindex.rst ├── index.rst ├── make.bat ├── numerical.rst ├── requirements.txt ├── testing.rst └── versions.rst ├── setup.cfg ├── setup.py ├── tafra ├── __init__.py ├── base.py ├── csvreader.py ├── formatter.py ├── group.py ├── protocol.py ├── py.typed └── version.py └── test ├── __init__.py ├── ex1.csv ├── ex2.csv ├── ex3.csv ├── ex4.csv ├── ex5.csv ├── ex6.csv ├── test.bat ├── test.sh └── test_tafra.py /.coveragerc: -------------------------------------------------------------------------------- 1 | 2 | # .coveragerc to control coverage.py 3 | [run] 4 | # branch = True 5 | 6 | [report] 7 | # Regexes for lines to exclude from consideration 8 | exclude_lines = 9 | # Have to re-enable the standard pragma 10 | pragma: no cover 11 | pass 12 | 13 | # Don't complain about missing debug-only code: 14 | def __repr__ 15 | if self\.debug 16 | 17 | # Don't complain if tests don't hit defensive assertion code: 18 | raise AssertionError 19 | raise NotImplementedError 20 | 21 | # Don't complain if non-runnable code isn't run: 22 | if 0: 23 | if __name__ == .__main__.: 24 | 25 | ignore_errors = True 26 | 27 | [html] 28 | directory = test/htmlcov 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Other things 2 | *.ipynb 3 | test/test*.csv 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # VSCode 14 | .vscode/ 15 | *.code-workspace 16 | 17 | # Zips 18 | *.zip 19 | 20 | # Spotfire 21 | *.dxp 22 | 23 | # Vim 24 | *.swp 25 | *.swo 26 | 27 | # Distribution / packaging 28 | .Python 29 | build/ 30 | develop-eggs/ 31 | dist/ 32 | downloads/ 33 | eggs/ 34 | .eggs/ 35 | lib/ 36 | lib64/ 37 | parts/ 38 | sdist/ 39 | var/ 40 | wheels/ 41 | *.egg-info/ 42 | .installed.cfg 43 | *.egg 44 | MANIFEST 45 | 46 | # PyInstaller 47 | # Usually these files are written by a python script from a template 48 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 49 | *.manifest 50 | *.spec 51 | 52 | # Installer logs 53 | pip-log.txt 54 | pip-delete-this-directory.txt 55 | 56 | # Unit test / coverage reports 57 | htmlcov/ 58 | .tox/ 59 | .coverage 60 | .coverage.* 61 | .cache 62 | nosetests.xml 63 | coverage.xml 64 | *.cover 65 | .hypothesis/ 66 | .pytest_cache/ 67 | 68 | # Translations 69 | *.mo 70 | *.pot 71 | 72 | # Django stuff: 73 | *.log 74 | local_settings.py 75 | db.sqlite3 76 | 77 | # Flask stuff: 78 | instance/ 79 | .webassets-cache 80 | 81 | # Scrapy stuff: 82 | .scrapy 83 | 84 | # Sphinx documentation 85 | docs/_build/ 86 | 87 | # PyBuilder 88 | target/ 89 | 90 | # Jupyter Notebook 91 | .ipynb_checkpoints 92 | 93 | # pyenv 94 | .python-version 95 | 96 | # celery beat schedule file 97 | celerybeat-schedule 98 | 99 | # SageMath parsed files 100 | *.sage.py 101 | 102 | # Environments 103 | .env 104 | .venv 105 | env/ 106 | venv/ 107 | ENV/ 108 | env.bak/ 109 | venv.bak/ 110 | 111 | # Spyder project settings 112 | .spyderproject 113 | .spyproject 114 | 115 | # Rope project settings 116 | .ropeproject 117 | 118 | # mkdocs documentation 119 | /site 120 | 121 | # mypy 122 | .mypy_cache/ 123 | *._py 124 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation in the docs/ directory with Sphinx 9 | sphinx: 10 | configuration: docs/conf.py 11 | 12 | # Build documentation with MkDocs 13 | #mkdocs: 14 | # configuration: mkdocs.yml 15 | 16 | # Optionally build your docs in additional formats such as PDF 17 | formats: 18 | - pdf 19 | 20 | # Optionally set the version of Python and requirements required to build your docs 21 | python: 22 | version: 3.7 23 | system_packages: true 24 | install: 25 | - requirements: docs/requirements.txt 26 | - method: pip 27 | path: . 28 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: "python" 2 | 3 | python: 4 | - "3.7" 5 | - "3.8" 6 | 7 | install: 8 | - "pip install -U flake8 mypy numpy pandas typing_extensions pytest pytest-cov \"attrs>=19.2.0\" hypothesis coveralls sphinx sphinx_rtd_theme" 9 | - "pip install -U git+https://github.com/numpy/numpy-stubs.git" 10 | - "pip install ." 11 | 12 | script: 13 | - 'flake8 tafra' 14 | - "mypy tafra" 15 | - "pytest" 16 | - "sphinx-build -W -b html docs docs/_build/html" 17 | 18 | notifications: 19 | - email: false 20 | 21 | after_success: 22 | - "coveralls" 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Derrick W. Turk and David S. Fulford 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | include LICENSE 3 | include docs/*.rst 4 | include docs/Makefile 5 | include docs/make.bat 6 | include docs/conf.py 7 | include docs/_static/* 8 | include docs/img/* 9 | include docs/requirements.txt 10 | include test/*.py 11 | include test/ex*.csv 12 | include test/test.bat 13 | include test/test.sh 14 | include .coveragerc 15 | include dev-requirements.txt 16 | include setup.cfg 17 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ============================= 2 | Tafra: a minimalist dataframe 3 | ============================= 4 | 5 | .. image:: https://img.shields.io/pypi/v/tafra.svg 6 | :target: https://pypi.org/project/tafra/ 7 | 8 | .. image:: https://travis-ci.org/petbox-dev/tafra.svg?branch=master 9 | :target: https://travis-ci.org/petbox-dev/tafra 10 | 11 | .. image:: https://readthedocs.org/projects/tafra/badge/?version=latest 12 | :target: https://tafra.readthedocs.io/en/latest/?badge=latest 13 | :alt: Documentation Status 14 | 15 | .. image:: https://coveralls.io/repos/github/petbox-dev/tafra/badge.svg 16 | :target: https://coveralls.io/github/petbox-dev/tafra 17 | :alt: Coverage Status 18 | 19 | 20 | The ``tafra`` began life as a thought experiment: how could we reduce the idea 21 | of a da\ *tafra*\ me (as expressed in libraries like ``pandas`` or languages 22 | like R) to its useful essence, while carving away the cruft? 23 | The `original proof of concept `_ 24 | stopped at "group by". 25 | 26 | .. `original proof of concept`_ 27 | 28 | This library expands on the proof of concept to produce a practically 29 | useful ``tafra``, which we hope you may find to be a helpful lightweight 30 | substitute for certain uses of ``pandas``. 31 | 32 | A ``tafra`` is, more-or-less, a set of named *columns* or *dimensions*. 33 | Each of these is a typed ``numpy`` array of consistent length, representing 34 | the values for each column by *rows*. 35 | 36 | The library provides lightweight syntax for manipulating rows and columns, 37 | support for managing data types, iterators for rows and sub-frames, 38 | `pandas`-like "transform" support and conversion from `pandas` Dataframes, 39 | and SQL-style "group by" and join operations. 40 | 41 | +----------------------------+-----------------------------------------------------------------------------------------------------------------------------+ 42 | | Tafra | `Tafra `_ | 43 | +----------------------------+-----------------------------------------------------------------------------------------------------------------------------+ 44 | | Aggregations | `Union `_, | 45 | | | `GroupBy `_, | 46 | | | `Transform `_, | 47 | | | `IterateBy `_, | 48 | | | `InnerJoin `_, | 49 | | | `LeftJoin `_, | 50 | | | `CrossJoin `_ | 51 | +----------------------------+-----------------------------------------------------------------------------------------------------------------------------+ 52 | | Aggregation Helpers | `union `__, | 53 | | | `union_inplace `_, | 54 | | | `group_by `_, | 55 | | | `transform `__, | 56 | | | `iterate_by `_, | 57 | | | `inner_join `_, | 58 | | | `left_join `_, | 59 | | | `cross_join `_ | 60 | +----------------------------+-----------------------------------------------------------------------------------------------------------------------------+ 61 | | Constructors | `as_tafra `_, | 62 | | | `from_dataframe `_, | 63 | | | `from_series `_, | 64 | | | `from_records `_ | 65 | +----------------------------+-----------------------------------------------------------------------------------------------------------------------------+ 66 | | SQL Readers | `read_sql `_, | 67 | | | `read_sql_chunks `_ | 68 | +----------------------------+-----------------------------------------------------------------------------------------------------------------------------+ 69 | | Destructors | `to_records `_, | 70 | | | `to_list `_, | 71 | | | `to_tuple `_, | 72 | | | `to_array `_, | 73 | | | `to_pandas `_ | 74 | +----------------------------+-----------------------------------------------------------------------------------------------------------------------------+ 75 | | Properties | `rows `_, | 76 | | | `columns `_, | 77 | | | `data `_, | 78 | | | `dtypes `_, | 79 | | | `size `_, | 80 | | | `ndim `_, | 81 | | | `shape `_ | 82 | +----------------------------+-----------------------------------------------------------------------------------------------------------------------------+ 83 | | Iter Methods | `iterrows `_, | 84 | | | `itertuples `_, | 85 | | | `itercols `_ | 86 | +----------------------------+-----------------------------------------------------------------------------------------------------------------------------+ 87 | | Functional Methods | `row_map `_, | 88 | | | `tuple_map `_, | 89 | | | `col_map `_, | 90 | | | `pipe `_ | 91 | +----------------------------+-----------------------------------------------------------------------------------------------------------------------------+ 92 | | Dict-like Methods | `keys `_, | 93 | | | `values `_, | 94 | | | `items `_, | 95 | | | `get `_, | 96 | | | `update `_, | 97 | | | `update_inplace `_, | 98 | | | `update_dtypes `_, | 99 | | | `update_dtypes_inplace `_ | 100 | +----------------------------+-----------------------------------------------------------------------------------------------------------------------------+ 101 | | Other Helper Methods | `select `_, | 102 | | | `head `_, | 103 | | | `copy `_, | 104 | | | `rename `_, | 105 | | | `rename_inplace `_, | 106 | | | `coalesce `_, | 107 | | | `coalesce_inplace `_, | 108 | | | `_coalesce_dtypes `_, | 109 | | | `delete `_, | 110 | | | `delete_inplace `_ | 111 | +----------------------------+-----------------------------------------------------------------------------------------------------------------------------+ 112 | | Printer Methods | `pprint `_, | 113 | | | `pformat `_, | 114 | | | `to_html `_ | 115 | +----------------------------+-----------------------------------------------------------------------------------------------------------------------------+ 116 | | Indexing Methods | `_slice `_, | 117 | | | `_index `_, | 118 | | | `_ndindex `_ | 119 | +----------------------------+-----------------------------------------------------------------------------------------------------------------------------+ 120 | 121 | Getting Started 122 | =============== 123 | 124 | Install the library with `pip `_: 125 | 126 | .. code-block:: shell 127 | 128 | pip install tafra 129 | 130 | 131 | A short example 132 | --------------- 133 | 134 | .. code-block:: python 135 | 136 | >>> from tafra import Tafra 137 | 138 | >>> t = Tafra({ 139 | ... 'x': np.array([1, 2, 3, 4]), 140 | ... 'y': np.array(['one', 'two', 'one', 'two'], dtype='object'), 141 | ... }) 142 | 143 | >>> t.pformat() 144 | Tafra(data = { 145 | 'x': array([1, 2, 3, 4]), 146 | 'y': array(['one', 'two', 'one', 'two'])}, 147 | dtypes = { 148 | 'x': 'int', 'y': 'object'}, 149 | rows = 4) 150 | 151 | >>> print('List:', '\n', t.to_list()) 152 | List: 153 | [array([1, 2, 3, 4]), array(['one', 'two', 'one', 'two'], dtype=object)] 154 | 155 | >>> print('Records:', '\n', tuple(t.to_records())) 156 | Records: 157 | ((1, 'one'), (2, 'two'), (3, 'one'), (4, 'two')) 158 | 159 | >>> gb = t.group_by( 160 | ... ['y'], {'x': sum} 161 | ... ) 162 | 163 | >>> print('Group By:', '\n', gb.pformat()) 164 | Group By: 165 | Tafra(data = { 166 | 'x': array([4, 6]), 'y': array(['one', 'two'])}, 167 | dtypes = { 168 | 'x': 'int', 'y': 'object'}, 169 | rows = 2) 170 | 171 | 172 | Flexibility 173 | ----------- 174 | 175 | Have some code that works with ``pandas``, or just a way of doing things 176 | that you prefer? ``tafra`` is flexible: 177 | 178 | .. code-block:: python 179 | 180 | >>> df = pd.DataFrame(np.c_[ 181 | ... np.array([1, 2, 3, 4]), 182 | ... np.array(['one', 'two', 'one', 'two']) 183 | ... ], columns=['x', 'y']) 184 | 185 | >>> t = Tafra.from_dataframe(df) 186 | 187 | 188 | And going back is just as simple: 189 | 190 | .. code-block:: python 191 | 192 | >>> df = pd.DataFrame(t.data) 193 | 194 | 195 | Timings 196 | ======= 197 | 198 | In this case, lightweight also means performant. Beyond any additional 199 | features added to the library, ``tafra`` should provide the necessary 200 | base for organizing data structures for numerical processing. One of the 201 | most important aspects is fast access to the data itself. By minimizing 202 | abstraction to access the underlying ``numpy`` arrays, ``tafra`` provides 203 | an order of magnitude increase in performance. 204 | 205 | - **Import note** If you assign directly to the ``Tafra.data`` or 206 | ``Tafra._data`` attributes, you *must* call ``Tafra._coalesce_dtypes`` 207 | afterwards in order to ensure the typing is consistent. 208 | 209 | Construct a ``Tafra`` and a ``DataFrame``: 210 | 211 | .. code-block:: python 212 | 213 | >>> tf = Tafra({ 214 | ... 'x': np.array([1., 2., 3., 4., 5., 6.]), 215 | ... 'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'), 216 | ... 'z': np.array([0, 0, 0, 1, 1, 1]) 217 | ... }) 218 | 219 | >>> df = pd.DataFrame(t.data) 220 | 221 | Read Operations 222 | --------------- 223 | 224 | Direct access: 225 | 226 | .. code-block:: python 227 | 228 | >>> %timemit x = t._data['x'] 229 | 55.3 ns ± 5.64 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each) 230 | 231 | 232 | Indirect with some penalty to support ``Tafra`` slicing and ``numpy``'s 233 | advanced indexing: 234 | 235 | .. code-block:: python 236 | 237 | >>> %timemit x = t['x'] 238 | 219 ns ± 71.6 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each) 239 | 240 | 241 | ``pandas`` timing: 242 | 243 | .. code-block:: python 244 | 245 | >>> %timemit x = df['x'] 246 | 1.55 µs ± 105 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each) 247 | 248 | 249 | This is the fastest methed for accessing the numpy array among alternatives of 250 | ``df.values()``, ``df.to_numpy()``, and ``df.loc[]``. 251 | 252 | 253 | Assignment Operations 254 | --------------------- 255 | 256 | Direct access is not recommended as it avoids the validation steps, but it 257 | does provide fast access to the data attribute: 258 | 259 | .. code-block:: python 260 | 261 | >>> x = np.arange(6) 262 | 263 | >>> %timeit tf._data['x'] = x 264 | 65 ns ± 5.55 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each) 265 | 266 | 267 | Indidrect access has a performance penalty due to the validation checks to 268 | ensure consistency of the ``tafra``: 269 | 270 | .. code-block:: python 271 | 272 | >>> %timeit tf['x'] = x 273 | 7.39 µs ± 950 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each) 274 | 275 | Even so, there is considerable performance improvement over ``pandas``. 276 | 277 | ``pandas`` timing: 278 | 279 | .. code-block:: python 280 | 281 | >>> %timeit df['x'] = x 282 | 47.8 µs ± 3.53 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each) 283 | 284 | 285 | Grouping Operations 286 | ------------------- 287 | 288 | ``tafra`` also excels at aggregation methods, the primary of which are a 289 | SQL-like ``GROUP BY`` and the split-apply-combine equivalent to a SQL-like 290 | ``GROUP BY`` following by a ``LEFT JOIN`` back to the original table. 291 | 292 | .. code-block:: python 293 | 294 | >>> %timeit tf.group_by(['y', 'z'], {'x': sum}) 295 | 138 µs ± 4.03 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each) 296 | 297 | >>> %timeit tf.transform(['y', 'z'], {'sum_x': (sum, 'x')}) 298 | 161 µs ± 2.31 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each) 299 | 300 | The equivalent ``pandas`` functions are given below. They require a chain 301 | of several object methods to perform the same role, and the transform requires 302 | a copy operation and assignment into the copied ``DataFrame`` in order to 303 | preserve immutability. 304 | 305 | .. code-block:: python 306 | 307 | >>> %timeit df.groupby(['y','z']).agg({'x': 'sum'}).reset_index() 308 | 2.5 ms ± 177 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) 309 | 310 | >>> %%timeit 311 | ... tdf = df.copy() 312 | ... tdf['x'] = df.groupby(['y', 'z'])[['x']].transform(sum) 313 | 2.81 ms ± 143 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) 314 | -------------------------------------------------------------------------------- /dev-requirements.txt: -------------------------------------------------------------------------------- 1 | attrs>=19.3.0 2 | coverage>=5.1 3 | coveralls>=2.0.0 4 | flake8>=3.8.2 5 | hypothesis>=5.16.0 6 | mypy>=0.770 7 | numpy>=1.18.4 8 | numpy-stubs>=0.0.1 9 | pytest>=5.4.2 10 | pytest-cov>=2.9.0 11 | Sphinx>=3.0.4 12 | sphinx-rtd-theme>=0.4.3 13 | typing_extensions>=3.7.4.1 14 | wheel>=0.34.2 15 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/_static/theme_override.css: -------------------------------------------------------------------------------- 1 | /* https://rackerlabs.github.io/docs-rackspace/tools/rtd-tables.html */ 2 | /* override table width restrictions */ 3 | @media screen and (min-width: 767px) { 4 | 5 | .wy-table-responsive table td { 6 | /* !important prevents the common CSS stylesheets from overriding 7 | this as on RTD they are loaded after this stylesheet */ 8 | white-space: normal !important; 9 | } 10 | 11 | .wy-table-responsive { 12 | overflow: visible !important; 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | ============= 2 | API Reference 3 | ============= 4 | 5 | Summary 6 | ======= 7 | 8 | Tafra 9 | ----- 10 | 11 | .. currentmodule:: tafra.base 12 | 13 | .. autosummary:: 14 | 15 | Tafra 16 | 17 | 18 | Aggregations 19 | ------------ 20 | 21 | .. currentmodule:: tafra.group 22 | 23 | .. autosummary:: 24 | 25 | Union 26 | GroupBy 27 | Transform 28 | IterateBy 29 | InnerJoin 30 | LeftJoin 31 | CrossJoin 32 | 33 | 34 | Methods 35 | ------- 36 | 37 | .. currentmodule:: tafra.base.Tafra 38 | 39 | .. autosummary:: 40 | 41 | from_records 42 | from_dataframe 43 | from_series 44 | read_sql 45 | read_sql_chunks 46 | read_csv 47 | as_tafra 48 | to_records 49 | to_list 50 | to_tuple 51 | to_array 52 | to_pandas 53 | to_csv 54 | rows 55 | columns 56 | data 57 | dtypes 58 | size 59 | ndim 60 | shape 61 | head 62 | keys 63 | values 64 | items 65 | get 66 | iterrows 67 | itertuples 68 | itercols 69 | row_map 70 | tuple_map 71 | col_map 72 | key_map 73 | pipe 74 | select 75 | copy 76 | update 77 | update_inplace 78 | update_dtypes 79 | update_dtypes_inplace 80 | parse_object_dtypes 81 | parse_object_dtypes_inplace 82 | rename 83 | rename_inplace 84 | coalesce 85 | coalesce_inplace 86 | _coalesce_dtypes 87 | delete 88 | delete_inplace 89 | pprint 90 | pformat 91 | to_html 92 | _slice 93 | _iindex 94 | _aindex 95 | _ndindex 96 | 97 | 98 | Helper Methods 99 | -------------- 100 | 101 | .. currentmodule:: tafra.base.Tafra 102 | 103 | .. autosummary:: 104 | 105 | union 106 | union_inplace 107 | group_by 108 | transform 109 | iterate_by 110 | inner_join 111 | left_join 112 | cross_join 113 | 114 | 115 | Object Formatter 116 | ---------------- 117 | 118 | .. currentmodule:: tafra.formatter 119 | 120 | .. autosummary:: 121 | 122 | ObjectFormatter 123 | 124 | 125 | Detailed Reference 126 | ================== 127 | 128 | 129 | Tafra 130 | ----- 131 | 132 | .. currentmodule:: tafra.base 133 | 134 | 135 | Methods 136 | ~~~~~~~ 137 | 138 | .. autoclass:: Tafra 139 | 140 | .. automethod:: from_dataframe 141 | .. automethod:: from_series 142 | .. automethod:: from_records 143 | .. automethod:: read_sql 144 | .. automethod:: read_sql_chunks 145 | .. automethod:: read_csv 146 | .. automethod:: as_tafra 147 | .. automethod:: to_records 148 | .. automethod:: to_list 149 | .. automethod:: to_tuple 150 | .. automethod:: to_array 151 | .. automethod:: to_pandas 152 | .. automethod:: to_csv 153 | .. autoattribute:: rows 154 | .. autoattribute:: columns 155 | .. autoattribute:: data 156 | .. autoattribute:: dtypes 157 | .. autoattribute:: size 158 | .. autoattribute:: ndim 159 | .. autoattribute:: shape 160 | .. automethod:: head 161 | .. automethod:: keys 162 | .. automethod:: values 163 | .. automethod:: items 164 | .. automethod:: get 165 | .. automethod:: iterrows 166 | .. automethod:: itertuples 167 | .. automethod:: itercols 168 | .. automethod:: row_map 169 | .. automethod:: tuple_map 170 | .. automethod:: col_map 171 | .. automethod:: key_map 172 | .. automethod:: pipe 173 | .. automethod:: __rshift__ 174 | .. automethod:: select 175 | .. automethod:: copy 176 | .. automethod:: update 177 | .. automethod:: update_inplace 178 | .. automethod:: update_dtypes 179 | .. automethod:: update_dtypes_inplace 180 | .. automethod:: parse_object_dtypes 181 | .. automethod:: parse_object_dtypes_inplace 182 | .. automethod:: rename 183 | .. automethod:: rename_inplace 184 | .. automethod:: coalesce 185 | .. automethod:: coalesce_inplace 186 | .. automethod:: _coalesce_dtypes 187 | .. automethod:: delete 188 | .. automethod:: delete_inplace 189 | .. automethod:: pprint 190 | .. automethod:: pformat 191 | .. automethod:: to_html 192 | .. automethod:: _slice 193 | .. automethod:: _iindex 194 | .. automethod:: _aindex 195 | .. automethod:: _ndindex 196 | 197 | 198 | Helper Methods 199 | ~~~~~~~~~~~~~~ 200 | 201 | .. class:: Tafra 202 | :noindex: 203 | 204 | .. automethod:: union 205 | .. automethod:: union_inplace 206 | .. automethod:: group_by 207 | .. automethod:: transform 208 | .. automethod:: iterate_by 209 | .. automethod:: inner_join 210 | .. automethod:: left_join 211 | .. automethod:: cross_join 212 | 213 | 214 | Aggregations 215 | ------------ 216 | 217 | .. currentmodule:: tafra.group 218 | 219 | .. autoclass:: Union 220 | 221 | .. automethod:: apply 222 | .. automethod:: apply_inplace 223 | 224 | .. autoclass:: GroupBy 225 | 226 | .. automethod:: apply 227 | 228 | .. autoclass:: Transform 229 | 230 | .. automethod:: apply 231 | 232 | .. autoclass:: IterateBy 233 | 234 | .. automethod:: apply 235 | 236 | .. autoclass:: InnerJoin 237 | 238 | .. automethod:: apply 239 | 240 | .. autoclass:: LeftJoin 241 | 242 | .. automethod:: apply 243 | 244 | .. autoclass:: CrossJoin 245 | 246 | .. automethod:: apply 247 | 248 | 249 | Object Formatter 250 | ---------------- 251 | 252 | .. currentmodule:: tafra.formatter 253 | 254 | .. autoclass:: ObjectFormatter 255 | 256 | .. automethod:: __getitem__ 257 | .. automethod:: __setitem__ 258 | .. automethod:: __delitem__ 259 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | 16 | sys.path.insert(0, os.path.abspath('..')) 17 | import tafra 18 | 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = 'tafra' 23 | copyright = '2020, David S. Fulford' 24 | author = 'David S. Fulford' 25 | 26 | # The full version, including alpha/beta/rc tags 27 | release = tafra.__version__ 28 | 29 | 30 | # -- General configuration --------------------------------------------------- 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = [ 35 | 'sphinx.ext.autodoc', 36 | 'sphinx.ext.autosummary', 37 | 'sphinx.ext.viewcode', 38 | 'sphinx.ext.napoleon', 39 | 'sphinx.ext.coverage', 40 | ] 41 | 42 | # Add any paths that contain templates here, relative to this directory. 43 | templates_path = ['_templates'] 44 | 45 | # The suffix of source filenames. 46 | source_suffix = '.rst' 47 | 48 | # The master toctree document. 49 | master_doc = 'index' 50 | 51 | # List of patterns, relative to source directory, that match files and 52 | # directories to ignore when looking for source files. 53 | # This pattern also affects html_static_path and html_extra_path. 54 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 55 | 56 | 57 | # -- Options for HTML output ------------------------------------------------- 58 | 59 | # The theme to use for HTML and HTML Help pages. See the documentation for 60 | # a list of builtin themes. 61 | # 62 | html_theme = 'sphinx_rtd_theme' 63 | 64 | # Add any paths that contain custom static files (such as style sheets) here, 65 | # relative to this directory. They are copied after the builtin static files, 66 | # so a file named "default.css" will overwrite the builtin "default.css". 67 | html_static_path = ['_static'] 68 | 69 | html_context = { 70 | # https://rackerlabs.github.io/docs-rackspace/tools/rtd-tables.html 71 | 'css_files': ['_static/theme_overrides.css'], 72 | } 73 | -------------------------------------------------------------------------------- /docs/genindex.rst: -------------------------------------------------------------------------------- 1 | Index 2 | ===== 3 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../README.rst 2 | 3 | Contents 4 | ======== 5 | 6 | .. toctree:: 7 | :maxdepth: 2 8 | 9 | README 10 | api 11 | numerical 12 | 13 | .. toctree:: 14 | :maxdepth: 1 15 | 16 | testing 17 | versions 18 | genindex 19 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/numerical.rst: -------------------------------------------------------------------------------- 1 | ===================== 2 | Numerical Performance 3 | ===================== 4 | 5 | Summary 6 | ======= 7 | 8 | One of the goals of ``tafra`` is to provide a fast-as-possible data structure 9 | for numerical computing. To achieve this, all function returns are written 10 | as `generator expressions `_ wherever 11 | possible. 12 | 13 | Additionally, because the :attr:`data` contains values of ndarrays, the 14 | ``map`` functions may also take functions that operate on ndarrays. This means 15 | that they are able to take `numba `_ ``@jit``'ed 16 | functions as well. 17 | 18 | ``pandas`` is essentially a standard package for anyone performing data science 19 | with Python, and it provides a wide variety of useful features. However, it's 20 | not particularly aimed at maximizing performance. Let's use an example of a 21 | dataframe of function arguments, and a function that maps scalar arguments into 22 | a vector result. Any function of time serves this purpose, so let's use a 23 | hyperbolic function. 24 | 25 | First, let's randomnly generate some function arguments and construct both a 26 | ``Tafra`` and a ``pandas.DataFrame``: 27 | 28 | .. code-block:: python 29 | 30 | >>> from tafra import Tafra 31 | >>> import pandas as pd 32 | >>> import numpy as np 33 | 34 | >>> from typing import Tuple, Union, Any 35 | 36 | >>> tf = Tafra({ 37 | ... 'wellid': np.arange(0, 100), 38 | ... 'qi': np.random.lognormal(np.log(2000.), np.log(3000. / 1000.) / (2 * 1.28), 100), 39 | ... 'Di': np.random.uniform(.5, .9, 100), 40 | ... 'bi': np.random.normal(1.0, .2, 100) 41 | ... }) 42 | 43 | >>> df = pd.DataFrame(tf.data) 44 | 45 | >>> tf.head(5) 46 | 47 | ====== ====== ======= ======= ======= 48 | index wellid qi Di bi 49 | ====== ====== ======= ======= ======= 50 | dtype int32 float64 float64 float64 51 | 0 0 2665.82 0.54095 1.07538 52 | 1 1 1245.85 0.81711 0.48448 53 | 2 2 1306.56 0.61570 0.54587 54 | 3 3 2950.33 0.81956 0.66440 55 | 4 4 1963.93 0.56918 0.74165 56 | ====== ====== ======= ======= ======= 57 | 58 | 59 | Next, we define our hyperbolic function and the time array to evaluate: 60 | 61 | .. code-block:: python 62 | 63 | >>> import math 64 | 65 | >>> def tan_to_nominal(D: float) -> float: 66 | ... return -math.log1p(-D) 67 | 68 | >>> def sec_to_nominal(D: float, b: float) -> float: 69 | ... if b <= 1e-4: 70 | ... return tan_to_nominal(Di) 71 | ... 72 | ... return ((1.0 - D) ** -b - 1.0) / b 73 | 74 | >>> def hyp(qi: float, Di: float, bi: float, t: np.ndarray) -> np.ndarray: 75 | ... Dn = sec_to_nominal(Di, bi) 76 | ... 77 | ... if bi <= 1e-4: 78 | ... return qi * np.exp(-Dn * t) 79 | ... 80 | ... return qi / (1.0 + Dn * bi * t) ** (1.0 / bi) 81 | 82 | >>> t = 10 ** np.linspace(0, 4, 101) 83 | 84 | 85 | And let's build a generic ``mapper`` function to map over the named columns: 86 | 87 | .. code-block:: python 88 | 89 | >>> def mapper(tf: Union[Tafra, pd.DataFrame]) -> Tuple[int, np.ndarray]: 90 | ... return tf['wellid'], hyp(tf['qi'], tf['Di'], tf['bi'], t) 91 | 92 | 93 | We can call this with the following style. The ``pandas`` syntax is a bit 94 | verbose, but :meth:`pandas.DataFrame.from_items()` is deprecated in newer 95 | versions, so this is the recommended way. Let's time each approach: 96 | 97 | .. code-block:: python 98 | 99 | >>> %timeit tdcs = Tafra(tf.row_map(mapper)) 100 | 3.38 ms ± 129 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) 101 | 102 | 103 | >>> %timeit pdcs = pd.DataFrame(dict(df.apply(mapper, axis=1).to_list()))) 104 | 6.86 ms ± 408 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) 105 | 106 | 107 | We see ``Tafra`` is about twice as fast. Mapping a function this way is 108 | convenient, but there is some indirection occuring that we can do away with to 109 | obtain direct access to the data of the ``Tafra``, and there is a faster 110 | method for ``pandas`` as well as opposed to :meth:`pandas.DataFrame.apply`. 111 | Instead of constructing a new ``Tafra`` or ``pd.DataFrame`` for each row, we 112 | can instead return a :class:`NamedTuple`, which is faster to construct. Doing so: 113 | 114 | .. code-block:: python 115 | 116 | >>> def tuple_mapper(tf: Tuple[Any, ...]) -> Tuple[int, np.ndarray]: 117 | ... return tf.wellid, hyp(tf.qi, tf.Di, tf.bi, t) 118 | 119 | >>> %timeit Tafra(tf.tuple_map(tuple_mapper)) 120 | 1.68 ms ± 84.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) 121 | 122 | >>> %timeit pd.DataFrame(dict((tuple_mapper(row)) for row in df.itertuples())) 123 | 3.14 ms ± 121 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) 124 | 125 | 126 | And once again, ``Tafra`` is about twice as fast. 127 | 128 | One of the upcoming features of ``pandas`` is the ability to apply ``numba`` 129 | ``@jit``'ed functions to :meth:`pandas.DataFrame.apply`. The performance 130 | improvement should be significant, especially for long-running functions, 131 | but there will still be overhead in the abstraction before the function is 132 | called. We can demonstrate this by ``@jit``'ing our hyperbolic function and 133 | mapping it over the dataframes, and get an idea of how much improvement is 134 | possible: 135 | 136 | .. code-block:: python 137 | 138 | >>> from numba import jit 139 | >>> jit_kw = {'fastmath': True} 140 | 141 | >>> @jit(**jit_kw) 142 | ... def tan_to_nominal(D: float) -> float: 143 | ... return -math.log1p(-D) 144 | 145 | >>> @jit(**jit_kw) 146 | ... def sec_to_nominal(D: float, b: float) -> float: 147 | ... if b <= 1e-4: 148 | ... return tan_to_nominal(D) 149 | ... 150 | ... return ((1.0 - D) ** -b - 1.0) / b 151 | 152 | >>> @jit(**jit_kw) 153 | ... def hyp(qi: float, Di: float, bi: float, t: np.ndarray) -> np.ndarray: 154 | ... Dn = sec_to_nominal(Di, bi) 155 | ... 156 | ... if bi <= 1e-4: 157 | ... return qi * np.exp(-Dn * t) 158 | ... 159 | ... return qi / (1.0 + Dn * bi * t) ** (1.0 / bi) 160 | 161 | >>> %timeit Tafra(tf.tuple_map(tuple_mapper)) 162 | 884 µs ± 41.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) 163 | 164 | >>> %timeit pd.DataFrame(dict((tuple_mapper(row)) for row in df.itertuples())) 165 | 3.09 ms ± 115 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) 166 | 167 | 168 | Interestingly, we see that ``pandas`` does not get much benefit from this, as 169 | the limit occurs not in the performance of the functions but in the performance 170 | of ``pandas`` itself. We can validate this by skipping the dataframe 171 | construction step: 172 | 173 | .. code-block:: python 174 | 175 | >>> %timeit [tf.tuple_map(tuple_mapper)] 176 | 81.9 µs ± 2.91 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each) 177 | 178 | >>> %timeit [(tuple_mapper(row)) for row in df.itertuples()] 179 | 614 µs ± 14.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) 180 | 181 | 182 | Last, we might as the question "If ``pandas`` is incurring some performance 183 | penalty, what is the performance penalty of ``Tafra``?" We'll write a function 184 | that operates on the :class:`numpy.ndarray`\s themselves rather than using the 185 | helper :meth:`Tafra.tuple_map()`. We can also use ``numpy``'s built in apply 186 | function, :meth:`numpy.apply_along_axis()`, but it is considerably slower than 187 | a ``@jit``'ed function. 188 | 189 | .. code-block:: python 190 | 191 | >>> @jit(**jit_kw) 192 | ... def ndarray_map(qi: np.ndarray, Di: np.ndarray, bi: np.ndarray, t: np.ndarray) -> np.ndarray: 193 | ... out = np.zeros((qi.shape[0], t.shape[0])) 194 | ... for i in range(qi.shape[0]): 195 | ... out[i, :] = hyp(qi[i], Di[i], bi[i], t) 196 | ... 197 | ... return out 198 | 81.2 µs ± 9.7 µs per loop (mean ± std. dev. of 7 runs, 1 loop each) 199 | 200 | 201 | And the timing is neglible, meaning ``Tafra``'s :meth:`Tafra.tuple_map()` is 202 | essentially as performant as we are able to achieve in Python. 203 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.17.0 2 | scipy>=1.4.0 3 | Sphinx>=3.0.0 4 | sphinx-rtd-theme>=0.4.0 5 | typing_extensions>=3.7.4.1 6 | -------------------------------------------------------------------------------- /docs/testing.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Testing 3 | ======= 4 | 5 | Testing is set to evaluate: 6 | 7 | - style with `flake8 `_, 8 | - typing with `mypy `_, 9 | - valid function return values and behaviors with `hypothesis `_, and 10 | - test coverage using `coverage `_. 11 | 12 | Windows 13 | ------- 14 | 15 | Run ``test.bat`` in the ``test`` directory. 16 | 17 | 18 | Linux 19 | ----- 20 | 21 | Run ``test.sh`` in the ``test`` directory. 22 | -------------------------------------------------------------------------------- /docs/versions.rst: -------------------------------------------------------------------------------- 1 | =============== 2 | Version History 3 | =============== 4 | 5 | .. automodule:: tafra 6 | :noindex: 7 | 8 | 1.0.10 9 | ------ 10 | 11 | * Add ``pipe`` and overload ``>>`` operator for Tafra objects 12 | 13 | 1.0.9 14 | ----- 15 | 16 | * Add test files to build 17 | 18 | 1.0.8 19 | ----- 20 | 21 | * Check rows in constructor to ensure equal data length 22 | 23 | 1.0.7 24 | ----- 25 | 26 | * Handle missing or NULL values in ``read_csv()``. 27 | * Cast empty elements to None when updating dtypes to avoid failure of ``np.astype()``. 28 | * Update some typing, minor refactoring for performance 29 | 30 | 31 | 1.0.6 32 | ----- 33 | 34 | * Additional validations in constructor, primary to evaluate Iterables of values 35 | * Split ``col_map`` to ``col_map`` and ``key_map`` as the original function's return signature depending upon an argument. 36 | * Fix some documentation typos 37 | 38 | 39 | 1.0.5 40 | ----- 41 | 42 | * Add ``tuple_map`` method 43 | * Refactor all iterators and ``..._map`` functions to improve performance 44 | * Unpack ``np.ndarray`` if given as keys to constructor 45 | * Add ``validate=False`` in ``__post_init__`` if inputs are **known** to be valid to improve performance 46 | 47 | 48 | 1.0.4 49 | ----- 50 | 51 | * Add ``read_csv``, ``to_csv`` 52 | * Various refactoring and improvement in data validation 53 | * Add ``typing_extensions`` to dependencies 54 | * Change method of ``dtype`` storage, extract ``str`` representation from ``np.dtype()`` 55 | 56 | 57 | 1.0.3 58 | ----- 59 | 60 | * Add ``read_sql`` and ``read_sql_chunks`` 61 | * Add ``to_tuple`` and ``to_pandas`` 62 | * Cleanup constructor data validation 63 | 64 | 65 | 1.0.2 66 | ----- 67 | 68 | * Add object_formatter to expose user formatting for dtype=object 69 | * Improvements to indexing and slicing 70 | 71 | 72 | 1.0.1 73 | ----- 74 | 75 | * Add iter functions 76 | * Add map functions 77 | * Various constructor improvements 78 | 79 | 80 | 1.0.0 81 | ----- 82 | 83 | * Initial Release 84 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 100 3 | ignore = 4 | F401, 5 | F841, 6 | E116, 7 | E251, 8 | E261, 9 | E265, 10 | E266, 11 | E302, 12 | E305, 13 | E402, 14 | E722, 15 | E741, 16 | W503, 17 | W605 18 | exclude = 19 | .git, 20 | __pycache__, 21 | docs/conf.py, 22 | docs/source/conf.py, 23 | old, 24 | build, 25 | dist 26 | max-complexity = 20 27 | # output-file = src\test\flake8_run.txt 28 | 29 | [mypy] 30 | check_untyped_defs = true 31 | disallow_any_generics = true 32 | disallow_incomplete_defs = true 33 | disallow_subclassing_any = true 34 | disallow_untyped_calls = true 35 | disallow_untyped_decorators = true 36 | disallow_untyped_defs = true 37 | # ignore_missing_imports = true 38 | no_implicit_optional = true 39 | show_error_codes = true 40 | strict_equality = true 41 | warn_redundant_casts = true 42 | # warn_return_any = true 43 | warn_unreachable = true 44 | warn_unused_configs = true 45 | warn_unused_ignores = true 46 | 47 | [tool:pytest] 48 | addopts = --cov=tafra --cov-report=term-missing --hypothesis-show-statistics -v test 49 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tafra: a minimalist dataframe 3 | 4 | Copyright (c) 2020 Derrick W. Turk and David S. Fulford 5 | 6 | Author 7 | ------ 8 | Derrick W. Turk 9 | David S. Fulford 10 | 11 | Notes 12 | ----- 13 | Created on April 25, 2020 14 | """ 15 | 16 | import os 17 | import sys 18 | import re 19 | 20 | try: 21 | from setuptools import setup 22 | except ImportError: 23 | from distutils.core import setup 24 | 25 | 26 | def find_version() -> str: 27 | v = {} 28 | with open('tafra/version.py', 'r') as f: 29 | exec(f.read(), globals(), v) 30 | 31 | return v['__version__'] 32 | 33 | 34 | def get_long_description() -> str: 35 | # Fix display issues on PyPI caused by RST markup 36 | with open('README.rst', 'r') as f: 37 | readme = f.read() 38 | 39 | replacements = [ 40 | '.. automodule:: tafra', 41 | ':noindex:', 42 | ] 43 | 44 | subs = [ 45 | r':func:`([a-zA-Z0-9._]+)`', 46 | r':meth:`([a-zA-Z0-9._]+)`', 47 | ] 48 | 49 | def replace(s: str) -> str: 50 | for r in replacements: 51 | s = s.replace(r, '') 52 | return s 53 | 54 | lines = [] 55 | with open('docs/versions.rst', 'r') as f: 56 | iter_f = iter(f) 57 | _ = next(f) 58 | for line in f: 59 | if any(r in line for r in replacements): 60 | continue 61 | lines.append(line) 62 | 63 | version_history = ''.join(lines) 64 | for sub in subs: 65 | version_history = re.sub(sub, r'\1', version_history) 66 | 67 | return readme + '\n\n' + version_history 68 | 69 | 70 | version = find_version() 71 | 72 | if sys.argv[-1] == 'build': 73 | print(f'\nBuilding version {version}...\n') 74 | os.system('rm -r dist\\') # clean out dist/ 75 | os.system('python setup.py sdist bdist_wheel') 76 | sys.exit() 77 | 78 | 79 | setup( 80 | name='tafra', 81 | version=version, 82 | description='Tafra: innards of a dataframe', 83 | long_description=get_long_description(), 84 | long_description_content_type="text/x-rst", 85 | url='https://github.com/petbox-dev/tafra', 86 | author='David S. Fulford', 87 | author_email='petbox.dev@gmail.com', 88 | install_requires=['numpy>=1.17', 'typing_extensions'], 89 | zip_safe=False, 90 | packages=['tafra'], 91 | package_data={ 92 | 'tafra': ['py.typed'] 93 | }, 94 | python_requires='>=3.7', 95 | classifiers=[ 96 | 'Development Status :: 5 - Production/Stable', 97 | 'Intended Audience :: Science/Research', 98 | 'Intended Audience :: Education', 99 | 'Intended Audience :: Developers', 100 | 'Natural Language :: English', 101 | 'License :: OSI Approved :: MIT License', 102 | 'Programming Language :: Python :: 3.7', 103 | 'Programming Language :: Python :: 3.8', 104 | 'Programming Language :: Python :: Implementation :: CPython', 105 | 'Topic :: Scientific/Engineering', 106 | 'Topic :: Scientific/Engineering :: Mathematics', 107 | 'Topic :: Software Development :: Libraries', 108 | 'Typing :: Typed' 109 | ], 110 | keywords=[ 111 | 'tafra', 'dataframe', 'sql', 'group-by', 'aggregation', 112 | 'performance', 'minimalist' 113 | ], 114 | ) 115 | -------------------------------------------------------------------------------- /tafra/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tafra: a minimalist dataframe 3 | 4 | Copyright (c) 2020 Derrick W. Turk and David S. Fulford 5 | 6 | Author 7 | ------ 8 | Derrick W. Turk 9 | David S. Fulford 10 | 11 | Notes 12 | ----- 13 | Created on April 25, 2020 14 | """ 15 | 16 | from .version import __version__ 17 | 18 | from .base import Tafra, object_formatter 19 | from .group import GroupBy, Transform, IterateBy, InnerJoin, LeftJoin 20 | 21 | read_sql = Tafra.read_sql 22 | read_sql_chunks = Tafra.read_sql_chunks 23 | read_csv = Tafra.read_csv 24 | as_tafra = Tafra.as_tafra 25 | -------------------------------------------------------------------------------- /tafra/base.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tafra: a minimalist dataframe 3 | 4 | Copyright (c) 2020 Derrick W. Turk and David S. Fulford 5 | 6 | Author 7 | ------ 8 | Derrick W. Turk 9 | David S. Fulford 10 | 11 | Notes 12 | ----- 13 | Created on April 25, 2020 14 | """ 15 | __all__ = ['Tafra'] 16 | 17 | from pathlib import Path 18 | import re 19 | import warnings 20 | import csv 21 | import pprint as pprint 22 | from datetime import date, datetime 23 | from itertools import chain, islice 24 | from collections import namedtuple 25 | import dataclasses as dc 26 | 27 | import numpy as np 28 | from .protocol import Series, DataFrame, Cursor # just for mypy... 29 | 30 | from typing import (Any, Callable, Dict, Mapping, List, Tuple, Optional, Union as _Union, Sequence, 31 | Sized, Iterable, Iterator, Type, KeysView, ValuesView, ItemsView, 32 | IO) 33 | from typing_extensions import Concatenate, ParamSpec 34 | from typing import cast 35 | from io import TextIOWrapper 36 | 37 | from .formatter import ObjectFormatter 38 | from .csvreader import CSVReader 39 | 40 | 41 | P = ParamSpec('P') 42 | 43 | 44 | # default object formats 45 | object_formatter = ObjectFormatter() 46 | object_formatter['Decimal'] = lambda x: x.astype(float) 47 | 48 | 49 | NAMEDTUPLE_TYPE: Dict[str, Type[Any]] = { 50 | 'int': int, 51 | 'float': float, 52 | 'bool': bool, 53 | 'str': str, 54 | 'date': date, 55 | 'datetime': datetime, 56 | 'object': str, 57 | } 58 | 59 | RECORD_TYPE: Dict[str, Callable[[Any], Any]] = { 60 | 'int': int, 61 | 'float': float, 62 | 'bool': bool, 63 | 'str': str, 64 | 'date': lambda x: x.isoformat(), 65 | 'datetime': lambda x: x.isoformat(), 66 | 'object': str, 67 | } 68 | 69 | 70 | Scalar = _Union[str, int, float, bool] 71 | _Mapping = _Union[ 72 | Mapping[str, Any], 73 | Mapping[int, Any], 74 | Mapping[float, Any], 75 | Mapping[bool, Any], 76 | ] 77 | _Element = _Union[Tuple[_Union[str, int, float, np.ndarray], Any], List[Any], _Mapping] 78 | InitVar = _Union[ 79 | Tuple[str, Any], 80 | _Mapping, 81 | Sequence[_Element], 82 | Iterable[_Element], 83 | Iterator[_Element], 84 | enumerate 85 | ] 86 | 87 | 88 | @dc.dataclass(repr=False, eq=False) 89 | class Tafra: 90 | """ 91 | A minimalist dataframe. 92 | 93 | Constructs a :class:`Tafra` from :class:`dict` of data and (optionally) 94 | dtypes. Types on parameters are the types of the constructed :class:`Tafra`, 95 | but attempts are made to parse anything that "looks" like the correct data 96 | structure, including :class:`Iterable`, :class:`Iterator`, :class:`Sequence`, 97 | and :class:`Mapping` and various combinations. 98 | 99 | Parameters are given as an ``InitVar``, defined as: 100 | 101 | ``InitVar = Union[Tuple[str, Any], _Mapping, Sequence[_Element], Iterable[_Element],`` 102 | ``Iterator[_Element], enumerate]`` 103 | 104 | ``_Mapping = Union[Mapping[str, Any], Mapping[int, Any], Mapping[float, Any],`` 105 | ``Mapping[bool, Any]`` 106 | 107 | ``_Element = Union[Tuple[Union[str, int, float, np.ndarray], Any], List[Any], Mapping]`` 108 | 109 | Parameters 110 | ---------- 111 | data: InitVar 112 | The data of the Tafra. 113 | 114 | dtypes: InitVar 115 | The dtypes of the columns. 116 | 117 | validate: bool = True 118 | Run validation checks of the data. False will improve performance, but `data` and `dtypes` 119 | will not be validated for conformance to expected data structures. 120 | 121 | check_rows: bool = True 122 | Run row count checks. False will allow columns of differing lengths, which may break several 123 | methods. 124 | 125 | Returns 126 | ------- 127 | tafra: Tafra 128 | The constructed :class:`Tafra`. 129 | 130 | """ 131 | data: dc.InitVar[InitVar] 132 | dtypes: dc.InitVar[Optional[InitVar]] = None 133 | validate: dc.InitVar[bool] = True 134 | check_rows: bool = True 135 | 136 | _data: Dict[str, np.ndarray] = dc.field(init=False) 137 | _dtypes: Dict[str, str] = dc.field(init=False) 138 | 139 | def __post_init__(self, data: InitVar, dtypes: Optional[InitVar], validate: bool) -> None: 140 | # TODO: enable this? 141 | # if isinstance(self._data, DataFrame): 142 | # tf = self.from_dataframe(df=self._data) 143 | # self._data = tf._data 144 | # self._dtypes = tf._dtypes 145 | # self._rows = tf._rows 146 | # return 147 | 148 | rows: Optional[int] = None 149 | 150 | if validate: 151 | # check that the structure is actually a dict 152 | self._data = self._check_initvar(data) 153 | if dtypes is None or isinstance(dtypes, property): 154 | self._dtypes = {} 155 | else: 156 | self._dtypes = cast(Dict[str, str], self._check_initvar(dtypes)) 157 | 158 | # check that the values are properly formed np.ndarray 159 | for column, value in self._data.items(): 160 | self._ensure_valid(column, value, check_rows=False) 161 | 162 | n_rows = len(self._data[column]) 163 | if rows is None: 164 | rows = n_rows 165 | 166 | if self.check_rows and rows != n_rows: 167 | raise ValueError('`Tafra` must have consistent row counts.') 168 | elif rows < n_rows: # pragma: no cover 169 | rows = n_rows 170 | 171 | if rows is None: 172 | raise ValueError('No data provided in constructor statement.') 173 | 174 | self.update_dtypes_inplace(self._dtypes) 175 | # must coalesce all dtypes immediately, other functions assume a 176 | # proper structure of the Tafra 177 | self._coalesce_dtypes() 178 | 179 | else: 180 | self._data = cast(Dict[str, np.ndarray], data) 181 | if dtypes is None or isinstance(dtypes, property): 182 | self._dtypes = {} 183 | self._coalesce_dtypes() 184 | else: 185 | self._dtypes = cast(Dict[str, str], dtypes) 186 | 187 | self._update_rows() 188 | 189 | def _check_initvar(self, values: InitVar) -> Dict[str, Any]: 190 | """ 191 | Pre-process an :class:`InitVar` into a :class:`Dict`. 192 | """ 193 | _values: Dict[Any, Any] 194 | 195 | if isinstance(values, (Mapping, dict)): 196 | _values = cast(Dict[str, Any], values) 197 | 198 | elif isinstance(values, Sequence): 199 | _values = self._parse_sequence(values) 200 | 201 | elif isinstance(values, (Iterator, enumerate)): 202 | _values = self._parse_iterator(cast(Iterator[_Element], values)) 203 | 204 | elif isinstance(values, Iterable): 205 | _values = self._parse_iterable(cast(Iterable[_Element], values)) 206 | 207 | else: 208 | # last ditch attempt 209 | _values = cast(Dict[Any, Any], values) 210 | 211 | if not isinstance(_values, Dict): 212 | raise TypeError('Must contain `Dict`, `Mapping`, `Sequence`, Iterable, or Iterator, ' 213 | f'got `{type(_values)}`') 214 | 215 | # cast all keys to strings if they are not 216 | # must copy first as mutating the dict changes next(iterator) 217 | columns = [c for c in _values.keys() if not isinstance(c, str)] 218 | for column in columns: 219 | _values[str(column)] = _values.pop(column) 220 | 221 | return _values 222 | 223 | def _parse_sequence(self, values: Sequence[_Element]) -> Dict[Any, Any]: 224 | """ 225 | Pre-Process a :class:`Sequence` :class:`InitVar` into a :class:`Dict`. 226 | """ 227 | head = values[0] 228 | if isinstance(head, Dict): 229 | for _dict in values: 230 | head.update(cast(Dict[Any, Any], _dict)) 231 | _values = head 232 | 233 | # maybe a Sequence of 2-tuples or 2-lists? Cast and try it. 234 | elif isinstance(head, Sequence) and len(head) == 2: 235 | # is the key an ndarray? turn it into a scalar 236 | if isinstance(head[0], np.ndarray) and len(np.atleast_1d(head[0])) == 1: 237 | # mypy doesn't get that we've checked the head of values as an ndarray 238 | _values = {key.item(): value for key, value in 239 | cast(Iterable[Tuple[np.ndarray, Any]], values)} 240 | else: 241 | _values = dict(cast(Iterable[Tuple[Any, Any]], values)) 242 | 243 | else: 244 | raise TypeError('Sequence must contain `Dict`, `Mapping`, or `Sequence`, ' 245 | f'got `{type(head)}`') 246 | 247 | return _values 248 | 249 | def _parse_iterable(self, values: Iterable[_Element]) -> Dict[Any, Any]: 250 | """ 251 | Pre-Process a :class:`Iterable` :class:`InitVar` into a :class:`Dict`. 252 | """ 253 | iter_values = iter(values) 254 | head = next(iter_values) 255 | if isinstance(head, Dict): 256 | for _dict in iter_values: 257 | head.update(cast(Dict[Any, Any], _dict)) 258 | _values = head 259 | 260 | # maybe an Iterable of 2-tuples or 2-lists? Cast and try it. 261 | elif isinstance(head, Sequence) and len(head) == 2: 262 | # is the key an ndarray? turn it into a scalar 263 | if isinstance(head[0], np.ndarray) and len(np.atleast_1d(head[0])) == 1: 264 | # mypy doesn't get that we've checked the head of values as an ndarray 265 | _values = _values = {key.item(): value for key, value in chain( 266 | cast(Iterable[Tuple[np.ndarray, Any]], [head]), 267 | cast(Iterator[Tuple[np.ndarray, Any]], values))} 268 | else: 269 | _values = dict(chain( 270 | cast(Iterable[Tuple[Any, Any]], [head]), 271 | cast(Iterator[Tuple[Any, Any]], values))) 272 | 273 | else: 274 | raise TypeError('Iterable must contain `Dict`, `Mapping`, or `Sequence`, ' 275 | f'got `{type(head)}`') 276 | 277 | return _values 278 | 279 | def _parse_iterator(self, values: Iterator[_Element]) -> Dict[Any, Any]: 280 | """ 281 | Pre-Process a :class:`Iterator` :class:`InitVar` into a :class:`Dict`. 282 | """ 283 | head = next(values) 284 | 285 | if isinstance(head, Dict): 286 | # consume the iterator if its a dict 287 | for _dict in values: 288 | head.update(cast(Dict[Any, Any], _dict)) 289 | _values = head 290 | 291 | # maybe an Iterator of 2-tuples or 2-lists? Cast and try it. 292 | elif isinstance(head, Sequence) and len(head) == 2: 293 | # is the key an ndarray? turn it into a scalar 294 | if isinstance(head[0], np.ndarray) and len(np.atleast_1d(head[0])) == 1: 295 | # mypy doesn't get that we've checked the head of values as an ndarray 296 | _values = {key.item(): value for key, value in chain( 297 | cast(Iterable[Tuple[np.ndarray, Any]], [head]), 298 | cast(Iterator[Tuple[np.ndarray, Any]], values))} 299 | else: 300 | _values = dict(chain( 301 | cast(Iterable[Tuple[Any, Any]], [head]), 302 | cast(Iterator[Tuple[Any, Any]], values))) 303 | 304 | else: 305 | raise TypeError('Iterator must contain `Dict`, `Mapping`, or `Sequence`, ' 306 | f'got `{type(head)}`') 307 | 308 | return _values 309 | 310 | def __getitem__( 311 | self, 312 | item: _Union[str, int, slice, Sequence[_Union[str, int, bool]], np.ndarray]) -> Any: 313 | # return type is actually Union[np.ndarray, 'Tafra'] but mypy requires user to type check 314 | # in either case, what we return is a "slice" of the :class:`Tafra` 315 | if isinstance(item, str): 316 | return self._data[item] 317 | 318 | elif isinstance(item, int): 319 | return self._iindex(item) 320 | 321 | elif isinstance(item, slice): 322 | return self._slice(item) 323 | 324 | elif isinstance(item, np.ndarray): 325 | return self._ndindex(item) 326 | 327 | elif isinstance(item, Sequence): 328 | if isinstance(item[0], str): 329 | return self.select(cast(Sequence[str], item)) 330 | else: 331 | return self._aindex(cast(Sequence[_Union[int, bool]], item)) 332 | 333 | else: 334 | raise TypeError(f'Type {type(item)} not supported.') 335 | 336 | def __setitem__(self, item: str, value: _Union[np.ndarray, Sequence[Any], Any]) -> None: 337 | self._ensure_valid(item, value, set_item=True) 338 | 339 | def __repr__(self) -> str: 340 | if not hasattr(self, '_rows'): 341 | return f'Tafra(data={self._data}, dtypes={self._dtypes}, rows=n/a)' 342 | return f'Tafra(data={self._data}, dtypes={self._dtypes}, rows={self._rows})' 343 | 344 | def __str__(self) -> str: 345 | return self.__repr__() 346 | 347 | def __len__(self) -> int: 348 | assert self._data is not None, \ 349 | 'Interal error: Cannot construct a Tafra with no data.' 350 | return self._rows 351 | 352 | def __iter__(self) -> Iterator['Tafra']: 353 | return (self._iindex(i) for i in range(self._rows)) 354 | 355 | def __rshift__(self, other: Callable[['Tafra'], 'Tafra']) -> 'Tafra': 356 | return self.pipe(other) 357 | 358 | def iterrows(self) -> Iterator['Tafra']: 359 | """ 360 | Yield rows as :class:`Tafra`. Use :meth:`itertuples` for better performance. 361 | 362 | Returns 363 | ------- 364 | tafras: Iterator[Tafra] 365 | An iterator of :class:`Tafra`. 366 | """ 367 | yield from self.__iter__() 368 | 369 | def itertuples(self, name: Optional[str] = 'Tafra') -> Iterator[Tuple[Any, ...]]: 370 | """ 371 | Yield rows as :class:`NamedTuple`, or if ``name`` is ``None``, yield 372 | rows as :class:`tuple`. 373 | 374 | Parameters 375 | ---------- 376 | name: Optional[str] = 'Tafra' 377 | The name for the :class:`NamedTuple`. If ``None``, construct a 378 | :class:`Tuple` instead. 379 | 380 | Returns 381 | ------- 382 | tuples: Iterator[NamedTuple[Any, ...]] 383 | An iterator of :class:`NamedTuple`. 384 | """ 385 | if name is None: 386 | return (tuple(values) for values in zip(*self._data.values())) 387 | 388 | TafraNT = namedtuple(name, self._data.keys()) # type: ignore 389 | return map(TafraNT._make, zip(*self._data.values())) 390 | 391 | def itercols(self) -> Iterator[Tuple[str, np.ndarray]]: 392 | """ 393 | Yield columns as :class:`Tuple[str, np.ndarray]`, where the ``str`` is the column 394 | name. 395 | 396 | Returns 397 | ------- 398 | tuples: Iterator[Tuple[str, np.ndarray]] 399 | An iterator of :class:`Tafra`. 400 | """ 401 | return map(tuple, self.data.items()) # type: ignore 402 | 403 | def _update_rows(self) -> None: 404 | """ 405 | Updates :attr:`_rows`. User should call this if they have directly assigned to 406 | :attr:_data and need to validate the :class:`Tafra`. 407 | """ 408 | iter_values = iter(self._data.values()) 409 | self._rows = len(next(iter_values)) 410 | if self.check_rows and not all(len(v) == self._rows for v in iter_values): 411 | raise TypeError('Uneven length of data.') 412 | 413 | def _slice(self, _slice: slice) -> 'Tafra': 414 | """ 415 | Use a :class:`slice` to slice the :class:`Tafra`. 416 | 417 | Parameters 418 | ---------- 419 | _slice: slice 420 | The ``slice`` object. 421 | 422 | Returns 423 | ------- 424 | tafra: Tafra 425 | The sliced :class:`Tafra`. 426 | """ 427 | return Tafra( 428 | {column: np.atleast_1d(value[_slice]) 429 | for column, value in self._data.items()}, 430 | self._dtypes, 431 | validate=False 432 | ) 433 | 434 | def _iindex(self, index: int) -> 'Tafra': 435 | """ 436 | Use a :class`int` to slice the :class:`Tafra`. 437 | 438 | Parameters 439 | ---------- 440 | index: int 441 | 442 | Returns 443 | ------- 444 | tafra: Tafra 445 | The sliced :class:`Tafra`. 446 | """ 447 | return Tafra( 448 | {column: value[[index]] 449 | for column, value in self._data.items()}, 450 | self._dtypes, 451 | validate=False 452 | ) 453 | 454 | def _aindex(self, index: Sequence[_Union[int, bool]]) -> 'Tafra': 455 | """ 456 | Use numpy advanced indexing to slice the :class:`Tafra`. 457 | 458 | Parameters 459 | ---------- 460 | index: Sequence[Union[int, bool]] 461 | 462 | Returns 463 | ------- 464 | tafra: Tafra 465 | The sliced :class:`Tafra`. 466 | """ 467 | return Tafra( 468 | {column: value[index] 469 | for column, value in self._data.items()}, 470 | self._dtypes, 471 | validate=False 472 | ) 473 | 474 | def _ndindex(self, index: np.ndarray) -> 'Tafra': 475 | """ 476 | Use :class:`numpy.ndarray` indexing to slice the :class:`Tafra`. 477 | 478 | Parameters 479 | ---------- 480 | index: np.ndarray 481 | 482 | Returns 483 | ------- 484 | tafra: Tafra 485 | The sliced :class:`Tafra`. 486 | """ 487 | if index.ndim != 1: 488 | raise IndexError(f'Indexing np.ndarray must ndim == 1, got ndim == {index.ndim}') 489 | 490 | return Tafra( 491 | {column: value[index] 492 | for column, value in self._data.items()}, 493 | self._dtypes, 494 | validate=False 495 | ) 496 | 497 | def _repr_pretty_(self, p: 'IPython.lib.pretty.RepresentationPrinter', # type: ignore # noqa 498 | cycle: bool) -> None: 499 | """ 500 | A dunder method for IPython to pretty print. 501 | 502 | Parameters 503 | ---------- 504 | p: IPython.lib.pretty.RepresentationPrinter 505 | IPython provides this class to handle the object representation. 506 | 507 | cycle: bool 508 | IPython has detected an infinite loop. Print an alternative represenation 509 | and return. 510 | 511 | Returns 512 | ------- 513 | None 514 | Calls p.text and returns. 515 | """ 516 | if cycle: 517 | p.text('Tafra(...)') 518 | else: 519 | p.text(self._pretty_format(lambda s: ' ' + pprint.pformat(s, indent=1)[1:].strip())) 520 | 521 | def _repr_html_(self) -> str: 522 | """ 523 | a dunder method for Jupyter Notebook to print HTML. 524 | """ 525 | return self.to_html() 526 | 527 | def _pretty_format(self, formatter: Callable[[object], str]) -> str: 528 | """ 529 | Format _data and _dtypes for pretty printing. 530 | 531 | Parameters 532 | ---------- 533 | formatter: Callable[[object], str] 534 | A formatter that operates on the _data and _dtypes :class:`dict`. 535 | 536 | Returns 537 | ------- 538 | string: str 539 | The formatted string for printing. 540 | """ 541 | PATTERN = r'(, dtype=[a-z]+)(?=\))' 542 | 543 | return '\n'.join([ 544 | 'Tafra(data = {', 545 | f'{re.sub(PATTERN, "", formatter(self._data))},', 546 | 'dtypes = {', 547 | f'{re.sub(PATTERN, "", formatter(self._dtypes))},', 548 | f'rows = {self._rows})' 549 | ]) 550 | 551 | def pformat(self, indent: int = 1, width: int = 80, depth: Optional[int] = None, 552 | compact: bool = False) -> str: 553 | """ 554 | Format for pretty printing. Parameters are passed to 555 | :class:`pprint.PrettyPrinter`. 556 | 557 | Parameters 558 | ---------- 559 | indent: int 560 | Number of spaces to indent for each level of nesting. 561 | 562 | width: int 563 | Attempted maximum number of columns in the output. 564 | 565 | depth: Optional[int] 566 | The maximum depth to print out nested structures. 567 | 568 | compact: bool 569 | If true, several items will be combined in one line. 570 | 571 | Returns 572 | ------- 573 | formatted string: str 574 | A formatted string for pretty printing. 575 | """ 576 | return self._pretty_format( 577 | lambda s: indent * ' ' + pprint.pformat( 578 | s, indent, width, depth, compact=compact)[1:].strip()) 579 | 580 | def pprint(self, indent: int = 1, width: int = 80, depth: Optional[int] = None, 581 | compact: bool = False) -> None: 582 | """ 583 | Pretty print. Parameters are passed to :class:`pprint.PrettyPrinter`. 584 | 585 | Parameters 586 | ---------- 587 | indent: int 588 | Number of spaces to indent for each level of nesting. 589 | 590 | width: int 591 | Attempted maximum number of columns in the output. 592 | 593 | depth: Optional[int] 594 | The maximum depth to print out nested structures. 595 | 596 | compact: bool 597 | If true, several items will be combined in one line. 598 | 599 | Returns 600 | ------- 601 | None: None 602 | """ 603 | print(self.pformat(indent, width, depth, compact=compact)) 604 | 605 | @staticmethod 606 | def _html_thead(columns: Iterable[Any]) -> str: 607 | """ 608 | Construct the table head of the HTML representation. 609 | 610 | Parameters 611 | ---------- 612 | columns: Iterable[Any] 613 | An iterable of items with defined func:`__repr__` methods. 614 | 615 | Returns 616 | ------- 617 | HTML: str 618 | The HTML table head. 619 | """ 620 | return '\n\n{th}\n\n' \ 621 | .format(th='\n'.join(f'{c}' for c in columns)) 622 | 623 | @staticmethod 624 | def _html_tr(row: Iterable[Any]) -> str: 625 | """ 626 | Construct each table row of the HTML representation. 627 | 628 | Parameters 629 | ---------- 630 | row: Iterable[Any] 631 | An iterable of items with defined func:`__repr__` methods. 632 | 633 | Returns 634 | ------- 635 | HTML: str 636 | The HTML table row. 637 | """ 638 | return '\n{td}\n' \ 639 | .format(td='\n'.join(f'{td}' for td in row)) 640 | 641 | @staticmethod 642 | def _html_tbody(tr: Iterable[str]) -> str: 643 | """ 644 | Construct the table body of the HTML representation. 645 | 646 | Parameters 647 | ---------- 648 | tr: Iterable[str] 649 | An iterable of HTML table rows. 650 | 651 | Returns 652 | ------- 653 | HTML: str 654 | The HTML table body. 655 | """ 656 | return '\n{tr}\n' \ 657 | .format(tr='\n'.join(tr)) 658 | 659 | @staticmethod 660 | def _html_table(thead: str, tbody: str) -> str: 661 | """ 662 | Construct the final table of the HTML representation. 663 | 664 | Parameters 665 | ---------- 666 | thead: str 667 | An HTML representation of the table head. 668 | 669 | tbody: str 670 | An HTML representation of the table body. 671 | 672 | Returns 673 | ------- 674 | HTML: str 675 | The HTML table. 676 | """ 677 | return f'\n{thead}\n{tbody}\n
' 678 | 679 | def to_html(self, n: int = 20) -> str: 680 | """ 681 | Construct an HTML table representation of the :class:`Tafra` data. 682 | 683 | Parameters 684 | ---------- 685 | n: int = 20 686 | Number of items to print. 687 | 688 | Returns 689 | ------- 690 | HTML: str 691 | The HTML table representation. 692 | """ 693 | thead = self._html_thead(chain([''], self._data.keys())) 694 | tr = chain( 695 | [self._html_tr(chain( 696 | ['dtype'], 697 | (self._dtypes[column] for column in self._data.keys()) 698 | ))], 699 | (self._html_tr(chain( 700 | [i], 701 | (v[i] for v in self._data.values()) 702 | )) 703 | for i in range(min(n, self._rows))) 704 | ) 705 | tbody = self._html_tbody(tr) 706 | return self._html_table(thead, tbody) 707 | 708 | def _ensure_valid(self, column: str, value: _Union[np.ndarray, Sequence[Any], Any], 709 | check_rows: bool = True, set_item: bool = False) -> None: 710 | """ 711 | Validate values as an :class:`np.ndarray` of equal length to :attr:`rows` before 712 | assignment. Will attempt to create a :class:`np.ndarray` if ``value`` is not one 713 | already, and will check that :attr`np.ndarray.ndim` ``== 1``. If 714 | :attr:`np.ndarray.ndim` ``> 1`` it will attempt :meth:`np.squeeze` on ``value``. 715 | 716 | Parameters 717 | ---------- 718 | column: str 719 | The column to assign to. 720 | 721 | value: Union[np.ndarray, Sequence[Any], Any] 722 | The value to be assigned. 723 | 724 | Returns 725 | ------- 726 | None: None 727 | """ 728 | _type = type(value).__name__ 729 | id_value = id(value) 730 | rows = self._rows if check_rows else 1 731 | 732 | if value is None: 733 | value = np.full(rows, value) 734 | 735 | elif isinstance(value, np.ndarray): 736 | if value.ndim == 0: 737 | value = np.full(rows, value.item()) 738 | elif value.ndim == 1 and value.shape[0] == 1 and rows > 1: 739 | value = np.full(rows, value) 740 | 741 | elif isinstance(value, str): 742 | value = np.full(rows, value) 743 | 744 | elif isinstance(value, Iterator): 745 | value = np.asarray(tuple(value)) 746 | 747 | elif isinstance(value, Iterable): 748 | value = np.asarray(value) 749 | 750 | elif not isinstance(value, Sized): 751 | value = np.full(rows, value) 752 | 753 | assert isinstance(value, np.ndarray), \ 754 | 'Internal error: `Tafra` only supports assigning `ndarray`.' 755 | 756 | if value.ndim > 1: 757 | sq_value = value.squeeze() 758 | if sq_value.ndim > 1: 759 | raise ValueError('`ndarray` or `np.squeeze(ndarray)` must have ndim == 1.') 760 | elif sq_value.ndim == 1: 761 | # if value was a single item, squeeze returns zero length item 762 | warnings.warn('`np.squeeze(ndarray)` applied to set ndim == 1.') 763 | warnings.resetwarnings() 764 | value = sq_value 765 | 766 | assert value.ndim >= 1, \ 767 | 'Interal error: `Tafra` only supports assigning ndim == 1.' 768 | 769 | if check_rows and len(value) != rows: 770 | raise ValueError( 771 | '`Tafra` must have consistent row counts.\n' 772 | f'This `Tafra` has {rows} rows. Assigned {_type} has {len(value)} rows.') 773 | 774 | # special parsing of various object types 775 | parsed_value = object_formatter.parse_dtype(value) 776 | if parsed_value is not None: 777 | value = parsed_value 778 | 779 | # have we modified value? 780 | if set_item or id(value) != id_value: 781 | self._data[column] = value 782 | self._dtypes[column] = self._format_dtype(value.dtype) 783 | 784 | def parse_object_dtypes(self) -> 'Tafra': 785 | """ 786 | Parse the object dtypes using the :class:`ObjectFormatter` instance. 787 | """ 788 | tafra = self.copy() 789 | tafra.parse_object_dtypes_inplace() 790 | return tafra 791 | 792 | def parse_object_dtypes_inplace(self) -> None: 793 | """ 794 | Inplace version. 795 | 796 | Parse the object dtypes using the :class:`ObjectFormatter` instance. 797 | """ 798 | for column, value in self._data.items(): 799 | parsed_value = object_formatter.parse_dtype(value) 800 | if parsed_value is not None: 801 | self._data[column] = parsed_value 802 | self._dtypes[column] = self._format_dtype(parsed_value.dtype) 803 | 804 | def _validate_columns(self, columns: Iterable[str]) -> None: 805 | """ 806 | Validate that the column name(s) exists in :attr:`_data`. 807 | 808 | Parameters 809 | ---------- 810 | columns: Iterable[str] 811 | The column names to validate. 812 | 813 | Returns 814 | ------- 815 | None: None 816 | """ 817 | for column in columns: 818 | if column not in self._data.keys(): 819 | raise ValueError(f'Column {column} does not exist in `tafra`.') 820 | 821 | def _validate_dtypes(self, dtypes: Dict[str, Any]) -> Dict[str, str]: 822 | """ 823 | Validate that the dtypes as internally used names and that the columns exists in 824 | :attr:`_data`. 825 | 826 | Parameters 827 | ---------- 828 | dtypes: Dict[str, Any] 829 | The dtypes to validate. 830 | 831 | Returns 832 | ------- 833 | dtypes: Dict[str, str] 834 | The validated types. 835 | """ 836 | 837 | self._validate_columns(dtypes.keys()) 838 | return {column: self._format_dtype(dtype) for column, dtype in dtypes.items()} 839 | 840 | @staticmethod 841 | def _format_dtype(dtype: Any) -> str: 842 | """ 843 | Parse a dtype into the internally used string representation, if defined. 844 | Otherwise, pass through and let numpy raise error if it is not a valid dtype. 845 | 846 | Parameters 847 | ---------- 848 | dtype: Any 849 | The dtype to parse. 850 | 851 | Returns 852 | ------- 853 | dtype: str 854 | The parsed dtype. 855 | """ 856 | _dtype = np.dtype(dtype) 857 | name = _dtype.type.__name__ 858 | if 'str' in name: 859 | return 'str' 860 | 861 | return name.replace('_', '') 862 | 863 | @staticmethod 864 | def _reduce_dtype(dtype: Any) -> str: 865 | """ 866 | Parse a dtype to the base type. 867 | 868 | Parameters 869 | ---------- 870 | dtype: Any 871 | The dtype to parse. 872 | 873 | Returns 874 | ------- 875 | dtype: str 876 | The parsed dtype. 877 | """ 878 | name = np.dtype(dtype).type.__name__ 879 | m = re.search(r'([a-z]+)', name) 880 | if m: 881 | return m.group(1) 882 | 883 | # are there any dtypes without text names? 884 | return name # pragma: no cover 885 | 886 | @classmethod 887 | def from_records(cls, records: Iterable[Iterable[Any]], columns: Iterable[str], 888 | dtypes: Optional[Iterable[Any]] = None, **kwargs: Any) -> 'Tafra': 889 | """ 890 | Construct a :class:`Tafra` from an Iterator of records, e.g. from a SQL query. The 891 | records should be a nested Iterable, but can also be fed a cursor method such as 892 | ``cur.fetchmany()`` or ``cur.fetchall()``. 893 | 894 | Parameters 895 | ---------- 896 | records: ITerable[Iteralble[str]] 897 | The records to turn into a :class:`Tafra`. 898 | 899 | columns: Iterable[str] 900 | The column names to use. 901 | 902 | dtypes: Optional[Iterable[Any]] = None 903 | The dtypes of the columns. 904 | 905 | Returns 906 | ------- 907 | tafra: Tafra 908 | The constructed :class:`Tafra`. 909 | """ 910 | if dtypes is None: 911 | return Tafra({column: value for column, value in zip(columns, zip(*records))}, **kwargs) 912 | 913 | return Tafra( 914 | {column: value for column, value in zip(columns, zip(*records))}, 915 | {column: value for column, value in zip(columns, dtypes)}, 916 | **kwargs 917 | ) 918 | 919 | @classmethod 920 | def from_series(cls, s: Series, dtype: Optional[str] = None, **kwargs: Any) -> 'Tafra': 921 | """ 922 | Construct a :class:`Tafra` from a :class:`pandas.Series`. If ``dtype`` is not 923 | given, take from :attr:`pandas.Series.dtype`. 924 | 925 | Parameters 926 | ---------- 927 | df: pandas.Series 928 | The series used to build the :class:`Tafra`. 929 | 930 | dtype: Optional[str] = None 931 | The dtypes of the column. 932 | 933 | Returns 934 | ------- 935 | tafra: Tafra 936 | The constructed :class:`Tafra`. 937 | """ 938 | if dtype is None: 939 | dtype = s.dtype 940 | dtypes = {s.name: cls._format_dtype(dtype)} 941 | 942 | return cls( 943 | {s.name: s.values.astype(dtypes[s.name])}, 944 | dtypes, 945 | **kwargs 946 | ) 947 | 948 | @classmethod 949 | def from_dataframe(cls, df: DataFrame, dtypes: Optional[Dict[str, Any]] = None, 950 | **kwargs: Any) -> 'Tafra': 951 | """ 952 | Construct a :class:`Tafra` from a :class:`pandas.DataFrame`. If ``dtypes`` are not 953 | given, take from :attr:`pandas.DataFrame.dtypes`. 954 | 955 | Parameters 956 | ---------- 957 | df: pandas.DataFrame 958 | The dataframe used to build the :class:`Tafra`. 959 | 960 | dtypes: Optional[Dict[str, Any]] = None 961 | The dtypes of the columns. 962 | 963 | Returns 964 | ------- 965 | tafra: Tafra 966 | The constructed :class:`Tafra`. 967 | """ 968 | if dtypes is None: 969 | dtypes = {c: t for c, t in zip(df.columns, df.dtypes)} 970 | dtypes = {c: cls._format_dtype(t) for c, t in dtypes.items()} 971 | 972 | return cls( 973 | {c: df[c].values.astype(dtypes[c]) for c in df.columns}, 974 | {c: dtypes[c] for c in df.columns}, 975 | **kwargs 976 | ) 977 | 978 | @classmethod 979 | def read_sql(cls, query: str, cur: Cursor) -> 'Tafra': 980 | """ 981 | Execute a SQL SELECT statement using a :class:`pyodbc.Cursor` and return a Tuple 982 | of column names and an Iterator of records. 983 | 984 | Parameters 985 | ---------- 986 | query: str 987 | The SQL query. 988 | 989 | cur: pyodbc.Cursor 990 | The ``pyodbc`` cursor. 991 | 992 | Returns 993 | ------- 994 | tafra: Tafra 995 | The constructed :class:`Tafra`. 996 | """ 997 | cur.execute(query) 998 | 999 | columns, dtypes = zip(*((d[0], d[1]) for d in cur.description)) 1000 | 1001 | head = cur.fetchone() 1002 | if head is None: 1003 | return Tafra({column: () for column in columns}) 1004 | 1005 | return Tafra.from_records(chain([head], cur.fetchall()), columns, dtypes) 1006 | 1007 | @classmethod 1008 | def read_sql_chunks(cls, query: str, cur: Cursor, chunksize: int = 100) -> Iterator['Tafra']: 1009 | """ 1010 | Execute a SQL SELECT statement using a :class:`pyodbc.Cursor` and return a Tuple 1011 | of column names and an Iterator of records. 1012 | 1013 | Parameters 1014 | ---------- 1015 | query: str 1016 | The SQL query. 1017 | 1018 | cur: pyodbc.Cursor 1019 | The ``pyodbc`` cursor. 1020 | 1021 | Returns 1022 | ------- 1023 | tafra: Tafra 1024 | The constructed :class:`Tafra`. 1025 | """ 1026 | cur.execute(query) 1027 | 1028 | columns, dtypes = zip(*((d[0], d[1]) for d in cur.description)) 1029 | 1030 | head = cur.fetchone() 1031 | if head is None: 1032 | yield Tafra({column: () for column in columns}) 1033 | return 1034 | 1035 | def chunks(iterable: Iterable[Any], chunksize: int = 1000) -> Iterator[Iterable[Any]]: 1036 | for f in iterable: 1037 | yield list(chain([f], islice(iterable, chunksize - 1))) 1038 | 1039 | for chunk in chunks(chain([head], cur), chunksize): 1040 | yield Tafra.from_records(chunk, columns, dtypes) 1041 | 1042 | @classmethod 1043 | def read_csv(cls, csv_file: _Union[str, Path, TextIOWrapper, IO[str]], guess_rows: int = 5, 1044 | missing: Optional[str] = '', dtypes: Optional[Dict[str, Any]] = None, 1045 | **csvkw: Dict[str, Any] 1046 | ) -> 'Tafra': 1047 | """ 1048 | Read a CSV file with a header row, infer the types of each column, 1049 | and return a Tafra containing the file's contents. 1050 | 1051 | Parameters 1052 | ---------- 1053 | csv_file: Union[str, TextIOWrapper] 1054 | The path to the CSV file, or an open file-like object. 1055 | 1056 | guess_rows: int 1057 | The number of rows to use when guessing column types. 1058 | 1059 | dtypes: Optional[Dict[str, str]] 1060 | dtypes by column name; by default, all dtypes will be inferred 1061 | from the file contents. 1062 | 1063 | **csvkw: Dict[str, Any] 1064 | Additional keyword arguments passed to csv.reader. 1065 | 1066 | Returns 1067 | ------- 1068 | tafra: Tafra 1069 | The constructed :class:`Tafra`. 1070 | """ 1071 | reader = CSVReader(cast(_Union[str, Path, TextIOWrapper], csv_file), 1072 | guess_rows, missing, **csvkw) 1073 | return Tafra(reader.read(), dtypes=dtypes) 1074 | 1075 | @classmethod 1076 | def as_tafra(cls, maybe_tafra: _Union['Tafra', DataFrame, Series, Dict[str, Any], Any] 1077 | ) -> Optional['Tafra']: 1078 | """ 1079 | Returns the unmodified `tafra`` if already a :class:`Tafra`, else construct a 1080 | :class:`Tafra` from known types or subtypes of :class:`DataFrame` or `dict`. 1081 | Structural subtypes of :class:`DataFrame` or :class:`Series` are also valid, 1082 | as are classes that have ``cls.__name__ == 'DataFrame'`` or 1083 | ``cls.__name__ == 'Series'``. 1084 | 1085 | Parameters 1086 | ---------- 1087 | maybe_tafra: Union['tafra', DataFrame] 1088 | The object to ensure is a :class:`Tafra`. 1089 | 1090 | Returns 1091 | ------- 1092 | tafra: Optional[Tafra] 1093 | The :class:`Tafra`, or None is ``maybe_tafra`` is an unknown 1094 | type. 1095 | """ 1096 | if isinstance(maybe_tafra, Tafra): 1097 | return maybe_tafra 1098 | 1099 | elif isinstance(maybe_tafra, Series): # pragma: no cover 1100 | return cls.from_series(maybe_tafra) 1101 | 1102 | elif type(maybe_tafra).__name__ == 'Series': # pragma: no cover 1103 | return cls.from_series(cast(Series, maybe_tafra)) 1104 | 1105 | elif isinstance(maybe_tafra, DataFrame): # pragma: no cover 1106 | return cls.from_dataframe(maybe_tafra) 1107 | 1108 | elif type(maybe_tafra).__name__ == 'DataFrame': # pragma: no cover 1109 | return cls.from_dataframe(cast(DataFrame, maybe_tafra)) 1110 | 1111 | elif isinstance(maybe_tafra, dict): 1112 | return cls(maybe_tafra) 1113 | 1114 | raise TypeError(f'Unknown type `{type(maybe_tafra)}` for conversion to `Tafra`') 1115 | 1116 | @property 1117 | def columns(self) -> Tuple[str, ...]: 1118 | """ 1119 | The names of the columns. Equivalent to `Tafra`.keys(). 1120 | 1121 | Returns 1122 | ------- 1123 | columns: Tuple[str, ...] 1124 | The column names. 1125 | """ 1126 | return tuple(self._data.keys()) 1127 | 1128 | @columns.setter 1129 | def columns(self, value: Any) -> None: 1130 | raise ValueError('Assignment to `columns` is forbidden.') 1131 | 1132 | @property 1133 | def rows(self) -> int: 1134 | """ 1135 | The number of rows of the first item in :attr:`data`. The :func:`len()` 1136 | of all items have been previously validated. 1137 | 1138 | Returns 1139 | ------- 1140 | rows: int 1141 | The number of rows of the :class:`Tafra`. 1142 | """ 1143 | return self.__len__() 1144 | 1145 | @rows.setter 1146 | def rows(self, value: Any) -> None: 1147 | raise ValueError('Assignment to `rows` is forbidden.') 1148 | 1149 | @property # type: ignore 1150 | def data(self) -> Dict[str, np.ndarray]: 1151 | """ 1152 | The :class:`Tafra` data. 1153 | 1154 | Returns 1155 | ------- 1156 | data: Dict[str, np.ndarray] 1157 | The data. 1158 | """ 1159 | return self._data 1160 | 1161 | @data.setter 1162 | def data(self, value: Any) -> None: 1163 | raise ValueError('Assignment to `data` is forbidden.') 1164 | 1165 | @property # type: ignore 1166 | def dtypes(self) -> Dict[str, str]: 1167 | """ 1168 | The :class:`Tafra` dtypes. 1169 | 1170 | Returns 1171 | ------- 1172 | dtypes: Dict[str, str] 1173 | The dtypes. 1174 | """ 1175 | return self._dtypes 1176 | 1177 | @dtypes.setter 1178 | def dtypes(self, value: Any) -> None: 1179 | raise ValueError('Assignment to `dtypes` is forbidden.') 1180 | 1181 | @property 1182 | def size(self) -> int: 1183 | """ 1184 | The :class:`Tafra` size. 1185 | 1186 | Returns 1187 | ------- 1188 | size: int 1189 | The size. 1190 | """ 1191 | return self.rows * len(self.columns) 1192 | 1193 | @size.setter 1194 | def size(self, value: Any) -> None: 1195 | raise ValueError('Assignment to `size` is forbidden.') 1196 | 1197 | @property 1198 | def ndim(self) -> int: 1199 | """ 1200 | The :class:`Tafra` number of dimensions. 1201 | 1202 | Returns 1203 | ------- 1204 | ndim: int 1205 | The number of dimensions. 1206 | """ 1207 | return max(2, len(self.columns)) 1208 | 1209 | @ndim.setter 1210 | def ndim(self, value: Any) -> None: 1211 | raise ValueError('Assignment to `ndim` is forbidden.') 1212 | 1213 | @property 1214 | def shape(self) -> Tuple[int, int]: 1215 | """ 1216 | The :class:`Tafra` shape. 1217 | 1218 | Returns 1219 | ------- 1220 | shape: int 1221 | The shape. 1222 | """ 1223 | return self.rows, len(self.columns) 1224 | 1225 | @shape.setter 1226 | def shape(self, value: Any) -> None: 1227 | raise ValueError('Assignment to `shape` is forbidden.') 1228 | 1229 | def row_map(self, fn: Callable[..., Any], *args: Any, **kwargs: Any) -> Iterator[Any]: 1230 | """ 1231 | Map a function over rows. To apply to specific columns, use :meth:`select` 1232 | first. The function must operate on :class:`Tafra`. 1233 | 1234 | Parameters 1235 | ---------- 1236 | fn: Callable[..., Any] 1237 | The function to map. 1238 | 1239 | *args: Any 1240 | Additional positional arguments to ``fn``. 1241 | 1242 | **kwargs: Any 1243 | Additional keyword arguments to ``fn``. 1244 | 1245 | Returns 1246 | ------- 1247 | iter_tf: Iterator[Any] 1248 | An iterator to map the function. 1249 | """ 1250 | return (fn(tf, *args, **kwargs) for tf in self.__iter__()) 1251 | 1252 | def tuple_map(self, fn: Callable[..., Any], *args: Any, **kwargs: Any) -> Iterator[Any]: 1253 | """ 1254 | Map a function over rows. This is faster than :meth:`row_map`. To apply to 1255 | specific columns, use :meth:`select` first. The function must operate on 1256 | :class:`NamedTuple` from :meth:`itertuples`. 1257 | 1258 | Parameters 1259 | ---------- 1260 | fn: Callable[..., Any] 1261 | The function to map. 1262 | 1263 | name: Optional[str] = 'Tafra' 1264 | The name for the :class:`NamedTuple`. If ``None``, construct a 1265 | :class:`Tuple` instead. Must be given as a keyword argument. 1266 | 1267 | *args: Any 1268 | Additional positional arguments to ``fn``. 1269 | 1270 | **kwargs: Any 1271 | Additional keyword arguments to ``fn``. 1272 | 1273 | Returns 1274 | ------- 1275 | iter_tf: Iterator[Any] 1276 | An iterator to map the function. 1277 | """ 1278 | name = kwargs.pop('name', 'Tafra') 1279 | return (fn(tf, *args, **kwargs) for tf in self.itertuples(name)) 1280 | 1281 | def col_map(self, fn: Callable[..., Any], *args: Any, **kwargs: Any) -> Iterator[Any]: 1282 | """ 1283 | Map a function over columns. To apply to specific columns, use :meth:`select` 1284 | first. The function must operate on :class:`Tuple[str, np.ndarray]`. 1285 | 1286 | Parameters 1287 | ---------- 1288 | fn: Callable[..., Any] 1289 | The function to map. 1290 | 1291 | *args: Any 1292 | Additional positional arguments to ``fn``. 1293 | 1294 | **kwargs: Any 1295 | Additional keyword arguments to ``fn``. 1296 | 1297 | Returns 1298 | ------- 1299 | iter_tf: Iterator[Any] 1300 | An iterator to map the function. 1301 | """ 1302 | 1303 | return (fn(value, *args, **kwargs) for column, value in self.itercols()) 1304 | 1305 | def key_map(self, fn: Callable[..., Any], 1306 | *args: Any, **kwargs: Any) -> Iterator[Tuple[str, Any]]: 1307 | """ 1308 | Map a function over columns like :meth:col_map, but return :class:`Tuple` of the 1309 | key with the function result. To apply to specific columns, use :meth:`select` 1310 | first. The function must operate on :class:`Tuple[str, np.ndarray]`. 1311 | 1312 | Parameters 1313 | ---------- 1314 | fn: Callable[..., Any] 1315 | The function to map. 1316 | 1317 | *args: Any 1318 | Additional positional arguments to ``fn``. 1319 | 1320 | **kwargs: Any 1321 | Additional keyword arguments to ``fn``. 1322 | 1323 | Returns 1324 | ------- 1325 | iter_tf: Iterator[Any] 1326 | An iterator to map the function. 1327 | """ 1328 | return ((column, fn(value, *args, **kwargs)) for column, value in self.itercols()) 1329 | 1330 | def pipe(self, fn: Callable[Concatenate['Tafra', P], 'Tafra'], 1331 | *args: Any, **kwargs: Any) -> 'Tafra': 1332 | """ 1333 | Apply a function to the :class:`Tafra` and return the resulting :class:`Tafra`. Primarily 1334 | used to build a tranformer pipeline. 1335 | 1336 | Parameters 1337 | ---------- 1338 | fn: Callable[[], 'Tafra'] 1339 | The function to apply. 1340 | 1341 | *args: Any 1342 | Additional positional arguments to ``fn``. 1343 | 1344 | **kwargs: Any 1345 | Additional keyword arguments to ``fn``. 1346 | 1347 | Returns 1348 | ------- 1349 | tafra: Tafra 1350 | A new :class:`Tafra` result of the function. 1351 | """ 1352 | return fn(self, *args, **kwargs) 1353 | 1354 | def select(self, columns: Iterable[str]) -> 'Tafra': 1355 | """ 1356 | Use column names to slice the :class:`Tafra` columns analogous to SQL SELECT. 1357 | This does not copy the data. Call :meth:`copy` to obtain a copy of the sliced 1358 | data. 1359 | 1360 | Parameters 1361 | ---------- 1362 | columns: Iterable[str] 1363 | The column names to slice from the :class:`Tafra`. 1364 | 1365 | Returns 1366 | ------- 1367 | tafra: Tafra 1368 | the :class:`Tafra` with the sliced columns. 1369 | """ 1370 | if isinstance(columns, str): 1371 | columns = [columns] 1372 | self._validate_columns(columns) 1373 | 1374 | return Tafra( 1375 | {column: self._data[column] for column in columns}, 1376 | {column: self._dtypes[column] for column in columns}, 1377 | validate=False 1378 | ) 1379 | 1380 | def head(self, n: int = 5) -> 'Tafra': 1381 | """ 1382 | Display the head of the :class:`Tafra`. 1383 | 1384 | Parameters 1385 | ---------- 1386 | n: int = 5 1387 | The number of rows to display. 1388 | 1389 | Returns 1390 | ------- 1391 | None: None 1392 | """ 1393 | return self._slice(slice(n)) 1394 | 1395 | def keys(self) -> KeysView[str]: 1396 | """ 1397 | Return the keys of :attr:`data`, i.e. like :meth:`dict.keys()`. 1398 | 1399 | Returns 1400 | ------- 1401 | data keys: KeysView[str] 1402 | The keys of the data property. 1403 | """ 1404 | return self._data.keys() 1405 | 1406 | def values(self) -> ValuesView[np.ndarray]: 1407 | """ 1408 | Return the values of :attr:`data`, i.e. like :meth:`dict.values()`. 1409 | 1410 | Returns 1411 | ------- 1412 | data values: ValuesView[np.ndarray] 1413 | The values of the data property. 1414 | """ 1415 | return self._data.values() 1416 | 1417 | def items(self) -> ItemsView[str, np.ndarray]: 1418 | """ 1419 | Return the items of :attr:`data`, i.e. like :meth:`dict.items()`. 1420 | 1421 | Returns 1422 | ------- 1423 | items: ItemsView[str, np.ndarray] 1424 | The data items. 1425 | """ 1426 | return self._data.items() 1427 | 1428 | def get(self, key: str, default: Any = None) -> Any: 1429 | """ 1430 | Return from the :meth:`get` function of :attr:`data`, i.e. like 1431 | :meth:`dict.get()`. 1432 | 1433 | Parameters 1434 | ---------- 1435 | key: str 1436 | The key value in the data property. 1437 | 1438 | default: Any 1439 | The default to return if the key does not exist. 1440 | 1441 | Returns 1442 | ------- 1443 | value: Any 1444 | The value for the key, or the default if the key does not 1445 | exist. 1446 | """ 1447 | return self._data.get(key, default) 1448 | 1449 | def update(self, other: 'Tafra') -> 'Tafra': 1450 | """ 1451 | Update the data and dtypes of this :class:`Tafra` with another :class:`Tafra`. 1452 | Length of rows must match, while data of different ``dtype`` will overwrite. 1453 | 1454 | Parameters 1455 | ---------- 1456 | other: Tafra 1457 | The other :class:`Tafra` from which to update. 1458 | 1459 | Returns 1460 | ------- 1461 | None: None 1462 | """ 1463 | tafra = self.copy() 1464 | tafra.update_inplace(other) 1465 | return tafra 1466 | 1467 | def update_inplace(self, other: 'Tafra') -> None: 1468 | """ 1469 | Inplace version. 1470 | 1471 | Update the data and dtypes of this :class:`Tafra` with another :class:`Tafra`. 1472 | Length of rows must match, while data of different ``dtype`` will overwrite. 1473 | 1474 | Parameters 1475 | ---------- 1476 | other: Tafra 1477 | The other :class:`Tafra` from which to update. 1478 | 1479 | Returns 1480 | ------- 1481 | None: None 1482 | """ 1483 | if not isinstance(other, Tafra): 1484 | # should be a Tafra, but if not let's construct one 1485 | other = Tafra(other) # type: ignore 1486 | 1487 | rows = self._rows 1488 | 1489 | for column, value in other._data.items(): 1490 | if len(value) != rows: 1491 | raise ValueError( 1492 | 'Other `Tafra` must have consistent row count. ' 1493 | f'This `Tafra` has {rows} rows, other `Tafra` has {len(value)} rows.') 1494 | self._data[column] = value 1495 | 1496 | self.update_dtypes_inplace(other._dtypes) 1497 | 1498 | def _coalesce_dtypes(self) -> None: 1499 | """ 1500 | Update :attr:`dtypes` with missing keys that exist in :attr:`data`. 1501 | 1502 | **Must be called if :attr:`data` or :attr:`data` is directly modified!** 1503 | 1504 | Returns 1505 | ------- 1506 | None: None 1507 | """ 1508 | for column in self._data.keys(): 1509 | if column not in self._dtypes: 1510 | self._dtypes[column] = self._format_dtype(self._data[column].dtype) 1511 | 1512 | def update_dtypes(self, dtypes: Dict[str, Any]) -> 'Tafra': 1513 | """ 1514 | Apply new dtypes. 1515 | 1516 | Parameters 1517 | ---------- 1518 | dtypes: Dict[str, Any] 1519 | The dtypes to update. If ``None``, create from entries in :attr:`data`. 1520 | 1521 | Returns 1522 | ------- 1523 | tafra: Optional[Tafra] 1524 | The updated :class:`Tafra`. 1525 | """ 1526 | tafra = self.copy() 1527 | tafra.update_dtypes_inplace(dtypes) 1528 | return tafra 1529 | 1530 | def update_dtypes_inplace(self, dtypes: Dict[str, Any]) -> None: 1531 | """ 1532 | Inplace version. 1533 | 1534 | Apply new dtypes. 1535 | 1536 | Parameters 1537 | ---------- 1538 | dtypes: Dict[str, Any] 1539 | The dtypes to update. If ``None``, create from entries in :attr:`data`. 1540 | 1541 | Returns 1542 | ------- 1543 | tafra: Optional[Tafra] 1544 | The updated :class:`Tafra`. 1545 | """ 1546 | dtypes = self._validate_dtypes(dtypes) 1547 | self._dtypes.update(dtypes) 1548 | 1549 | for column in dtypes.keys(): 1550 | if self._format_dtype(self._data[column].dtype) != self._dtypes[column]: 1551 | try: 1552 | self._data[column] = self._data[column].astype(self._dtypes[column]) 1553 | except ValueError: 1554 | REPL_VALS = ['', ] 1555 | for repl_val in REPL_VALS: 1556 | where_repl = np.equal(self._data[column], repl_val) 1557 | self._data[column][where_repl] = None 1558 | self._data[column] = self._data[column].astype(self._dtypes[column]) 1559 | 1560 | def rename(self, renames: Dict[str, str]) -> 'Tafra': 1561 | """ 1562 | Rename columns in the :class:`Tafra` from a :class:`dict`. 1563 | 1564 | Parameters 1565 | ---------- 1566 | renames: Dict[str, str] 1567 | The map from current names to new names. 1568 | 1569 | Returns 1570 | ------- 1571 | tafra: Optional[Tafra] 1572 | The :class:`Tafra` with update names. 1573 | """ 1574 | 1575 | tafra = self.copy() 1576 | tafra.rename_inplace(renames) 1577 | return tafra 1578 | 1579 | def rename_inplace(self, renames: Dict[str, str]) -> None: 1580 | """ 1581 | In-place version. 1582 | 1583 | Rename columns in the :class:`Tafra` from a :class:`dict`. 1584 | 1585 | Parameters 1586 | ---------- 1587 | renames: Dict[str, str] 1588 | The map from current names to new names. 1589 | 1590 | Returns 1591 | ------- 1592 | tafra: Optional[Tafra] 1593 | The :class:`Tafra` with update names. 1594 | """ 1595 | self._validate_columns(renames.keys()) 1596 | 1597 | for cur, new in renames.items(): 1598 | self._data[new] = self._data.pop(cur) 1599 | self._dtypes[new] = self._dtypes.pop(cur) 1600 | return None 1601 | 1602 | def delete(self, columns: Iterable[str]) -> 'Tafra': 1603 | """ 1604 | Remove a column from :attr:`data` and :attr:`dtypes`. 1605 | 1606 | Parameters 1607 | ---------- 1608 | column: str 1609 | The column to remove. 1610 | 1611 | Returns 1612 | ------- 1613 | tafra: Optional[Tafra] 1614 | The :class:`Tafra` with the deleted column. 1615 | """ 1616 | if isinstance(columns, str): 1617 | columns = [columns] 1618 | self._validate_columns(columns) 1619 | 1620 | return Tafra( 1621 | {column: value.copy() for column, value in self._data.items() 1622 | if column not in columns}, 1623 | {column: value for column, value in self._dtypes.items() 1624 | if column not in columns}, 1625 | validate=False 1626 | ) 1627 | 1628 | def delete_inplace(self, columns: Iterable[str]) -> None: 1629 | """ 1630 | In-place version. 1631 | 1632 | Remove a column from :attr:`data` and :attr:`dtypes`. 1633 | 1634 | Parameters 1635 | ---------- 1636 | column: str 1637 | The column to remove. 1638 | 1639 | Returns 1640 | ------- 1641 | tafra: Optional[Tafra] 1642 | The :class:`Tafra` with the deleted column. 1643 | """ 1644 | if isinstance(columns, str): 1645 | columns = [columns] 1646 | self._validate_columns(columns) 1647 | 1648 | for column in columns: 1649 | _ = self._data.pop(column, None) 1650 | _ = self._dtypes.pop(column, None) 1651 | 1652 | def copy(self, order: str = 'C') -> 'Tafra': 1653 | """ 1654 | Create a copy of a :class:`Tafra`. 1655 | 1656 | Parameters 1657 | ---------- 1658 | order: str = 'C' {‘C’, ‘F’, ‘A’, ‘K’} 1659 | Controls the memory layout of the copy. ‘C’ means C-order, ‘F’ means 1660 | F-order, ‘A’ means ‘F’ if a is Fortran contiguous, ‘C’ otherwise. ‘K’ 1661 | means match the layout of a as closely as possible. 1662 | 1663 | Returns 1664 | ------- 1665 | tafra: Tafra 1666 | A copied :class:`Tafra`. 1667 | """ 1668 | return Tafra( 1669 | {column: value.copy(order=order) 1670 | for column, value in self._data.items()}, 1671 | self._dtypes.copy(), 1672 | validate=False 1673 | ) 1674 | 1675 | def coalesce(self, column: str, fills: Iterable[ 1676 | Iterable[_Union[None, str, int, float, bool, np.ndarray]] 1677 | ]) -> np.ndarray: 1678 | """ 1679 | Fill ``None`` values from ``fills``. Analogous to ``SQL COALESCE`` or 1680 | :meth:`pandas.fillna`. 1681 | 1682 | Parameters 1683 | ---------- 1684 | column: str 1685 | The column to coalesce. 1686 | 1687 | fills: Iterable[Union[str, int, float, bool, np.ndarray]: 1688 | 1689 | Returns 1690 | ------- 1691 | data: np.ndarray 1692 | The coalesced data. 1693 | """ 1694 | # TODO: handle dtype? 1695 | iter_fills = iter(fills) 1696 | head = next(iter_fills) 1697 | 1698 | if column in self._data.keys(): 1699 | value = self._data[column].copy() 1700 | else: 1701 | value = np.empty(self._rows, np.asarray(head).dtype) 1702 | 1703 | for _fill in chain([head], iter_fills): 1704 | fill = np.atleast_1d(_fill) 1705 | where_na = np.full(self._rows, False) 1706 | where_na |= value == np.array([None]) 1707 | try: 1708 | where_na |= np.isnan(value) 1709 | except: 1710 | pass 1711 | 1712 | if len(fill) == 1: 1713 | value[where_na] = fill 1714 | else: 1715 | value[where_na] = fill[where_na] 1716 | 1717 | return value 1718 | 1719 | def coalesce_inplace(self, column: str, fills: Iterable[ 1720 | Iterable[_Union[None, str, int, float, bool, np.ndarray]] 1721 | ]) -> None: 1722 | """ 1723 | In-place version. 1724 | 1725 | Fill ``None`` values from ``fills``. Analogous to ``SQL COALESCE`` or 1726 | :meth:`pandas.fillna`. 1727 | 1728 | Parameters 1729 | ---------- 1730 | column: str 1731 | The column to coalesce. 1732 | 1733 | fills: Iterable[Union[str, int, float, bool, np.ndarray]: 1734 | 1735 | Returns 1736 | ------- 1737 | data: np.ndarray 1738 | The coalesced data. 1739 | """ 1740 | self._data[column] = self.coalesce(column, fills) 1741 | self.update_dtypes_inplace({column: self._data[column].dtype}) 1742 | 1743 | def _cast_record(self, dtype: str, data: np.ndarray, cast_null: bool) -> Optional[float]: 1744 | """ 1745 | Casts needed to generate records for database insert. 1746 | 1747 | Will cast ``np.nan`` to ``None``. Requires changing ``dtype`` to 1748 | ``object``. 1749 | 1750 | Parameters 1751 | ---------- 1752 | dtype: str 1753 | The dtype of the data value. 1754 | 1755 | data: np.ndarray 1756 | The data to have its values cast. 1757 | 1758 | cast_null: bool 1759 | Perform the cast for ``np.nan`` 1760 | 1761 | Returns 1762 | ------- 1763 | value: Any 1764 | The cast value. 1765 | """ 1766 | _dtype = self._reduce_dtype(dtype) 1767 | value: Any = RECORD_TYPE[_dtype](data.item()) 1768 | if cast_null and _dtype == 'float' and np.isnan(data.item()): 1769 | return None 1770 | return value 1771 | 1772 | def to_records(self, columns: Optional[Iterable[str]] = None, 1773 | cast_null: bool = True) -> Iterator[Tuple[Any, ...]]: 1774 | """ 1775 | Return a :class:`Iterator` of :class:`Tuple`, each being a record (i.e. row) and 1776 | allowing heterogeneous typing. Useful for e.g. sending records back to a 1777 | database. 1778 | 1779 | Parameters 1780 | ---------- 1781 | columns: Optional[Iterable[str]] = None 1782 | The columns to extract. If ``None``, extract all columns. 1783 | 1784 | cast_null: bool 1785 | Cast ``np.nan`` to None. Necessary for :mod:``pyodbc`` 1786 | 1787 | Returns 1788 | ------- 1789 | records: Iterator[Tuple[Any, ...]] 1790 | """ 1791 | if columns is None: 1792 | columns = self.columns 1793 | else: 1794 | if isinstance(columns, str): 1795 | columns = [columns] 1796 | self._validate_columns(columns) 1797 | 1798 | return (tuple( 1799 | None if len(self._data[c]) <= row else self._cast_record( 1800 | self._dtypes[c], self._data[c][[row]], 1801 | cast_null 1802 | ) 1803 | for c in columns) 1804 | for row in range(self._rows)) 1805 | 1806 | def to_list(self, columns: Optional[Iterable[str]] = None, 1807 | inner: bool = False) -> _Union[List[np.ndarray], List[List[Any]]]: 1808 | """ 1809 | Return a list of homogeneously typed columns (as :class:`numpy.ndarray`). If a 1810 | generator is needed, use :meth:`to_records`. If ``inner == True`` each column 1811 | will be cast from :class:`numpy.ndarray` to a :class:`List`. 1812 | 1813 | Parameters 1814 | ---------- 1815 | columns: Optional[Iterable[str]] = None 1816 | The columns to extract. If ``None``, extract all columns. 1817 | 1818 | inner: bool = False 1819 | Cast all :class:`np.ndarray` to :class`List`. 1820 | 1821 | Returns 1822 | ------- 1823 | list: Union[List[np.ndarray], List[List[Any]]] 1824 | """ 1825 | if columns is None: 1826 | columns = self.columns 1827 | else: 1828 | if isinstance(columns, str): 1829 | columns = [columns] 1830 | self._validate_columns(columns) 1831 | 1832 | if inner: 1833 | return [list(self._data[c]) for c in columns] 1834 | return [self._data[c] for c in columns] 1835 | 1836 | def to_tuple(self, columns: Optional[Iterable[str]] = None, name: Optional[str] = 'Tafra', 1837 | inner: bool = False) -> _Union[Tuple[np.ndarray], Tuple[Tuple[Any, ...]]]: 1838 | """ 1839 | Return a :class:`NamedTuple` or :class:`Tuple`. If a generator is needed, use 1840 | :meth:`to_records`. If ``inner == True`` each column will be cast from 1841 | :class:`np.ndarray` to a :class:`Tuple`. If `name` is `None`, returns a 1842 | :class:`Tuple` instead. 1843 | 1844 | Parameters 1845 | ---------- 1846 | columns: Optional[Iterable[str]] = None 1847 | The columns to extract. If ``None``, extract all columns. 1848 | 1849 | name: Optional[str] = 'Tafra' 1850 | The name for the :class:`NamedTuple`. If ``None``, construct a 1851 | :class:`Tuple` instead. 1852 | 1853 | inner: bool = False 1854 | Cast all :class:`np.ndarray` to :class`List`. 1855 | 1856 | Returns 1857 | ------- 1858 | list: Union[Tuple[np.ndarray], Tuple[Tuple[Any, ...]]] 1859 | """ 1860 | if columns is None: 1861 | columns = self.columns 1862 | else: 1863 | if isinstance(columns, str): 1864 | columns = [columns] 1865 | self._validate_columns(columns) 1866 | 1867 | if name is None: 1868 | if inner: 1869 | return tuple(tuple(self._data[c]) for c in columns) # type: ignore 1870 | return tuple(self._data[c] for c in columns) # type: ignore 1871 | 1872 | TafraNT = namedtuple(name, columns, rename=True) # type: ignore 1873 | 1874 | if inner: 1875 | return TafraNT._make((tuple(self._data[c]) for c in columns)) # type: ignore 1876 | return TafraNT._make((self._data[c] for c in columns)) # type: ignore 1877 | 1878 | def to_array(self, columns: Optional[Iterable[str]] = None) -> np.ndarray: 1879 | """ 1880 | Return an object array. 1881 | 1882 | Parameters 1883 | ---------- 1884 | columns: Optional[Iterable[str]] = None 1885 | The columns to extract. If ``None``, extract all columns. 1886 | 1887 | Returns 1888 | ------- 1889 | array: np.ndarray 1890 | """ 1891 | if columns is None: 1892 | columns = self.columns 1893 | else: 1894 | if isinstance(columns, str): 1895 | columns = [columns] 1896 | self._validate_columns(columns) 1897 | 1898 | return np.array([self._data[c] for c in columns], dtype=object).T 1899 | 1900 | def to_pandas(self, columns: Optional[Iterable[str]] = None) -> DataFrame: 1901 | """ 1902 | Construct a :class:`pandas.DataFrame`. 1903 | 1904 | Parameters 1905 | ---------- 1906 | columns: Iterable[str] 1907 | The columns to write. IF ``None``, write all columns. 1908 | 1909 | Returns 1910 | ------- 1911 | dataframe: :class:`pandas.DataFrame` 1912 | """ 1913 | try: 1914 | import pandas as pd # type: ignore 1915 | except ImportError as e: # pragma: no cover 1916 | raise ImportError('`pandas` does not appear to be installed.') 1917 | 1918 | if columns is None: 1919 | columns = self.columns 1920 | else: 1921 | if isinstance(columns, str): 1922 | columns = [columns] 1923 | self._validate_columns(columns) 1924 | 1925 | return pd.DataFrame({ 1926 | column: pd.Series(value) for column, value in self._data.items() 1927 | if column in columns 1928 | }) 1929 | 1930 | def to_csv(self, filename: _Union[str, Path, TextIOWrapper, IO[str]], 1931 | columns: Optional[Iterable[str]] = None) -> None: 1932 | """ 1933 | Write the :class:`Tafra` to a CSV. 1934 | 1935 | Parameters 1936 | ---------- 1937 | filename: Union[str, Path] 1938 | The path of the filename to write. 1939 | 1940 | columns: Iterable[str] 1941 | The columns to write. IF ``None``, write all columns. 1942 | """ 1943 | if columns is None: 1944 | columns = self.columns 1945 | else: 1946 | if isinstance(columns, str): 1947 | columns = [columns] 1948 | self._validate_columns(columns) 1949 | 1950 | if isinstance(filename, (str, Path)): 1951 | f = open(filename, 'w', newline='') 1952 | should_close = True 1953 | 1954 | elif isinstance(filename, TextIOWrapper): 1955 | if 'w' not in filename.mode: 1956 | raise ValueError(f'file must be opened in write mode, got {filename.mode}') 1957 | f = filename 1958 | should_close = False 1959 | 1960 | f.reconfigure(newline='') 1961 | 1962 | writer = csv.writer(f, delimiter=',', quotechar='"') 1963 | writer.writerow((column for column in self._data.keys() if column in columns)) 1964 | writer.writerows(self.to_records(columns)) 1965 | 1966 | if should_close: 1967 | f.close() 1968 | 1969 | def union(self, other: 'Tafra') -> 'Tafra': 1970 | """ 1971 | Helper function to implement :meth:`tafra.group.Union.apply`. 1972 | 1973 | Union two :class:`Tafra` together. Analogy to SQL UNION or `pandas.append`. All 1974 | column names and dtypes must match. 1975 | 1976 | Parameters 1977 | ---------- 1978 | other: Tafra 1979 | The other tafra to union. 1980 | 1981 | Returns 1982 | ------- 1983 | tafra: Tafra 1984 | A new tafra with the unioned data. 1985 | """ 1986 | return Union().apply(self, other) 1987 | 1988 | def union_inplace(self, other: 'Tafra') -> None: 1989 | """ 1990 | Inplace version. 1991 | 1992 | 1993 | Helper function to implement :meth:`tafra.group.Union.apply_inplace`. 1994 | 1995 | Union two :class:`Tafra` together. Analogy to SQL UNION or `pandas.append`. All 1996 | column names and dtypes must match. 1997 | 1998 | Parameters 1999 | ---------- 2000 | other: Tafra 2001 | The other tafra to union. 2002 | 2003 | Returns 2004 | ------- 2005 | None: None 2006 | """ 2007 | Union().apply_inplace(self, other) 2008 | 2009 | def group_by(self, columns: Iterable[str], aggregation: 'InitAggregation' = {}, 2010 | iter_fn: Mapping[str, Callable[[np.ndarray], Any]] = dict()) -> 'Tafra': 2011 | """ 2012 | Helper function to implement :meth:`tafra.group.GroupBy.apply`. 2013 | 2014 | Aggregation by a set of unique values. 2015 | 2016 | Analogy to SQL ``GROUP BY``, not :meth:`pandas.DataFrame.groupby()`. 2017 | 2018 | Parameters 2019 | ---------- 2020 | columns: Iterable[str] 2021 | The column names to group by. 2022 | 2023 | aggregation: Mapping[str, Union[Callable[[np.ndarray], Any], \ 2024 | Tuple[Callable[[np.ndarray], Any], str]]] 2025 | Optional. A mapping for columns and aggregation functions. Should be 2026 | given as {'column': fn} or {'new_column': (fn, 'column')}. 2027 | 2028 | iter_fn: Mapping[str, Callable[[np.ndarray], Any]] 2029 | Optional. A mapping for new columns names to the function to apply to 2030 | the enumeration. Should be given as {'new_column': fn}. 2031 | 2032 | Returns 2033 | ------- 2034 | tafra: Tafra 2035 | The aggregated :class:`Tafra`. 2036 | """ 2037 | return GroupBy(columns, aggregation, iter_fn).apply(self) 2038 | 2039 | def transform(self, columns: Iterable[str], aggregation: 'InitAggregation' = {}, 2040 | iter_fn: Dict[str, Callable[[np.ndarray], Any]] = dict()) -> 'Tafra': 2041 | """ 2042 | Helper function to implement :meth:`tafra.group.Transform.apply`. 2043 | 2044 | Apply a function to each unique set of values and join to the original table. 2045 | Analogy to :meth:`pandas.DataFrame.groupby().transform()`, 2046 | i.e. a SQL ``GROUP BY`` and ``LEFT JOIN`` back to the original table. 2047 | 2048 | Parameters 2049 | ---------- 2050 | group_by: Iterable[str] 2051 | The column names to group by. 2052 | 2053 | aggregation: Mapping[str, Union[Callable[[np.ndarray], Any], \ 2054 | Tuple[Callable[[np.ndarray], Any], str]]] 2055 | Optional. A mapping for columns and aggregation functions. Should be 2056 | given as {'column': fn} or {'new_column': (fn, 'column')}. 2057 | 2058 | iter_fn: Mapping[str, Callable[[np.ndarray], Any]] 2059 | Optional. A mapping for new columns names to the function to apply to 2060 | the enumeration. Should be given as {'new_column': fn}. 2061 | 2062 | Returns 2063 | ------- 2064 | tafra: Tafra 2065 | The transformed :class:`Tafra`. 2066 | """ 2067 | return Transform(columns, aggregation, iter_fn).apply(self) 2068 | 2069 | def iterate_by(self, columns: Iterable[str]) -> Iterator['GroupDescription']: 2070 | """ 2071 | Helper function to implement :meth:`tafra.group.IterateBy.apply`. 2072 | 2073 | A generator that yields a :class:`Tafra` for each set of unique values. Analogy 2074 | to `pandas.DataFrame.groupby()`, i.e. an :class:`Iterator` of :class:`Tafra`. 2075 | 2076 | Yields tuples of ((unique grouping values, ...), row indices array, subset 2077 | tafra) 2078 | 2079 | Parameters 2080 | ---------- 2081 | group_by: Iterable[str] 2082 | The column names to group by. 2083 | 2084 | Returns 2085 | ------- 2086 | tafras: Iterator[GroupDescription] 2087 | An iterator over the grouped :class:`Tafra`. 2088 | """ 2089 | yield from IterateBy(columns).apply(self) 2090 | 2091 | def inner_join(self, right: 'Tafra', on: Iterable[Tuple[str, str, str]], 2092 | select: Iterable[str] = list()) -> 'Tafra': 2093 | """ 2094 | Helper function to implement :meth:`tafra.group.InnerJoin.apply`. 2095 | 2096 | An inner join. 2097 | 2098 | Analogy to SQL INNER JOIN, or `pandas.merge(..., how='inner')`, 2099 | 2100 | Parameters 2101 | ---------- 2102 | right: Tafra 2103 | The right-side :class:`Tafra` to join. 2104 | 2105 | on: Iterable[Tuple[str, str, str]] 2106 | The columns and operator to join on. Should be given as 2107 | ('left column', 'right column', 'op') Valid ops are: 2108 | 2109 | '==' : equal to 2110 | '!=' : not equal to 2111 | '<' : less than 2112 | '<=' : less than or equal to 2113 | '>' : greater than 2114 | '>=' : greater than or equal to 2115 | 2116 | select: Iterable[str] = [] 2117 | The columns to return. If not given, all unique columns names are 2118 | returned. If the column exists in both :class`Tafra`, prefers the left 2119 | over the right. 2120 | 2121 | Returns 2122 | ------- 2123 | tafra: Tafra 2124 | The joined :class:`Tafra`. 2125 | """ 2126 | return InnerJoin(on, select).apply(self, right) 2127 | 2128 | def left_join(self, right: 'Tafra', on: Iterable[Tuple[str, str, str]], 2129 | select: Iterable[str] = list()) -> 'Tafra': 2130 | """ 2131 | Helper function to implement :meth:`tafra.group.LeftJoin.apply`. 2132 | 2133 | A left join. 2134 | 2135 | Analogy to SQL LEFT JOIN, or `pandas.merge(..., how='left')`, 2136 | 2137 | Parameters 2138 | ---------- 2139 | right: Tafra 2140 | The right-side :class:`Tafra` to join. 2141 | 2142 | on: Iterable[Tuple[str, str, str]] 2143 | The columns and operator to join on. Should be given as 2144 | ('left column', 'right column', 'op') Valid ops are: 2145 | 2146 | '==' : equal to 2147 | '!=' : not equal to 2148 | '<' : less than 2149 | '<=' : less than or equal to 2150 | '>' : greater than 2151 | '>=' : greater than or equal to 2152 | 2153 | select: Iterable[str] = [] 2154 | The columns to return. If not given, all unique columns names are 2155 | returned. If the column exists in both :class`Tafra`, prefers the left 2156 | over the right. 2157 | 2158 | Returns 2159 | ------- 2160 | tafra: Tafra 2161 | The joined :class:`Tafra`. 2162 | """ 2163 | return LeftJoin(on, select).apply(self, right) 2164 | 2165 | def cross_join(self, right: 'Tafra', 2166 | select: Iterable[str] = list()) -> 'Tafra': 2167 | """ 2168 | Helper function to implement :meth:`tafra.group.CrossJoin.apply`. 2169 | 2170 | A cross join. 2171 | 2172 | Analogy to SQL CROSS JOIN, or `pandas.merge(..., how='outer') using temporary 2173 | columns of static value to intersect all rows`. 2174 | 2175 | Parameters 2176 | ---------- 2177 | right: Tafra 2178 | The right-side :class:`Tafra` to join. 2179 | 2180 | select: Iterable[str] = [] 2181 | The columns to return. If not given, all unique columns names are 2182 | returned. If the column exists in both :class`Tafra`, prefers the left 2183 | over the right. 2184 | 2185 | Returns 2186 | ------- 2187 | tafra: Tafra 2188 | The joined :class:`Tafra`. 2189 | """ 2190 | return CrossJoin([], select).apply(self, right) 2191 | 2192 | def to_field_name(maybe_text: _Union[str, int, float]) -> str: # pragma: no cover 2193 | text = str(maybe_text) 2194 | 2195 | # Remove invalid characters 2196 | mid_text = re.sub('[^0-9a-zA-Z]', '', text) 2197 | 2198 | # Remove leading characters until we find a letter 2199 | final_text = re.sub('^[^a-zA-Z]+', '', mid_text) 2200 | 2201 | if final_text == '': 2202 | final_text = 'field_' + mid_text 2203 | 2204 | return final_text 2205 | 2206 | def _in_notebook() -> bool: # pragma: no cover 2207 | """ 2208 | Checks if running in a Jupyter Notebook. 2209 | 2210 | Returns 2211 | ------- 2212 | in_notebook: bool 2213 | """ 2214 | try: 2215 | from IPython import get_ipython # type: ignore 2216 | if 'IPKernelApp' in get_ipython().config: 2217 | return True 2218 | except Exception as e: 2219 | pass 2220 | return False 2221 | 2222 | # Import here to resolve circular dependency 2223 | from .group import (GroupBy, Transform, IterateBy, InnerJoin, LeftJoin, CrossJoin, Union, 2224 | InitAggregation, GroupDescription) 2225 | -------------------------------------------------------------------------------- /tafra/csvreader.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tafra: a minimalist dataframe 3 | 4 | Copyright (c) 2020 Derrick W. Turk and David S. Fulford 5 | 6 | Author 7 | ------ 8 | Derrick W. Turk 9 | David S. Fulford 10 | 11 | Notes 12 | ----- 13 | Created on April 25, 2020 14 | """ 15 | from pathlib import Path 16 | import csv 17 | import dataclasses as dc 18 | 19 | import numpy as np 20 | 21 | from enum import Enum, auto 22 | from io import TextIOWrapper 23 | from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Type 24 | from typing import Union, cast 25 | 26 | # this doesn't type well in Python 27 | @dc.dataclass(frozen=True) 28 | class ReadableType: 29 | dtype: Type[Any] 30 | parse: Callable[[str], Any] 31 | 32 | def _parse_bool(val: str) -> bool: 33 | folded = val.casefold() 34 | if folded in ('false', 'no', 'f'): 35 | return False 36 | if folded in ('true', 'yes', 't'): 37 | return True 38 | raise ValueError('not a boolean') 39 | 40 | # numpy-stubs is a lie about many of these, hence the type: ignore spam 41 | _TYPE_PRECEDENCE: List[ReadableType] = [ 42 | ReadableType(np.int32, cast(Callable[[str], Any], np.int32)), 43 | ReadableType(np.int64, cast(Callable[[str], Any], np.int64)), 44 | # np.float32, # nervous about ever inferring this 45 | ReadableType(np.float64, cast(Callable[[str], Any], np.float64)), 46 | ReadableType(bool, _parse_bool), 47 | # TODO: date, 48 | # TODO: datetime, 49 | ] 50 | 51 | _TYPE_OBJECT: ReadableType = ReadableType(object, lambda x: x) 52 | 53 | class ReaderState(Enum): 54 | AWAIT_GUESSABLE = auto() 55 | EARLY_EOF = auto() 56 | GUESS = auto() 57 | READ = auto() 58 | EOF = auto() 59 | DONE = auto() 60 | 61 | class CSVReader: 62 | def __init__(self, source: Union[str, Path, TextIOWrapper], 63 | guess_rows: int = 5, missing: Optional[str] = '', 64 | **csvkw: Dict[str, Any]): 65 | if isinstance(source, (str, Path)): 66 | self._stream = open(source, newline='') 67 | self._should_close = True 68 | elif isinstance(source, TextIOWrapper): 69 | source.reconfigure(newline='') 70 | self._stream = source 71 | self._should_close = False 72 | reader = csv.reader(self._stream, dialect='excel', **csvkw) 73 | self._header = _unique_header(next(reader)) 74 | self._reader = (self._decode_missing(t) for t in reader) 75 | self._guess_types = { 76 | col: _TYPE_PRECEDENCE[0] for col in self._header 77 | } 78 | self._guess_data: Dict[str, List[Any]] = { 79 | col: list() for col in self._header 80 | } 81 | self._data: Dict[str, List[Any]] = dict() 82 | self._guess_rows = guess_rows 83 | self._missing = missing 84 | self._rows = 0 85 | self._state = ReaderState.AWAIT_GUESSABLE 86 | 87 | def read(self) -> Dict[str, np.ndarray]: 88 | while self._state != ReaderState.DONE: 89 | self._step() 90 | return self._finalize() 91 | 92 | def _step(self) -> None: 93 | if self._state == ReaderState.AWAIT_GUESSABLE: 94 | self.state_await_guessable() 95 | return 96 | 97 | if self._state == ReaderState.GUESS: 98 | self.state_guess() 99 | return 100 | 101 | if self._state == ReaderState.READ: 102 | self.state_read() 103 | return 104 | 105 | if self._state == ReaderState.EARLY_EOF: 106 | self.state_early_eof() 107 | return 108 | 109 | if self._state == ReaderState.EOF: 110 | self.state_eof() 111 | return 112 | 113 | if self._state == ReaderState.DONE: # pragma: no cover 114 | return 115 | 116 | def state_await_guessable(self) -> None: 117 | try: 118 | row = next(self._reader) 119 | except StopIteration: 120 | self._state = ReaderState.EARLY_EOF 121 | return 122 | 123 | self._rows += 1 124 | if len(row) != len(self._header): 125 | raise ValueError(f'length of row #{self._rows}' 126 | ' does not match header length') 127 | 128 | for col, val in zip(self._header, row): 129 | self._guess_data[col].append(val) 130 | 131 | if self._rows == self._guess_rows: 132 | self._state = ReaderState.GUESS 133 | 134 | def state_guess(self) -> None: 135 | for col in self._header: 136 | ty, parsed = _guess_column(_TYPE_PRECEDENCE, 137 | self._guess_data[col]) 138 | self._guess_types[col] = ty 139 | self._data[col] = parsed 140 | self._state = ReaderState.READ 141 | 142 | def state_read(self) -> None: 143 | try: 144 | row = next(self._reader) 145 | except StopIteration: 146 | self._state = ReaderState.EOF 147 | return 148 | 149 | self._rows += 1 150 | if len(row) != len(self._header): 151 | raise ValueError(f'length of row #{self._rows}' 152 | ' does not match header length') 153 | 154 | for col, val in zip(self._header, row): 155 | try: 156 | self._data[col].append(self._guess_types[col].parse(val)) # type: ignore 157 | except: 158 | self._promote(col, val) 159 | 160 | def state_early_eof(self) -> None: 161 | if self._should_close: 162 | self._stream.close() 163 | 164 | for col in self._header: 165 | ty, parsed = _guess_column(_TYPE_PRECEDENCE, 166 | self._guess_data[col]) 167 | self._guess_types[col] = ty 168 | self._data[col] = parsed 169 | 170 | self._state = ReaderState.DONE 171 | 172 | def state_eof(self) -> None: 173 | if self._should_close: 174 | self._stream.close() 175 | self._state = ReaderState.DONE 176 | 177 | def _promote(self, col: str, val: Optional[str]) -> None: 178 | ty_ix = _TYPE_PRECEDENCE.index(self._guess_types[col]) 179 | try_next = _TYPE_PRECEDENCE[ty_ix + 1:] 180 | stringized = self._encode_missing(self._data[col]) 181 | stringized.append(val) 182 | ty, parsed = _guess_column(try_next, stringized) 183 | self._guess_types[col] = ty 184 | self._data[col] = parsed 185 | 186 | def _finalize(self) -> Dict[str, np.ndarray]: 187 | assert self._state == ReaderState.DONE, 'CSVReader is not in DONE state.' 188 | return { 189 | col: np.array(self._data[col], dtype=self._guess_types[col].dtype) 190 | for col in self._header 191 | } 192 | 193 | def _decode_missing(self, row: List[str]) -> Sequence[Optional[str]]: 194 | if self._missing is None: 195 | return row 196 | return [v if v != self._missing else None for v in row] 197 | 198 | def _encode_missing(self, row: Sequence[Optional[Any]]) -> List[Optional[str]]: 199 | return [str(v) if v is not None else self._missing for v in row] 200 | 201 | def _unique_header(header: List[str]) -> List[str]: 202 | uniq: List[str] = list() 203 | for col in header: 204 | col_unique = col 205 | i = 2 206 | while col_unique in uniq: 207 | col_unique = f'{col} ({i})' 208 | i += 1 209 | uniq.append(col_unique) 210 | return uniq 211 | 212 | # the "real" return type is a dependent pair (t: ReadableType ** List[t.dtype]) 213 | def _guess_column(precedence: List[ReadableType], vals: List[Optional[str]] 214 | ) -> Tuple[ReadableType, List[Any]]: 215 | for ty in precedence: 216 | try: 217 | # mypy doesn't really get that the thing we're mapping is not a method 218 | # on `ty` but a data member 219 | typed = list(map(ty.parse, vals)) # type: ignore 220 | return ty, typed 221 | except: 222 | next 223 | return _TYPE_OBJECT, vals 224 | -------------------------------------------------------------------------------- /tafra/formatter.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tafra: a minimalist dataframe 3 | 4 | Copyright (c) 2020 Derrick W. Turk and David S. Fulford 5 | 6 | Author 7 | ------ 8 | Derrick W. Turk 9 | David S. Fulford 10 | 11 | Notes 12 | ----- 13 | Created on April 25, 2020 14 | """ 15 | from typing import Callable, Dict, Tuple, Any, Iterator, MutableMapping, Type, Optional 16 | 17 | import numpy as np 18 | 19 | 20 | class ObjectFormatter(Dict[str, Callable[[np.ndarray], np.ndarray]], 21 | MutableMapping[str, Callable[[np.ndarray], np.ndarray]]): 22 | """ 23 | A dictionary that contains mappings for formatting objects. Some numpy objects 24 | should be cast to other types, e.g. the :class:`decimal.Decimal` type cannot 25 | operate with :class:`np.float`. These mappings are defined in this class. 26 | 27 | Each mapping must define a function that takes a :class:`np.ndarray` and 28 | returns a :class:`np.ndarray`. 29 | 30 | The key for each mapping is the name of the type of the actual value, 31 | looked up from the first element of the :class:`np.ndarray`, i.e. 32 | ``type(array[0]).__name__``. 33 | """ 34 | test_array = np.arange(4) 35 | 36 | def __setitem__(self, dtype: str, value: Callable[[np.ndarray], np.ndarray]) -> None: 37 | """ 38 | Set the dtype formatter. 39 | """ 40 | try: 41 | if not isinstance(value(self.test_array), np.ndarray): 42 | raise ValueError( 43 | 'Must provide a function that takes an ``np.ndarray`` and returns ' 44 | 'an np.ndarray.') 45 | except Exception as e: 46 | raise ValueError( 47 | 'Must provide a function that takes an ``np.ndarray`` and returns ' 48 | 'an np.ndarray.') 49 | 50 | dict.__setitem__(self, dtype, value) 51 | 52 | def __getitem__(self, dtype: str) -> Callable[[np.ndarray], np.ndarray]: 53 | """ 54 | Get the dtype formatter. 55 | """ 56 | return dict.__getitem__(self, dtype) 57 | 58 | def __delitem__(self, dtype: str) -> None: 59 | """ 60 | Delete the dtype formatter. 61 | """ 62 | dict.__delitem__(self, dtype) 63 | 64 | def __repr__(self) -> str: 65 | return self.__str__() 66 | 67 | def __str__(self) -> str: 68 | if self.__len__() < 1: 69 | return r'{}' 70 | return '{' + '\n'.join(f'{c}: {v}' for c, v in self.items()) + '}' 71 | 72 | def __iter__(self) -> Iterator[Any]: 73 | yield from dict.__iter__(self) 74 | 75 | def __len__(self) -> int: 76 | return dict.__len__(self) 77 | 78 | def copy(self) -> Dict[str, Any]: 79 | return {k: dict.__getitem__(self, k) for k in self} 80 | 81 | def parse_dtype(self, value: np.ndarray) -> Optional[np.ndarray]: 82 | """ 83 | Parse an object dtype. 84 | 85 | Parameters 86 | ---------- 87 | value: np.ndarray 88 | The :class:`np.ndarray` to be parsed. 89 | 90 | Returns 91 | ------- 92 | value, modified: Tuple(np.ndarray, bool) 93 | The :class:`np.ndarray` and whether it was modified or not. 94 | """ 95 | if value.dtype != np.dtype(object): 96 | return None 97 | 98 | type_name = type(value[0]).__name__ 99 | if type_name in self.keys(): 100 | value = self[type_name](value) 101 | return value 102 | 103 | return None 104 | -------------------------------------------------------------------------------- /tafra/group.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tafra: a minimalist dataframe 3 | 4 | Copyright (c) 2020 Derrick W. Turk and David S. Fulford 5 | 6 | Author 7 | ------ 8 | Derrick W. Turk 9 | David S. Fulford 10 | 11 | Notes 12 | ----- 13 | Created on April 25, 2020 14 | """ 15 | __all__ = ['GroupBy', 'Transform', 'IterateBy', 'InnerJoin', 'LeftJoin'] 16 | 17 | import operator 18 | from collections import OrderedDict 19 | from itertools import chain 20 | import dataclasses as dc 21 | 22 | import numpy as np 23 | 24 | from typing import (Any, Callable, Dict, Mapping, List, Tuple, Optional, Union as _Union, Sequence, 25 | Iterable, Iterator) 26 | from typing import cast 27 | 28 | 29 | JOIN_OPS: Dict[str, Callable[[Any, Any], Any]] = { 30 | '==': operator.eq, 31 | '!=': operator.ne, 32 | '<': operator.lt, 33 | '<=': operator.le, 34 | '>': operator.gt, 35 | '>=': operator.ge 36 | } 37 | 38 | # for the passed argument to an aggregation 39 | InitAggregation = Mapping[ 40 | str, 41 | _Union[ 42 | Callable[[np.ndarray], Any], 43 | Tuple[Callable[[np.ndarray], Any], str] 44 | ] 45 | ] 46 | 47 | 48 | # for the result type of IterateBy 49 | GroupDescription = Tuple[ 50 | Tuple[Any, ...], # tuple of unique values from group-by columns 51 | np.ndarray, # int array of row indices into original tafra for this group 52 | 'Tafra' # sub-tafra for the group 53 | ] 54 | 55 | 56 | class Union: 57 | """ 58 | Union two :class:`Tafra` together. Analogy to SQL UNION or 59 | `pandas.append`. All column names and dtypes must match. 60 | """ 61 | @staticmethod 62 | def _validate(left: 'Tafra', right: 'Tafra') -> None: 63 | """ 64 | Validate the :class:`Tafra` before applying. 65 | """ 66 | # These should be unreachable unless attributes were directly modified 67 | if len(left._data) != len(left._dtypes): 68 | assert 0, 'This `Tafra` length of data and dtypes do not match' 69 | if len(right._data) != len(right._dtypes): 70 | assert 0, 'right `Tafra` length of data and dtypes do not match' 71 | 72 | # ensure same number of columns 73 | if len(left._data) != len(right._data) or len(left._dtypes) != len(right._dtypes): 74 | raise ValueError( 75 | 'This `Tafra` column count does not match right `Tafra` column count.') 76 | 77 | # ensure all columns in this `Tafra` exist in right `Tafra` 78 | # if len() is same AND all columns in this exist in right, 79 | # do not need to check right `Tafra` columns in this `Tafra`. 80 | for (data_column, value), (dtype_column, dtype) \ 81 | in zip(left._data.items(), left._dtypes.items()): 82 | 83 | if data_column not in right._data or dtype_column not in right._dtypes: 84 | raise TypeError( 85 | f'This `Tafra` column `{data_column}` does not exist in right `Tafra`.') 86 | 87 | elif value.dtype != right._data[data_column].dtype: 88 | raise TypeError( 89 | f'This `Tafra` column `{data_column}` dtype `{value.dtype}` ' 90 | f'does not match right `Tafra` dtype `{right._data[data_column].dtype}`.') 91 | 92 | # should not happen unless dtypes manually changed, but let's check it 93 | elif dtype != right._dtypes[dtype_column]: 94 | raise TypeError( 95 | f'This `Tafra` column `{data_column}` dtype `{dtype}` ' 96 | f'does not match right `Tafra` dtype `{right._dtypes[dtype_column]}`.') 97 | 98 | def apply(self, left: 'Tafra', right: 'Tafra') -> 'Tafra': 99 | """ 100 | Apply the :class:`Union_` to the :class:`Tafra`. 101 | 102 | Parameters 103 | ---------- 104 | left: Tafra 105 | The left :class:`Tafra` to union. 106 | 107 | right: Tafra 108 | The right :class:`Tafra` to union. 109 | 110 | Returns 111 | ------- 112 | tafra: Tafra 113 | The unioned :class`Tafra`. 114 | """ 115 | self._validate(left, right) 116 | 117 | return Tafra( 118 | {column: np.append(value, right._data[column]) for column, value in left._data.items()}, 119 | left._dtypes.copy() 120 | ) 121 | 122 | def apply_inplace(self, left: 'Tafra', right: 'Tafra') -> None: 123 | """ 124 | In-place version. 125 | 126 | Apply the :class:`Union_` to the :class:`Tafra`. 127 | 128 | Parameters 129 | ---------- 130 | left: Tafra 131 | The left :class:`Tafra` to union. 132 | 133 | right: Tafra 134 | The right :class:`Tafra` to union. 135 | 136 | Returns 137 | ------- 138 | tafra: Tafra 139 | The unioned :class`Tafra`. 140 | """ 141 | self._validate(left, right) 142 | 143 | for column, value in left._data.items(): 144 | left._data[column] = np.append(value, right._data[column]) 145 | left._update_rows() 146 | 147 | @dc.dataclass 148 | class GroupSet: 149 | """ 150 | A `GroupSet` is the set of columns by which we construct our groups. 151 | """ 152 | 153 | @staticmethod 154 | def _unique_groups(tafra: 'Tafra', columns: Iterable[str]) -> List[Any]: 155 | """ 156 | Construct a unique set of grouped values. 157 | Uses :class:``OrderedDict`` rather than :class:``set`` to maintain order. 158 | """ 159 | return list(OrderedDict.fromkeys(zip(*(tafra._data[col] for col in columns)))) 160 | 161 | @staticmethod 162 | def _validate(tafra: 'Tafra', columns: Iterable[str]) -> None: # pragma: no cover 163 | """ 164 | Validate the :class:`Tafra` before applying. 165 | """ 166 | assert tafra._rows >= 1, 'No rows exist in `tafra`.' 167 | tafra._validate_columns(columns) 168 | 169 | 170 | @dc.dataclass 171 | class AggMethod(GroupSet): 172 | """ 173 | Basic methods for aggregations over a data table. 174 | """ 175 | group_by_cols: Iterable[str] 176 | aggregation: dc.InitVar[InitAggregation] 177 | _aggregation: Mapping[str, Tuple[Callable[[np.ndarray], Any], str]] = dc.field(init=False) 178 | iter_fn: Mapping[str, Callable[[np.ndarray], Any]] 179 | 180 | def __post_init__(self, aggregation: InitAggregation) -> None: 181 | self._aggregation = dict() 182 | for rename, agg in aggregation.items(): 183 | if callable(agg): 184 | self._aggregation[rename] = (agg, rename) 185 | elif (isinstance(agg, Sequence) and len(agg) == 2 186 | and callable(cast(Tuple[Callable[[np.ndarray], Any], str], agg)[0])): 187 | self._aggregation[rename] = agg 188 | else: 189 | raise ValueError(f'{rename}: {agg} is not a valid aggregation argument') 190 | 191 | for rename, agg in self.iter_fn.items(): 192 | if not callable(agg): 193 | raise ValueError(f'{rename}: {agg} is not a valid aggregation argument') 194 | 195 | def result_factory(self, fn: Callable[[str, str], np.ndarray]) -> Dict[str, np.ndarray]: 196 | """ 197 | Factory function to generate the dict for the results set. 198 | A function to take the new column name and source column name 199 | and return an empty `np.ndarray` should be given. 200 | """ 201 | return { 202 | rename: fn(rename, col) for rename, col in ( 203 | *((col, col) for col in self.group_by_cols), 204 | *((rename, agg[1]) for rename, agg in self._aggregation.items()) 205 | ) 206 | } 207 | 208 | def iter_fn_factory(self, fn: Callable[[], np.ndarray]) -> Dict[str, np.ndarray]: 209 | return {rename: fn() for rename in self.iter_fn.keys()} 210 | 211 | def apply(self, tafra: 'Tafra') -> 'Tafra': 212 | raise NotImplementedError 213 | 214 | 215 | class GroupBy(AggMethod): 216 | """ 217 | Aggregation by a set of unique values. 218 | 219 | Analogy to SQL ``GROUP BY``, not :meth:`pandas.DataFrame.groupby()`. 220 | 221 | Parameters 222 | ---------- 223 | columns: Iterable[str] 224 | The column names to group by. 225 | 226 | aggregation: Mapping[str, Union[Callable[[np.ndarray], Any], \ 227 | Optional. Tuple[Callable[[np.ndarray], Any], str]]] 228 | A mapping for columns and aggregation functions. Should be 229 | given as {'column': fn} or {'new_column': (fn, 'column')}. 230 | 231 | iter_fn: Mapping[str, Callable[[np.ndarray], Any]] 232 | Optional. A mapping for new columns names to the function to apply to 233 | the enumeration. Should be given as {'new_column': fn}. 234 | """ 235 | 236 | def apply(self, tafra: 'Tafra') -> 'Tafra': 237 | """ 238 | Apply the :class:`GroupBy` to the :class:`Tafra`. 239 | 240 | Parameters 241 | ---------- 242 | tafra: Tafra 243 | The tafra to apply the operation to. 244 | 245 | Returns 246 | ------- 247 | tafra: Tafra 248 | The aggregated :class:`Tafra`. 249 | """ 250 | self._validate(tafra, ( 251 | *self.group_by_cols, 252 | *(col for (_, col) in self._aggregation.values()) 253 | )) 254 | unique = self._unique_groups(tafra, self.group_by_cols) 255 | result = self.result_factory( 256 | lambda rename, col: np.empty(len(unique), dtype=tafra._data[col].dtype)) 257 | iter_fn = self.iter_fn_factory(lambda: np.ones(len(unique), dtype=int)) 258 | ones = np.ones(tafra._rows, dtype=int) 259 | 260 | for i, u in enumerate(unique): 261 | which_rows = np.full(tafra._rows, True) 262 | 263 | for val, col in zip(u, self.group_by_cols): 264 | which_rows &= tafra._data[col] == val 265 | result[col][i] = val 266 | 267 | for rename, (fn, col) in self._aggregation.items(): 268 | result[rename][i] = fn(tafra._data[col][which_rows]) 269 | 270 | for rename, fn in self.iter_fn.items(): 271 | iter_fn[rename][i] = fn(i * ones[which_rows]) 272 | 273 | result.update(iter_fn) 274 | return Tafra(result) 275 | 276 | 277 | class Transform(AggMethod): 278 | """ 279 | Apply a function to each unique set of values and join to the original table. 280 | 281 | Analogy to :meth:`pandas.DataFrame.groupby().transform()`, 282 | i.e. a SQL ``GROUP BY`` and ``LEFT JOIN`` back to the original table. 283 | 284 | Parameters 285 | ---------- 286 | group_by: Iterable[str] 287 | The column names to group by. 288 | 289 | aggregation: Mapping[str, Union[Callable[[np.ndarray], Any], \ 290 | Tuple[Callable[[np.ndarray], Any], str]]] 291 | Optional. A mapping for columns and aggregation functions. Should be 292 | given as {'column': fn} or {'new_column': (fn, 'column')}. 293 | 294 | iter_fn: Mapping[str, Callable[[np.ndarray], Any]] 295 | Optional. A mapping for new columns names to the function to apply to 296 | the enumeration. Should be given as {'new_column': fn}. 297 | """ 298 | 299 | def apply(self, tafra: 'Tafra') -> 'Tafra': 300 | """ 301 | Apply the :class:`Transform` to the :class:`Tafra`. 302 | 303 | Parameters 304 | ---------- 305 | tafra: Tafra 306 | The tafra to apply the operation to. 307 | 308 | Returns 309 | ------- 310 | tafra: Tafra 311 | The transformed :class:`Tafra`. 312 | """ 313 | self._validate(tafra, ( 314 | *self.group_by_cols, 315 | *(col for (_, col) in self._aggregation.values()) 316 | )) 317 | unique = self._unique_groups(tafra, self.group_by_cols) 318 | result = self.result_factory( 319 | lambda rename, col: np.empty_like(tafra._data[col])) 320 | iter_fn = self.iter_fn_factory(lambda: np.ones(tafra._rows, dtype=int)) 321 | ones = np.ones(tafra._rows, dtype=int) 322 | 323 | for i, u in enumerate(unique): 324 | which_rows = np.full(tafra._rows, True) 325 | 326 | for val, col in zip(u, self.group_by_cols): 327 | which_rows &= tafra._data[col] == val 328 | result[col][which_rows] = tafra._data[col][which_rows] 329 | 330 | for rename, agg in self._aggregation.items(): 331 | fn, col = agg 332 | result[rename][which_rows] = fn(tafra._data[col][which_rows]) 333 | 334 | for rename, fn in self.iter_fn.items(): 335 | iter_fn[rename][which_rows] = fn(i * ones[which_rows]) 336 | 337 | result.update(iter_fn) 338 | return Tafra(result) 339 | 340 | 341 | @dc.dataclass 342 | class IterateBy(GroupSet): 343 | """ 344 | A generator that yields a :class:`Tafra` for each set of unique values. 345 | 346 | Analogy to `pandas.DataFrame.groupby()`, i.e. an Sequence of `Tafra` objects. 347 | Yields tuples of ((unique grouping values, ...), row indices array, subset tafra) 348 | 349 | Parameters 350 | ---------- 351 | group_by: Iterable[str] 352 | The column names to group by. 353 | """ 354 | group_by_cols: Iterable[str] 355 | 356 | def apply(self, tafra: 'Tafra') -> Iterator[GroupDescription]: 357 | """ 358 | Apply the :class:`IterateBy` to the :class:`Tafra`. 359 | 360 | Parameters 361 | ---------- 362 | tafra: Tafra 363 | The tafra to apply the operation to. 364 | 365 | Returns 366 | ------- 367 | tafras: Iterator[GroupDescription] 368 | An iterator over the grouped :class:`Tafra`. 369 | """ 370 | self._validate(tafra, self.group_by_cols) 371 | unique = self._unique_groups(tafra, self.group_by_cols) 372 | 373 | for u in unique: 374 | which_rows = np.full(tafra._rows, True) 375 | for val, col in zip(u, self.group_by_cols): 376 | which_rows &= tafra._data[col] == val 377 | 378 | if len(u) == 1: 379 | u = u[0] 380 | 381 | yield (u, which_rows, tafra._ndindex(which_rows)) 382 | 383 | 384 | @dc.dataclass 385 | class Join(GroupSet): 386 | """ 387 | Base class for SQL-like JOINs. 388 | """ 389 | on: Iterable[Tuple[str, str, str]] 390 | select: Iterable[str] 391 | 392 | def _validate_dtypes(self, l_table: 'Tafra', r_table: 'Tafra') -> None: 393 | for l_column, r_column, _ in self.on: 394 | l_value = l_table._data[l_column] 395 | r_value = r_table._data[r_column] 396 | l_dtype = l_table._dtypes[l_column] 397 | r_dtype = r_table._dtypes[r_column] 398 | 399 | if l_value.dtype != r_value.dtype: 400 | raise TypeError( 401 | f'This `Tafra` column `{l_column}` dtype `{l_value.dtype}` ' 402 | f'does not match other `Tafra` dtype `{r_value.dtype}`.') 403 | 404 | # should not happen unless dtypes manually changed, but let's check it 405 | elif l_dtype != r_dtype: 406 | raise TypeError( 407 | f'This `Tafra` column `{l_column}` dtype `{l_dtype}` ' 408 | f'does not match other `Tafra` dtype `{r_dtype}`.') 409 | 410 | @staticmethod 411 | def _validate_ops(ops: Iterable[str]) -> None: 412 | for op in ops: 413 | _op = JOIN_OPS.get(op, None) 414 | if _op is None: 415 | raise TypeError(f'The operator {op} is not valid.') 416 | 417 | def apply(self, left_t: 'Tafra', right_t: 'Tafra') -> 'Tafra': 418 | raise NotImplementedError 419 | 420 | 421 | class InnerJoin(Join): 422 | """ 423 | An inner join. 424 | 425 | Analogy to SQL INNER JOIN, or `pandas.merge(..., how='inner')`, 426 | 427 | Parameters 428 | ---------- 429 | right: Tafra 430 | The right-side :class:`Tafra` to join. 431 | 432 | on: Iterable[Tuple[str, str, str]] 433 | The columns and operator to join on. Should be given as 434 | ('left column', 'right column', 'op') Valid ops are: 435 | 436 | '==' : equal to 437 | '!=' : not equal to 438 | '<' : less than 439 | '<=' : less than or equal to 440 | '>' : greater than 441 | '>=' : greater than or equal to 442 | 443 | select: Iterable[str] = [] 444 | The columns to return. If not given, all unique columns names 445 | are returned. If the column exists in both :class`Tafra`, 446 | prefers the left over the right. 447 | """ 448 | 449 | def apply(self, left_t: 'Tafra', right_t: 'Tafra') -> 'Tafra': 450 | """ 451 | Apply the :class:`InnerJoin` to the :class:`Tafra`. 452 | 453 | Parameters 454 | ---------- 455 | left_t: Tafra 456 | The left tafra to join. 457 | 458 | right_t: Tafra 459 | The right tafra to join. 460 | 461 | Returns 462 | ------- 463 | tafra: Tafra 464 | The joined :class:`Tafra`. 465 | """ 466 | left_cols, right_cols, ops = list(zip(*self.on)) 467 | self._validate(left_t, left_cols) 468 | self._validate(right_t, right_cols) 469 | self._validate_dtypes(left_t, right_t) 470 | self._validate_ops(ops) 471 | 472 | _on = tuple((left_col, right_col, JOIN_OPS[op]) for left_col, right_col, op in self.on) 473 | 474 | join: Dict[str, List[Any]] = {column: list() for column in chain( 475 | left_t._data.keys(), 476 | right_t._data.keys() 477 | ) if not self.select 478 | or (self.select and column in self.select)} 479 | 480 | # right-to-left so left dtypes overwrite 481 | dtypes: Dict[str, str] = {column: dtype for column, dtype in chain( 482 | right_t._dtypes.items(), 483 | left_t._dtypes.items() 484 | ) if column in join.keys()} 485 | 486 | for i in range(left_t._rows): 487 | right_rows = np.full(right_t._rows, True) 488 | 489 | for left_col, right_col, op in _on: 490 | right_rows &= op(left_t[left_col][i], right_t[right_col]) 491 | 492 | right_count = np.sum(right_rows) 493 | 494 | # this is the only difference from the LeftJoin 495 | if right_count <= 0: 496 | continue 497 | 498 | for column in join.keys(): 499 | if column in left_t._data: 500 | join[column].extend(max(1, right_count) * [left_t[column][i]]) 501 | 502 | elif column in right_t._data: 503 | join[column].extend(right_t[column][right_rows]) 504 | 505 | return Tafra( 506 | {column: np.array(value) 507 | for column, value in join.items()}, 508 | dtypes 509 | ) 510 | 511 | 512 | class LeftJoin(Join): 513 | """ 514 | A left join. 515 | 516 | Analogy to SQL LEFT JOIN, or `pandas.merge(..., how='left')`, 517 | 518 | Parameters 519 | ---------- 520 | right: Tafra 521 | The right-side :class:`Tafra` to join. 522 | 523 | on: Iterable[Tuple[str, str, str]] 524 | The columns and operator to join on. Should be given as 525 | ('left column', 'right column', 'op') Valid ops are: 526 | 527 | '==' : equal to 528 | '!=' : not equal to 529 | '<' : less than 530 | '<=' : less than or equal to 531 | '>' : greater than 532 | '>=' : greater than or equal to 533 | 534 | select: Iterable[str] = [] 535 | The columns to return. If not given, all unique columns names 536 | are returned. If the column exists in both :class`Tafra`, 537 | prefers the left over the right. 538 | """ 539 | 540 | def apply(self, left_t: 'Tafra', right_t: 'Tafra') -> 'Tafra': 541 | """ 542 | Apply the :class:`LeftJoin` to the :class:`Tafra`. 543 | 544 | Parameters 545 | ---------- 546 | left_t: Tafra 547 | The left tafra to join. 548 | 549 | right_t: Tafra 550 | The right tafra to join. 551 | 552 | Returns 553 | ------- 554 | tafra: Tafra 555 | The joined :class:`Tafra`. 556 | """ 557 | left_cols, right_cols, ops = list(zip(*self.on)) 558 | self._validate(left_t, left_cols) 559 | self._validate(right_t, right_cols) 560 | self._validate_dtypes(left_t, right_t) 561 | self._validate_ops(ops) 562 | 563 | _on = tuple((left_col, right_col, JOIN_OPS[op]) for left_col, right_col, op in self.on) 564 | 565 | join: Dict[str, List[Any]] = {column: list() for column in chain( 566 | left_t._data.keys(), 567 | right_t._data.keys() 568 | ) if not self.select 569 | or (self.select and column in self.select)} 570 | 571 | dtypes: Dict[str, str] = {column: dtype for column, dtype in chain( 572 | left_t._dtypes.items(), 573 | right_t._dtypes.items() 574 | ) if column in join.keys()} 575 | 576 | for i in range(left_t._rows): 577 | right_rows = np.full(right_t._rows, True) 578 | 579 | for left_col, right_col, op in _on: 580 | right_rows &= op(left_t[left_col][i], right_t[right_col]) 581 | 582 | right_count = np.sum(right_rows) 583 | 584 | for column in join.keys(): 585 | if column in left_t._data: 586 | join[column].extend(max(1, right_count) * [left_t[column][i]]) 587 | 588 | elif column in right_t._data: 589 | if right_count <= 0: 590 | join[column].append(None) 591 | if dtypes[column] != 'object': 592 | dtypes[column] = 'object' 593 | else: 594 | join[column].extend(right_t[column][right_rows]) 595 | 596 | return Tafra( 597 | {column: np.array(value) 598 | for column, value in join.items()}, 599 | dtypes 600 | ) 601 | 602 | 603 | @dc.dataclass 604 | class CrossJoin(Join): 605 | """ 606 | A cross join. 607 | 608 | Analogy to SQL CROSS JOIN, or `pandas.merge(..., how='outer') 609 | using temporary columns of static value to intersect all rows`. 610 | 611 | Parameters 612 | ---------- 613 | right: Tafra 614 | The right-side :class:`Tafra` to join. 615 | 616 | select: Iterable[str] = [] 617 | The columns to return. If not given, all unique columns names 618 | are returned. If the column exists in both :class`Tafra`, 619 | prefers the left over the right. 620 | """ 621 | 622 | def apply(self, left_t: 'Tafra', right_t: 'Tafra') -> 'Tafra': 623 | """ 624 | Apply the :class:`CrossJoin` to the :class:`Tafra`. 625 | 626 | Parameters 627 | ---------- 628 | left_t: Tafra 629 | The left tafra to join. 630 | 631 | right_t: Tafra 632 | The right tafra to join. 633 | 634 | Returns 635 | ------- 636 | tafra: Tafra 637 | The joined :class:`Tafra`. 638 | """ 639 | self._validate_dtypes(left_t, right_t) 640 | 641 | left_rows = left_t._rows 642 | right_rows = right_t._rows 643 | 644 | select = set(self.select) 645 | if len(select) > 0: 646 | left_cols = list(select.intersection(list(left_t._data.keys()))) 647 | right_cols = list(select.intersection(list(right_t._data.keys()))) 648 | 649 | if len(left_cols) == 0: 650 | raise IndexError('No columns given to select from left `Tafra`.') 651 | if len(right_cols) == 0: 652 | raise IndexError('No columns given to select from right `Tafra`.') 653 | 654 | else: 655 | left_cols = list(left_t._data.keys()) 656 | right_cols = list(right_t._data.keys()) 657 | 658 | left_new = Tafra(left_t[left_cols].key_map(np.tile, reps=right_rows)) 659 | right_new = Tafra(right_t[right_cols].key_map(np.tile, reps=left_rows)) 660 | 661 | left_new.update_inplace(right_new) 662 | 663 | return left_new 664 | 665 | 666 | # Import here to resolve circular dependency 667 | from .base import Tafra 668 | -------------------------------------------------------------------------------- /tafra/protocol.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tafra: a minimalist dataframe 3 | 4 | Copyright (c) 2020 Derrick W. Turk and David S. Fulford 5 | 6 | Author 7 | ------ 8 | Derrick W. Turk 9 | David S. Fulford 10 | 11 | Notes 12 | ----- 13 | Created on April 25, 2020 14 | """ 15 | import numpy as np 16 | from typing import Dict, List, Tuple, Any, Optional, Type, Iterable, Iterator 17 | from typing_extensions import Protocol, runtime_checkable 18 | 19 | 20 | @runtime_checkable 21 | class Series(Protocol): 22 | name: str 23 | values: np.ndarray 24 | dtype: str 25 | 26 | 27 | @runtime_checkable 28 | class DataFrame(Protocol): 29 | """ 30 | A fake class to satisfy typing of a ``pandas.DataFrame`` without a dependency. 31 | """ 32 | _data: Dict[str, Series] 33 | columns: List[str] 34 | dtypes: List[str] 35 | 36 | def __getitem__(self, column: str) -> Series: 37 | raise NotImplementedError 38 | 39 | def __setitem__(self, column: str, value: np.ndarray) -> None: 40 | raise NotImplementedError 41 | 42 | @runtime_checkable 43 | class Cursor(Protocol): 44 | """ 45 | A fake class to satisfy typing of a ``pyodbc.Cursor`` without a dependency. 46 | """ 47 | description: Tuple[Tuple[str, Type[Any], Optional[int], int, int, int, bool]] 48 | 49 | def __iter__(self) -> Iterator[Tuple[Any, ...]]: 50 | raise NotImplementedError 51 | 52 | def __next__(self) -> Tuple[Any, ...]: 53 | raise NotImplementedError 54 | 55 | def execute(self, sql: str) -> None: 56 | raise NotImplementedError 57 | 58 | def fetchone(self) -> Optional[Tuple[Any, ...]]: 59 | raise NotImplementedError 60 | 61 | def fetchmany(self, size: int) -> List[Tuple[Any, ...]]: 62 | raise NotImplementedError 63 | 64 | def fetchall(self) -> List[Tuple[Any, ...]]: 65 | raise NotImplementedError 66 | -------------------------------------------------------------------------------- /tafra/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petbox-dev/tafra/c8bd5452f314e498fc7a7dbc9a30d1f6efde4174/tafra/py.typed -------------------------------------------------------------------------------- /tafra/version.py: -------------------------------------------------------------------------------- 1 | __version__ = '1.0.10' 2 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petbox-dev/tafra/c8bd5452f314e498fc7a7dbc9a30d1f6efde4174/test/__init__.py -------------------------------------------------------------------------------- /test/ex1.csv: -------------------------------------------------------------------------------- 1 | a,b,c 2 | 1,true,23 3 | 2,false,45.6 4 | 3,true,90.2 5 | 4,false,2.1 6 | 5,true,9.6 7 | 6,false,-10.1 8 | -------------------------------------------------------------------------------- /test/ex2.csv: -------------------------------------------------------------------------------- 1 | a,b,c 2 | 1,true,23 3 | 2,false,45.6 4 | -------------------------------------------------------------------------------- /test/ex3.csv: -------------------------------------------------------------------------------- 1 | a,b,b 2 | 1,true,23 3 | 2,false,45.6 4 | 3,true,90.2 5 | 4,false,2.1 6 | 5,jimmy,9.6 7 | 6,false,-10.1 8 | -------------------------------------------------------------------------------- /test/ex4.csv: -------------------------------------------------------------------------------- 1 | a,b,b 2 | 1,true,23 3 | 2,false,45.6 4 | 3,true,90.2 5 | 4,false,2.1 6 | 5,false,9.6 7 | 6,jimmy,-10.1 8 | -------------------------------------------------------------------------------- /test/ex5.csv: -------------------------------------------------------------------------------- 1 | a,b,c 2 | 1,true,23 3 | 2,false,45.6 4 | 3,true,90.2 5 | 4,2.1 6 | 5,false,9.6 7 | 6,true,-10.1 8 | 7.3,false,2.3 9 | -------------------------------------------------------------------------------- /test/ex6.csv: -------------------------------------------------------------------------------- 1 | t,te,dp,dp_prime,dp_prime_te 2 | 0.00417,0.00417,0.57,, 3 | 8.33E-03,8.33E-03,3.81,6.75,6.76 4 | 1.25E-02,1.25E-02,6.55,9.87,9.88 5 | 1.67E-02,1.67E-02,10.03,13.98,13.99 6 | 2.08E-02,2.08E-02,13.27,17.29,17.32 7 | 2.50E-02,2.50E-02,16.77,20.08,20.12 8 | -------------------------------------------------------------------------------- /test/test.bat: -------------------------------------------------------------------------------- 1 | :: Run tests and generate report 2 | 3 | flake8 %~dp0..\tafra 4 | mypy %~dp0..\tafra 5 | 6 | sphinx-build -W -b html -a %~dp0..\docs %~dp0..\docs\_build\html 7 | 8 | pytest 9 | -------------------------------------------------------------------------------- /test/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 4 | 5 | 6 | echo flake8 ../tafra 7 | flake8 $DIR/../tafra 8 | echo 9 | 10 | echo mypy ../tafra 11 | mypy $DIR/../tafra 12 | echo 13 | 14 | echo sphinx-build -W -b html -a ../docs ../docs/_build/html 15 | sphinx-build -W -b html -a $DIR/..docs $DIR/..docs/_build/html 16 | echo 17 | 18 | echo pytest 19 | pytest 20 | -------------------------------------------------------------------------------- /test/test_tafra.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import platform 3 | import warnings 4 | from decimal import Decimal 5 | from datetime import date, datetime 6 | 7 | import numpy as np 8 | from tafra import Tafra, object_formatter 9 | import pandas as pd # type: ignore 10 | from itertools import islice 11 | 12 | from typing import Dict, List, Any, Iterator, Iterable, Sequence, Tuple, Optional, Type 13 | 14 | import pytest # type: ignore 15 | from unittest.mock import MagicMock 16 | 17 | 18 | class TestClass: 19 | ... 20 | 21 | 22 | class Series: 23 | name: str = 'x' 24 | values: np.ndarray = np.arange(5) 25 | dtype: str = 'int' 26 | 27 | 28 | class DataFrame: 29 | _data: Dict[str, Series] = {'x': Series(), 'y': Series()} 30 | columns: List[str] = ['x', 'y'] 31 | dtypes: List[str] = ['int', 'int'] 32 | 33 | def __getitem__(self, column: str) -> Series: 34 | return self._data[column] 35 | 36 | def __setitem__(self, column: str, value: np.ndarray) -> None: 37 | self._data[column].values = value 38 | 39 | 40 | class Cursor: 41 | description = ( 42 | ('Fruit', str, None, 1, 1, 1, True), 43 | ('Amount', int, None, 1, 1, 1, True), 44 | ('Price', float, None, 1, 1, 1, True) 45 | ) 46 | _iter = [ 47 | ('Apples', 5, .95), 48 | ('Pears', 2, .80) 49 | ] 50 | idx = 0 51 | 52 | def __iter__(self) -> Iterator[Tuple[Any, ...]]: 53 | return self 54 | 55 | def __next__(self) -> Tuple[Any, ...]: 56 | try: 57 | item = self._iter[self.idx] 58 | except IndexError: 59 | raise StopIteration() 60 | self.idx += 1 61 | return item 62 | 63 | def execute(self, sql: str) -> None: 64 | ... 65 | 66 | def fetchone(self) -> Optional[Tuple[Any, ...]]: 67 | try: 68 | return next(self) 69 | except: 70 | return None 71 | 72 | def fetchmany(self, size: int) -> List[Tuple[Any, ...]]: 73 | return list(islice(self, size)) 74 | 75 | def fetchall(self) -> List[Tuple[Any, ...]]: 76 | return [rec for rec in self] 77 | 78 | 79 | def build_tafra() -> Tafra: 80 | return Tafra({ 81 | 'x': np.array([1, 2, 3, 4, 5, 6]), 82 | 'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'), 83 | 'z': np.array([0, 0, 0, 1, 1, 1]) 84 | }) 85 | 86 | 87 | def check_tafra(t: Tafra, check_rows: bool = True) -> bool: 88 | assert len(t._data) == len(t._dtypes) 89 | for c in t.columns: 90 | assert isinstance(t[c], np.ndarray) 91 | assert isinstance(t.data[c], np.ndarray) 92 | assert isinstance(t._data[c], np.ndarray) 93 | assert isinstance(t.dtypes[c], str) 94 | assert isinstance(t._dtypes[c], str) 95 | if check_rows: 96 | assert t._rows == len(t._data[c]) 97 | pd.Series(t._data[c]) 98 | 99 | columns = [c for c in t.columns][:-1] 100 | 101 | _ = t.to_records() 102 | _ = t.to_records(columns=columns) 103 | _ = t.to_list() 104 | _ = t.to_list(columns=columns) 105 | _ = t.to_list(inner=True) 106 | _ = t.to_list(columns=columns, inner=True) 107 | _ = t.to_tuple() 108 | _ = t.to_tuple(columns=columns) 109 | _ = t.to_tuple(name=None) 110 | _ = t.to_tuple(name='tf') 111 | _ = t.to_tuple(columns=columns, name=None) 112 | _ = t.to_tuple(columns=columns, name='tf') 113 | _ = t.to_tuple(inner=True) 114 | _ = t.to_tuple(inner=True, name=None) 115 | _ = t.to_tuple(inner=True, name='tf') 116 | _ = t.to_tuple(columns=columns, inner=True) 117 | _ = t.to_tuple(columns=columns, inner=True, name=None) 118 | _ = t.to_tuple(columns=columns, inner=True, name='tf') 119 | _ = t.to_array() 120 | _ = t.to_array(columns=columns) 121 | df = t.to_pandas() 122 | df = t.to_pandas(columns=columns) 123 | assert isinstance(df, pd.DataFrame) 124 | write_path = Path('test/test_to_csv.csv') 125 | t.to_csv(write_path) 126 | # t.to_csv(write_path, columns=columns) 127 | 128 | return True 129 | 130 | def test_constructions() -> None: 131 | t = build_tafra() 132 | check_tafra(t) 133 | 134 | t = Tafra({ 135 | 'x': np.array([1, 2, 3, 4, 5, 6]), 136 | 'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'), 137 | 'z': np.array([0, 0, 0, 1, 1, 1]) 138 | }, validate=False) 139 | check_tafra(t) 140 | 141 | t = Tafra({ 142 | 'x': np.array([1, 2, 3, 4, 5, 6]), 143 | 'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'), 144 | 'z': np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) 145 | }, validate=False, check_rows=False) 146 | check_tafra(t, check_rows=False) 147 | 148 | with pytest.raises(TypeError) as e: 149 | t = Tafra() # type: ignore # noqa 150 | 151 | with pytest.raises(ValueError) as e: 152 | t = Tafra({}) 153 | 154 | t = Tafra({'x': None}) 155 | with warnings.catch_warnings(record=True) as w: 156 | check_tafra(t) 157 | 158 | t = Tafra({'x': Decimal('1.23456')}) 159 | check_tafra(t) 160 | 161 | t = Tafra({'x': np.array(1)}) 162 | check_tafra(t) 163 | 164 | t = Tafra({'x': np.array([1])}) 165 | check_tafra(t) 166 | 167 | t = Tafra({'x': [True, False]}) 168 | check_tafra(t) 169 | 170 | t = Tafra({'x': 'test'}) 171 | check_tafra(t) 172 | 173 | t = Tafra((('x', np.arange(6)),)) 174 | check_tafra(t) 175 | 176 | t = Tafra([('x', np.arange(6))]) 177 | check_tafra(t) 178 | 179 | t = Tafra([['x', np.arange(6)]]) 180 | check_tafra(t) 181 | 182 | t = Tafra([(np.array('x'), np.arange(6))]) 183 | check_tafra(t) 184 | 185 | t = Tafra([(np.array(['x']), np.arange(6))]) 186 | check_tafra(t) 187 | 188 | t = Tafra([('x', np.arange(6)), ('y', np.linspace(0, 1, 6))]) 189 | check_tafra(t) 190 | 191 | t = Tafra([['x', np.arange(6)], ('y', np.linspace(0, 1, 6))]) 192 | check_tafra(t) 193 | 194 | t = Tafra([('x', np.arange(6)), ['y', np.linspace(0, 1, 6)]]) 195 | check_tafra(t) 196 | 197 | t = Tafra([['x', np.arange(6)], ['y', np.linspace(0, 1, 6)]]) 198 | check_tafra(t) 199 | 200 | t = Tafra([{'x': np.arange(6)}, {'y': np.linspace(0, 1, 6)}]) 201 | check_tafra(t) 202 | 203 | t = Tafra(iter([{'x': np.arange(6)}, {'y': np.linspace(0, 1, 6)}])) 204 | check_tafra(t) 205 | 206 | def iterator() -> Iterator[Dict[str, np.ndarray]]: 207 | yield {'x': np.array([1, 2, 3, 4, 5, 6])} 208 | yield {'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object')} 209 | yield {'z': np.array([0, 0, 0, 1, 1, 1])} 210 | 211 | t = Tafra(iterator()) 212 | check_tafra(t) 213 | 214 | class DictIterable: 215 | def __iter__(self) -> Iterator[Dict[str, np.ndarray]]: 216 | yield {'x': np.array([1, 2, 3, 4, 5, 6])} 217 | yield {'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object')} 218 | yield {'z': np.array([0, 0, 0, 1, 1, 1])} 219 | 220 | t = Tafra(DictIterable()) 221 | check_tafra(t) 222 | 223 | t = Tafra(iter(DictIterable())) 224 | check_tafra(t) 225 | 226 | class SequenceIterable: 227 | def __iter__(self) -> Iterator[Any]: 228 | yield ('x', np.array([1, 2, 3, 4, 5, 6])) 229 | yield ['y', np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object')] 230 | yield ('z', np.array([0, 0, 0, 1, 1, 1])) 231 | 232 | t = Tafra(SequenceIterable()) 233 | check_tafra(t) 234 | 235 | class SequenceIterable2: 236 | def __iter__(self) -> Iterator[Any]: 237 | yield (np.array(['x']), np.array([1, 2, 3, 4, 5, 6])) 238 | yield [np.array(['y']), np.array(['one', 'two', 'one', 'two', 'one', 'two'], 239 | dtype='object')] 240 | yield (np.array(['z']), np.array([0, 0, 0, 1, 1, 1])) 241 | 242 | t = Tafra(SequenceIterable2()) 243 | check_tafra(t) 244 | 245 | t = Tafra(iter(SequenceIterable2())) 246 | check_tafra(t) 247 | 248 | t = Tafra(enumerate(np.arange(6))) 249 | check_tafra(t) 250 | 251 | t = build_tafra() 252 | df = pd.DataFrame(t.data) 253 | _ = Tafra.from_series(df['x']) 254 | check_tafra(_) 255 | 256 | _ = Tafra.from_dataframe(df) 257 | check_tafra(_) 258 | 259 | _ = Tafra.as_tafra(df) 260 | check_tafra(_) 261 | 262 | _ = Tafra.as_tafra(df['x']) 263 | check_tafra(_) 264 | 265 | _ = Tafra.as_tafra(t) 266 | check_tafra(_) 267 | 268 | _ = Tafra.as_tafra({'x': np.array(1)}) 269 | check_tafra(_) 270 | 271 | _ = Tafra.from_series(Series()) 272 | check_tafra(_) 273 | 274 | _ = Tafra.as_tafra(Series()) 275 | check_tafra(_) 276 | 277 | _ = Tafra.from_dataframe(DataFrame()) # type: ignore 278 | check_tafra(_) 279 | 280 | _ = Tafra.as_tafra(DataFrame()) 281 | check_tafra(_) 282 | 283 | with pytest.raises(TypeError) as e: 284 | t = Tafra([{1, 2}]) # type: ignore 285 | 286 | class BadIterable: 287 | def __iter__(self) -> Iterator[Any]: 288 | yield {1, 2} 289 | yield {3.1412159, .5772156} 290 | 291 | with pytest.raises(TypeError) as e: 292 | t = Tafra(BadIterable()) 293 | 294 | with pytest.raises(TypeError) as e: 295 | t = Tafra(iter(BadIterable())) 296 | 297 | with pytest.raises(TypeError) as e: 298 | _ = Tafra(np.arange(6)) 299 | 300 | with pytest.raises(TypeError) as e: 301 | _ = Tafra.as_tafra(np.arange(6)) 302 | 303 | with pytest.raises(ValueError) as e: 304 | t = Tafra({'x': np.array([1, 2]), 'y': np.array([3., 4., 5.])}) 305 | 306 | def test_read_sql() -> None: 307 | 308 | cur = Cursor() 309 | columns, dtypes = zip(*((d[0], d[1]) for d in cur.description)) 310 | records = cur.fetchall() 311 | t = Tafra.from_records(records, columns) 312 | check_tafra(t) 313 | 314 | t = Tafra.from_records(records, columns, dtypes) 315 | check_tafra(t) 316 | 317 | cur = Cursor() 318 | t = Tafra.read_sql('SELECT * FROM [Table]', cur) # type: ignore 319 | check_tafra(t) 320 | 321 | cur = Cursor() 322 | cur._iter = [] 323 | t = Tafra.read_sql('SELECT * FROM [Table]', cur) # type: ignore 324 | check_tafra(t) 325 | 326 | cur = Cursor() 327 | for t in Tafra.read_sql_chunks('SELECT * FROM [Table]', cur): # type: ignore 328 | check_tafra(t) 329 | 330 | cur = Cursor() 331 | cur._iter = [] 332 | for t in Tafra.read_sql_chunks('SELECT * FROM [Table]', cur): # type: ignore 333 | check_tafra(t) 334 | 335 | 336 | def test_destructors() -> None: 337 | def gen_values() -> Iterator[Dict[str, np.ndarray]]: 338 | yield {'x': np.arange(6)} 339 | yield {'y': np.arange(6)} 340 | 341 | t = Tafra(gen_values()) 342 | check_tafra(t) 343 | 344 | t = build_tafra() 345 | t = t.update_dtypes({'x': 'float'}) 346 | t.data['x'][2] = np.nan 347 | check_tafra(t) 348 | 349 | _ = tuple(t.to_records()) 350 | _ = tuple(t.to_records(columns='x')) 351 | _ = tuple(t.to_records(columns=['x'])) 352 | _ = tuple(t.to_records(columns=['x', 'y'])) 353 | _ = tuple(t.to_records(cast_null=False)) 354 | _ = tuple(t.to_records(columns='x', cast_null=False)) 355 | _ = tuple(t.to_records(columns=['x'], cast_null=False)) 356 | _ = tuple(t.to_records(columns=['x', 'y'], cast_null=False)) 357 | 358 | _ = t.to_list() 359 | _ = t.to_list(columns='x') 360 | _ = t.to_list(columns=['x']) 361 | _ = t.to_list(columns=['x', 'y']) 362 | 363 | _ = t.to_list(inner=True) 364 | _ = t.to_list(columns='x', inner=True) 365 | _ = t.to_list(columns=['x'], inner=True) 366 | _ = t.to_list(columns=['x', 'y'], inner=True) 367 | 368 | _ = t.to_tuple() 369 | _ = t.to_tuple(columns='x') 370 | _ = t.to_tuple(columns=['x']) 371 | _ = t.to_tuple(columns=['x', 'y']) 372 | 373 | _ = t.to_tuple(inner=True) 374 | _ = t.to_tuple(columns='x', inner=True) 375 | _ = t.to_tuple(columns=['x'], inner=True) 376 | _ = t.to_tuple(columns=['x', 'y'], inner=True) 377 | 378 | _ = t.to_array() 379 | _ = t.to_array(columns='x') 380 | _ = t.to_array(columns=['x']) 381 | _ = t.to_array(columns=['x', 'y']) 382 | 383 | _ = t.to_pandas() 384 | _ = t.to_pandas(columns='x') 385 | _ = t.to_pandas(columns=['x']) 386 | _ = t.to_pandas(columns=['x', 'y']) 387 | 388 | filepath = Path('test/test_to_csv.csv') 389 | t.to_csv(filepath) 390 | t.to_csv(filepath, columns='x') 391 | t.to_csv(filepath, columns=['x']) 392 | t.to_csv(filepath, columns=['x', 'y']) 393 | 394 | 395 | def test_properties() -> None: 396 | t = build_tafra() 397 | _ = t.columns 398 | _ = t.rows 399 | _ = t.data 400 | _ = t.dtypes 401 | _ = t.size 402 | _ = t.ndim 403 | _ = t.shape 404 | 405 | with pytest.raises(ValueError) as e: 406 | t.columns = ['x', 'a'] # type: ignore 407 | 408 | with pytest.raises(ValueError) as e: 409 | t.rows = 3 410 | 411 | with pytest.raises(ValueError) as e: 412 | t.data = {'x': np.arange(6)} 413 | 414 | with pytest.raises(ValueError) as e: 415 | t.dtypes = {'x': 'int'} 416 | 417 | with pytest.raises(ValueError) as e: 418 | t.size = 3 419 | 420 | with pytest.raises(ValueError) as e: 421 | t.ndim = 3 422 | 423 | with pytest.raises(ValueError) as e: 424 | t.shape = (10, 2) 425 | 426 | def test_views() -> None: 427 | t = build_tafra() 428 | _ = t.keys() 429 | _ = t.values() 430 | _ = t.items() 431 | _ = t.get('x') 432 | 433 | def test_assignment() -> None: 434 | t = build_tafra() 435 | t['x'] = np.arange(6) 436 | t['x'] = 3 437 | t['x'] = 6 438 | t['x'] = 'test' 439 | t['x'] = list(range(6)) 440 | t['x'] = np.array(6) 441 | t['x'] = np.array([6]) 442 | t['x'] = iter([1, 2, 3, 4, 5, 6]) 443 | t['x'] = range(6) 444 | check_tafra(t) 445 | 446 | with pytest.raises(ValueError) as e: 447 | t['x'] = np.arange(3) 448 | 449 | def test_dtype_update() -> None: 450 | t = build_tafra() 451 | assert t._data['x'].dtype != np.dtype(object) 452 | t.update_dtypes_inplace({'x': 'O'}) 453 | assert t._data['x'].dtype == np.dtype(object) 454 | check_tafra(t) 455 | 456 | t = build_tafra() 457 | assert t._data['x'].dtype != np.dtype(object) 458 | _ = t.update_dtypes({'x': 'O'}) 459 | assert _._data['x'].dtype == np.dtype(object) 460 | check_tafra(_) 461 | 462 | 463 | def test_select() -> None: 464 | t = build_tafra() 465 | _ = t.select('x') 466 | _ = t.select(['x']) 467 | _ = t.select(['x', 'y']) 468 | 469 | with pytest.raises(ValueError) as e: 470 | _ = t.select('a') 471 | 472 | def test_formatter() -> None: 473 | _ = str(object_formatter) 474 | 475 | t = Tafra({'x': Decimal(1.2345)}) 476 | assert t._dtypes['x'] == 'float64' 477 | assert t['x'].dtype == np.dtype(float) 478 | 479 | object_formatter['Decimal'] = lambda x: x.astype(int) 480 | t = Tafra({'x': Decimal(1.2345)}) 481 | if platform.system() == 'Windows': 482 | assert t._dtypes['x'] == 'int32' 483 | elif platform.system() == 'Linux': 484 | assert t._dtypes['x'] == 'int64' 485 | assert t['x'].dtype == np.dtype(int) 486 | 487 | _ = str(object_formatter) 488 | 489 | for fmt in object_formatter: 490 | pass 491 | 492 | _ = object_formatter.copy() 493 | 494 | del object_formatter['Decimal'] 495 | 496 | with pytest.raises(ValueError) as e: 497 | object_formatter['Decimal'] = lambda x: 'int' # type: ignore 498 | 499 | _ = str(object_formatter) 500 | 501 | def test_prints() -> None: 502 | t = build_tafra() 503 | _ = t.pformat() 504 | t.pprint() 505 | t.head(5) 506 | 507 | mock = MagicMock() 508 | mock.text = print 509 | t._repr_pretty_(mock, True) 510 | t._repr_pretty_(mock, False) 511 | 512 | _ = t._repr_html_() 513 | 514 | def test_dunder() -> None: 515 | t = build_tafra() 516 | l = len(t) 517 | s = str(t) 518 | 519 | def test_update() -> None: 520 | t = build_tafra() 521 | t2 = build_tafra() 522 | _ = t2.update(t2) 523 | check_tafra(_) 524 | 525 | t.update_inplace(t2) 526 | check_tafra(t) 527 | 528 | _ = t.update(t2._data) # type: ignore 529 | check_tafra(_) 530 | 531 | def test_coalesce_dtypes() -> None: 532 | t = build_tafra() 533 | t._data['a'] = np.arange(6) 534 | assert 'a' not in t._dtypes 535 | 536 | t._coalesce_dtypes() 537 | assert 'a' in t._dtypes 538 | check_tafra(t) 539 | 540 | def test_update_dtypes() -> None: 541 | t = build_tafra() 542 | t.update_dtypes_inplace({'x': float}) 543 | check_tafra(t) 544 | assert t['x'].dtype == 'float' 545 | assert isinstance(t['x'][0], np.float64) 546 | 547 | t = build_tafra() 548 | _ = t.update_dtypes({'x': float}) 549 | check_tafra(_) 550 | assert _['x'].dtype == 'float' 551 | assert isinstance(_['x'][0], np.float64) 552 | 553 | def test_rename() -> None: 554 | t = build_tafra() 555 | t.rename_inplace({'x': 'a'}) 556 | assert 'a' in t.data 557 | assert 'a' in t.dtypes 558 | assert 'x' not in t.data 559 | assert 'x' not in t.dtypes 560 | check_tafra(t) 561 | 562 | t = build_tafra() 563 | _ = t.rename({'x': 'a'}) 564 | assert 'a' in _.data 565 | assert 'a' in _.dtypes 566 | assert 'x' not in _.data 567 | assert 'x' not in _.dtypes 568 | check_tafra(_) 569 | 570 | def test_delete() -> None: 571 | t = build_tafra() 572 | t.delete_inplace('x') 573 | assert 'x' not in t.data 574 | assert 'x' not in t.dtypes 575 | check_tafra(t) 576 | 577 | t = build_tafra() 578 | t.delete_inplace(['x']) 579 | assert 'x' not in t.data 580 | assert 'x' not in t.dtypes 581 | check_tafra(t) 582 | 583 | t = build_tafra() 584 | t.delete_inplace(['x', 'y']) 585 | assert 'x' not in t.data 586 | assert 'y' not in t.dtypes 587 | assert 'x' not in t.data 588 | assert 'y' not in t.dtypes 589 | check_tafra(t) 590 | 591 | t = build_tafra() 592 | _ = t.delete('x') 593 | assert 'x' not in _.data 594 | assert 'x' not in _.dtypes 595 | check_tafra(t) 596 | check_tafra(_) 597 | 598 | t = build_tafra() 599 | _ = t.delete(['x']) 600 | assert 'x' not in _.data 601 | assert 'x' not in _.dtypes 602 | check_tafra(t) 603 | check_tafra(_) 604 | 605 | t = build_tafra() 606 | _ = t.delete(['x', 'y']) 607 | assert 'x' not in _.data 608 | assert 'y' not in _.dtypes 609 | assert 'x' not in _.data 610 | assert 'y' not in _.dtypes 611 | check_tafra(t) 612 | check_tafra(_) 613 | 614 | def test_iter_methods() -> None: 615 | t = build_tafra() 616 | for _ in t: 617 | pass 618 | 619 | for _ in t.iterrows(): 620 | pass 621 | 622 | for _ in t.itercols(): 623 | pass 624 | 625 | for _ in t.itertuples(): 626 | pass 627 | 628 | for _ in t.itertuples(name='test'): 629 | pass 630 | 631 | for _ in t.itertuples(name=None): 632 | pass 633 | 634 | def test_groupby() -> None: 635 | t = build_tafra() 636 | gb = t.group_by( 637 | ['y', 'z'], {'x': sum}, {'count': len} 638 | ) 639 | check_tafra(gb) 640 | 641 | def test_groupby_iter_fn() -> None: 642 | t = build_tafra() 643 | gb = t.group_by( 644 | ['y', 'z'], { 645 | 'x': sum, 646 | 'new_x': (sum, 'x') 647 | }, {'count': len} 648 | ) 649 | check_tafra(gb) 650 | 651 | def test_transform() -> None: 652 | t = build_tafra() 653 | tr = t.transform( 654 | ['y', 'z'], {'x': sum}, {'id': max} 655 | ) 656 | check_tafra(tr) 657 | 658 | def test_iterate_by_attr() -> None: 659 | t = build_tafra() 660 | t.id = np.empty(t.rows, dtype=int) # type: ignore 661 | t['id'] = np.empty(t.rows, dtype=int) 662 | for i, (u, ix, grouped) in enumerate(t.iterate_by(['y', 'z'])): 663 | t['x'][ix] = sum(grouped['x']) 664 | t.id[ix] = len(grouped['x']) # type: ignore 665 | t['id'][ix] = max(grouped['x']) 666 | check_tafra(t) 667 | 668 | def test_iterate_by() -> None: 669 | t = build_tafra() 670 | for u, ix, grouped in t.iterate_by(['y']): 671 | assert isinstance(grouped, Tafra) 672 | 673 | def group_by_in_iterate_by() -> None: 674 | t = build_tafra() 675 | for u, ix, grouped in t.iterate_by(['y']): 676 | assert isinstance(grouped.group_by(['z'], {'x': sum}), Tafra) 677 | 678 | def test_update_transform() -> None: 679 | t = build_tafra() 680 | t.update(t.transform(['y'], {}, {'id': max})) 681 | 682 | for u, ix, it in t.iterate_by(['y']): 683 | t['x'][ix] = it['x'] - np.mean(it['x']) 684 | check_tafra(t) 685 | 686 | def test_transform_assignment() -> None: 687 | t = build_tafra() 688 | for u, ix, it in t.iterate_by(['y']): 689 | it['x'][0] = 9 690 | check_tafra(t) 691 | check_tafra(it) 692 | 693 | def test_invalid_agg() -> None: 694 | t = build_tafra() 695 | with pytest.raises(ValueError) as e: 696 | gb = t.group_by( 697 | ['y', 'z'], {sum: 'x'} # type: ignore 698 | ) 699 | 700 | with pytest.raises(ValueError) as e: 701 | gb = t.group_by( 702 | ['y', 'z'], {}, {len: 'count'} # type: ignore 703 | ) 704 | 705 | def test_map() -> None: 706 | t = build_tafra() 707 | 708 | def repeat(tf: Tafra, repeats: int) -> Tafra: 709 | return [tf for _ in range(repeats)] 710 | 711 | _ = list(t.row_map(repeat, 6)) 712 | _ = list(t.tuple_map(repeat, 6)) 713 | _ = list(t.col_map(repeat, repeats=6)) 714 | _ = Tafra(t.key_map(np.repeat, repeats=6)) 715 | 716 | def test_pipe() -> None: 717 | def fn1(t: Tafra) -> Tafra: 718 | return t[t['y'] == 'one'] 719 | def fn2(t: Tafra) -> Tafra: 720 | return t[t['z'] == 0] 721 | 722 | t = build_tafra() 723 | check_tafra(t.pipe(fn1)) 724 | check_tafra(t >> fn1) 725 | check_tafra(t.pipe(fn1).pipe(fn2)) 726 | check_tafra(t >> fn1 >> fn2) 727 | 728 | def fn3(t: Tafra, i: int) -> Tafra: 729 | return t[t['x'] == i] 730 | 731 | check_tafra(t.pipe(fn3, 1)) 732 | check_tafra(t.pipe(fn3, i=1)) 733 | check_tafra(t >> (lambda t: fn3(t, i=1))) 734 | 735 | def test_union() -> None: 736 | t = build_tafra() 737 | t2 = build_tafra() 738 | 739 | _ = t2.union(t) 740 | check_tafra(_) 741 | assert len(_) == len(t) + len(t2) 742 | 743 | t2.union_inplace(t) 744 | check_tafra(t2) 745 | assert len(t2) == 2 * len(t) 746 | 747 | t = build_tafra() 748 | t2 = build_tafra() 749 | t._dtypes['a'] = 'int' 750 | with pytest.raises(Exception) as e: 751 | t.union_inplace(t2) 752 | 753 | t = build_tafra() 754 | t2._dtypes['a'] = 'int' 755 | with pytest.raises(Exception) as e: 756 | t.union_inplace(t2) 757 | 758 | t = build_tafra() 759 | t2 = build_tafra() 760 | t['a'] = np.arange(6) 761 | with pytest.raises(ValueError) as e: 762 | t.union_inplace(t2) 763 | 764 | t = build_tafra() 765 | t2 = build_tafra() 766 | t2['a'] = np.arange(6) 767 | with pytest.raises(ValueError) as e: 768 | t.union_inplace(t2) 769 | 770 | t = build_tafra() 771 | t2 = build_tafra() 772 | t.rename_inplace({'x': 'a'}) 773 | with pytest.raises(TypeError) as e: 774 | t.union_inplace(t2) 775 | 776 | t = build_tafra() 777 | t2 = build_tafra() 778 | t2.rename_inplace({'x': 'a'}) 779 | with pytest.raises(TypeError) as e: 780 | t.union_inplace(t2) 781 | 782 | t = build_tafra() 783 | t2 = build_tafra() 784 | t.update_dtypes_inplace({'x': float}) 785 | with pytest.raises(TypeError) as e: 786 | t.union_inplace(t2) 787 | 788 | t = build_tafra() 789 | t2 = build_tafra() 790 | t2._dtypes['x'] = 'float' 791 | with pytest.raises(TypeError) as e: 792 | t.union_inplace(t2) 793 | 794 | def test_slice() -> None: 795 | t = build_tafra() 796 | _ = t[:3] 797 | _['x'][0] = 0 798 | check_tafra(_) 799 | 800 | t = build_tafra() 801 | _ = t[slice(0, 3)] 802 | _['x'][0] = 7 803 | check_tafra(_) 804 | check_tafra(t) 805 | 806 | t = build_tafra() 807 | _ = t[:3].copy() 808 | _['x'][0] = 9 809 | check_tafra(_) 810 | check_tafra(t) 811 | 812 | t = build_tafra() 813 | _ = t[t['x'] <= 4] 814 | _['x'][1] = 15 815 | check_tafra(_) 816 | check_tafra(t) 817 | 818 | t = build_tafra() 819 | _ = t[2] 820 | _ = t[[1, 3]] 821 | _ = t[np.array([2, 4])] 822 | _ = t[[True, False, True, True, False, True]] 823 | _ = t[np.array([True, False, True, True, False, True])] 824 | _ = t[['x', 'y']] 825 | _ = t[('x', 'y')] 826 | _ = t[[True, 2]] 827 | check_tafra(_) 828 | check_tafra(t) 829 | 830 | with pytest.raises(IndexError) as e: 831 | _ = t[np.array([[1, 2]])] 832 | 833 | with pytest.raises(IndexError) as e: 834 | _ = t[[True, False]] 835 | 836 | with pytest.raises(IndexError) as e: 837 | _ = t[np.array([True, False])] 838 | 839 | with pytest.raises(IndexError) as e: 840 | _ = t[(1, 2)] # noqa 841 | 842 | with pytest.raises(IndexError) as e: 843 | _ = t[(1, 2.)] # type: ignore # noqa 844 | 845 | with pytest.raises(ValueError) as e: 846 | _ = t[['x', 2]] 847 | 848 | with pytest.raises(TypeError) as e: 849 | _ = t[{'x': [1, 2]}] # type: ignore 850 | 851 | with pytest.raises(TypeError) as e: 852 | _ = t[TestClass()] # type: ignore # noqa 853 | 854 | with pytest.raises(IndexError) as e: 855 | _ = t[[1, 2.]] # type: ignore 856 | 857 | with pytest.raises(IndexError) as e: 858 | _ = t[np.array([1, 2.])] 859 | 860 | 861 | def test_invalid_dtypes() -> None: 862 | t = build_tafra() 863 | with pytest.raises(Exception) as e: 864 | t.update_dtypes({'x': 'flot', 'y': 'st'}) 865 | 866 | def test_invalid_assignment() -> None: 867 | t = build_tafra() 868 | _ = build_tafra() 869 | _._data['x'] = np.arange(5) 870 | 871 | with pytest.raises(Exception) as e: 872 | _._update_rows() 873 | 874 | with pytest.raises(Exception) as e: 875 | _ = t.update(_) 876 | 877 | with pytest.raises(Exception) as e: 878 | t.update_inplace(_) 879 | 880 | with warnings.catch_warnings(record=True) as w: 881 | t['x'] = np.arange(6)[:, None] 882 | assert str(w[0].message) == '`np.squeeze(ndarray)` applied to set ndim == 1.' 883 | 884 | with warnings.catch_warnings(record=True) as w: 885 | t['x'] = np.atleast_2d(np.arange(6)) 886 | assert str(w[0].message) == '`np.squeeze(ndarray)` applied to set ndim == 1.' 887 | 888 | with warnings.catch_warnings(record=True) as w: 889 | t['x'] = np.atleast_2d(np.arange(6)).T 890 | assert str(w[0].message) == '`np.squeeze(ndarray)` applied to set ndim == 1.' 891 | 892 | with warnings.catch_warnings(record=True) as w: 893 | t['x'] = np.atleast_2d(np.arange(6)) 894 | assert str(w[0].message) == '`np.squeeze(ndarray)` applied to set ndim == 1.' 895 | 896 | with pytest.raises(Exception) as e: 897 | t['x'] = np.repeat(np.arange(6)[:, None], repeats=2, axis=1) 898 | 899 | def test_datetime() -> None: 900 | t = build_tafra() 901 | t['d'] = np.array([np.datetime64(_, 'D') for _ in range(6)]) 902 | t.update_dtypes({'d': ' None: 906 | t = build_tafra() 907 | t['d'] = np.array([datetime.fromisoformat(f'2020-0{_+1}-01') for _ in range(6)]) 908 | assert t._dtypes['d'] == 'object' 909 | check_tafra(t) 910 | 911 | object_formatter['datetime'] = lambda x: x.astype('datetime64[D]') 912 | t2 = t.parse_object_dtypes() 913 | assert t2['d'].dtype == np.dtype('datetime64[D]') 914 | check_tafra(t2) 915 | 916 | t.parse_object_dtypes_inplace() 917 | assert t['d'].dtype == np.dtype('datetime64[D]') 918 | check_tafra(t) 919 | 920 | def test_coalesce() -> None: 921 | t = Tafra({'x': np.array([1, 2, None, 4, None])}) 922 | t['x'] = t.coalesce('x', [[1, 2, 3, None, 5], [None, None, None, None, 'five']]) 923 | t['y'] = t.coalesce('y', [[1, 2, 3, None, 5], [None, None, None, None, 'five']]) 924 | assert np.all(t['x'] != np.array(None)) 925 | assert t['y'][3] == np.array(None) 926 | check_tafra(t) 927 | 928 | t = Tafra({'x': np.array([1, 2, None, 4, None])}) 929 | t.coalesce_inplace('x', [[1, 2, 3, None, 5], [None, None, None, None, 'five']]) 930 | t.coalesce_inplace('y', [[1, 2, 3, None, 5], [None, None, None, None, 'five']]) 931 | assert np.all(t['x'] != np.array(None)) 932 | assert t['y'][3] == np.array(None) 933 | check_tafra(t) 934 | 935 | t = Tafra({'x': np.array([None])}) 936 | t.coalesce('x', [[1], [None]]) 937 | check_tafra(t) 938 | 939 | def test_left_join_equi() -> None: 940 | l = Tafra({ 941 | 'x': np.array([1, 2, 3, 4, 5, 6]), 942 | 'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'), 943 | 'z': np.array([0, 0, 0, 1, 1, 1]) 944 | }) 945 | 946 | r = Tafra({ 947 | 'a': np.array([1, 2, 3, 4, 5, 6]), 948 | 'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'), 949 | 'c': np.array([0, 0, 0, 1, 1, 1]) 950 | }) 951 | t = l.left_join(r, [('x', 'a', '==')], ['x', 'y', 'a', 'b']) 952 | check_tafra(t) 953 | 954 | r = Tafra({ 955 | 'a': np.array([1, 1, 1, 2, 2, 2]), 956 | 'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'), 957 | 'c': np.array([2, 2, 2, 3, 3, 3]) 958 | }) 959 | t = l.left_join(r, [('x', 'a', '=='), ('z', 'c', '==')], ['x', 'y', 'a', 'b']) 960 | check_tafra(t) 961 | 962 | r = Tafra({ 963 | 'a': np.array([1, 1, 1, 2, 2, 2]), 964 | '_a': np.array([1, 1, 2, 2, 3, 3]), 965 | 'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'), 966 | 'c': np.array([0, 0, 0, 1, 1, 1]) 967 | }) 968 | t = l.left_join(r, [('x', 'a', '=='), ('x', '_a', '==')], ['x', 'y', 'a', 'b']) 969 | check_tafra(t) 970 | 971 | r = Tafra({ 972 | 'a': np.array([1, 1, 2, 2, 3, 3]), 973 | 'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'), 974 | 'c': np.array([0, 0, 0, 1, 1, 1]) 975 | }) 976 | t = l.left_join(r, [('x', 'a', '<')], ['x', 'y', 'a', 'b']) 977 | check_tafra(t) 978 | 979 | def test_inner_join() -> None: 980 | l = Tafra({ 981 | 'x': np.array([1, 2, 3, 4, 5, 6]), 982 | 'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'), 983 | 'z': np.array([0, 0, 0, 1, 1, 1]) 984 | }) 985 | 986 | r = Tafra({ 987 | 'a': np.array([1, 2, 3, 4, 5, 6]), 988 | 'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'), 989 | 'c': np.array([0, 0, 0, 1, 1, 1]) 990 | }) 991 | t = l.inner_join(r, [('x', 'a', '==')], ['x', 'y', 'a', 'b']) 992 | check_tafra(t) 993 | 994 | r = Tafra({ 995 | 'a': np.array([1, 1, 2, 2, 3, 3]), 996 | 'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'), 997 | 'c': np.array([0, 0, 0, 1, 1, 1]) 998 | }) 999 | t = l.inner_join(r, [('x', 'a', '==')], ['x', 'y', 'a', 'b']) 1000 | check_tafra(t) 1001 | 1002 | r = Tafra({ 1003 | 'a': np.array([1, 1, 1, 2, 2, 2]), 1004 | 'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'), 1005 | 'c': np.array([0, 0, 0, 1, 1, 1]) 1006 | }) 1007 | t = l.inner_join(r, [('x', 'a', '==')], ['x', 'y', 'a', 'b']) 1008 | check_tafra(t) 1009 | 1010 | r = Tafra({ 1011 | 'a': np.array([1, 1, 1, 2, 2, 2]), 1012 | 'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'), 1013 | 'c': np.array([0, 0, 0, 1, 1, 1]) 1014 | }) 1015 | 1016 | t = l.inner_join(r, [('x', 'a', '<=')], ['x', 'y', 'a', 'b']) 1017 | check_tafra(t) 1018 | 1019 | 1020 | def test_cross_join() -> None: 1021 | l = Tafra({ 1022 | 'x': np.array([1, 2, 3, 4, 5, 6]), 1023 | 'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'), 1024 | 'z': np.array([0, 0, 0, 1, 1, 1]) 1025 | }) 1026 | 1027 | r = Tafra({ 1028 | 'a': np.array([1, 2, 3, 4, 5, 6]), 1029 | 'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'), 1030 | 'c': np.array([0, 0, 0, 1, 1, 1]) 1031 | }) 1032 | t = l.cross_join(r) 1033 | check_tafra(t) 1034 | 1035 | r = Tafra({ 1036 | 'a': np.array([1, 1, 2, 2, 3, 3]), 1037 | 'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'), 1038 | 'c': np.array([0, 0, 0, 1, 1, 1]) 1039 | }) 1040 | t = l.cross_join(r) 1041 | check_tafra(t) 1042 | 1043 | r = Tafra({ 1044 | 'a': np.array([1, 1, 1, 2, 2, 2]), 1045 | 'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'), 1046 | 'c': np.array([0, 0, 0, 1, 1, 1]) 1047 | }) 1048 | t = l.cross_join(r) 1049 | check_tafra(t) 1050 | 1051 | r = Tafra({ 1052 | 'a': np.array([1, 1, 1, 2, 2, 2]), 1053 | 'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'), 1054 | 'c': np.array([0, 0, 0, 1, 1, 1]) 1055 | }) 1056 | 1057 | t = l.cross_join(r, select=['x', 'z', 'a', 'c']) 1058 | check_tafra(t) 1059 | 1060 | with pytest.raises(IndexError) as e: 1061 | t = l.cross_join(r, select=['x', 'z']) 1062 | 1063 | with pytest.raises(IndexError) as e: 1064 | t = l.cross_join(r, select=['a', 'c']) 1065 | 1066 | def test_left_join_invalid() -> None: 1067 | l = Tafra({ 1068 | 'x': np.array([1, 2, 3, 4, 5, 6]), 1069 | 'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'), 1070 | 'z': np.array([0, 0, 0, 1, 1, 1]) 1071 | }) 1072 | 1073 | r = Tafra({ 1074 | 'a': np.array([1, 2, 3, 4, 5, 6]), 1075 | 'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'), 1076 | 'c': np.array([0, 0, 0, 1, 1, 1]) 1077 | }) 1078 | 1079 | with pytest.raises(TypeError) as e: 1080 | t = l.left_join(r, [('x', 'a', '===')], ['x', 'y', 'a', 'b']) 1081 | 1082 | r = Tafra({ 1083 | 'a': np.array([1, 2, 3, 4, 5, 6], dtype='float'), 1084 | 'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'), 1085 | 'c': np.array([0, 0, 0, 1, 1, 1]) 1086 | }) 1087 | 1088 | with pytest.raises(TypeError) as e: 1089 | t = l.left_join(r, [('x', 'a', '==')], ['x', 'y', 'a', 'b']) 1090 | 1091 | r = Tafra({ 1092 | 'a': np.array([1, 2, 3, 4, 5, 6]), 1093 | 'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'), 1094 | 'c': np.array([0, 0, 0, 1, 1, 1]) 1095 | }) 1096 | 1097 | l._dtypes['x'] = 'float' 1098 | with pytest.raises(TypeError) as e: 1099 | t = l.left_join(r, [('x', 'a', '==')], ['x', 'y', 'a', 'b']) 1100 | 1101 | def test_csv() -> None: 1102 | write_path = 'test/test_to_csv.csv' 1103 | 1104 | def write_reread(t: Tafra) -> None: 1105 | t.to_csv(write_path) 1106 | t2 = Tafra.read_csv(write_path, dtypes=t.dtypes) 1107 | 1108 | for c1, c2 in zip(t.columns, t2.columns): 1109 | assert np.array_equal(t.data[c1], t2.data[c2]) 1110 | assert np.array_equal(t.dtypes[c1], t2.dtypes[c2]) 1111 | 1112 | # straightforward CSV - inference heuristic works 1113 | path = Path('test/ex1.csv') 1114 | t = Tafra.read_csv(path) 1115 | assert t.dtypes['a'] == 'int32' 1116 | assert t.dtypes['b'] == 'bool' 1117 | assert t.dtypes['c'] == 'float64' 1118 | assert t.rows == 6 1119 | assert len(t.columns) == 3 1120 | check_tafra(t) 1121 | write_reread(t) 1122 | 1123 | # test again with TextIOWrapper 1124 | with open('test/ex1.csv', 'r') as f: 1125 | t = Tafra.read_csv(f) 1126 | assert t.dtypes['a'] == 'int32' 1127 | assert t.dtypes['b'] == 'bool' 1128 | assert t.dtypes['c'] == 'float64' 1129 | assert t.rows == 6 1130 | assert len(t.columns) == 3 1131 | check_tafra(t) 1132 | write_reread(t) 1133 | 1134 | with open(write_path, 'w') as f: 1135 | t.to_csv(f) 1136 | with pytest.raises(ValueError) as e: 1137 | with open(write_path) as f: 1138 | t.to_csv(f) 1139 | 1140 | # short CSV - ends during inference period 1141 | t = Tafra.read_csv('test/ex2.csv') 1142 | assert t.dtypes['a'] == 'int32' 1143 | assert t.dtypes['b'] == 'bool' 1144 | assert t.dtypes['c'] == 'float64' 1145 | assert t.rows == 2 1146 | assert len(t.columns) == 3 1147 | check_tafra(t) 1148 | write_reread(t) 1149 | 1150 | # harder CSV - promote to object during inference period, 1151 | # duplicate column name 1152 | t = Tafra.read_csv('test/ex3.csv') 1153 | assert t.dtypes['a'] == 'int32' 1154 | assert t.dtypes['b'] == 'object' 1155 | assert t.dtypes['b (2)'] == 'float64' 1156 | assert t.rows == 6 1157 | assert len(t.columns) == 3 1158 | check_tafra(t) 1159 | write_reread(t) 1160 | 1161 | # as above, but with a promotion required after inference period 1162 | # (heuristic fails) 1163 | t = Tafra.read_csv('test/ex4.csv') 1164 | assert t.dtypes['a'] == 'int32' 1165 | assert t.dtypes['b'] == 'object' 1166 | assert t.dtypes['b (2)'] == 'float64' 1167 | assert t.rows == 6 1168 | assert len(t.columns) == 3 1169 | check_tafra(t) 1170 | write_reread(t) 1171 | 1172 | # bad CSV - missing column on row #4 1173 | with pytest.raises(ValueError) as e: 1174 | t = Tafra.read_csv('test/ex5.csv') 1175 | 1176 | # bad CSV - missing column on row #4 - after guess rows 1177 | with pytest.raises(ValueError) as e: 1178 | t = Tafra.read_csv('test/ex5.csv', guess_rows=2) 1179 | 1180 | # missing column - but numpy will automatically convert missing (None) to nan 1181 | t = Tafra.read_csv('test/ex6.csv') 1182 | assert t.dtypes['dp'] == 'float64' 1183 | assert t.dtypes['dp_prime'] == 'float64' 1184 | assert t.dtypes['dp_prime_te'] == 'float64' 1185 | assert t.dtypes['t'] == 'float64' 1186 | assert t.dtypes['te'] == 'float64' 1187 | check_tafra(t) 1188 | 1189 | # missing column - do not automatically cast 1190 | t = Tafra.read_csv('test/ex6.csv', missing=None) 1191 | assert t.dtypes['dp'] == 'float64' 1192 | assert t.dtypes['dp_prime'] == 'object' 1193 | assert t.dtypes['dp_prime_te'] == 'object' 1194 | assert t.dtypes['t'] == 'float64' 1195 | assert t.dtypes['te'] == 'float64' 1196 | check_tafra(t) 1197 | 1198 | t.update_dtypes_inplace({'dp_prime': float, 'dp_prime_te': 'float64'}) 1199 | assert t.dtypes['dp_prime'] == 'float64' 1200 | assert t.dtypes['dp_prime_te'] == 'float64' 1201 | check_tafra(t) 1202 | 1203 | # force dtypes on missing columns 1204 | t = Tafra.read_csv('test/ex6.csv', missing=None, dtypes={'dp_prime': np.float64, 'dp_prime_te': np.float32}) 1205 | assert t.dtypes['dp'] == 'float64' 1206 | assert t.dtypes['dp_prime'] == 'float64' 1207 | assert t.dtypes['dp_prime_te'] == 'float32' 1208 | assert t.dtypes['t'] == 'float64' 1209 | assert t.dtypes['te'] == 'float64' 1210 | check_tafra(t) 1211 | 1212 | # override a column type 1213 | t = Tafra.read_csv('test/ex4.csv', dtypes={'a': 'float32'}) 1214 | assert t.dtypes['a'] == 'float32' 1215 | assert t.dtypes['b'] == 'object' 1216 | assert t.dtypes['b (2)'] == 'float64' 1217 | assert t.rows == 6 1218 | assert len(t.columns) == 3 1219 | check_tafra(t) 1220 | write_reread(t) 1221 | --------------------------------------------------------------------------------