├── .flake8
├── .github
    └── workflows
    │   └── test.yml
├── .gitignore
├── .readthedocs.yaml
├── AUTHORS.md
├── COPYING
├── Makefile
├── NEWS.md
├── README.md
├── benchmark-versions.sh
├── benchmark.py
├── bin
    ├── di-csv2json
    ├── di-format-geojson
    ├── di-geojson2csv
    ├── di-json2csv
    └── di-open
├── data
    ├── README.md
    ├── downloads.csv
    ├── downloads.json
    ├── holidays.csv
    ├── holidays.json
    ├── listings-reviews.csv
    ├── listings-reviews.json
    ├── listings.csv
    ├── listings.json
    ├── listings.py
    ├── neighbourhoods.geojson
    ├── vehicles.csv
    └── vehicles.json
├── dataiter
    ├── __init__.py
    ├── aggregate.py
    ├── data_frame.py
    ├── deco.py
    ├── dt.py
    ├── dtypes.py
    ├── geojson.py
    ├── io.py
    ├── list_of_dicts.py
    ├── regex.py
    ├── test
    │   ├── __init__.py
    │   ├── test_aggregate.py
    │   ├── test_data_frame.py
    │   ├── test_dt.py
    │   ├── test_geojson.py
    │   ├── test_io.py
    │   ├── test_list_of_dicts.py
    │   ├── test_regex.py
    │   ├── test_util.py
    │   └── test_vector.py
    ├── util.py
    └── vector.py
├── doc
    ├── Makefile
    ├── aggregation.rst
    ├── check.py
    ├── comparison.rst
    ├── comparison
    │   ├── Makefile
    │   ├── README.md
    │   ├── blocks
    │   │   ├── .flake8
    │   │   ├── aggregate-dataiter.py
    │   │   ├── aggregate-dplyr.R
    │   │   ├── aggregate-pandas.py
    │   │   ├── cbind-dataiter.py
    │   │   ├── cbind-dplyr.R
    │   │   ├── cbind-pandas.py
    │   │   ├── chain-dataiter.py
    │   │   ├── chain-dplyr.R
    │   │   ├── chain-pandas.py
    │   │   ├── colnames-dataiter.py
    │   │   ├── colnames-dplyr.R
    │   │   ├── colnames-pandas.py
    │   │   ├── filter-dataiter.py
    │   │   ├── filter-dplyr.R
    │   │   ├── filter-pandas.py
    │   │   ├── grouped-modify-dataiter.py
    │   │   ├── grouped-modify-dplyr.R
    │   │   ├── grouped-modify-pandas.py
    │   │   ├── head-dataiter.py
    │   │   ├── head-dplyr.R
    │   │   ├── head-pandas.py
    │   │   ├── import-dataiter.py
    │   │   ├── import-dplyr.R
    │   │   ├── import-pandas.py
    │   │   ├── index-dataiter.py
    │   │   ├── index-dplyr.R
    │   │   ├── index-pandas.py
    │   │   ├── io-binary-dataiter.py
    │   │   ├── io-binary-dplyr.R
    │   │   ├── io-binary-pandas.py
    │   │   ├── io-csv-dataiter.py
    │   │   ├── io-csv-dplyr.R
    │   │   ├── io-csv-pandas.py
    │   │   ├── join-dataiter.py
    │   │   ├── join-dplyr.R
    │   │   ├── join-pandas.py
    │   │   ├── modify-dataiter.py
    │   │   ├── modify-dplyr.R
    │   │   ├── modify-pandas.py
    │   │   ├── non-join-dataiter.py
    │   │   ├── non-join-dplyr.R
    │   │   ├── non-join-pandas.py
    │   │   ├── rbind-dataiter.py
    │   │   ├── rbind-dplyr.R
    │   │   ├── rbind-pandas.py
    │   │   ├── rename-dataiter.py
    │   │   ├── rename-dplyr.R
    │   │   ├── rename-pandas.py
    │   │   ├── select-dataiter.py
    │   │   ├── select-dplyr.R
    │   │   ├── select-pandas.py
    │   │   ├── size-dataiter.py
    │   │   ├── size-dplyr.R
    │   │   ├── size-pandas.py
    │   │   ├── sort-dataiter.py
    │   │   ├── sort-dplyr.R
    │   │   ├── sort-pandas.py
    │   │   ├── unique-dataiter.py
    │   │   ├── unique-dplyr.R
    │   │   └── unique-pandas.py
    │   ├── build.py
    │   ├── generate.sh
    │   ├── index.html
    │   ├── prism.css
    │   └── prism.js
    ├── conf.py
    ├── data-frame-column.rst
    ├── data-frame.rst
    ├── dataiter.rst
    ├── dt.rst
    ├── dtypes.rst
    ├── geojson.rst
    ├── index.rst
    ├── list-of-dicts.rst
    ├── output.py
    ├── quick-start.rst
    ├── regex.rst
    ├── requirements.txt
    └── vector.rst
├── pyproject.toml
├── requirements.txt
├── tools
    ├── check-missing.py
    └── release
└── validation
    ├── generate-df.py
    ├── generate-ld.py
    ├── generate.R
    ├── validate-df.sh
    └── validate-ld.sh


/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | select = E1,E9,F
3 | ignore = E125,E129
4 | exclude = doc/comparison/blocks,venv
5 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Test
 2 | on: [push]
 3 | jobs:
 4 |   test:
 5 |     runs-on: ubuntu-latest
 6 |     strategy:
 7 |       matrix:
 8 |         python-version: ["3.9", "3.10", "3.11", "3.12"]
 9 |     steps:
10 |       - uses: actions/checkout@v4
11 |       - uses: actions/setup-python@v5
12 |         with:
13 |           python-version: ${{ matrix.python-version }}
14 |       - run: pip install -U attd flake8 'numpy>=2.0,<3.0' pandas pyarrow pytest wcwidth
15 |       - run: make check
16 |       - run: make test
17 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.lprof
 2 | *.prof
 3 | .env
 4 | .envrc
 5 | .pytest_cache
 6 | __pycache__
 7 | benchmark-head.py
 8 | benchmark-versions.csv
 9 | benchmark-versions.ods
10 | build
11 | dataiter.egg-info
12 | dist
13 | doc/_build
14 | doc/comparison/comparison.html
15 | test.py
16 | tmp.csv
17 | validation/*.csv
18 | venv
19 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # https://docs.readthedocs.io/en/stable/config-file/v2.html
 2 | version: 2
 3 | build:
 4 |   os: ubuntu-22.04
 5 |   tools:
 6 |     python: "3.11"
 7 | sphinx:
 8 |   configuration: doc/conf.py
 9 | python:
10 |   install:
11 |   - requirements: doc/requirements.txt
12 | 


--------------------------------------------------------------------------------
/AUTHORS.md:
--------------------------------------------------------------------------------
1 | Osmo Salomaa <otsaloma@iki.fi>
2 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2019-2025 Osmo Salomaa
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8-unix -*-
  2 | 
  3 | # EDITOR must wait!
  4 | EDITOR = nano
  5 | PREFIX = /usr/local
  6 | PYTHON = python3
  7 | 
  8 | check:
  9 | 	flake8 .
 10 | 	flake8 . `grep -Fl '#!/usr/bin/env python3' bin/*`
 11 | 
 12 | clean:
 13 | 	rm -rf *.lprof
 14 | 	rm -rf *.prof
 15 | 	rm -rf build
 16 | 	rm -rf dataiter.egg-info
 17 | 	rm -rf dist
 18 | 	rm -rf doc/_build
 19 | 	rm -rf doc/comparison/comparison.html
 20 | 	rm -rf validation/*.csv
 21 | 	rm -rf __pycache__
 22 | 	rm -rf */__pycache__
 23 | 	rm -rf */*/__pycache__
 24 | 	rm -rf .pytest_cache
 25 | 	rm -rf */.pytest_cache
 26 | 	rm -rf */*/.pytest_cache
 27 | 
 28 | doc:
 29 | 	$(MAKE) SPHINXBUILD=../venv/bin/sphinx-build -C doc clean html
 30 | 
 31 | doc-check:
 32 | 	PYTHONPATH=. doc/check.py
 33 | 
 34 | doc-open:
 35 | 	xdg-open doc/_build/html/index.html
 36 | 
 37 | doc-watch:
 38 | 	watchexec -e py,rst --workdir doc $(MAKE) SPHINXBUILD=../venv/bin/sphinx-build html
 39 | 
 40 | install:
 41 | 	pip3 install --break-system-packages .
 42 | 
 43 | # Non-essential scripts, not installed by default.
 44 | # Note that these don't go through setuptools rewriting,
 45 | # instead they just do a plain unspecified dataiter import.
 46 | install-cli:
 47 | 	mkdir -p $(PREFIX)/bin
 48 | 	for X in `ls bin | grep di-`; do \
 49 | 	cp -fv bin/$$X $(PREFIX)/bin && \
 50 | 	chmod +x $(PREFIX)/bin/$$X; \
 51 | 	done
 52 | 
 53 | # Interactive!
 54 | publish:
 55 | 	$(MAKE) clean
 56 | 	python3 -m build
 57 | 	test -s dist/dataiter-*-py3-none-any.whl
 58 | 	test -s dist/dataiter-*.tar.gz
 59 | 	ls -l dist
 60 | 	@printf "Press Enter to upload or Ctrl+C to abort: "; read _
 61 | 	twine upload dist/*
 62 | 	sudo pip3 uninstall --break-system-packages -y dataiter || true
 63 | 	sudo pip3 uninstall --break-system-packages -y dataiter || true
 64 | 	sudo pip3 install   --break-system-packages -U dataiter
 65 | 	$(MAKE) test-installed
 66 | 
 67 | # Interactive!
 68 | release:
 69 | 	$(MAKE) check doc-check test validate clean
 70 | 	@echo "BUMP VERSION NUMBERS"
 71 | 	$(EDITOR) bin/di-open
 72 | 	$(EDITOR) dataiter/__init__.py
 73 | 	$(EDITOR) benchmark-versions.sh
 74 | 	@echo "ADD RELEASE NOTES"
 75 | 	$(EDITOR) NEWS.md
 76 | 	sudo $(MAKE) install clean
 77 | 	$(MAKE) test-installed
 78 | 	tools/release
 79 | 
 80 | test:
 81 | 	py.test .
 82 | 
 83 | test-installed:
 84 | 	cd && python3 -c "import dataiter; dataiter.DataFrame()"
 85 | 	cd && python3 -c "import dataiter; dataiter.ListOfDicts()"
 86 | 
 87 | validate:
 88 | 	cd validation && DATAITER_USE_NUMBA=false ./validate-df.sh
 89 | 	cd validation && DATAITER_USE_NUMBA=true ./validate-df.sh
 90 | 	cd validation && ./validate-ld.sh
 91 | 
 92 | venv:
 93 | 	rm -rf venv
 94 | 	$(PYTHON) -m venv venv
 95 | 	. venv/bin/activate && \
 96 | 	  pip install -U pip setuptools wheel && \
 97 | 	  pip install -r requirements.txt
 98 | 
 99 | .PHONY: check clean doc doc-check doc-open doc-watch install install-cli publish release test test-installed validate venv
100 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
  1 | 2025-02-07: Dataiter 1.0
  2 | ========================
  3 | 
  4 | * Silence warnings about writing NPZ files with StringDType:
  5 |   "UserWarning: Custom dtypes are saved as python objects using the
  6 |   pickle protocol. Loading this file requires allow_pickle=True to be
  7 |   set."
  8 | 
  9 |   Dataiter can now be considered stable. If upgrading from <= 0.51,
 10 |   please read the release notes for 0.99–0.9999.
 11 | 
 12 | 2025-01-12: Dataiter 0.9999
 13 | ===========================
 14 | 
 15 | * New module `dataiter.regex` for vectorized regular expressions
 16 | * Add proxy object `Vector.dt` for `dataiter.dt`
 17 | * Add proxy object `Vector.re` for `dataiter.regex`
 18 | * Add proxy object `Vector.str` for `numpy.strings`
 19 | * Use PyArrow instead of Pandas to read and write CSV files
 20 | * Replace Pandas dependency with PyArrow
 21 | 
 22 | This is likely to be a breaking change in some rare weirdly formatted
 23 | CSV files that Pandas and PyArrow might parse differently, resulting in
 24 | something like diffently guessed data types or differently detected
 25 | missing value markers. The note about stability below release 0.99 still
 26 | applies.
 27 | 
 28 | 2024-12-15: Dataiter 0.999
 29 | ==========================
 30 | 
 31 | * `DataFrame.fom_arrow`: Remove `strings_as_object` argument
 32 | * `DataFrame.from_pandas`: Remove `strings_as_object` argument
 33 | * `DataFrame.read_csv`: Remove `strings_as_object` argument
 34 | * `DataFrame.read_parquet`: Remove `strings_as_object` argument
 35 | * `GeoJSON.read`: Remove `strings_as_object` argument
 36 | * `ListOfDicts.to_data_frame`: Remove `strings_as_object` argument
 37 | * `read_csv`: Remove `strings_as_object` argument
 38 | * `read_geojson`: Remove `strings_as_object` argument
 39 | * `read_parquet`: Remove `strings_as_object` argument
 40 | * `Vector.as_string`: Remove `length` argument
 41 | * `Vector.is_na`: Fix to work in multidimensional cases where the
 42 |   elements of an object vector are arrays/vectors
 43 | * `Vector.rank`: Change default `method` to "min"
 44 | * `Vector.rank`: Remove `method` "average"
 45 | 
 46 | This is a breaking change to switch the string data type from the
 47 | fixed-width `str_` a.k.a. `<U#` to the variable-width `StringDType`
 48 | introduced in NumPy 2.0. The main benefit is greatly reduced memory use,
 49 | making strings usable without needing to be careful or falling back to
 50 | `object`. The note about stability below release 0.99 still applies.
 51 | 
 52 | Note that as `StringDType` is only in NumPy >= 2.0, any NPZ or Pickle
 53 | files saved cannot be opened using Dataiter < 0.99 and NumPy < 2.0. If
 54 | you need that kind of interoperability, consider using the Parquet file
 55 | format.
 56 | 
 57 | 2024-08-17: Dataiter 0.99
 58 | =========================
 59 | 
 60 | * Adapt to changes in NumPy 2.0
 61 | * Bump NumPy dependency to >= 2.0
 62 | 
 63 | This is a minimal change to be NumPy 2.0 compatible. In the 0.99+
 64 | releases, we plan to adopt the new NumPy string dtype and fix any
 65 | regressions that come up, leading to a 1.0 release when everything looks
 66 | to be working reliably (#26). Anyone looking for extreme stability
 67 | should consider avoiding the 0.99+ releases and waiting for 1.0.
 68 | 
 69 | 2024-06-24: Dataiter 0.51
 70 | =========================
 71 | 
 72 | * Mark NumPy dependency as < 2.0
 73 | 
 74 | 2024-04-06: Dataiter 0.50
 75 | =========================
 76 | 
 77 | * `ListOfDicts.drop_na`: New method
 78 | * `ListOfDicts.keys`: New method
 79 | * `ListOfDicts.print_memory_use`: New method
 80 | * Fix tabular display of Unicode characters with width != 1
 81 | * Add dependency on wcwidth: https://pypi.org/project/wcwidth
 82 | 
 83 | 2023-11-08: Dataiter 0.49
 84 | =========================
 85 | 
 86 | * `dt`: Handle all NaT input
 87 | * Migrate from `setup.py` to `hatch` and `pyproject.toml`
 88 | 
 89 | 2023-10-08: Dataiter 0.48
 90 | =========================
 91 | 
 92 | * `Vector.as_datetime`: Add `precision` argument
 93 | * `Vector.concat`: New method
 94 | * `Vector.sort`: Fix sorting object vectors
 95 | 
 96 | 2023-09-09: Dataiter 0.47
 97 | =========================
 98 | 
 99 | * `DataFrame`: Fix column and method name clash errors in certain operations
100 | * `dt.replace`: Allow vector arguments the same length as `x`
101 | 
102 | 2023-09-05: Dataiter 0.46
103 | =========================
104 | 
105 | * `DataFrame.count`: New method, shorthand for
106 |   `data.group_by(...).aggregate(n=di.count())`
107 | * `Vector.rank`: Handle empty and all-NA vectors
108 | 
109 | 2023-06-14: Dataiter 0.45
110 | =========================
111 | 
112 | * `USE_NUMBA_CACHE`: New option, read from environment variable
113 |   `DATAITER_USE_NUMBA_CACHE` if exists, defauls to `True`
114 | * Fix a possible issue with Numba caching
115 | 
116 | 2023-06-13: Dataiter 0.44
117 | =========================
118 | 
119 | * Use `numba.extending.overload` instead of the deprecated
120 |   `numba.generated_jit`
121 | 
122 | 2023-06-08: Dataiter 0.43
123 | =========================
124 | 
125 | * `DataFrame`: Don't try to do joins on NA values in `by` columns
126 | * `DataFrame.drop_na`: New method
127 | 
128 | 2023-05-30: Dataiter 0.42
129 | =========================
130 | 
131 | * `DataFrame`: Truncate multiline strings when printing
132 | * `DataFrame.from_arrow`: New method
133 | * `DataFrame.read_parquet`: New method
134 | * `DataFrame.to_arrow`: New method
135 | * `DataFrame.write_parquet`: New method
136 | * `read_parquet`: New function
137 | * `Vector.__init__`: Fix type guessing when mixing Python and NumPy
138 |   floats or integers and missing values
139 | * Allow using a thousand separator when printing numbers,
140 |   off by default, can be set with `dataiter.PRINT_THOUSAND_SEPARATOR`
141 | 
142 | 2023-03-11: Dataiter 0.41
143 | =========================
144 | 
145 | * Fix printing really small numbers
146 | 
147 | 2023-02-21: Dataiter 0.40.1
148 | ===========================
149 | 
150 | * `DataFrame.modify`: Fix grouped modify on unsorted data frame
151 | 
152 | 2023-02-20: Dataiter 0.40
153 | =========================
154 | 
155 | * `Vector.map`: Add `dtype` argument
156 | 
157 | 2023-02-06: Dataiter 0.39.1
158 | ===========================
159 | 
160 | * `ListOfDicts.to_data_frame`: Add `strings_as_object` argument
161 | 
162 | 2023-01-21: Dataiter 0.39
163 | =========================
164 | 
165 | * `read_csv`, `read_geojson`, `DataFrame.from_pandas`,
166 |   `DataFrame.read_csv`, `GeoJSON.read`: Add `strings_as_object` argument
167 | 
168 | 2022-12-15: Dataiter 0.38
169 | =========================
170 | 
171 | * `DataFrame.slice_off`: New method
172 | * `GeoJSON.to_data_frame`: New method
173 | * Fix error with new column placeholder attributes in conjunction with
174 |   pop, popitem and clear
175 | 
176 | 2022-11-17: Dataiter 0.37
177 | =========================
178 | 
179 | * `DataFrame`: Add placeholder attributes for columns so that
180 |   tab completion of columns as attributes at a shell works
181 | * `dt.from_string`: New function
182 | * `dt.to_string`: New function
183 | * `nrow`: Remove deprecated aggregation function
184 | * Don't use Numba for aggregation involving strings due to bad performance
185 | 
186 | 2022-10-16: Dataiter 0.36
187 | =========================
188 | 
189 | * `dt`: New module for dealing with dates and datetimes
190 | 
191 | 2022-10-03: Dataiter 0.35
192 | =========================
193 | 
194 | * `DataFrame.from_pandas`: Speed up by avoiding unnecessary conversions
195 | * `DataFrame.full_join`: Fix join and output when `by` is a tuple
196 | * `GeoJSON`: Fix printing object
197 | 
198 | 2022-09-17: Dataiter 0.34
199 | =========================
200 | 
201 | * `Vector`: Handle timedeltas correctly for NA checks and printing
202 | * `Vector.is_timedelta`: New method
203 | 
204 | 2022-09-03: Dataiter 0.33
205 | =========================
206 | 
207 | * `DataFrame.sort`: Convert object to string for sorting
208 | * `Vector.sort`: Convert object to string for sorting
209 | * Fix conditional Numba use when importing the numba package works,
210 |   but caching doesn't
211 | * Add `di-open` cli command (currently not part of the default install,
212 |   but can be installed from source using `make install-cli`)
213 | 
214 | 2022-04-02: Dataiter 0.32
215 | =========================
216 | 
217 | * `DataFrame.modify`: Add support for grouped modification (#19)
218 | * `DataFrame.split`: New method
219 | * `ListOfDicts.split`: New method
220 | 
221 | 2022-02-26: Dataiter 0.31
222 | =========================
223 | 
224 | * `DataFrame.compare`: New experimental method
225 | * `Vector.as_string`: Add `length` argument
226 | * Change the documentation to default to the latest release ("stable")
227 |   instead of the development version ("latest")
228 | 
229 | 2022-02-19: Dataiter 0.30
230 | =========================
231 | 
232 | * Use keyword-only arguments where appropriate – the general principle
233 |   is that mandatory arguments are allowed as positional, but optional
234 |   modifiers are keyword only
235 | * Rename all instances of "missing" to "na", such as `Vector.is_missing`
236 |   to `Vector.is_na`, the only exception being
237 |   `ListOfDicts.fill_missing`, which becomes
238 |   `ListOfDicts.fill_missing_keys`
239 | * Truncate data frame object and string columns at
240 |   `PRINT_TRUNCATE_WIDTH` (default 32) for printing
241 | 
242 | 2022-02-09: Dataiter 0.29.2
243 | ===========================
244 | 
245 | * Fix aggregation functions to work with all main data types:
246 |   boolean, integer, float, date, datetime and string
247 | * Fix aggregation functions to handle all missing values (NaN, NaT,
248 |   blank string) correctly, the same as implemented in Vector
249 | * Rename aggregation functions' `dropna` arguments to `drop_missing`
250 | * `first`, `last`, `nth`: Add `drop_missing` argument
251 | * `Vector.drop_missing`: New method
252 | 
253 | 2022-01-30: Dataiter 0.29.1
254 | ===========================
255 | 
256 | * `mode`: Fix to return first in case of ties (requires Python >= 3.8)
257 | * `std`, `var`: Add `ddof` argument (defaults to 0 on account of Numba limitations)
258 | * Don't try to dropna for non-float vectors in aggregation functions
259 | 
260 | 2022-01-29: Dataiter 0.29
261 | =========================
262 | 
263 | * Add shorthand helper functions for use with `DataFrame.aggregate`,
264 |   optionally using Numba JIT-compiled code for speed
265 |     - https://dataiter.readthedocs.io/en/latest/aggregation.html
266 |     - https://dataiter.readthedocs.io/en/latest/data-frame.html#dataiter.DataFrame.aggregate
267 |     - https://dataiter.readthedocs.io/en/latest/dataiter.html
268 | * `DataFrame.map`: New method
269 | * `ncol`: Removed
270 | * `nrow`: Deprecated in favor of `dataiter.count`
271 | * `read_csv`: New alias for `DataFrame.read_csv`
272 | * `read_geojson`: New alias for `GeoJSON.read`
273 | * `read_json`: New alias for `ListOfDicts.read_json`
274 | * `read_npz`: New alias for `DataFrame.read_npz`
275 | 
276 | 2022-01-09: Dataiter 0.28
277 | =========================
278 | 
279 | * `DataFrame`: Make object columns work in various operations
280 | * `DataFrame.from_json`: Add arguments `columns` and `dtypes`
281 | * `DataFrame.from_pandas`: Add argument `dtypes`
282 | * `DataFrame.full_join`: Speed up
283 | * `DataFrame.read_csv`: Add argument `dtypes`
284 | * `DataFrame.read_json`: Add arguments `columns` and `dtypes`
285 | * `GeoJSON.read`: Add arguments `columns` and `dtypes`
286 | * `ListOfDicts.fill_missing`: New method
287 | * `ListOfDicts.from_json`: Add arguments `keys` and `types`
288 | * `ListOfDicts.full_join`: Speed up
289 | * `ListOfDicts.read_csv`: Add argument `types`, rename `columns` to `keys`
290 | * `ListOfDicts.read_json`: Add arguments `keys` and `types`
291 | 
292 | 2022-01-01: Dataiter 0.27
293 | =========================
294 | 
295 | * `DataFrame`: Fix error message when column not found
296 | * `DataFrame.aggregate`: Speed up
297 | * `DataFrame.full_join`: Fix to join all possible columns
298 | * `DataFrame.read_csv`: Try to avoid mixed types
299 | * `ListOfDicts.full_join`: Fix to join all possible keys
300 | * `ListOfDicts.write_csv`: Use minimal quoting
301 | * `Vector.get_memory_use`: New method
302 | * `Vector.rank`: Rewrite, add `method` argument
303 | * `*.read_*`: Rename `fname` argument `path`
304 | * `*.write_*`: Rename `fname` argument `path`
305 | * Add comparison table dplyr vs. Dataiter vs. Pandas to documentation:
306 |   <https://dataiter.readthedocs.io/en/latest/comparison.html>
307 | 
308 | 2021-12-02: Dataiter 0.26
309 | =========================
310 | 
311 | * `DataFrame.read_npz`: New method to read NumPy npz format
312 | * `DataFrame.write_npz`: New method to write NumPy npz format
313 | * `*.read_*`: Decompress `.bz2|.gz|.xz` automatically
314 | * `*.write_*`: Compress `.bz2|.gz|.xz` automatically
315 | 
316 | 2021-11-13: Dataiter 0.25
317 | =========================
318 | 
319 | * `DataFrame.print_missing_counts`: Fix when nothing missing
320 | * `Vector.replace_missing`: New method
321 | 
322 | 2021-10-27: Dataiter 0.24
323 | =========================
324 | 
325 | * `DataFrame.print_memory_use`: New method
326 | * `ListOfDicts.write_csv`: Use less memory
327 | 
328 | 2021-07-08: Dataiter 0.23
329 | =========================
330 | 
331 | * `Vector.is_*`: Change to be methods instead of properties
332 | * Drop deprecated use of `np.int`
333 | * Drop deprecated comparisons against NaN
334 | 
335 | 2021-05-13: Dataiter 0.22
336 | =========================
337 | 
338 | * `ListOfDicts.map`: New method
339 | 
340 | 2021-03-08: Dataiter 0.21
341 | =========================
342 | 
343 | * `DataFrame.read_csv`: Add `columns` argument
344 | * `ListOfDicts.read_csv`: Add `columns` argument
345 | 
346 | 2021-03-06: Dataiter 0.20
347 | =========================
348 | 
349 | * `DataFrame.*_join`: Handle differing by names via tuple argument
350 | * `ListOfDicts.*_join`: Handle differing by names via tuple argument
351 | 
352 | 2021-03-04: Dataiter 0.19
353 | =========================
354 | 
355 | * Use terminal window width as maximum print width
356 | * `Vector.__init__`: Handle NaN values in non-float vectors
357 | 
358 | 2021-03-03: Dataiter 0.18
359 | =========================
360 | 
361 | * `Vector.__init__`: Accept generators/iterators
362 | * `Vector.map`: New method
363 | 
364 | 2021-02-27: Dataiter 0.17
365 | =========================
366 | 
367 | * `DataFrame.print_missing_counts`: New method
368 | * `GeoJSON.read`: Handle properties differing between features
369 | * `ListOfDicts.print_missing_counts`: New method
370 | * `Vector.as_object`: New method
371 | 
372 | 2020-10-03: Dataiter 0.16.1
373 | ===========================
374 | 
375 | * `GeoJSON.read`: Use warnings, not errors for ignored excess feature keys
376 | 
377 | 2020-09-26: Dataiter 0.16
378 | =========================
379 | 
380 | * `GeoJSON`: New class
381 | 
382 | 2020-09-12: Dataiter 0.15
383 | =========================
384 | 
385 | * `ListOfDicts.sort`: Handle descending sort for all types
386 | 
387 | 2020-08-22: Dataiter 0.14
388 | =========================
389 | 
390 | * `ListOfDicts`: Make obsoletion a warning instead of an error
391 | 
392 | 2020-08-15: Dataiter 0.13
393 | =========================
394 | 
395 | * `DataFrame`: Fix error printing blank strings (#8)
396 | 
397 | 2020-07-25: Dataiter 0.12
398 | =========================
399 | 
400 | * `DataFrame.filter`: Add `colname_value_pairs` argument
401 | * `DataFrame.filter_out`: Add `colname_value_pairs` argument
402 | * `ListOfDicts.__init__`: Remove arguments not intended for external use
403 | * `ListOfDicts.rename`: Preserve order of keys
404 | * Add documentation: https://dataiter.readthedocs.io/
405 | 
406 | 2020-06-02: Dataiter 0.11
407 | =========================
408 | 
409 | * `Vector.__init__`: Speed up by fixing type deduction
410 | 
411 | 2020-05-28: Dataiter 0.10.1
412 | ===========================
413 | 
414 | * `ListOfDicts.select`: Fix return value (#7)
415 | 
416 | 2020-05-21: Dataiter 0.10
417 | =========================
418 | 
419 | * `DataFrame.aggregate`: Fix `UnicodeEncodeError` with string columns
420 | * `DataFrame.unique`: Fix `UnicodeEncodeError` with string columns
421 | * `ListOfDicts.select`: Return keys in requested order
422 | * `Vector.__repr__`: Add custom conversion to string for display
423 | * `Vector.__str__`: Add custom conversion to string for display
424 | * `Vector.to_string`: Add custom conversion to string for display
425 | * `Vector.to_strings`: Add custom conversion to string for display
426 | 
427 | 2020-05-11: Dataiter 0.9
428 | ========================
429 | 
430 | * `Array`: Rename to `Vector`
431 | * `Vector.head`: New method
432 | * `Vector.range`: New method
433 | * `Vector.sample`: New method
434 | * `Vector.sort`: New method
435 | * `Vector.tail`: New method
436 | * `Vector.unique`: New method
437 | 
438 | 2020-05-10: Dataiter 0.8
439 | ========================
440 | 
441 | * `DataFrame`: New class
442 | * `ListOfDicts.__add__`: New method to support the `+` operator
443 | * `ListOfDicts.__init__`: Rename, reorder arguments
444 | * `ListOfDicts.__mul__`: New method to support the `*` operator
445 | * `ListOfDicts.__repr__`: New method, format as JSON
446 | * `ListOfDicts.__rmul__`: New method to support the `*` operator
447 | * `ListOfDicts.__setitem__`: New method, coerce to `AttributeDict`
448 | * `ListOfDicts.__str__`: New method, format as JSON
449 | * `ListOfDicts.aggregate`: Speed up
450 | * `ListOfDicts.anti_join`: New method
451 | * `ListOfDicts.append`: New method
452 | * `ListOfDicts.clear`: New method
453 | * `ListOfDicts.extend`: New method
454 | * `ListOfDicts.full_join`: New method
455 | * `ListOfDicts.head`: New method
456 | * `ListOfDicts.inner_join`: New method
457 | * `ListOfDicts.insert`: New method
458 | * `ListOfDicts.join`: Removed in favor of specific join types
459 | * `ListOfDicts.left_join`: New method
460 | * `ListOfDicts.pluck`: Add argument "default" to handle missing keys
461 | * `ListOfDicts.print_`: New method
462 | * `ListOfDicts.read_csv`: Add explicit arguments
463 | * `ListOfDicts.read_json`: Relay arguments to `json.loads`
464 | * `ListOfDicts.read_pickle`: New method
465 | * `ListOfDicts.reverse`: New method
466 | * `ListOfDicts.sample`: New method
467 | * `ListOfDicts.semi_join`: New method
468 | * `ListOfDicts.sort`: Change arguments to support sort direction better
469 | * `ListOfDicts.tail`: New method
470 | * `ListOfDicts.to_data_frame`: New method
471 | * `ListOfDicts.to_pandas`: New method
472 | * `ListOfDicts.unique`: Return unique by all keys if none given
473 | * `ListOfDicts.write_csv`: Add explicit arguments
474 | * `ListOfDicts.write_pickle`: New method
475 | 
476 | 2019-12-03: Dataiter 0.7
477 | ========================
478 | 
479 | * Make `sort` handle `None` values, sorted last
480 | 
481 | 2019-11-29: Dataiter 0.6
482 | ========================
483 | 
484 | * Fix `ObsoleteError` after multiple modifying actions
485 | 
486 | 2019-11-10: Dataiter 0.5
487 | ========================
488 | 
489 | * Add `read_csv`
490 | * Add `read_json`
491 | * Add `write_csv`
492 | * Add `write_json`
493 | 
494 | 2019-11-01: Dataiter 0.4
495 | ========================
496 | 
497 | * Fix `ObsoleteError` with `deepcopy`
498 | * Define `__deepcopy__` so that `copy.deepcopy` works too
499 | * Add `copy` (and `__copy__` for `copy.copy`)
500 | 
501 | 2019-11-01: Dataiter 0.3
502 | ========================
503 | 
504 | * Mark `ListOfDicts` object obsolete thus preventing (accidental) use if
505 |   a chained successor has modified the shared dicts
506 | * Add `modify_if`
507 | 
508 | 2019-10-31: Dataiter 0.2
509 | ========================
510 | 
511 | * Speed up, mostly by avoiding copying (methods that modify dicts now do
512 |   it in place rather than making a copy)
513 | 
514 | 2019-09-29: Dataiter 0.1
515 | ========================
516 | 
517 | * Initial release
518 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Simple, Light-Weight Data Frames for Python
 2 | ===========================================
 3 | 
 4 | [![PyPI](https://img.shields.io/pypi/v/dataiter.svg)](https://pypi.org/project/dataiter)
 5 | [![Downloads](https://pepy.tech/badge/dataiter/month)](https://pepy.tech/project/dataiter)
 6 | 
 7 | Dataiter's **`DataFrame`** is a class for tabular data similar to R's
 8 | `data.frame`, implementing all common operations to manipulate data. It
 9 | is under the hood a dictionary of NumPy arrays and thus capable of fast
10 | vectorized operations. You can consider it to be a light-weight
11 | alternative to Pandas with a simple and consistent API. Performance-wise
12 | Dataiter relies on NumPy and Numba and is likely to be at best
13 | comparable to Pandas.
14 | 
15 | ## Installation
16 | 
17 | ```bash
18 | # Latest stable version
19 | pip install -U dataiter
20 | 
21 | # Latest development version
22 | pip install -U git+https://github.com/otsaloma/dataiter
23 | 
24 | # Numba (optional)
25 | pip install -U numba
26 | ```
27 | 
28 | Dataiter optionally uses **Numba** to speed up certain operations. If
29 | you have Numba installed, Dataiter will use it automatically. It's
30 | currently not a hard dependency, so you need to install it separately.
31 | 
32 | ## Quick Start
33 | 
34 | ```python
35 | >>> import dataiter as di
36 | >>> data = di.read_csv("data/listings.csv")
37 | >>> data.filter(hood="Manhattan", guests=2).sort(price=1).head()
38 | .
39 |         id      hood zipcode guests    sqft price
40 |      int64    string  string  int64 float64 int64
41 |   ──────── ───────── ─────── ────── ─────── ─────
42 | 0 42279170 Manhattan   10013      2     nan     0
43 | 1 42384530 Manhattan   10036      2     nan     0
44 | 2 18835820 Manhattan   10021      2     nan    10
45 | 3 20171179 Manhattan   10027      2     nan    10
46 | 4 14858544 Manhattan              2     nan    15
47 | 5 31397084 Manhattan   10002      2     nan    19
48 | 6 22289683 Manhattan   10031      2     nan    20
49 | 7  7760204 Manhattan   10040      2     nan    22
50 | 8 43292527 Manhattan   10033      2     nan    22
51 | 9 43268040 Manhattan   10033      2     nan    23
52 | .
53 | ```
54 | 
55 | ## Documentation
56 | 
57 | https://dataiter.readthedocs.io/
58 | 
59 | If you're familiar with either dplyr (R) or Pandas (Python), the
60 | comparison table in the documentation will give you a quick overview of
61 | the differences and similarities in common operations.
62 | 
63 | https://dataiter.readthedocs.io/en/stable/comparison.html
64 | 
65 | ## Development
66 | 
67 | To install a virtualenv for development, use
68 | 
69 |     make venv
70 | 
71 | or, for a specific Python version
72 | 
73 |     make PYTHON=python3.X venv
74 | 


--------------------------------------------------------------------------------
/benchmark-versions.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | SCRIPT=benchmark-head.py
 4 | SCRIPT_ARGS="$@"
 5 | OUT_FILE=benchmark-versions.csv
 6 | TMP_FILE=tmp.csv
 7 | 
 8 | benchmark() {
 9 |     VERSION=$1
10 |     printf "\n$VERSION:\n"
11 |     git checkout -q $VERSION
12 |     ./$SCRIPT -o $TMP_FILE --version=$VERSION $SCRIPT_ARGS || true
13 |     tail -n+2 $TMP_FILE >> $OUT_FILE
14 |     sed -i 's/"//g' $OUT_FILE
15 | }
16 | 
17 | set -e
18 | rm -f $OUT_FILE
19 | echo "name,version,elapsed" > $OUT_FILE
20 | cp -fv benchmark.py $SCRIPT
21 | benchmark 1.0
22 | benchmark master
23 | rm -f $SCRIPT $TMP_FILE
24 | 


--------------------------------------------------------------------------------
/benchmark.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import click
  4 | import dataiter as di
  5 | import functools
  6 | import numpy as np
  7 | import random
  8 | import time
  9 | 
 10 | from dataiter import test
 11 | from statistics import mean
 12 | from unittest.mock import patch
 13 | 
 14 | @functools.cache
 15 | def _data_frame(path, nrow):
 16 |     data = test.data_frame(path)
 17 |     n = nrow // data.nrow
 18 |     data = data.rbind(*([data] * n))
 19 |     return data.head(nrow)
 20 | 
 21 | def data_frame(path, nrow=1_000_000):
 22 |     return _data_frame(path, nrow).deepcopy()
 23 | 
 24 | @functools.cache
 25 | def _data_frame_random(nrows, ngroups):
 26 |     return di.DataFrame(g=np.random.choice(ngroups, nrows, replace=True),
 27 |                         a=np.random.normal(10, 2, nrows))
 28 | 
 29 | def data_frame_random(nrows, ngroups):
 30 |     return _data_frame_random(nrows, ngroups).deepcopy()
 31 | 
 32 | def data_frame_aggregate_128():
 33 |     data = data_frame("vehicles.csv")
 34 |     start = time.time()
 35 |     (data
 36 |      .group_by("make")
 37 |      .aggregate(
 38 |          n=di.count(),
 39 |          hwy=di.mean("hwy"),
 40 |          cty=di.mean("cty")))
 41 |     return time.time() - start
 42 | 
 43 | def data_frame_aggregate_3264():
 44 |     data = data_frame("vehicles.csv")
 45 |     start = time.time()
 46 |     (data
 47 |      .group_by("make", "model")
 48 |      .aggregate(
 49 |          n=di.count(),
 50 |          hwy=di.mean("hwy"),
 51 |          cty=di.mean("cty")))
 52 |     return time.time() - start
 53 | 
 54 | def data_frame_aggregate_14668():
 55 |     data = data_frame("vehicles.csv")
 56 |     start = time.time()
 57 |     (data
 58 |      .group_by("make", "model", "year")
 59 |      .aggregate(
 60 |          n=di.count(),
 61 |          hwy=di.mean("hwy"),
 62 |          cty=di.mean("cty")))
 63 |     return time.time() - start
 64 | 
 65 | def data_frame_aggregate_100000_lambda():
 66 |     data = data_frame_random(1_000_000, 100_000)
 67 |     start = time.time()
 68 |     (data
 69 |      .group_by("g")
 70 |      .aggregate(
 71 |          a_mean=lambda x: np.mean(x.a),
 72 |          a_std=lambda x: np.std(x.a)))
 73 |     return time.time() - start
 74 | 
 75 | def data_frame_aggregate_100000_short():
 76 |     with patch("dataiter.USE_NUMBA", False):
 77 |         data = data_frame_random(1_000_000, 100_000)
 78 |         start = time.time()
 79 |         (data
 80 |          .group_by("g")
 81 |          .aggregate(
 82 |              a_mean=di.mean("a"),
 83 |              a_std=di.std("a")))
 84 |         return time.time() - start
 85 | 
 86 | def data_frame_aggregate_100000_short_numba():
 87 |     with patch("dataiter.USE_NUMBA", True):
 88 |         data = data_frame_random(1_000_000, 100_000)
 89 |         start = time.time()
 90 |         (data
 91 |          .group_by("g")
 92 |          .aggregate(
 93 |              a_mean=di.mean("a"),
 94 |              a_std=di.std("a")))
 95 |         return time.time() - start
 96 | 
 97 | def data_frame_full_join():
 98 |     data = data_frame("vehicles.csv")
 99 |     meta = data.select("make", "model").unique()
100 |     meta = meta.rbind(meta.modify(model="X"))
101 |     meta.random = np.random.random(meta.nrow)
102 |     assert meta.anti_join(data, "make", "model").nrow > 0
103 |     start = time.time()
104 |     data.full_join(meta, "make", "model")
105 |     return time.time() - start
106 | 
107 | def data_frame_left_join():
108 |     data = data_frame("vehicles.csv")
109 |     meta = data.select("make", "model").unique()
110 |     meta.random = np.random.random(meta.nrow)
111 |     start = time.time()
112 |     data.left_join(meta, "make", "model")
113 |     return time.time() - start
114 | 
115 | def data_frame_read_csv():
116 |     start = time.time()
117 |     test.data_frame("vehicles.csv")
118 |     return time.time() - start
119 | 
120 | def data_frame_read_json():
121 |     start = time.time()
122 |     test.data_frame("vehicles.json")
123 |     return time.time() - start
124 | 
125 | def data_frame_rbind_2():
126 |     # 2 * 500,000 = 1,000,000
127 |     data = data_frame("vehicles.csv", 500_000)
128 |     start = time.time()
129 |     data.rbind(data)
130 |     return time.time() - start
131 | 
132 | def data_frame_rbind_100():
133 |     # 100 * 10,000 = 1,000,000
134 |     data = data_frame("vehicles.csv", 10_000)
135 |     start = time.time()
136 |     data.rbind(*([data] * (100 - 1)))
137 |     return time.time() - start
138 | 
139 | def data_frame_rbind_100000():
140 |     # 100,000 * 10 = 1,000,000
141 |     data = data_frame("vehicles.csv", 10)
142 |     start = time.time()
143 |     data.rbind(*([data] * (100_000 - 1)))
144 |     return time.time() - start
145 | 
146 | def data_frame_sort():
147 |     data = data_frame("vehicles.csv")
148 |     start = time.time()
149 |     data.sort(make=1, model=1, year=1)
150 |     return time.time() - start
151 | 
152 | def data_frame_unique():
153 |     data = data_frame("vehicles.csv")
154 |     start = time.time()
155 |     data.unique("make", "model", "year")
156 |     return time.time() - start
157 | 
158 | @functools.cache
159 | def _list_of_dicts(path, length):
160 |     data = test.list_of_dicts(path)
161 |     n = length // len(data) + 1
162 |     data = data * n
163 |     return data.head(length)
164 | 
165 | def list_of_dicts(path, length=100_000):
166 |     return _list_of_dicts(path, length).deepcopy()
167 | 
168 | def list_of_dicts_aggregate_128():
169 |     data = list_of_dicts("vehicles.json")
170 |     start = time.time()
171 |     (data
172 |      .group_by("make")
173 |      .aggregate(
174 |          n=len,
175 |          hwy=lambda x: mean(x.pluck("hwy")),
176 |          cty=lambda x: mean(x.pluck("cty"))))
177 |     return time.time() - start
178 | 
179 | def list_of_dicts_aggregate_3264():
180 |     data = list_of_dicts("vehicles.json")
181 |     start = time.time()
182 |     (data
183 |      .group_by("make", "model")
184 |      .aggregate(
185 |          n=len,
186 |          hwy=lambda x: mean(x.pluck("hwy")),
187 |          cty=lambda x: mean(x.pluck("cty"))))
188 |     return time.time() - start
189 | 
190 | def list_of_dicts_aggregate_14668():
191 |     data = list_of_dicts("vehicles.json")
192 |     start = time.time()
193 |     (data
194 |      .group_by("make", "model", "year")
195 |      .aggregate(
196 |          n=len,
197 |          hwy=lambda x: mean(x.pluck("hwy")),
198 |          cty=lambda x: mean(x.pluck("cty"))))
199 |     return time.time() - start
200 | 
201 | def list_of_dicts_full_join():
202 |     data = list_of_dicts("vehicles.json")
203 |     meta = data.deepcopy().select("make", "model").unique()
204 |     meta = meta + meta.deepcopy().modify(model=lambda x: "X")
205 |     meta = meta.modify(random=lambda x: random.random())
206 |     assert len(meta.anti_join(data, "make", "model")) > 0
207 |     start = time.time()
208 |     data.full_join(meta, "make", "model")
209 |     return time.time() - start
210 | 
211 | def list_of_dicts_left_join():
212 |     data = list_of_dicts("vehicles.json")
213 |     meta = data.deepcopy().select("make", "model").unique()
214 |     meta = meta.deepcopy().modify(random=lambda x: random.random())
215 |     start = time.time()
216 |     data.left_join(meta, "make", "model")
217 |     return time.time() - start
218 | 
219 | def list_of_dicts_read_csv():
220 |     start = time.time()
221 |     test.list_of_dicts("vehicles.csv")
222 |     return time.time() - start
223 | 
224 | def list_of_dicts_read_json():
225 |     start = time.time()
226 |     test.list_of_dicts("vehicles.json")
227 |     return time.time() - start
228 | 
229 | def list_of_dicts_sort():
230 |     data = list_of_dicts("vehicles.csv")
231 |     start = time.time()
232 |     data.sort(make=1, model=1, year=1)
233 |     return time.time() - start
234 | 
235 | def vector_fast_list():
236 |     seq = list(range(1_000_000))
237 |     start = time.time()
238 |     di.Vector.fast(seq, int)
239 |     return time.time() - start
240 | 
241 | def vector_fast_np_array():
242 |     seq = list(range(1_000_000))
243 |     seq = np.array(seq)
244 |     start = time.time()
245 |     di.Vector.fast(seq, int)
246 |     return time.time() - start
247 | 
248 | def vector_new_list():
249 |     seq = list(range(1_000_000))
250 |     start = time.time()
251 |     di.Vector(seq)
252 |     return time.time() - start
253 | 
254 | def vector_new_np_array():
255 |     seq = list(range(1_000_000))
256 |     seq = np.array(seq)
257 |     start = time.time()
258 |     di.Vector(seq)
259 |     return time.time() - start
260 | 
261 | def vector_rank_max():
262 |     data = data_frame("vehicles.csv")
263 |     start = time.time()
264 |     data.model.rank(method="max")
265 |     return time.time() - start
266 | 
267 | def vector_rank_min():
268 |     data = data_frame("vehicles.csv")
269 |     start = time.time()
270 |     data.model.rank(method="min")
271 |     return time.time() - start
272 | 
273 | def vector_rank_ordinal():
274 |     data = data_frame("vehicles.csv")
275 |     start = time.time()
276 |     data.model.rank(method="ordinal")
277 |     return time.time() - start
278 | 
279 | def vector_sort():
280 |     data = data_frame("vehicles.csv")
281 |     start = time.time()
282 |     data.model.sort()
283 |     return time.time() - start
284 | 
285 | def vector_unique():
286 |     data = data_frame("vehicles.csv")
287 |     start = time.time()
288 |     data.model.unique()
289 |     return time.time() - start
290 | 
291 | def is_benchmark(name):
292 |     prefixes = ("data_frame_", "list_of_dicts_", "vector_")
293 |     return name.startswith(prefixes) and name != "data_frame_random"
294 | 
295 | BENCHMARKS = sorted(filter(is_benchmark, dir()), key=lambda x: (
296 |     [x.zfill(9) if x.isdigit() else x for x in x.split("_")]))
297 | 
298 | def run_benchmarks(benchmarks, output, rounds):
299 |     width = max(map(len, benchmarks)) + 2
300 |     for i, benchmark in enumerate(benchmarks):
301 |         print(f"{i+1:2d}/{len(benchmarks)}. ", end="", flush=True)
302 |         print(f"{benchmark+' ':.<{width}} ", end="", flush=True)
303 |         try:
304 |             f = globals()[benchmark]
305 |             elapsed = 1000 * min(f() for i in range(rounds))
306 |             print("{:5.0f} ms".format(elapsed), flush=True)
307 |         except Exception as error:
308 |             elapsed = -1
309 |             print(error.__class__.__name__)
310 |             if not output: raise
311 |         yield {"name": benchmark, "elapsed": round(elapsed)}
312 | 
313 | @click.command()
314 | @click.option("-o", "--output", help="Filename for optional CSV output")
315 | @click.option("-r", "--rounds", default=5, help="Number of rounds per benchmark")
316 | @click.option("--version", default=di.__version__, help="Version number for CSV output")
317 | @click.argument("pattern", nargs=-1)
318 | def main(output, rounds, version, pattern):
319 |     pattern = pattern or "_"
320 |     f = lambda x: any(y in x for y in pattern)
321 |     benchmarks = list(filter(f, BENCHMARKS))
322 |     results = di.ListOfDicts(run_benchmarks(benchmarks, output, rounds))
323 |     results = results.modify(version=lambda x: version)
324 |     if output:
325 |         assert output.endswith(".csv")
326 |         print(f"Writing {output}...")
327 |         results.write_csv(output)
328 | 
329 | if __name__ == "__main__":
330 |     main()
331 | 


--------------------------------------------------------------------------------
/bin/di-csv2json:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import click
 4 | import dataiter as di
 5 | 
 6 | from pathlib import Path
 7 | 
 8 | @click.command(no_args_is_help=True)
 9 | @click.option("-f", "--force", is_flag=True, default=False, help="Overwrite existing file")
10 | @click.argument("file", nargs=-1, type=click.Path(exists=True))
11 | def main(force, file):
12 |     """Convert CSV file to JSON file."""
13 |     for input in map(Path, file):
14 |         output = input.with_suffix(".json")
15 |         click.echo(f"{input} → {output}")
16 |         if output.exists() and not force:
17 |             raise SystemExit(
18 |                 f"Output file {output} exists, "
19 |                 f"use -f/--force to overwrite")
20 |         data = di.read_csv(input)
21 |         data.write_json(output)
22 | 
23 | if __name__ == "__main__":
24 |     main()
25 | 


--------------------------------------------------------------------------------
/bin/di-format-geojson:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import click
 4 | import dataiter as di
 5 | import shutil
 6 | import time
 7 | 
 8 | @click.command(no_args_is_help=True)
 9 | @click.option("-i", "--indent", default=2, help="Indent level")
10 | @click.option("-p", "--precision", default=9, help="Coordinate precision")
11 | @click.argument("file", nargs=-1, type=click.Path(exists=True))
12 | def main(indent, precision, file):
13 |     """Rewrite GeoJSON file with proper formatting."""
14 |     for path in file:
15 |         click.echo(path)
16 |         data = di.read_geojson(path)
17 |         for i in range(data.nrow):
18 |             coords = data.geometry[i].coordinates
19 |             data.geometry[i].coordinates = round_recursive(coords, precision)
20 |         backup = path + ".bak" + str(int(time.time()))
21 |         shutil.copyfile(path, backup)
22 |         data.write(path, indent=indent)
23 | 
24 | def round_recursive(value, precision):
25 |     if isinstance(value, list):
26 |         return [round_recursive(x, precision) for x in value]
27 |     return round(value, precision)
28 | 
29 | if __name__ == "__main__":
30 |     main()
31 | 


--------------------------------------------------------------------------------
/bin/di-geojson2csv:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import click
 4 | import dataiter as di
 5 | 
 6 | from pathlib import Path
 7 | 
 8 | @click.command(no_args_is_help=True)
 9 | @click.option("-f", "--force", is_flag=True, default=False, help="Overwrite existing file")
10 | @click.argument("file", nargs=-1, type=click.Path(exists=True))
11 | def main(force, file):
12 |     """Convert GeoJSON file to CSV file."""
13 |     for input in map(Path, file):
14 |         output = input.with_suffix(".csv")
15 |         click.echo(f"{input} → {output}")
16 |         if output.exists() and not force:
17 |             raise SystemExit(
18 |                 f"Output file {output} exists, "
19 |                 f"use -f/--force to overwrite")
20 |         data = di.read_geojson(input)
21 |         data = data.unselect("geometry")
22 |         data.write_csv(output)
23 | 
24 | if __name__ == "__main__":
25 |     main()
26 | 


--------------------------------------------------------------------------------
/bin/di-json2csv:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import click
 4 | import dataiter as di
 5 | 
 6 | from pathlib import Path
 7 | 
 8 | @click.command(no_args_is_help=True)
 9 | @click.option("-f", "--force", is_flag=True, default=False, help="Overwrite existing file")
10 | @click.argument("file", nargs=-1, type=click.Path(exists=True))
11 | def main(force, file):
12 |     """Convert JSON file to CSV file."""
13 |     for input in map(Path, file):
14 |         output = input.with_suffix(".csv")
15 |         click.echo(f"{input} → {output}")
16 |         if output.exists() and not force:
17 |             raise SystemExit(
18 |                 f"Output file {output} exists, "
19 |                 f"use -f/--force to overwrite")
20 |         data = di.read_json(input)
21 |         data.write_csv(output)
22 | 
23 | if __name__ == "__main__":
24 |     main()
25 | 


--------------------------------------------------------------------------------
/bin/di-open:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | uv run --no-project --with "dataiter==1.0,numba,pytz" python3.12 -i -c "
 3 | import dataiter as di
 4 | import os
 5 | import sys
 6 | from pathlib import Path
 7 | path = Path('$1')
 8 | assert path.exists()
 9 | os.chdir(path.parent)
10 | path = path.relative_to(path.parent)
11 | suffix = path.suffix.lstrip('.')
12 | read = getattr(di, f'read_{suffix}')
13 | print(f'Reading {str(path)} into data...')
14 | data = read(path)
15 | del os
16 | del Path
17 | del path
18 | del read
19 | del suffix
20 | del sys
21 | "
22 | 


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
 1 | Test Datasets
 2 | =============
 3 | 
 4 | | Data | Source |
 5 | | :--- | :----- |
 6 | | downloads | https://pypistats.org/api/packages/urllib3/system |
 7 | | listings | http://insideairbnb.com/get-the-data.html |
 8 | | neighbourhoods | http://insideairbnb.com/get-the-data.html |
 9 | | vehicles | https://github.com/hadley/fueleconomy |
10 | 


--------------------------------------------------------------------------------
/data/holidays.csv:
--------------------------------------------------------------------------------
 1 | date,holiday
 2 | 1000-01-01,New Year's Day
 3 | 2019-01-01,New Year's Day
 4 | 2019-01-06,Epiphany
 5 | 2019-04-19,Good Friday
 6 | 2019-04-21,Easter Sunday
 7 | 2019-04-22,Easter Monday
 8 | 2019-05-01,May Day
 9 | 2019-05-30,Ascension Day
10 | 2019-06-09,Whit Sunday
11 | 2019-06-21,Midsummer's Eve
12 | 2019-06-22,Midsummer Day
13 | 2019-11-02,All Saints' Day
14 | 2019-12-06,Independence Day
15 | 2019-12-24,Christmas Eve
16 | 2019-12-25,Christmas Day
17 | 2019-12-26,2nd Day of Christmas
18 | 2020-01-01,New Year's Day
19 | 2020-01-06,Epiphany
20 | 2020-04-10,Good Friday
21 | 2020-04-12,Easter Sunday
22 | 2020-04-13,Easter Monday
23 | 2020-05-01,May Day
24 | 2020-05-21,Ascension Day
25 | 2020-05-31,Whit Sunday
26 | 2020-06-19,Midsummer's Eve
27 | 2020-06-20,Midsummer Day
28 | 2020-10-31,All Saints' Day
29 | 2020-12-06,Independence Day
30 | 2020-12-24,Christmas Eve
31 | 2020-12-25,Christmas Day
32 | 2020-12-26,2nd Day of Christmas
33 | 3000-01-01,New Year's Day
34 | 


--------------------------------------------------------------------------------
/data/holidays.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "date": "1000-01-01",
  4 |     "holiday": "New Year's Day"
  5 |   },
  6 |   {
  7 |     "date": "2019-01-01",
  8 |     "holiday": "New Year's Day"
  9 |   },
 10 |   {
 11 |     "date": "2019-01-06",
 12 |     "holiday": "Epiphany"
 13 |   },
 14 |   {
 15 |     "date": "2019-04-19",
 16 |     "holiday": "Good Friday"
 17 |   },
 18 |   {
 19 |     "date": "2019-04-21",
 20 |     "holiday": "Easter Sunday"
 21 |   },
 22 |   {
 23 |     "date": "2019-04-22",
 24 |     "holiday": "Easter Monday"
 25 |   },
 26 |   {
 27 |     "date": "2019-05-01",
 28 |     "holiday": "May Day"
 29 |   },
 30 |   {
 31 |     "date": "2019-05-30",
 32 |     "holiday": "Ascension Day"
 33 |   },
 34 |   {
 35 |     "date": "2019-06-09",
 36 |     "holiday": "Whit Sunday"
 37 |   },
 38 |   {
 39 |     "date": "2019-06-21",
 40 |     "holiday": "Midsummer's Eve"
 41 |   },
 42 |   {
 43 |     "date": "2019-06-22",
 44 |     "holiday": "Midsummer Day"
 45 |   },
 46 |   {
 47 |     "date": "2019-11-02",
 48 |     "holiday": "All Saints' Day"
 49 |   },
 50 |   {
 51 |     "date": "2019-12-06",
 52 |     "holiday": "Independence Day"
 53 |   },
 54 |   {
 55 |     "date": "2019-12-24",
 56 |     "holiday": "Christmas Eve"
 57 |   },
 58 |   {
 59 |     "date": "2019-12-25",
 60 |     "holiday": "Christmas Day"
 61 |   },
 62 |   {
 63 |     "date": "2019-12-26",
 64 |     "holiday": "2nd Day of Christmas"
 65 |   },
 66 |   {
 67 |     "date": "2020-01-01",
 68 |     "holiday": "New Year's Day"
 69 |   },
 70 |   {
 71 |     "date": "2020-01-06",
 72 |     "holiday": "Epiphany"
 73 |   },
 74 |   {
 75 |     "date": "2020-04-10",
 76 |     "holiday": "Good Friday"
 77 |   },
 78 |   {
 79 |     "date": "2020-04-12",
 80 |     "holiday": "Easter Sunday"
 81 |   },
 82 |   {
 83 |     "date": "2020-04-13",
 84 |     "holiday": "Easter Monday"
 85 |   },
 86 |   {
 87 |     "date": "2020-05-01",
 88 |     "holiday": "May Day"
 89 |   },
 90 |   {
 91 |     "date": "2020-05-21",
 92 |     "holiday": "Ascension Day"
 93 |   },
 94 |   {
 95 |     "date": "2020-05-31",
 96 |     "holiday": "Whit Sunday"
 97 |   },
 98 |   {
 99 |     "date": "2020-06-19",
100 |     "holiday": "Midsummer's Eve"
101 |   },
102 |   {
103 |     "date": "2020-06-20",
104 |     "holiday": "Midsummer Day"
105 |   },
106 |   {
107 |     "date": "2020-10-31",
108 |     "holiday": "All Saints' Day"
109 |   },
110 |   {
111 |     "date": "2020-12-06",
112 |     "holiday": "Independence Day"
113 |   },
114 |   {
115 |     "date": "2020-12-24",
116 |     "holiday": "Christmas Eve"
117 |   },
118 |   {
119 |     "date": "2020-12-25",
120 |     "holiday": "Christmas Day"
121 |   },
122 |   {
123 |     "date": "2020-12-26",
124 |     "holiday": "2nd Day of Christmas"
125 |   },
126 |   {
127 |     "date": "3000-01-01",
128 |     "holiday": "New Year's Day"
129 |   }
130 | ]
131 | 


--------------------------------------------------------------------------------
/data/listings.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import dataiter as di
 4 | 
 5 | # Same as for documentation.
 6 | di.PRINT_MAX_WIDTH = 72
 7 | 
 8 | def parse_price(price):
 9 |     return int(float(price.lstrip("$").replace(",", "")))
10 | 
11 | data = (
12 |     di.read_csv("orig/listings.csv")
13 |     .select("id",
14 |             "neighbourhood_group_cleansed",
15 |             "zipcode",
16 |             "accommodates",
17 |             "square_feet",
18 |             "price")
19 |     .rename(hood="neighbourhood_group_cleansed")
20 |     .rename(guests="accommodates")
21 |     .rename(sqft="square_feet")
22 |     .modify(price=lambda x: x.price.map(parse_price))
23 | )
24 | 
25 | print(data.head())
26 | data.write_csv("listings.csv")
27 | data.write_json("listings.json")
28 | 
29 | data = (
30 |     di.read_csv("orig/listings.csv")
31 |     .select("id",
32 |             "number_of_reviews",
33 |             "review_scores_rating")
34 |     .rename(reviews="number_of_reviews")
35 |     .rename(rating="review_scores_rating")
36 |     .filter(lambda x: x.reviews >= 10)
37 | )
38 | 
39 | print(data.head())
40 | data.write_csv("listings-reviews.csv")
41 | data.write_json("listings-reviews.json")
42 | 


--------------------------------------------------------------------------------
/dataiter/__init__.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright (c) 2020 Osmo Salomaa
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in
 13 | # all copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 21 | # THE SOFTWARE.
 22 | 
 23 | import contextlib
 24 | import numpy as np
 25 | 
 26 | from dataiter import util
 27 | 
 28 | __version__ = "1.0"
 29 | 
 30 | DEFAULT_PEEK_ELEMENTS = 10
 31 | DEFAULT_PEEK_ITEMS = 3
 32 | DEFAULT_PEEK_ROWS = 10
 33 | PRINT_FLOAT_PRECISION = 6
 34 | PRINT_MAX_ELEMENTS = 100
 35 | PRINT_MAX_ITEMS = 10
 36 | PRINT_MAX_ROWS = 100
 37 | 
 38 | #: Maximum amount of columns to wrap print output to. This is only a fallback
 39 | #: in case Python's ``shutil.get_terminal_size`` fails to detect the width of
 40 | #: your terminal. By default the detected full width is used.
 41 | PRINT_MAX_WIDTH = 80
 42 | 
 43 | #: Thousand separator to use when printing numbers. By default this is blank,
 44 | #: meaning no thousand separators are rendered.
 45 | PRINT_THOUSAND_SEPARATOR = ""
 46 | 
 47 | #: Maximum width to truncate string columns to in :class:`DataFrame` print
 48 | #: output. When this is exceeded, strings will be cut and an ellipsis (``…``)
 49 | #: rendered at the cut point.
 50 | PRINT_TRUNCATE_WIDTH = 36
 51 | 
 52 | #: ``True`` to use Numba, if available, to speed up :doc:`aggregations
 53 | #: </aggregation>`, ``False`` to only use pure Python code.
 54 | USE_NUMBA = False
 55 | 
 56 | #: ``True`` to use Numba cache for JIT-compiled :doc:`aggregations
 57 | #: </aggregation>`, ``False`` to only keep compiled code in memory for the
 58 | #: duration of the session.
 59 | USE_NUMBA_CACHE = True
 60 | 
 61 | if not np.__version__.startswith("2."):
 62 |     raise Exception("NumPy 2.x required")
 63 | 
 64 | with contextlib.suppress(LookupError):
 65 |     USE_NUMBA_CACHE = util.parse_env_boolean("DATAITER_USE_NUMBA_CACHE")
 66 | 
 67 | try:
 68 |     # Force Numba on or off if environment variable defined.
 69 |     USE_NUMBA = util.parse_env_boolean("DATAITER_USE_NUMBA")
 70 | except LookupError:
 71 |     with contextlib.suppress(Exception):
 72 |         # Use Numba automatically if found
 73 |         # and calling a trivial function works.
 74 |         import numba
 75 |         try:
 76 |             @numba.njit(cache=USE_NUMBA_CACHE)
 77 |             def check(x):
 78 |                 return x**2
 79 |             assert check(10) == 100
 80 |             USE_NUMBA = True
 81 |         except Exception as error:
 82 |             print(f"Numba found, but disabled due to error: {error!s}")
 83 | 
 84 | globals().pop("check", None)
 85 | globals().pop("contextlib", None)
 86 | globals().pop("np", None)
 87 | globals().pop("numba", None)
 88 | globals().pop("util", None)
 89 | 
 90 | from dataiter import dtypes # noqa
 91 | from dataiter.vector import Vector # noqa
 92 | from dataiter.data_frame import DataFrame # noqa
 93 | from dataiter.data_frame import DataFrameColumn # noqa
 94 | from dataiter.geojson import GeoJSON # noqa
 95 | from dataiter.list_of_dicts import ListOfDicts # noqa
 96 | from dataiter import dt # noqa
 97 | from dataiter import regex # noqa
 98 | 
 99 | from dataiter.aggregate import all # noqa
100 | from dataiter.aggregate import any # noqa
101 | from dataiter.aggregate import count # noqa
102 | from dataiter.aggregate import count_unique # noqa
103 | from dataiter.aggregate import first # noqa
104 | from dataiter.aggregate import last # noqa
105 | from dataiter.aggregate import max # noqa
106 | from dataiter.aggregate import mean # noqa
107 | from dataiter.aggregate import median # noqa
108 | from dataiter.aggregate import min # noqa
109 | from dataiter.aggregate import mode # noqa
110 | from dataiter.aggregate import nth # noqa
111 | from dataiter.aggregate import quantile # noqa
112 | from dataiter.aggregate import std # noqa
113 | from dataiter.aggregate import sum # noqa
114 | from dataiter.aggregate import var # noqa
115 | 
116 | from dataiter.io import read_csv # noqa
117 | from dataiter.io import read_geojson # noqa
118 | from dataiter.io import read_json # noqa
119 | from dataiter.io import read_npz # noqa
120 | from dataiter.io import read_parquet # noqa
121 | 


--------------------------------------------------------------------------------
/dataiter/deco.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright (c) 2019 Osmo Salomaa
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in
13 | # all copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | # THE SOFTWARE.
22 | 
23 | import functools
24 | 
25 | def listify(function):
26 |     @functools.wraps(function)
27 |     def wrapper(*args, **kwargs):
28 |         value = function(*args, **kwargs)
29 |         return list(value)
30 |     return wrapper
31 | 
32 | def new_from_generator(function):
33 |     @functools.wraps(function)
34 |     def wrapper(self, *args, **kwargs):
35 |         value = function(self, *args, **kwargs)
36 |         return self._new(value)
37 |     return wrapper
38 | 
39 | def obsoletes(function):
40 |     @functools.wraps(function)
41 |     def wrapper(self, *args, **kwargs):
42 |         value = function(self, *args, **kwargs)
43 |         self._mark_obsolete()
44 |         return value
45 |     return wrapper
46 | 
47 | def tuplefy(function):
48 |     @functools.wraps(function)
49 |     def wrapper(*args, **kwargs):
50 |         value = function(*args, **kwargs)
51 |         return tuple(value)
52 |     return wrapper
53 | 


--------------------------------------------------------------------------------
/dataiter/dt.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright (c) 2022 Osmo Salomaa
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in
 13 | # all copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 21 | # THE SOFTWARE.
 22 | 
 23 | import datetime
 24 | import numpy as np
 25 | 
 26 | from dataiter import dtypes
 27 | from dataiter import util
 28 | from dataiter import Vector
 29 | from numpy.dtypes import StringDType
 30 | 
 31 | def day(x):
 32 |     """
 33 |     Extract day of the month from datetime `x`.
 34 | 
 35 |     >>> x = dt.new(["2022-10-15"])
 36 |     >>> dt.day(x)
 37 |     """
 38 |     return _pull_int(x, lambda y: y.day)
 39 | 
 40 | def from_string(x, format):
 41 |     """
 42 |     Initialize a datetime scalar or vector from `x`.
 43 | 
 44 |     `format` uses Python ``strptime`` format codes:
 45 |     https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes
 46 | 
 47 |     >>> x = di.Vector(["15.10.2022"])
 48 |     >>> dt.from_string(x, "%d.%m.%Y")
 49 |     """
 50 |     if util.is_scalar(x):
 51 |         x = Vector([x], str)
 52 |         return from_string(x, format)[0]
 53 |     assert isinstance(x, np.ndarray)
 54 |     assert isinstance(x.dtype, StringDType)
 55 |     out = np.full_like(x, None, object)
 56 |     out = Vector.fast(out, object)
 57 |     na = x == dtypes.string.na_object
 58 |     f = np.vectorize(lambda x: datetime.datetime.strptime(x, format))
 59 |     out[~na] = f(x[~na].astype(object))
 60 |     out = out.as_datetime()
 61 |     if (len(out[~na]) > 0 and
 62 |         (hour(out[~na])   == 0).all() and
 63 |         (minute(out[~na]) == 0).all() and
 64 |         (second(out[~na]) == 0).all()):
 65 |         out = out.as_date()
 66 |     return out
 67 | 
 68 | def hour(x):
 69 |     """
 70 |     Extract hour from datetime `x`.
 71 | 
 72 |     >>> x = dt.new(["2022-10-15T12:34:56"])
 73 |     >>> dt.hour(x)
 74 |     """
 75 |     return _pull_int(x, lambda y: y.hour)
 76 | 
 77 | def isoweek(x):
 78 |     """
 79 |     Extract ISO 8601 week from datetime `x`.
 80 | 
 81 |     >>> x = dt.new(["2022-10-15"])
 82 |     >>> dt.isoweek(x)
 83 |     """
 84 |     return _pull_int(x, lambda y: y.isocalendar()[1])
 85 | 
 86 | def isoweekday(x):
 87 |     """
 88 |     Extract day of the week from datetime `x`.
 89 | 
 90 |     Day of the week is an integer between 1 and 7, where 1 is Monday and 7 is
 91 |     Sunday.
 92 | 
 93 |     See also: :func:`weekday`
 94 | 
 95 |     >>> x = dt.new(["2022-10-15"])
 96 |     >>> dt.isoweekday(x)
 97 |     """
 98 |     return _pull_int(x, lambda y: y.isoweekday())
 99 | 
100 | def microsecond(x):
101 |     """
102 |     Extract microsecond from datetime `x`.
103 | 
104 |     >>> x = dt.new(["2022-10-15T12:34:56.789"])
105 |     >>> dt.microsecond(x)
106 |     """
107 |     return _pull_int(x, lambda y: y.microsecond)
108 | 
109 | def minute(x):
110 |     """
111 |     Extract minute from datetime `x`.
112 | 
113 |     >>> x = dt.new(["2022-10-15T12:34:56"])
114 |     >>> dt.minute(x)
115 |     """
116 |     return _pull_int(x, lambda y: y.minute)
117 | 
118 | def month(x):
119 |     """
120 |     Extract month from datetime `x`.
121 | 
122 |     >>> x = dt.new(["2022-10-15"])
123 |     >>> dt.month(x)
124 |     """
125 |     return _pull_int(x, lambda y: y.month)
126 | 
127 | def new(x):
128 |     """
129 |     Initialize a datetime scalar or vector from `x`.
130 | 
131 |     >>> dt.new("2022-10-15")
132 |     >>> dt.new("2022-10-15T12:00:00")
133 |     >>> dt.new(["2022-10-15"])
134 |     >>> dt.new(["2022-10-15T12:00:00"])
135 |     """
136 |     if util.is_scalar(x):
137 |         return np.datetime64(x)
138 |     return Vector.fast(map(np.datetime64, x), np.datetime64)
139 | 
140 | def now():
141 |     """
142 |     Return the current local datetime.
143 | 
144 |     >>> dt.now()
145 |     """
146 |     return np.datetime64(datetime.datetime.now())
147 | 
148 | def _pull_datetime(x, function):
149 |     if util.is_scalar(x):
150 |         x = Vector([x], np.datetime64)
151 |         return _pull_datetime(x, function)[0]
152 |     assert isinstance(x, np.ndarray)
153 |     assert np.issubdtype(x.dtype, np.datetime64)
154 |     out = np.full_like(x, np.nan)
155 |     out = Vector.fast(out, np.datetime64)
156 |     na = np.isnat(x)
157 |     if na.all(): return out
158 |     f = np.vectorize(function)
159 |     out[~na] = f(x[~na].astype(object))
160 |     return out
161 | 
162 | def _pull_int(x, function):
163 |     if util.is_scalar(x):
164 |         x = Vector([x], np.datetime64)
165 |         return _pull_int(x, function)[0]
166 |     assert isinstance(x, np.ndarray)
167 |     assert np.issubdtype(x.dtype, np.datetime64)
168 |     out = np.full_like(x, np.nan, float)
169 |     out = Vector.fast(out, float)
170 |     na = np.isnat(x)
171 |     if na.all(): return out
172 |     f = np.vectorize(function)
173 |     out[~na] = f(x[~na].astype(object))
174 |     return out if na.any() else out.as_integer()
175 | 
176 | def _pull_str(x, function):
177 |     if util.is_scalar(x):
178 |         x = Vector([x], np.datetime64)
179 |         return _pull_str(x, function)[0]
180 |     assert isinstance(x, np.ndarray)
181 |     assert np.issubdtype(x.dtype, np.datetime64)
182 |     out = np.full_like(x, dtypes.string.na_object, object)
183 |     out = Vector.fast(out, object)
184 |     na = np.isnat(x)
185 |     if na.all(): return out
186 |     f = np.vectorize(function)
187 |     out[~na] = f(x[~na].astype(object))
188 |     return out.as_string()
189 | 
190 | def quarter(x):
191 |     """
192 |     Extract quarter from datetime `x`.
193 | 
194 |     >>> x = dt.new(["2022-10-15"])
195 |     >>> dt.quarter(x)
196 |     """
197 |     y = np.ceil(month(x) / 3)
198 |     return y if np.isnan(y).any() else y.astype(int)
199 | 
200 | def replace(x, year=None, month=None, day=None, hour=None, minute=None, second=None, microsecond=None):
201 |     """
202 |     Return datetime `x` with given components replaced.
203 | 
204 |     >>> x = dt.new(["2022-10-15"])
205 |     >>> dt.replace(x, month=1, day=1)
206 |     """
207 |     kwargs = {k: v for k, v in locals().items() if k != "x" and v is not None}
208 |     if all(map(util.is_scalar, kwargs.values())):
209 |         return _pull_datetime(x, lambda y: y.replace(**kwargs))
210 |     for value in kwargs.values():
211 |         assert util.is_scalar(value) or len(value) == len(x)
212 |     scalar_keys = [x for x in kwargs if util.is_scalar(kwargs[x])]
213 |     vector_keys = [x for x in kwargs if x not in scalar_keys]
214 |     # Like _pull_datetime, but no vectorized function.
215 |     assert isinstance(x, np.ndarray)
216 |     assert np.issubdtype(x.dtype, np.datetime64)
217 |     out = np.full_like(x, np.nan)
218 |     out = Vector.fast(out, np.datetime64)
219 |     na = np.isnat(x)
220 |     xobj = x.astype(object)
221 |     kwargs_scalar = {x: kwargs[x] for x in scalar_keys}
222 |     for i in np.flatnonzero(~na):
223 |         for key in vector_keys:
224 |             kwargs_scalar[key] = kwargs[key][i]
225 |         out[i] = xobj[i].replace(**kwargs_scalar)
226 |     return out
227 | 
228 | def second(x):
229 |     """
230 |     Extract second from datetime `x`.
231 | 
232 |     >>> x = dt.new(["2022-10-15T12:34:56"])
233 |     >>> dt.second(x)
234 |     """
235 |     return _pull_int(x, lambda y: y.second)
236 | 
237 | def to_string(x, format):
238 |     """
239 |     Format datetime `x` as string.
240 | 
241 |     `format` uses Python ``strftime`` format codes:
242 |     https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes
243 | 
244 |     >>> x = dt.new(["2022-10-15"])
245 |     >>> dt.to_string(x, "%d.%m.%Y")
246 |     """
247 |     return _pull_str(x, lambda x: x.strftime(format))
248 | 
249 | def today():
250 |     """
251 |     Return the current local date.
252 | 
253 |     >>> dt.today()
254 |     """
255 |     return np.datetime64(datetime.date.today())
256 | 
257 | def weekday(x):
258 |     """
259 |     Extract day of the week from datetime `x`.
260 | 
261 |     Day of the week is an integer between 0 and 6, where 0 is Monday and 6 is
262 |     Sunday.
263 | 
264 |     See also: :func:`isoweekday`
265 | 
266 |     >>> x = dt.new(["2022-10-15"])
267 |     >>> dt.weekday(x)
268 |     """
269 |     return _pull_int(x, lambda y: y.weekday())
270 | 
271 | def year(x):
272 |     """
273 |     Extract year from datetime `x`.
274 | 
275 |     >>> x = dt.new(["2022-10-15"])
276 |     >>> dt.year(x)
277 |     """
278 |     return _pull_int(x, lambda y: y.year)
279 | 


--------------------------------------------------------------------------------
/dataiter/dtypes.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright (c) 2024 Osmo Salomaa
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in
13 | # all copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | # THE SOFTWARE.
22 | 
23 | import warnings
24 | 
25 | from numpy.dtypes import StringDType
26 | 
27 | #: Instance of NumPy variable-width StringDType used
28 | string = StringDType(na_object="")
29 | 
30 | # Use a blank string as missing value sentinel (1) because that's what we used
31 | # prior to the NumPy 2.0 StringDType and (2) because in many cases, such as CSV
32 | # input, a distinction between NA and blank cannot usually be made.
33 | # TODO: Consider changing this to something like ':NA:'.
34 | # https://numpy.org/doc/stable/user/basics.strings.html#missing-data-support
35 | 
36 | # Ignore pointless warnings about using StringDType in numpy.savez.
37 | _pattern = "Custom dtypes are saved as python objects using the pickle protocol"
38 | warnings.filterwarnings("ignore", message=_pattern, category=UserWarning)
39 | 


--------------------------------------------------------------------------------
/dataiter/geojson.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright (c) 2020 Osmo Salomaa
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in
 13 | # all copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 21 | # THE SOFTWARE.
 22 | 
 23 | import json
 24 | 
 25 | from attd import AttributeDict
 26 | from dataiter import DataFrame
 27 | from dataiter import DataFrameColumn
 28 | from dataiter import util
 29 | from dataiter import Vector
 30 | 
 31 | class GeoJSON(DataFrame):
 32 | 
 33 |     """
 34 |     A class for GeoJSON data.
 35 | 
 36 |     GeoJSON is a simple wrapper class that reads GeoJSON features into a
 37 |     :class:`.DataFrame`. Any operations on the data are thus done with methods
 38 |     provided by the data frame class. Geometry is available in the "geometry"
 39 |     column, but no special geometric operations are supported. All other data
 40 |     is available in the "metadata" attribute as an ``attd.AttributeDict``.
 41 |     """
 42 | 
 43 |     # List of names that are actual attributes, not columns
 44 |     ATTRIBUTES = DataFrame.ATTRIBUTES + ["metadata"]
 45 | 
 46 |     # Lists of supported GeoJSON keys and types
 47 |     FEATURE_KEYS = ["type", "properties", "geometry"]
 48 |     FEATURE_TYPES = ["Feature"]
 49 |     PROPERTY_TYPES = [bool, int, float, str, type(None)]
 50 |     TOP_LEVEL_TYPES = ["FeatureCollection"]
 51 | 
 52 |     def __init__(self, *args, **kwargs):
 53 |         """
 54 |         Return a new GeoJSON object.
 55 | 
 56 |         `args` and `kwargs` are like for ``dict``.
 57 | 
 58 |         https://docs.python.org/3/library/stdtypes.html#dict
 59 |         """
 60 |         super().__init__(*args, **kwargs)
 61 |         self.metadata = AttributeDict(type="FeatureCollection")
 62 | 
 63 |     @classmethod
 64 |     def _check_raw_data(cls, data):
 65 |         if data.type not in cls.TOP_LEVEL_TYPES:
 66 |             raise TypeError(f"Top-level type {data.type!r} not supported")
 67 |         warned_feature_keys = []
 68 |         for feature in data.features:
 69 |             cls._check_raw_feature(feature, warned_feature_keys)
 70 | 
 71 |     @classmethod
 72 |     def _check_raw_feature(cls, feature, warned_feature_keys):
 73 |         if feature.type not in cls.FEATURE_TYPES:
 74 |             raise TypeError(f"Feature type {feature.type!r} not supported")
 75 |         for key in set(feature) - set(cls.FEATURE_KEYS):
 76 |             if key in warned_feature_keys: continue
 77 |             print(f"Warning: Ignoring feature key {key!r}")
 78 |             warned_feature_keys.append(key)
 79 |         for key, value in feature.properties.items():
 80 |             if isinstance(value, tuple(cls.PROPERTY_TYPES)): continue
 81 |             raise TypeError(f"Property type {type(value)} of {key!r} not supported")
 82 | 
 83 |     @classmethod
 84 |     def read(cls, path, *, encoding="utf-8", columns=[], dtypes={}, **kwargs):
 85 |         """
 86 |         Return data from GeoJSON file `path`.
 87 | 
 88 |         Will automatically decompress if `path` ends in ``.bz2|.gz|.xz``.
 89 |         `columns` is an optional list of columns to limit to. `dtypes` is an
 90 |         optional dict mapping column names to NumPy datatypes. `kwargs` are
 91 |         passed to ``json.load``.
 92 |         """
 93 |         with util.xopen(path, "rt", encoding=encoding) as f:
 94 |             raw = AttributeDict(json.load(f, **kwargs))
 95 |         cls._check_raw_data(raw)
 96 |         data = {}
 97 |         for feature in raw.features:
 98 |             for key in feature.properties:
 99 |                 data.setdefault(key, [])
100 |         if columns:
101 |             data = {k: v for k, v in data.items() if k in columns}
102 |         for feature in raw.features:
103 |             for key in data:
104 |                 value = feature.properties.get(key, None)
105 |                 data[key].append(value)
106 |         data["geometry"] = [x.geometry for x in raw.features]
107 |         for name, dtype in dtypes.items():
108 |             data[name] = DataFrameColumn(data[name], dtype)
109 |         data = cls(**data)
110 |         del raw.features
111 |         data.metadata = raw
112 |         return data
113 | 
114 |     def to_data_frame(self, drop_geometry=False):
115 |         """
116 |         Return GeoJSON converted to a regular data frame.
117 |         """
118 |         data = dict.copy(self)
119 |         if drop_geometry:
120 |             data.pop("geometry", None)
121 |         return DataFrame(**data)
122 | 
123 |     def to_string(self, *, max_rows=None, max_width=None):
124 |         if "geometry" in self.colnames:
125 |             geometry = [f"<{x['type']}>" for x in self.geometry]
126 |             self = self.modify(geometry=Vector.fast(geometry, object))
127 |         return DataFrame.to_string(self, max_rows=max_rows, max_width=max_width)
128 | 
129 |     def write(self, path, *, encoding="utf-8", **kwargs):
130 |         """
131 |         Write data to GeoJSON file `path`.
132 | 
133 |         Will automatically compress if `path` ends in ``.bz2|.gz|.xz``.
134 |         `kwargs` are passed to ``json.dumps``.
135 |         """
136 |         kwargs.setdefault("default", str)
137 |         kwargs.setdefault("ensure_ascii", False)
138 |         indent_width = kwargs.pop("indent", 2) or 0
139 |         indent1 = " " * indent_width * 1
140 |         indent2 = " " * indent_width * 2
141 |         if "geometry" not in self:
142 |             raise ValueError("Geometry missing")
143 |         data = self.to_list_of_dicts()
144 |         util.makedirs_for_file(path)
145 |         with util.xopen(path, "wt", encoding=encoding) as f:
146 |             f.write("{\n")
147 |             for key, value in self.metadata.items():
148 |                 blob = json.dumps(value, **kwargs)
149 |                 f.write(f'{indent1}"{key}": {blob},\n')
150 |             f.write(f'{indent1}"features": [\n')
151 |             for i, item in enumerate(data):
152 |                 geometry = item.pop("geometry")
153 |                 blob = {"type": "Feature", "properties": item, "geometry": geometry}
154 |                 blob = json.dumps(blob, **kwargs)
155 |                 comma = "," if i < len(data) - 1 else ""
156 |                 f.write(f"{indent2}{blob}{comma}\n")
157 |             f.write(f"{indent1}]\n")
158 |             f.write("}\n")
159 | 


--------------------------------------------------------------------------------
/dataiter/io.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright (c) 2022 Osmo Salomaa
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in
13 | # all copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | # THE SOFTWARE.
22 | 
23 | from dataiter import DataFrame
24 | from dataiter import GeoJSON
25 | from dataiter import ListOfDicts
26 | from dataiter import util
27 | 
28 | def read_csv(path, *, encoding="utf-8", sep=",", header=True, columns=[], dtypes={}):
29 |     return DataFrame.read_csv(path,
30 |                               encoding=encoding,
31 |                               sep=sep,
32 |                               header=header,
33 |                               columns=columns,
34 |                               dtypes=dtypes)
35 | 
36 | def read_geojson(path, *, encoding="utf-8", columns=[], dtypes={}, **kwargs):
37 |     return GeoJSON.read(path,
38 |                         encoding=encoding,
39 |                         columns=columns,
40 |                         dtypes=dtypes,
41 |                         **kwargs)
42 | 
43 | def read_json(path, *, encoding="utf-8", keys=[], types={}, **kwargs):
44 |     return ListOfDicts.read_json(path,
45 |                                  encoding=encoding,
46 |                                  keys=keys,
47 |                                  types=types,
48 |                                  **kwargs)
49 | 
50 | def read_npz(path, *, allow_pickle=True):
51 |     return DataFrame.read_npz(path, allow_pickle=allow_pickle)
52 | 
53 | def read_parquet(path, *, columns=[], dtypes={}):
54 |     return DataFrame.read_parquet(path, columns=[], dtypes={})
55 | 
56 | read_csv.__doc__ = util.format_alias_doc(read_csv, DataFrame.read_csv)
57 | read_geojson.__doc__ = util.format_alias_doc(read_geojson, GeoJSON.read)
58 | read_json.__doc__ = util.format_alias_doc(read_json, ListOfDicts.read_json)
59 | read_npz.__doc__ = util.format_alias_doc(read_npz, DataFrame.read_npz)
60 | read_parquet.__doc__ = util.format_alias_doc(read_parquet, DataFrame.read_parquet)
61 | 


--------------------------------------------------------------------------------
/dataiter/regex.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright (c) 2025 Osmo Salomaa
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in
 13 | # all copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 21 | # THE SOFTWARE.
 22 | 
 23 | import numpy as np
 24 | import re
 25 | 
 26 | from dataiter import dtypes
 27 | from dataiter import util
 28 | from dataiter import Vector
 29 | from numpy.dtypes import StringDType
 30 | 
 31 | def _prep(string, dtype, default):
 32 |     assert isinstance(string, np.ndarray)
 33 |     assert isinstance(string.dtype, StringDType)
 34 |     out = np.full_like(string, default, dtype)
 35 |     na = string == dtypes.string.na_object
 36 |     return out, na
 37 | 
 38 | def findall(pattern, string, flags=0):
 39 |     """
 40 |     Return a list of matches of `pattern` in `string`.
 41 | 
 42 |     https://docs.python.org/3/library/re.html#re.findall
 43 | 
 44 |     >>> x = di.Vector(["asdf", "1234"])
 45 |     >>> regex.findall(r"[a-z]", x)
 46 |     """
 47 |     if util.is_scalar(string):
 48 |         return re.findall(pattern, string, flags=flags)
 49 |     out, na = _prep(string, object, None)
 50 |     for i in np.flatnonzero(~na):
 51 |         out[i] = re.findall(pattern, string[i], flags=flags)
 52 |     return Vector.fast(out, object)
 53 | 
 54 | def fullmatch(pattern, string, flags=0):
 55 |     """
 56 |     Return a ``re.Match`` object or ``None``.
 57 | 
 58 |     https://docs.python.org/3/library/re.html#re.fullmatch
 59 | 
 60 |     >>> x = di.Vector(["asdf", "1234"])
 61 |     >>> regex.fullmatch(r"[a-z]+", x)
 62 |     """
 63 |     if util.is_scalar(string):
 64 |         return re.fullmatch(pattern, string, flags=flags)
 65 |     out, na = _prep(string, object, None)
 66 |     for i in np.flatnonzero(~na):
 67 |         out[i] = re.fullmatch(pattern, string[i], flags=flags)
 68 |     return Vector.fast(out, object)
 69 | 
 70 | def match(pattern, string, flags=0):
 71 |     """
 72 |     Return a ``re.Match`` object or ``None``.
 73 | 
 74 |     https://docs.python.org/3/library/re.html#re.match
 75 | 
 76 |     >>> x = di.Vector(["asdf", "1234"])
 77 |     >>> regex.match(r"[a-z]", x)
 78 |     """
 79 |     if util.is_scalar(string):
 80 |         return re.match(pattern, string, flags=flags)
 81 |     out, na = _prep(string, object, None)
 82 |     for i in np.flatnonzero(~na):
 83 |         out[i] = re.match(pattern, string[i], flags=flags)
 84 |     return Vector.fast(out, object)
 85 | 
 86 | def search(pattern, string, flags=0):
 87 |     """
 88 |     Return a ``re.Match`` object or ``None``.
 89 | 
 90 |     https://docs.python.org/3/library/re.html#re.search
 91 | 
 92 |     >>> x = di.Vector(["asdf", "1234"])
 93 |     >>> regex.search(r"[a-z]", x)
 94 |     """
 95 |     if util.is_scalar(string):
 96 |         return re.search(pattern, string, flags=flags)
 97 |     out, na = _prep(string, object, None)
 98 |     for i in np.flatnonzero(~na):
 99 |         out[i] = re.search(pattern, string[i], flags=flags)
100 |     return Vector.fast(out, object)
101 | 
102 | def split(pattern, string, maxsplit=0, flags=0):
103 |     """
104 |     Return a list of `string` split by `pattern`.
105 | 
106 |     https://docs.python.org/3/library/re.html#re.split
107 | 
108 |     >>> x = di.Vector(["one two three", "four"])
109 |     >>> regex.split(r" +", x)
110 |     """
111 |     if util.is_scalar(string):
112 |         return re.split(pattern, string, maxsplit=maxsplit, flags=flags)
113 |     out, na = _prep(string, object, None)
114 |     for i in np.flatnonzero(~na):
115 |         out[i] = re.split(pattern, string[i], maxsplit=maxsplit, flags=flags)
116 |     return Vector.fast(out, object)
117 | 
118 | def sub(pattern, repl, string, count=0, flags=0):
119 |     """
120 |     Return `string` with instances of `pattern` replaced with `repl`.
121 | 
122 |     https://docs.python.org/3/library/re.html#re.sub
123 | 
124 |     >>> x = di.Vector(["great", "fantastic"])
125 |     >>> regex.sub(r"$", r"!", x)
126 |     """
127 |     if util.is_scalar(string):
128 |         return re.sub(pattern, repl, string, count=count, flags=flags)
129 |     out, na = _prep(string, dtypes.string, dtypes.string.na_object)
130 |     for i in np.flatnonzero(~na):
131 |         out[i] = re.sub(pattern, repl, string[i], count=count, flags=flags)
132 |     return Vector.fast(out, str)
133 | 
134 | def subn(pattern, repl, string, count=0, flags=0):
135 |     """
136 |     Return `string`, count of instances of `pattern` replaced with `repl`.
137 | 
138 |     https://docs.python.org/3/library/re.html#re.subn
139 | 
140 |     >>> x = di.Vector(["great", "fantastic"])
141 |     >>> regex.subn(r"$", r"!", x)
142 |     """
143 |     if util.is_scalar(string):
144 |         return re.subn(pattern, repl, string, count=count, flags=flags)
145 |     out, na = _prep(string, object, None)
146 |     for i in np.flatnonzero(~na):
147 |         out[i] = re.subn(pattern, repl, string[i], count=count, flags=flags)
148 |     return Vector.fast(out, object)
149 | 


--------------------------------------------------------------------------------
/dataiter/test/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright (c) 2020 Osmo Salomaa
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in
13 | # all copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | # THE SOFTWARE.
22 | 
23 | import functools
24 | 
25 | from dataiter import DataFrame
26 | from dataiter import GeoJSON
27 | from dataiter import ListOfDicts
28 | from pathlib import Path
29 | 
30 | def cached(function):
31 |     cache = {}
32 |     @functools.wraps(function)
33 |     def wrapper(path):
34 |         if path not in cache:
35 |             cache[path] = function(path)
36 |         return cache[path].deepcopy()
37 |     return wrapper
38 | 
39 | @cached
40 | def data_frame(name):
41 |     path = get_data_path(name)
42 |     extension = path.suffix.lstrip(".")
43 |     read = getattr(DataFrame, f"read_{extension}")
44 |     return read(path)
45 | 
46 | @cached
47 | def geojson(name):
48 |     path = get_data_path(name)
49 |     return GeoJSON.read(path)
50 | 
51 | def get_data_path(name):
52 |     for parent in Path(__file__).parents:
53 |         path = parent / "data" / name
54 |         if path.exists():
55 |             return path
56 | 
57 | @cached
58 | def list_of_dicts(name):
59 |     path = get_data_path(name)
60 |     extension = path.suffix.lstrip(".")
61 |     read = getattr(ListOfDicts, f"read_{extension}")
62 |     return read(path)
63 | 


--------------------------------------------------------------------------------
/dataiter/test/test_aggregate.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright (c) 2022 Osmo Salomaa
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in
 13 | # all copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 21 | # THE SOFTWARE.
 22 | 
 23 | import dataiter
 24 | import datetime
 25 | import numpy as np
 26 | import pytest
 27 | 
 28 | from dataiter import DataFrame
 29 | from dataiter import Vector
 30 | from dataiter.aggregate import all
 31 | from dataiter.aggregate import any
 32 | from dataiter.aggregate import count
 33 | from dataiter.aggregate import count_unique
 34 | from dataiter.aggregate import first
 35 | from dataiter.aggregate import last
 36 | from dataiter.aggregate import max
 37 | from dataiter.aggregate import mean
 38 | from dataiter.aggregate import median
 39 | from dataiter.aggregate import min
 40 | from dataiter.aggregate import mode
 41 | from dataiter.aggregate import nth
 42 | from dataiter.aggregate import quantile
 43 | from dataiter.aggregate import std
 44 | from dataiter.aggregate import sum
 45 | from dataiter.aggregate import var
 46 | from unittest.mock import patch
 47 | 
 48 | T = True
 49 | F = False
 50 | 
 51 | D1 = datetime.date.today()
 52 | D2 = D1 + datetime.timedelta(days=1)
 53 | D3 = D1 + datetime.timedelta(days=2)
 54 | D4 = D1 + datetime.timedelta(days=3)
 55 | D5 = D1 + datetime.timedelta(days=4)
 56 | D6 = D1 + datetime.timedelta(days=5)
 57 | D7 = D1 + datetime.timedelta(days=6)
 58 | 
 59 | NaN = np.nan
 60 | NaT = np.datetime64("NaT")
 61 | 
 62 | EMPTY_VECTOR = Vector([], float)
 63 | GROUPS = [1, 1, 2, 2, 3, 3, 4, 4, 5, 5]
 64 | 
 65 | nth0 = lambda x: nth(x, 0)
 66 | quantile05 = lambda x: quantile(x, 0.5)
 67 | 
 68 | TEST_MATRIX = [
 69 | 
 70 |     # NaNs evaluate to true, because they are not equal to zero.
 71 |     # https://numpy.org/doc/stable/reference/generated/numpy.all.html
 72 |     (all, [T, T, T, T, T, F, F, F, F, F], [T, T, F, F, F]),
 73 |     (all, [1, 2, 3, 4, 5, 0, 0, 0, 0, 0], [T, T, F, F, F]),
 74 |     (all, [0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0, NaN, NaN, NaN], [F, F, T, T, T]),
 75 |     (all, [D1, D2, D3, D4, D5, NaT, NaT, NaT, NaT, NaT], [T, T, T, T, T]),
 76 |     (all, ["a", "b", "c", "d", "e", "", "", "", "", ""], [T, T, T, T, T]),
 77 | 
 78 |     # NaNs evaluate to true, because they are not equal to zero.
 79 |     # https://numpy.org/doc/stable/reference/generated/numpy.any.html
 80 |     (any, [T, T, T, T, T, F, F, F, F, F], [T, T, T, F, F]),
 81 |     (any, [1, 2, 3, 4, 5, 0, 0, 0, 0, 0], [T, T, T, F, F]),
 82 |     (any, [0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0, NaN, NaN, NaN], [F, T, T, T, T]),
 83 |     (any, [D1, D2, D3, D4, D5, NaT, NaT, NaT, NaT, NaT], [T, T, T, T, T]),
 84 |     (any, ["a", "b", "c", "d", "e", "", "", "", "", ""], [T, T, T, T, T]),
 85 | 
 86 |     (count, [T, T, T, T, T, F, F, F, F, F], [2, 2, 2, 2, 2]),
 87 |     (count, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [2, 2, 2, 2, 2]),
 88 |     (count, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, NaN, NaN, NaN], [2, 2, 2, 2, 2]),
 89 |     (count, [D1, D2, D3, D4, D5, D6, D7, NaT, NaT, NaT], [2, 2, 2, 2, 2]),
 90 |     (count, ["a", "b", "c", "d", "e", "f", "g", "", "", ""], [2, 2, 2, 2, 2]),
 91 | 
 92 |     # NaN is not considered equal to itself and thus all are counted here.
 93 |     (count_unique, [T, T, T, T, T, F, F, F, F, F], [1, 1, 2, 1, 1]),
 94 |     (count_unique, [1, 1, 3, 3, 5, 6, 7, 8, 9, 10], [1, 1, 2, 2, 2]),
 95 |     (count_unique, [1.0, 1.0, 3.0, 3.0, 5.0, 6.0, 7.0, NaN, NaN, NaN], [1, 1, 2, 2, 2]),
 96 |     (count_unique, [D1, D1, D3, D3, D5, D6, D7, NaT, NaT, NaT], [1, 1, 2, 2, 2]),
 97 |     (count_unique, ["a", "a", "c", "c", "e", "f", "g", "", "", ""], [1, 1, 2, 2, 1]),
 98 | 
 99 |     (first, [T, T, T, T, T, F, F, F, F, F], [T, T, T, F, F]),
100 |     (first, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1, 3, 5, 7, 9]),
101 |     (first, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, NaN, NaN, NaN], [1.0, 3.0, 5.0, 7.0, NaN]),
102 |     (first, [D1, D2, D3, D4, D5, D6, D7, NaT, NaT, NaT], [D1, D3, D5, D7, NaT]),
103 |     (first, ["a", "b", "c", "d", "e", "f", "g", "", "", ""], ["a", "c", "e", "g", ""]),
104 | 
105 |     (last, [T, T, T, T, T, F, F, F, F, F], [T, T, F, F, F]),
106 |     (last, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [2, 4, 6, 8, 10]),
107 |     (last, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, NaN, NaN, NaN], [2.0, 4.0, 6.0, NaN, NaN]),
108 |     (last, [D1, D2, D3, D4, D5, D6, D7, NaT, NaT, NaT], [D2, D4, D6, NaT, NaT]),
109 |     (last, ["a", "b", "c", "d", "e", "f", "g", "", "", ""], ["b", "d", "f", "", ""]),
110 | 
111 |     (max, [T, T, T, T, T, F, F, F, F, F], [T, T, T, F, F]),
112 |     (max, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [2, 4, 6, 8, 10]),
113 |     (max, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, NaN, NaN, NaN], [2.0, 4.0, 6.0, 7.0, NaN]),
114 |     (max, [D1, D2, D3, D4, D5, D6, D7, NaT, NaT, NaT], [D2, D4, D6, D7, NaT]),
115 | 
116 |     (mean, [T, T, T, T, T, F, F, F, F, F], [1.0, 1.0, 0.5, 0.0, 0.0]),
117 |     (mean, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1.5, 3.5, 5.5, 7.5, 9.5]),
118 |     (mean, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, NaN, NaN, NaN], [1.5, 3.5, 5.5, 7.0, NaN]),
119 | 
120 |     (median, [T, T, T, T, T, F, F, F, F, F], [1.0, 1.0, 0.5, 0.0, 0.0]),
121 |     (median, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1.5, 3.5, 5.5, 7.5, 9.5]),
122 |     (median, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, NaN, NaN, NaN], [1.5, 3.5, 5.5, 7.0, NaN]),
123 | 
124 |     (min, [T, T, T, T, T, F, F, F, F, F], [T, T, F, F, F]),
125 |     (min, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1, 3, 5, 7, 9]),
126 |     (min, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, NaN, NaN, NaN], [1.0, 3.0, 5.0, 7.0, NaN]),
127 |     (min, [D1, D2, D3, D4, D5, D6, D7, NaT, NaT, NaT], [D1, D3, D5, D7, NaT]),
128 | 
129 |     (mode, [T, T, T, T, T, F, F, F, F, F], [T, T, T, F, F]),
130 |     (mode, [1, 1, 3, 3, 5, 6, 7, 8, 9, 10], [1, 3, 5, 7, 9]),
131 |     (mode, [1.0, 1.0, 3.0, 3.0, 5.0, 6.0, 7.0, NaN, NaN, NaN], [1.0, 3.0, 5.0, 7.0, NaN]),
132 |     (mode, [D1, D1, D3, D3, D5, D6, D7, NaT, NaT, NaT], [D1, D3, D5, D7, NaT]),
133 |     (mode, ["a", "a", "c", "c", "e", "f", "g", "", "", ""], ["a", "c", "e", "g", ""]),
134 | 
135 |     (nth0, [T, T, T, T, T, F, F, F, F, F], [T, T, T, F, F]),
136 |     (nth0, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1, 3, 5, 7, 9]),
137 |     (nth0, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, NaN, NaN, NaN], [1.0, 3.0, 5.0, 7.0, NaN]),
138 |     (nth0, [D1, D2, D3, D4, D5, D6, D7, NaT, NaT, NaT], [D1, D3, D5, D7, NaT]),
139 |     (nth0, ["a", "b", "c", "d", "e", "f", "g", "", "", ""], ["a", "c", "e", "g", ""]),
140 | 
141 |     (quantile05, [T, T, T, T, T, F, F, F, F, F], [1.0, 1.0, 0.5, 0.0, 0.0]),
142 |     (quantile05, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1.5, 3.5, 5.5, 7.5, 9.5]),
143 |     (quantile05, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, NaN, NaN, NaN], [1.5, 3.5, 5.5, 7.0, NaN]),
144 | 
145 |     (std, [T, T, T, T, T, F, F, F, F, F], [0.0, 0.0, 0.5, 0.0, 0.0]),
146 |     (std, [1, 1, 2, 3, 5, 7, 8, 12, 13, 21], [0.0, 0.5, 1.0, 2.0, 4.0]),
147 |     (std, [1.0, 1.0, 2.0, 3.0, 5.0, 7.0, 8.0, NaN, NaN, NaN], [0.0, 0.5, 1.0, NaN, NaN]),
148 | 
149 |     (sum, [T, T, T, T, T, F, F, F, F, F], [2, 2, 1, 0, 0]),
150 |     (sum, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [3, 7, 11, 15, 19]),
151 |     (sum, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, NaN, NaN, NaN], [3.0, 7.0, 11.0, 7.0, 0.0]),
152 | 
153 |     (var, [T, T, T, T, T, F, F, F, F, F], [0.0, 0.0, 0.25, 0.0, 0.0]),
154 |     (var, [1, 1, 2, 3, 5, 7, 8, 12, 13, 21], [0.0, 0.25, 1.0, 4.0, 16.0]),
155 |     (var, [1.0, 1.0, 2.0, 3.0, 5.0, 7.0, 8.0, NaN, NaN, NaN], [0.0, 0.25, 1.0, NaN, NaN]),
156 | 
157 | ]
158 | 
159 | class TestAggregate:
160 | 
161 |     @pytest.mark.parametrize("use_numba", [False, True])
162 |     @pytest.mark.parametrize("function,input,output", TEST_MATRIX)
163 |     def test_aggregate(self, function, input, output, use_numba):
164 |         if use_numba and not dataiter.USE_NUMBA:
165 |             pytest.skip("No Numba")
166 |         with patch("dataiter.USE_NUMBA", use_numba):
167 |             data = DataFrame(g=GROUPS, a=input)
168 |             stat = data.group_by("g").aggregate(a=function("a"))
169 |             expected = Vector(output)
170 |             try:
171 |                 assert stat.a.equal(expected)
172 |             except AssertionError:
173 |                 print("")
174 |                 print(data)
175 |                 print("Expected:")
176 |                 print(expected)
177 |                 print("Got:")
178 |                 print(stat.a)
179 |                 raise
180 | 
181 |     @pytest.mark.parametrize("use_numba", [False, True])
182 |     def test_aggregate_count(self, use_numba):
183 |         if use_numba and not dataiter.USE_NUMBA:
184 |             pytest.skip("No Numba")
185 |         with patch("dataiter.USE_NUMBA", use_numba):
186 |             data = DataFrame(g=GROUPS)
187 |             stat = data.group_by("g").aggregate(n=count())
188 |             assert (stat.n == 2).all()
189 | 
190 |     def test_all(self):
191 |         assert all(EMPTY_VECTOR)
192 |         assert all(Vector([T, T]))
193 |         assert not all(Vector([T, F]))
194 |         assert not all(Vector([F, F]))
195 | 
196 |     def test_any(self):
197 |         assert not any(EMPTY_VECTOR)
198 |         assert any(Vector([T, T]))
199 |         assert any(Vector([T, F]))
200 |         assert not any(Vector([F, F]))
201 | 
202 |     def test_count(self):
203 |         assert count(EMPTY_VECTOR) == 0
204 |         assert count(Vector([1])) == 1
205 |         assert count(Vector([1, 2])) == 2
206 |         assert count(Vector([1, 2, NaN])) == 3
207 |         assert count(Vector([1, 2, NaN]), drop_na=True) == 2
208 | 
209 |     def test_count_unique(self):
210 |         assert count_unique(EMPTY_VECTOR) == 0
211 |         assert count_unique(Vector([1])) == 1
212 |         assert count_unique(Vector([1, 1])) == 1
213 |         assert count_unique(Vector([1, 1, 2])) == 2
214 |         assert count_unique(Vector([1, 1, 2, NaN])) == 3
215 |         assert count_unique(Vector([1, 1, 2, NaN]), drop_na=True) == 2
216 | 
217 |     def test_first(self):
218 |         assert first(Vector([1, 2, 3])) == 1
219 |         assert first(Vector([NaN, 1, 2]), drop_na=True) == 1
220 | 
221 |     def test_first_nan(self):
222 |         assert np.isnan(first(EMPTY_VECTOR))
223 |         assert np.isnan(first(Vector([NaN, 1, 2])))
224 | 
225 |     def test_last(self):
226 |         assert last(Vector([1, 2, 3])) == 3
227 |         assert last(Vector([1, 2, NaN]), drop_na=True) == 2
228 | 
229 |     def test_last_nan(self):
230 |         assert np.isnan(last(EMPTY_VECTOR))
231 |         assert np.isnan(last(Vector([1, 2, NaN])))
232 | 
233 |     def test_max(self):
234 |         assert max(Vector([3, 2, 1])) == 3
235 |         assert max(Vector([3, 2, NaN])) == 3
236 | 
237 |     def test_max_nan(self):
238 |         assert np.isnan(max(EMPTY_VECTOR))
239 |         assert np.isnan(max(Vector([3, 2, NaN]), drop_na=False))
240 | 
241 |     def test_mean(self):
242 |         assert np.isclose(mean(Vector([1, 2, 10])), 4.333333)
243 |         assert np.isclose(mean(Vector([1, 2, NaN])), 1.5)
244 | 
245 |     def test_mean_nan(self):
246 |         assert np.isnan(mean(EMPTY_VECTOR))
247 |         assert np.isnan(mean(Vector([1, 2, NaN]), drop_na=False))
248 | 
249 |     def test_median(self):
250 |         assert median(Vector([1, 4, 6, 8, 5])) == 5
251 |         assert median(Vector([1, 4, 6, NaN, NaN])) == 4
252 | 
253 |     def test_median_nan(self):
254 |         assert np.isnan(median(EMPTY_VECTOR))
255 |         assert np.isnan(median(Vector([1, 4, NaN]), drop_na=False))
256 | 
257 |     def test_min(self):
258 |         assert min(Vector([3, 2, 1])) == 1
259 |         assert min(Vector([3, 2, NaN])) == 2
260 | 
261 |     def test_min_nan(self):
262 |         assert np.isnan(min(EMPTY_VECTOR))
263 |         assert np.isnan(min(Vector([3, 2, NaN]), drop_na=False))
264 | 
265 |     def test_mode(self):
266 |         assert mode(Vector([1])) == 1
267 |         assert mode(Vector([1, 2])) == 1
268 |         assert mode(Vector([1, 2, 2])) == 2
269 |         assert mode(Vector([1, 2, 2, NaN])) == 2
270 |         assert mode(Vector([1, 2, 2, NaN]), drop_na=False) == 2
271 | 
272 |     def test_mode_nan(self):
273 |         assert np.isnan(mode(EMPTY_VECTOR))
274 |         assert np.isnan(mode(Vector([NaN, NaN], float), drop_na=False))
275 | 
276 |     def test_nth(self):
277 |         assert nth(Vector([1, 2, 3]), 0) == 1
278 |         assert nth(Vector([NaN, 1, 2]), 0, drop_na=True) == 1
279 | 
280 |     def test_nth_nan(self):
281 |         assert np.isnan(nth(EMPTY_VECTOR, 0))
282 |         assert np.isnan(nth(Vector([NaN, 1, 2]), 0))
283 | 
284 |     def test_quantile(self):
285 |         assert quantile(Vector([1, 4, 6, 8, 5]), 0.5) == 5
286 |         assert quantile(Vector([1, 4, 6, NaN, NaN]), 0.5) == 4
287 | 
288 |     def test_quantile_nan(self):
289 |         assert np.isnan(quantile(EMPTY_VECTOR, 0.5))
290 |         assert np.isnan(quantile(Vector([1, 4, NaN]), 0.5, drop_na=False))
291 | 
292 |     def test_std(self):
293 |         assert np.isclose(std(Vector([3, 6, 7])), 1.699673)
294 |         assert np.isclose(std(Vector([3, 6, NaN])), 1.5)
295 | 
296 |     def test_std_nan(self):
297 |         assert np.isnan(std(EMPTY_VECTOR))
298 |         assert np.isnan(std(Vector([1])))
299 |         assert np.isnan(std(Vector([3, 6, NaN]), drop_na=False))
300 | 
301 |     def test_sum(self):
302 |         assert sum(EMPTY_VECTOR) == 0
303 |         assert sum(Vector([1])) == 1
304 |         assert sum(Vector([1, 2])) == 3
305 |         assert sum(Vector([1, 2, NaN])) == 3
306 | 
307 |     def test_sum_nan(self):
308 |         assert np.isnan(sum(Vector([1, 2, NaN]), drop_na=False))
309 | 
310 |     def test_var(self):
311 |         assert np.isclose(var(Vector([3, 6, 7])), 2.888889)
312 |         assert np.isclose(var(Vector([3, 6, NaN])), 2.25)
313 | 
314 |     def test_var_nan(self):
315 |         assert np.isnan(var(EMPTY_VECTOR))
316 |         assert np.isnan(var(Vector([1])))
317 |         assert np.isnan(var(Vector([3, 6, NaN]), drop_na=False))
318 | 


--------------------------------------------------------------------------------
/dataiter/test/test_dt.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright (c) 2022 Osmo Salomaa
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in
 13 | # all copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 21 | # THE SOFTWARE.
 22 | 
 23 | import numpy as np
 24 | 
 25 | from dataiter import dt
 26 | from dataiter import Vector
 27 | 
 28 | NaT = np.datetime64("NaT")
 29 | 
 30 | class TestDT:
 31 | 
 32 |     def test_day(self):
 33 |         a = dt.new(["2022-10-15", NaT])
 34 |         assert dt.day(a).tolist() == [15, None]
 35 | 
 36 |     def test_day_nat(self):
 37 |         assert np.isnan(dt.day(NaT))
 38 | 
 39 |     def test_day_numpy(self):
 40 |         a = np.array(["2022-10-15", NaT], np.datetime64)
 41 |         assert dt.day(a).tolist() == [15, None]
 42 | 
 43 |     def test_day_scalar(self):
 44 |         x = np.datetime64("2022-10-15")
 45 |         assert dt.day(x) == 15
 46 | 
 47 |     def test_from_string_date(self):
 48 |         a = Vector(["14.11.2022", ""], str)
 49 |         b = dt.from_string(a, "%d.%m.%Y")
 50 |         assert b.is_datetime()
 51 |         assert b[0] == np.datetime64("2022-11-14")
 52 |         assert np.isnat(b[1])
 53 | 
 54 |     def test_from_string_datetime(self):
 55 |         a = Vector(["14.11.2022 22:49", ""], str)
 56 |         b = dt.from_string(a, "%d.%m.%Y %H:%M")
 57 |         assert b.is_datetime()
 58 |         assert b[0] == np.datetime64("2022-11-14T22:49:00")
 59 |         assert np.isnat(b[1])
 60 | 
 61 |     def test_hour(self):
 62 |         a = dt.new(["2022-10-15T12:34:56", NaT])
 63 |         assert dt.hour(a).tolist() == [12, None]
 64 | 
 65 |     def test_isoweek(self):
 66 |         a = dt.new(["2022-10-15", NaT])
 67 |         assert dt.isoweek(a).tolist() == [41, None]
 68 | 
 69 |     def test_isoweekday(self):
 70 |         a = dt.new(["2022-10-15", NaT])
 71 |         assert dt.isoweekday(a).tolist() == [6, None]
 72 | 
 73 |     def test_microsecond(self):
 74 |         a = dt.new(["2022-10-15T12:34:56.789", NaT])
 75 |         assert dt.microsecond(a).tolist() == [789_000, None]
 76 | 
 77 |     def test_minute(self):
 78 |         a = dt.new(["2022-10-15T12:34:56", NaT])
 79 |         assert dt.minute(a).tolist() == [34, None]
 80 | 
 81 |     def test_month(self):
 82 |         a = dt.new([NaT, NaT])
 83 |         assert dt.month(a).tolist() == [None, None]
 84 |         a = dt.new(["2022-10-15", NaT])
 85 |         assert dt.month(a).tolist() == [10, None]
 86 |         a = dt.new(["2022-10-15", "2022-11-15"])
 87 |         assert dt.month(a).tolist() == [10, 11]
 88 | 
 89 |     def test_new_date(self):
 90 |         a = dt.new(["2022-10-15"])
 91 |         b = Vector(["2022-10-15"]).as_date()
 92 |         assert a.equal(b)
 93 | 
 94 |     def test_new_datetime(self):
 95 |         a = dt.new(["2022-10-15T12:00:00"])
 96 |         b = Vector(["2022-10-15T12:00:00"]).as_datetime()
 97 |         assert a.equal(b)
 98 | 
 99 |     def test_new_scalar(self):
100 |         a = dt.new("2022-10-15")
101 |         b = np.datetime64("2022-10-15")
102 |         assert a == b
103 | 
104 |     def test_now(self):
105 |         assert isinstance(dt.now(), np.datetime64)
106 | 
107 |     def test_quarter(self):
108 |         a = dt.new("2022-10-15")
109 |         assert dt.quarter(a) == 4
110 |         assert np.isnan(dt.quarter(NaT))
111 |         a = dt.new(["2022-10-15"])
112 |         assert dt.quarter(a).tolist() == [4]
113 |         a = dt.new(["2022-10-15", NaT])
114 |         assert dt.quarter(a).tolist() == [4, None]
115 | 
116 |     def test_replace(self):
117 |         a = dt.new(["2022-10-15", NaT])
118 |         b = dt.new(["2022-01-01", NaT])
119 |         assert dt.replace(a, month=1, day=1).equal(b)
120 | 
121 |     def test_replace_vector(self):
122 |         a = dt.new(["2023-08-09", "2023-08-10", "2023-08-11"])
123 |         b = dt.new(["2023-07-01", "2023-07-02", "2023-07-03"])
124 |         assert dt.replace(a, month=7, day=[1, 2, 3]).equal(b)
125 | 
126 |     def test_replace_vector_1m(self):
127 |         a = np.repeat(dt.new("2023-08-09"), 1_000_000)
128 |         month = np.repeat([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 100_000)
129 |         assert len(month) == len(a) == 1_000_000
130 |         b = dt.replace(a, month=month, day=1)
131 |         assert len(b) == len(a)
132 | 
133 |     def test_second(self):
134 |         a = dt.new(["2022-10-15T12:34:56", NaT])
135 |         assert dt.second(a).tolist() == [56, None]
136 | 
137 |     def test_to_string_date(self):
138 |         a = dt.new(["2022-11-14", NaT])
139 |         b = dt.to_string(a, "%d.%m.%Y")
140 |         assert b.is_string()
141 |         assert b.tolist() == ["14.11.2022", None]
142 | 
143 |     def test_to_string_datetime(self):
144 |         a = dt.new(["2022-11-14T22:49:00", NaT])
145 |         b = dt.to_string(a, "%Y%m%d-%H%M%S")
146 |         assert b.is_string()
147 |         assert b.tolist() == ["20221114-224900", None]
148 | 
149 |     def test_today(self):
150 |         assert isinstance(dt.today(), np.datetime64)
151 | 
152 |     def test_weekday(self):
153 |         a = dt.new(["2022-10-15", NaT])
154 |         assert dt.weekday(a).tolist() == [5, None]
155 | 
156 |     def test_year(self):
157 |         a = dt.new(["2022-10-15", NaT])
158 |         assert dt.year(a).tolist() == [2022, None]
159 | 


--------------------------------------------------------------------------------
/dataiter/test/test_geojson.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright (c) 2020 Osmo Salomaa
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in
13 | # all copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | # THE SOFTWARE.
22 | 
23 | import tempfile
24 | 
25 | from dataiter import GeoJSON
26 | from dataiter import test
27 | from pathlib import Path
28 | 
29 | class TestGeoJSON:
30 | 
31 |     path = "neighbourhoods.geojson"
32 | 
33 |     def test_read(self):
34 |         path = str(test.get_data_path(self.path))
35 |         data = GeoJSON.read(path)
36 |         assert data.nrow == 233
37 |         assert data.ncol == 3
38 | 
39 |     def test_read_columns(self):
40 |         path = test.get_data_path(self.path)
41 |         data = GeoJSON.read(path, columns=["neighbourhood"])
42 |         assert data.colnames == ["neighbourhood", "geometry"]
43 | 
44 |     def test_read_dtypes(self):
45 |         path = test.get_data_path(self.path)
46 |         dtypes = {"neighbourhood": object, "neighbourhood_group": object}
47 |         data = GeoJSON.read(path, dtypes=dtypes)
48 |         assert data.neighbourhood.is_object()
49 |         assert data.neighbourhood_group.is_object()
50 | 
51 |     def test_to_data_frame(self):
52 |         orig = test.geojson(self.path)
53 |         data = orig.to_data_frame()
54 |         assert data.ncol == orig.ncol
55 |         assert data.nrow == orig.nrow
56 |         assert not isinstance(data, GeoJSON)
57 | 
58 |     def test_to_data_frame_drop_geometry(self):
59 |         orig = test.geojson(self.path)
60 |         data = orig.to_data_frame(drop_geometry=True)
61 |         assert data.ncol == orig.ncol - 1
62 |         assert data.nrow == orig.nrow
63 |         assert not isinstance(data, GeoJSON)
64 |         assert "geometry" not in data.colnames
65 | 
66 |     def test_to_string(self):
67 |         data = test.geojson(self.path)
68 |         assert data.head(0).to_string()
69 |         assert data.head(5).to_string()
70 | 
71 |     def test_to_string_no_geometry(self):
72 |         data = test.geojson(self.path)
73 |         del data.geometry
74 |         assert data.head(0).to_string()
75 |         assert data.head(5).to_string()
76 | 
77 |     def test_write(self):
78 |         orig = test.geojson(self.path)
79 |         handle, path = tempfile.mkstemp(".geojson")
80 |         orig.write(path)
81 |         data = GeoJSON.read(path)
82 |         assert data == orig
83 |         assert data.metadata == orig.metadata
84 | 
85 |     def test_write_path(self):
86 |         orig = test.geojson(self.path)
87 |         handle, path = tempfile.mkstemp(".geojson")
88 |         orig.write(Path(path))
89 | 


--------------------------------------------------------------------------------
/dataiter/test/test_io.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright (c) 2022 Osmo Salomaa
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in
13 | # all copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | # THE SOFTWARE.
22 | 
23 | import inspect
24 | 
25 | from dataiter import DataFrame
26 | from dataiter import GeoJSON
27 | from dataiter import io
28 | from dataiter import ListOfDicts
29 | 
30 | class TestIO:
31 | 
32 |     def test_read_csv(self):
33 |         s1 = inspect.signature(io.read_csv)
34 |         s2 = inspect.signature(DataFrame.read_csv)
35 |         assert s1 == s2
36 | 
37 |     def test_read_geojson(self):
38 |         s1 = inspect.signature(io.read_geojson)
39 |         s2 = inspect.signature(GeoJSON.read)
40 |         assert s1 == s2
41 | 
42 |     def test_read_json(self):
43 |         s1 = inspect.signature(io.read_json)
44 |         s2 = inspect.signature(ListOfDicts.read_json)
45 |         assert s1 == s2
46 | 
47 |     def test_read_npz(self):
48 |         s1 = inspect.signature(io.read_npz)
49 |         s2 = inspect.signature(DataFrame.read_npz)
50 |         assert s1 == s2
51 | 
52 |     def test_read_parquet(self):
53 |         s1 = inspect.signature(io.read_parquet)
54 |         s2 = inspect.signature(DataFrame.read_parquet)
55 |         assert s1 == s2
56 | 


--------------------------------------------------------------------------------
/dataiter/test/test_regex.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright (c) 2025 Osmo Salomaa
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in
13 | # all copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | # THE SOFTWARE.
22 | 
23 | import re
24 | 
25 | from dataiter import regex
26 | from dataiter import Vector
27 | 
28 | class TestRegex:
29 | 
30 |     def test_findall(self):
31 |         pattern = r"[a-z]"
32 |         string = Vector(["asdf", "1234", ""])
33 |         result = regex.findall(pattern, string)
34 |         expected = [["a", "s", "d", "f"], [], None]
35 |         assert result.tolist() == expected
36 |         assert regex.findall(pattern, string[0]) == result[0]
37 | 
38 |     def test_fullmatch(self):
39 |         pattern = r"[a-z]+"
40 |         string = Vector(["asdf", "1234", ""])
41 |         result = regex.fullmatch(pattern, string)
42 |         assert isinstance(result[0], re.Match)
43 |         assert result[1] is None
44 |         assert result[2] is None
45 |         match = regex.fullmatch(pattern, string[0])
46 |         assert isinstance(match, re.Match)
47 | 
48 |     def test_match(self):
49 |         pattern = r"[a-z]"
50 |         string = Vector(["asdf", "1234", ""])
51 |         result = regex.match(pattern, string)
52 |         assert isinstance(result[0], re.Match)
53 |         assert result[1] is None
54 |         assert result[2] is None
55 |         match = regex.match(pattern, string[0])
56 |         assert isinstance(match, re.Match)
57 | 
58 |     def test_search(self):
59 |         pattern = r"[a-z]"
60 |         string = Vector(["asdf", "1234", ""])
61 |         result = regex.search(pattern, string)
62 |         assert isinstance(result[0], re.Match)
63 |         assert result[1] is None
64 |         assert result[2] is None
65 |         match = regex.search(pattern, string[0])
66 |         assert isinstance(match, re.Match)
67 | 
68 |     def test_split(self):
69 |         pattern = r" +"
70 |         string = Vector(["one two three", "four", ""])
71 |         result = regex.split(pattern, string)
72 |         expected = [["one", "two", "three"], ["four"], None]
73 |         assert result.tolist() == expected
74 |         assert regex.split(pattern, string[0]) == result[0]
75 | 
76 |     def test_sub(self):
77 |         pattern = r"$"
78 |         repl = "!"
79 |         string = Vector(["great", "fantastic", ""])
80 |         result = regex.sub(pattern, repl, string)
81 |         expected = ["great!", "fantastic!", None]
82 |         assert result.tolist() == expected
83 |         assert regex.sub(pattern, repl, string[0]) == result[0]
84 | 
85 |     def test_subn(self):
86 |         pattern = r"$"
87 |         repl = "!"
88 |         string = Vector(["great", "fantastic", ""])
89 |         result = regex.subn(pattern, repl, string)
90 |         expected = [("great!", 1), ("fantastic!", 1), None]
91 |         assert result.tolist() == expected
92 |         assert regex.subn(pattern, repl, string[0]) == result[0]
93 | 


--------------------------------------------------------------------------------
/dataiter/test/test_util.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright (c) 2020 Osmo Salomaa
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in
 13 | # all copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 21 | # THE SOFTWARE.
 22 | 
 23 | import datetime
 24 | import math
 25 | import numpy as np
 26 | import tempfile
 27 | 
 28 | from dataiter import util
 29 | 
 30 | class TestUtil:
 31 | 
 32 |     def test_count_digits(self):
 33 |         assert util.count_digits(0) == (0, 0)
 34 |         assert util.count_digits(0.1) == (0, 1)
 35 |         assert util.count_digits(1.000) == (1, 0)
 36 |         assert util.count_digits(123.456) == (3, 3)
 37 |         assert util.count_digits(1e-1) == (0, 1)
 38 |         assert util.count_digits(1e-10) == (0, 10)
 39 | 
 40 |     def test_count_digits_special(self):
 41 |         assert util.count_digits(np.nan) == (0, 0)
 42 |         assert util.count_digits(math.inf) == (0, 0)
 43 | 
 44 |     def test_format_floats_1(self):
 45 |         a = [1/1000000000, 1/1000000, 1/1000, np.nan]
 46 |         b = ["1e-09", "1e-06", "1e-03", "nan"]
 47 |         assert util.format_floats(a) == b
 48 | 
 49 |     def test_format_floats_2(self):
 50 |         a = [0.000123456, 0.123456, 0, np.nan]
 51 |         b = ["0.000123", "0.123456", "0.000000", "nan"]
 52 |         assert util.format_floats(a) == b
 53 | 
 54 |     def test_format_floats_3(self):
 55 |         a = [0.123456, 1, 123.456, np.nan]
 56 |         b = ["0.123", "1.000", "123.456", "nan"]
 57 |         assert util.format_floats(a) == b
 58 | 
 59 |     def test_format_floats_4(self):
 60 |         a = [123.456789, 123456.789, 123456789, np.nan]
 61 |         b = ["123", "123457", "123456789", "nan"]
 62 |         assert util.format_floats(a) == b
 63 | 
 64 |     def test_format_floats_4_ksep(self):
 65 |         a = [123.456789, 123456.789, 123456789, np.nan]
 66 |         b = ["123", "123,457", "123,456,789", "nan"]
 67 |         assert util.format_floats(a, ksep=",") == b
 68 | 
 69 |     def test_format_floats_5(self):
 70 |         a = [12345678, 1234567812345678, 123456781234567812345678, np.nan]
 71 |         b = ["1.234568e+07", "1.234568e+15", "1.234568e+23", "nan"]
 72 |         assert util.format_floats(a) == b
 73 | 
 74 |     def test_format_floats_6(self):
 75 |         a = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5]
 76 |         b = ["0.10000", "0.01000", "0.00100", "0.00010", "0.00001"]
 77 |         assert util.format_floats(a) == b
 78 | 
 79 |     def test_format_floats_inf(self):
 80 |         a = [-math.inf, 0, math.inf, np.nan]
 81 |         b = ["-inf", "0e+00", "inf", "nan"]
 82 |         assert util.format_floats(a) == b
 83 | 
 84 |     def test_format_floats_integer(self):
 85 |         a = [1.0, 2.0, 3.0, np.nan]
 86 |         b = ["1", "2", "3", "nan"]
 87 |         assert util.format_floats(a) == b
 88 | 
 89 |     def test_generate_colnames(self):
 90 |         colnames = util.generate_colnames(1000)
 91 |         assert len(colnames) == 1000
 92 |         assert len(set(colnames)) == 1000
 93 | 
 94 |     def test_get_print_width(self):
 95 |         assert 0 < util.get_print_width() < 1000
 96 | 
 97 |     def test_is_scalar(self):
 98 |         assert util.is_scalar(None)
 99 |         assert util.is_scalar(b"")
100 |         assert util.is_scalar(1.0)
101 |         assert util.is_scalar(1)
102 |         assert util.is_scalar("")
103 |         assert util.is_scalar(datetime.date.today())
104 |         assert util.is_scalar(datetime.datetime.now())
105 |         assert util.is_scalar(datetime.timedelta(days=1))
106 |         assert not util.is_scalar(np.array([1, 2, 3]))
107 |         assert not util.is_scalar([1, 2, 3])
108 |         assert not util.is_scalar((1, 2, 3))
109 | 
110 |     def test_length(self):
111 |         assert util.length(1) == 1
112 |         assert util.length([1]) == 1
113 |         assert util.length([1, 2]) == 2
114 | 
115 |     def test_quote(self):
116 |         assert util.quote("hello") == '"hello"'
117 |         assert util.quote('"hello"') == '"\\"hello\\""'
118 | 
119 |     def test_sequencify(self):
120 |         assert util.sequencify(np.array([1])) == np.array([1])
121 |         assert util.sequencify([1]) == [1]
122 |         assert util.sequencify((1,)) == (1,)
123 |         assert util.sequencify(None) == [None]
124 |         assert util.sequencify(1) == [1]
125 |         assert util.sequencify(map(math.sqrt, [1, 4, 9])) == [1, 2, 3]
126 | 
127 |     def test_ulen(self):
128 |         assert util.ulen("asdf") == 4
129 |         assert util.ulen("asdf\u200b") == 4
130 |         assert util.ulen("asdf\u200b\u200b") == 4
131 | 
132 |     def test_unique_keys(self):
133 |         assert util.unique_keys([1, 2, 3]) == [1, 2, 3]
134 |         assert util.unique_keys([1, 2, 3, 1]) == [1, 2, 3]
135 | 
136 |     def test_unique_types(self):
137 |         assert util.unique_types([1, 2, 3.3, np.nan, None]) == {int, float}
138 | 
139 |     def test_upad(self):
140 |         assert util.upad(["a", "aa", "aaa"], align="right") == ["  a", " aa", "aaa"]
141 |         assert util.upad(["a", "aa", "aaa"], align="left")  == ["a  ", "aa ", "aaa"]
142 | 
143 |     def test_utruncate(self):
144 |         assert util.utruncate("abcdef", 4) == "abcd"
145 |         assert util.utruncate("abc\u200bdef", 4) == "abc\u200bd"
146 |         assert util.utruncate("abc\u200bdef\u200b", 4) == "abc\u200bd"
147 | 
148 |     def test_xopen_bz2(self):
149 |         text = "test åäö"
150 |         handle, path = tempfile.mkstemp(".bz2")
151 |         with util.xopen(path, "wt") as f:
152 |             f.write(text)
153 |         with util.xopen(path, "rt") as f:
154 |             assert f.read() == text
155 | 
156 |     def test_xopen_gz(self):
157 |         text = "test åäö"
158 |         handle, path = tempfile.mkstemp(".gz")
159 |         with util.xopen(path, "wt") as f:
160 |             f.write(text)
161 |         with util.xopen(path, "rt") as f:
162 |             assert f.read() == text
163 | 
164 |     def test_xopen_txt(self):
165 |         text = "test åäö"
166 |         handle, path = tempfile.mkstemp(".txt")
167 |         with util.xopen(path, "wt") as f:
168 |             f.write(text)
169 |         with util.xopen(path, "rt") as f:
170 |             assert f.read() == text
171 | 
172 |     def test_xopen_xz(self):
173 |         text = "test åäö"
174 |         handle, path = tempfile.mkstemp(".xz")
175 |         with util.xopen(path, "wt") as f:
176 |             f.write(text)
177 |         with util.xopen(path, "rt") as f:
178 |             assert f.read() == text
179 | 


--------------------------------------------------------------------------------
/dataiter/util.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright (c) 2020 Osmo Salomaa
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in
 13 | # all copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 21 | # THE SOFTWARE.
 22 | 
 23 | import bz2
 24 | import dataiter
 25 | import datetime
 26 | import gzip
 27 | import itertools
 28 | import lzma
 29 | import math
 30 | import numpy as np
 31 | import os
 32 | import shutil
 33 | import string
 34 | import wcwidth
 35 | 
 36 | from dataiter import deco
 37 | from pathlib import Path
 38 | 
 39 | def count_digits(value):
 40 |     if np.isnan(value): return 0, 0
 41 |     if math.isinf(value): return 0, 0
 42 |     parts = np.format_float_positional(value).split(".")
 43 |     n = len(parts[0].lstrip("0"))
 44 |     m = len(parts[1].rstrip("0"))
 45 |     return n, m
 46 | 
 47 | def format_alias_doc(alias, target):
 48 |     return f"{target.__doc__}\n\n{' '*8}" + (
 49 |         ".. note:: :func:`{}` is a convenience alias for :meth:`{}`."
 50 |         .format(alias.__name__, target.__qualname__))
 51 | 
 52 | def format_floats(seq, ksep=None):
 53 |     precision = dataiter.PRINT_FLOAT_PRECISION
 54 |     if any(0 < abs(x) < 1/10**precision or abs(x) > 10**16 - 1 for x in seq):
 55 |         # Format tiny and huge numbers in scientific notation.
 56 |         f = np.format_float_scientific
 57 |         return [f(x, precision=precision, trim="-") for x in seq]
 58 |     if ksep is None:
 59 |         ksep = dataiter.PRINT_THOUSAND_SEPARATOR
 60 |     # Format like largest by significant digits.
 61 |     digits = [count_digits(x) for x in seq]
 62 |     n = max(x[0] for x in digits)
 63 |     m = max(x[1] for x in digits)
 64 |     precision = min(m, max(0, precision - n))
 65 |     return [f"{{:,.{precision}f}}".format(x).replace(",", ksep)
 66 |             for x in seq]
 67 | 
 68 | def generate_colnames(n):
 69 |     return list(itertools.islice(yield_colnames(), n))
 70 | 
 71 | def get_print_width():
 72 |     return shutil.get_terminal_size((dataiter.PRINT_MAX_WIDTH, 24))[0] - 1
 73 | 
 74 | def is_scalar(value):
 75 |     # np.isscalar doesn't cover all needed cases.
 76 |     return (np.isscalar(value) or
 77 |             value is None or
 78 |             isinstance(value, (bytes,
 79 |                                bool,
 80 |                                float,
 81 |                                int,
 82 |                                str,
 83 |                                datetime.date,
 84 |                                datetime.datetime,
 85 |                                datetime.timedelta)))
 86 | 
 87 | def length(value):
 88 |     return 1 if is_scalar(value) else len(value)
 89 | 
 90 | def makedirs_for_file(path):
 91 |     return Path(path).parent.mkdir(parents=True, exist_ok=True)
 92 | 
 93 | def parse_env_boolean(name):
 94 |     return {
 95 |         "1":     True,
 96 |         "t":     True,
 97 |         "true":  True,
 98 |         "y":     True,
 99 |         "yes":   True,
100 |         "0":     False,
101 |         "f":     False,
102 |         "false": False,
103 |         "n":     False,
104 |         "no":    False,
105 |     }[os.environ[name].strip().lower()]
106 | 
107 | def quote(value):
108 |     return '"{}"'.format(str(value).replace('"', r'\"'))
109 | 
110 | def sequencify(value):
111 |     if isinstance(value, (np.ndarray, list, tuple)):
112 |         return value
113 |     if is_scalar(value):
114 |         return [value]
115 |     if hasattr(value, "__iter__"):
116 |         # Evaluate generator or iterator.
117 |         return list(value)
118 |     raise ValueError(f"Unexpected type: {type(value)}")
119 | 
120 | def unique_keys(keys):
121 |     return list(dict.fromkeys(keys))
122 | 
123 | def ulen(string):
124 |     # Return the display length of string accounting for
125 |     # Unicode characters that have a display width != 1.
126 |     length = wcwidth.wcswidth(string)
127 |     return length if length >= 0 else 0
128 | 
129 | def unique_types(seq):
130 |     return set(x.__class__ for x in seq if
131 |                x is not None and
132 |                not (isinstance(x, float) and np.isnan(x)))
133 | 
134 | @deco.listify
135 | def upad(strings, *, align="right"):
136 |     # Pad strings for display accounting for
137 |     # Unicode characters that have a display width != 1.
138 |     width = max(ulen(x) for x in strings)
139 |     for value in strings:
140 |         padding = " " * (width - ulen(value))
141 |         yield (padding + value
142 |                if align == "right"
143 |                else value + padding)
144 | 
145 | def utruncate(string, width):
146 |     # Truncate string to display width accounting for
147 |     # Unicode characters that have a display width != 1.
148 |     for i in range(1, len(string)):
149 |         if ulen(string[:i]) > width:
150 |             return string[:(i-1)]
151 |     return string
152 | 
153 | def xopen(path, mode="r", **kwargs):
154 |     if "b" not in mode:
155 |         kwargs.setdefault("encoding", "utf-8")
156 |     if str(path).endswith(".bz2"):
157 |         kwargs.setdefault("compresslevel", 6)
158 |         return bz2.open(path, mode, **kwargs)
159 |     if str(path).endswith(".gz"):
160 |         kwargs.setdefault("compresslevel", 6)
161 |         return gzip.open(path, mode, **kwargs)
162 |     if str(path).endswith(".xz"):
163 |         return lzma.open(path, mode)
164 |     return open(path, mode, **kwargs)
165 | 
166 | def yield_colnames():
167 |     # Like Excel: a, b, c, ..., aa, bb, cc, ...
168 |     for batch in range(1, 1000):
169 |         for letter in string.ascii_lowercase:
170 |             yield letter * batch
171 | 


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/doc/aggregation.rst:
--------------------------------------------------------------------------------
  1 | Aggregation
  2 | ===========
  3 | 
  4 | .. note:: The following applies currently only to the
  5 |           :class:`.DataFrame` class. Aggregation with a
  6 |           :class:`.ListOfDicts` is simpler and covered by the
  7 |           API-documentation on :meth:`.ListOfDicts.aggregate`.
  8 | 
  9 | By aggregation, we refer to splitting a data frame into groups based on
 10 | the values of one or more columns and then calculating group-wise
 11 | summaries, such total count or mean of a column. The first step is
 12 | called ``group_by`` and the second ``aggregate``, usually written via
 13 | method chaining as ``data.group_by(...).aggregate(...)``.
 14 | 
 15 | A simple example below of how to calculate the total count and mean
 16 | price of AirBnb listings in New York grouped by neighbourhood. The
 17 | ``aggregate`` method takes keyword arguments of the function to be used
 18 | to calculate the summary and the name of the column for that summary in
 19 | the output. The return value is a regular data frame. See the following
 20 | sections for what kinds of aggregation functions you can use.
 21 | 
 22 | >>> import dataiter as di
 23 | >>> data = di.read_csv("data/listings.csv")
 24 | >>> data.group_by("hood").aggregate(n=di.count(), price=di.mean("price"))
 25 | .
 26 |            hood     n   price
 27 |            <U13 int64 float64
 28 |   ───────────── ───── ───────
 29 | 0         Bronx  1198  90.176
 30 | 1      Brooklyn 19931 125.056
 31 | 2     Manhattan 21963 218.855
 32 | 3        Queens  6068  99.745
 33 | 4 Staten Island   370 116.908
 34 | 
 35 | Common Aggregation Functions
 36 | ----------------------------
 37 | 
 38 | Dataiter includes ready functions for the most common summaries that you
 39 | might want to calculate. These are technically function factories, i.e.
 40 | they are functions that return functions, that will then be called
 41 | group-wise within the ``aggregate`` method. For example,
 42 | ``di.mean("price")`` returns a function, that given a data frame,
 43 | returns the group-wise mean of the "price" column. The supported
 44 | functions are listed below.
 45 | 
 46 | * :func:`~dataiter.all`
 47 | * :func:`~dataiter.any`
 48 | * :func:`~dataiter.count`
 49 | * :func:`~dataiter.count_unique`
 50 | * :func:`~dataiter.first`
 51 | * :func:`~dataiter.last`
 52 | * :func:`~dataiter.max`
 53 | * :func:`~dataiter.mean`
 54 | * :func:`~dataiter.median`
 55 | * :func:`~dataiter.min`
 56 | * :func:`~dataiter.mode`
 57 | * :func:`~dataiter.nth`
 58 | * :func:`~dataiter.quantile`
 59 | * :func:`~dataiter.std`
 60 | * :func:`~dataiter.sum`
 61 | * :func:`~dataiter.var`
 62 | 
 63 | These common aggregation functions are provided for two reasons: (1)
 64 | they provide shorter, more convenient syntax than typing out lambda
 65 | functions and (2) they allow a huge conditional speed up under the hood.
 66 | The relevant caveat here is that they work only for single column
 67 | calculations. If you need to use multiple columns, such as for
 68 | calculating a weighted mean, see the next section on using arbitrary
 69 | lambda functions. And see the last section on when and how you can
 70 | benefit from the huge speed ups that these functions provide.
 71 | 
 72 | Arbitrary Aggregation
 73 | ---------------------
 74 | 
 75 | If you need to access multiple columns in aggregation or you need to
 76 | calculate some more esoteric summaries than what you can accomplish with
 77 | the above, then you'll need to use custom lambda functions. These
 78 | functions should take a data frame as an argument and return a scalar
 79 | value. The ``aggregate`` method will then apply your lambda functions
 80 | group-wise.
 81 | 
 82 | Repeating the example up top, below is how you'd do the same with lambda
 83 | functions. Notice that the code needed is a bit more verbose and if you
 84 | try this with a data frame that has a large amount of groups (around
 85 | 100,000 or more), you'll notice that it gets a bit slow, but for more
 86 | common sizes of input, it should be well usable.
 87 | 
 88 | >>> import dataiter as di
 89 | >>> data = di.read_csv("data/listings.csv")
 90 | >>> data.group_by("hood").aggregate(n=lambda x: x.nrow, price=lambda x: x.price.mean())
 91 | .
 92 |            hood     n   price
 93 |            <U13 int64 float64
 94 |   ───────────── ───── ───────
 95 | 0         Bronx  1198  90.176
 96 | 1      Brooklyn 19931 125.056
 97 | 2     Manhattan 21963 218.855
 98 | 3        Queens  6068  99.745
 99 | 4 Staten Island   370 116.908
100 | 
101 | Going Fast with Numba
102 | ---------------------
103 | 
104 | The common aggregation functions listed above are implemented in
105 | Dataiter as both pure Python code (slow) and JIT-compiled `Numba
106 | <https://numba.pydata.org/>`_ code (fast). If you have Numba installed,
107 | then Dataiter will **automatically** use it for aggregation involving
108 | **boolean**, **integer**, **float**, **date**, and **datetime** columns.
109 | If Numba is not available, Dataiter will automatically fall back on the
110 | slower pure Python implementations. The result should be the same,
111 | whether Numba is used or not, excluding some minor rounding or float
112 | precision differences.
113 | 
114 | Numba is currently not a hard dependency of Dataiter, so you'll need to
115 | install it separately::
116 | 
117 |    pip install -U numba
118 | 
119 | When, for a particular version of Dataiter, you first use a
120 | Numba-accelerated aggregation function, the code will be compiled, which
121 | might take a couple seconds. The compiled code is saved in `cache
122 | <https://numba.readthedocs.io/en/stable/developer/caching.html>`_.
123 | After that, using the function from cache will be really fast. In case
124 | you're benchmarking something, note also that on the first use of such a
125 | function in a Python session, the compiled code is loaded from cache on
126 | disk, which takes something like 10–100 ms and further calls will be
127 | faster as there's no more need to load anything.
128 | 
129 | .. note:: If you have trouble with Numba, please check the value of
130 |           ``di.USE_NUMBA`` to see if Numba has been found. You can also
131 |           set ``di.USE_NUMBA = False`` if you have Numba installed, but
132 |           it's not working right, or via the environment variable
133 |           ``DATAITER_USE_NUMBA=false``. Sometimes it's just the
134 |           `caching
135 |           <https://numba.readthedocs.io/en/stable/developer/caching.html>`_
136 |           part of Numba that's causing issues. When upgrading you might
137 |           sometimes need to delete old caches. If that doesn't help, you
138 |           can also turn caching off with ``di.USE_NUMBA_CACHE = False``
139 |           or the environment variable
140 |           ``DATAITER_USE_NUMBA_CACHE=false``.
141 | 


--------------------------------------------------------------------------------
/doc/check.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import dataiter as di
 4 | import inspect
 5 | 
 6 | from pathlib import Path
 7 | 
 8 | DOCUMENTED_SPECIALS = [
 9 |     "__init__",
10 | ]
11 | 
12 | PAGES = {
13 |     "dataiter.rst": di,
14 |     "data-frame.rst": di.DataFrame,
15 |     "data-frame-column.rst": di.DataFrameColumn,
16 |     "dt.rst": di.dt,
17 |     "dtypes.rst": di.dtypes,
18 |     "geojson.rst": di.GeoJSON,
19 |     "list-of-dicts.rst": di.ListOfDicts,
20 |     "regex.rst": di.regex,
21 |     "vector.rst": di.Vector,
22 | }
23 | 
24 | SKIP = [
25 |     "DataFrame.clear",
26 |     "DataFrame.COLUMN_PLACEHOLDER",
27 |     "DataFrame.pop",
28 |     "DataFrame.popitem",
29 |     "GeoJSON.to_string",
30 |     "Vector.to_strings",
31 | ]
32 | 
33 | DIRECTORY = Path(__file__).parent
34 | for page, obj in PAGES.items():
35 |     text = (DIRECTORY / page).read_text("utf-8")
36 |     source = inspect.getsourcefile(obj)
37 |     print(f"Checking {source}...")
38 |     for name, value in inspect.getmembers(obj):
39 |         if (name.startswith("_") and
40 |             name not in DOCUMENTED_SPECIALS):
41 |             continue
42 |         if not inspect.getmodule(value): continue
43 |         module = inspect.getmodule(value)
44 |         if inspect.ismodule(obj):
45 |             # Skip objects documented separately.
46 |             if inspect.ismodule(value): continue
47 |             if inspect.isclass(value): continue
48 |         if inspect.isclass(obj):
49 |             # Skip base class methods from NumPy etc.
50 |             if inspect.getsourcefile(module) != source: continue
51 |         full_name = f"{obj.__name__}.{name}"
52 |         if full_name in SKIP: continue
53 |         print(f"... {full_name}")
54 |         if full_name not in text:
55 |             raise Exception("Not found")
56 | 


--------------------------------------------------------------------------------
/doc/comparison.rst:
--------------------------------------------------------------------------------
 1 | Comparison
 2 | ==========
 3 | 
 4 | If you're familiar with `dplyr <https://dplyr.tidyverse.org/>`_ (R) or
 5 | `Pandas <https://pandas.pydata.org/>`_ (Python) you might find the below
 6 | comparison table useful to get started. Dataiter is heavily inspired by
 7 | dplyr, but not an implementation of the dplyr API, rather an adaptation
 8 | of mixed influences primarily from dplyr, base R, SQL and base Python.
 9 | 
10 | `Comparison Table of Basic Data Frame Operations in dplyr vs. Dataiter vs. Pandas <_static/comparison.html>`_
11 | 


--------------------------------------------------------------------------------
/doc/comparison/Makefile:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8-unix -*-
 2 | 
 3 | build:
 4 | 	./build.py
 5 | 
 6 | run:
 7 | 	python3 -m http.server
 8 | 
 9 | .PHONY: build run
10 | 


--------------------------------------------------------------------------------
/doc/comparison/README.md:
--------------------------------------------------------------------------------
 1 | Comparison Table dplyr vs. Dataiter vs. Pandas
 2 | ==============================================
 3 | 
 4 | ## Development
 5 | 
 6 | Use `make run` and open <http://localhost:8000/>.
 7 | 
 8 | ## Production
 9 | 
10 | `index.html` is compiled into `comparison.html`, which is used as a
11 | static file as part of the autogenerated Sphinx documentation.
12 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | select = E1,E9,F
3 | ignore = E129,F401,F821
4 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/aggregate-dataiter.py:
--------------------------------------------------------------------------------
 1 | (data
 2 |  .group_by("year", "month")
 3 |  .aggregate(
 4 |      sales_total=di.sum("sales"),
 5 |      sales_per_day=di.mean("sales")))
 6 | 
 7 | (data
 8 |  .group_by("year", "month")
 9 |  .aggregate(
10 |      sales_total=lambda x: x.sales.sum(),
11 |      sales_per_day=lambda x: x.sales.mean()))
12 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/aggregate-dplyr.R:
--------------------------------------------------------------------------------
1 | data %>%
2 |     group_by(year, month) %>%
3 |     summarise(
4 |         sales_total=sum(sales),
5 |         sales_per_day=mean(sales))
6 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/aggregate-pandas.py:
--------------------------------------------------------------------------------
 1 | (data
 2 |  .groupby(["year", "month"], as_index=False)
 3 |  .agg(
 4 |      sales_total=("sales", "sum"),
 5 |      sales_per_day=("sales", "mean")))
 6 | 
 7 | (data
 8 |  .groupby(["year", "month"], as_index=False)
 9 |  .apply(lambda x: pd.Series({
10 |      "sales_total": x["sales"].sum(),
11 |      "sales_per_day": x["sales"].mean()})))
12 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/cbind-dataiter.py:
--------------------------------------------------------------------------------
1 | data1.cbind(data2)
2 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/cbind-dplyr.R:
--------------------------------------------------------------------------------
1 | bind_cols(data1, data2)
2 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/cbind-pandas.py:
--------------------------------------------------------------------------------
1 | data1 = data1.reset_index(drop=True)
2 | data2 = data2.reset_index(drop=True)
3 | pd.concat([data1, data2], axis=1)
4 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/chain-dataiter.py:
--------------------------------------------------------------------------------
1 | (data
2 |  .filter(year=2021)
3 |  .sort(sales=-1)
4 |  .head(10))
5 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/chain-dplyr.R:
--------------------------------------------------------------------------------
1 | data |>
2 |     filter(year == 2021) |>
3 |     arrange(desc(sales)) |>
4 |     head(10)
5 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/chain-pandas.py:
--------------------------------------------------------------------------------
1 | (data
2 |  .loc[lambda x: x["year"] == 2021]
3 |  .sort_values("sales", ascending=False)
4 |  .head(10))
5 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/colnames-dataiter.py:
--------------------------------------------------------------------------------
1 | names = data.colnames
2 | data.colnames = ["a", "b", "c"]
3 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/colnames-dplyr.R:
--------------------------------------------------------------------------------
1 | names = colnames(data)
2 | colnames(data) = c("a", "b", "c")
3 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/colnames-pandas.py:
--------------------------------------------------------------------------------
1 | names = data.columns
2 | data.columns = ["a", "b", "c"]
3 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/filter-dataiter.py:
--------------------------------------------------------------------------------
1 | data.filter(year=2021)
2 | data.filter(data.year == 2021)
3 | data.filter(lambda x: x.year == 2021)
4 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/filter-dplyr.R:
--------------------------------------------------------------------------------
1 | filter(data, year == 2021)
2 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/filter-pandas.py:
--------------------------------------------------------------------------------
1 | data[data["year"] == 2021]
2 | data.loc[data["year"] == 2021]
3 | data[lambda x: x["year"] == 2021]
4 | data.loc[lambda x: x["year"] == 2021]
5 | data.query("year == 2021")
6 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/grouped-modify-dataiter.py:
--------------------------------------------------------------------------------
1 | (data
2 |  .group_by("year", "month")
3 |  .modify(fraction=lambda x: (
4 |      x.sales / x.sales.sum())))
5 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/grouped-modify-dplyr.R:
--------------------------------------------------------------------------------
1 | data %>%
2 |     group_by(year, month) %>%
3 |     mutate(fraction=sales/sum(sales))
4 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/grouped-modify-pandas.py:
--------------------------------------------------------------------------------
1 | # No singular operation
2 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/head-dataiter.py:
--------------------------------------------------------------------------------
1 | data.head(10)
2 | data.tail(10)
3 | data.sample(10)
4 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/head-dplyr.R:
--------------------------------------------------------------------------------
1 | head(data, 10)
2 | tail(data, 10)
3 | slice_sample(data, n=10)
4 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/head-pandas.py:
--------------------------------------------------------------------------------
1 | data.head(10)
2 | data.tail(10)
3 | data.sample(10)
4 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/import-dataiter.py:
--------------------------------------------------------------------------------
1 | import dataiter as di
2 | import numpy as np
3 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/import-dplyr.R:
--------------------------------------------------------------------------------
1 | library(tidyverse)
2 | 
3 | # Avoid hiding print output.
4 | options(pillar.width=1000)
5 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/import-pandas.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | 
3 | # Avoid hiding print output.
4 | pd.set_option("display.max_columns", 1000)
5 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/index-dataiter.py:
--------------------------------------------------------------------------------
 1 | # Column by name
 2 | data.x
 3 | data["x"]
 4 | 
 5 | # Column by index
 6 | data.columns[2]
 7 | 
 8 | # Row by index
 9 | data.slice(2)
10 | 
11 | # Column element
12 | data.x[2]
13 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/index-dplyr.R:
--------------------------------------------------------------------------------
 1 | # Column by name
 2 | data$x
 3 | data[["x"]]
 4 | 
 5 | # Column by index
 6 | data[[3]]
 7 | 
 8 | # Row by index
 9 | data[3,]
10 | 
11 | # Column element
12 | data$x[3]
13 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/index-pandas.py:
--------------------------------------------------------------------------------
 1 | # Column by name
 2 | data.x
 3 | data["x"]
 4 | 
 5 | # Column by index
 6 | data.iloc[:,2]
 7 | 
 8 | # Row by index
 9 | data.iloc[2,:]
10 | 
11 | # Column element
12 | data["x"][2]
13 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/io-binary-dataiter.py:
--------------------------------------------------------------------------------
1 | data = di.read_npz("data.npz")
2 | data.write_npz("data.npz")
3 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/io-binary-dplyr.R:
--------------------------------------------------------------------------------
1 | data = read_rds("data.rds")
2 | write_rds(data, "data.rds")
3 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/io-binary-pandas.py:
--------------------------------------------------------------------------------
1 | data = pd.read_pickle("data.pkl")
2 | data.to_pickle("data.pkl")
3 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/io-csv-dataiter.py:
--------------------------------------------------------------------------------
1 | data = di.read_csv("data.csv")
2 | data.write_csv("data.csv")
3 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/io-csv-dplyr.R:
--------------------------------------------------------------------------------
1 | data = read_csv("data.csv")
2 | write_csv(data, "data.csv")
3 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/io-csv-pandas.py:
--------------------------------------------------------------------------------
1 | data = pd.read_csv("data.csv")
2 | data.to_csv("data.csv", index=False)
3 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/join-dataiter.py:
--------------------------------------------------------------------------------
1 | data1.left_join (data2, "id")
2 | data1.inner_join(data2, "id")
3 | data1.full_join (data2, "id")
4 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/join-dplyr.R:
--------------------------------------------------------------------------------
1 | left_join (data1, data2, by="id")
2 | inner_join(data1, data2, by="id")
3 | full_join (data1, data2, by="id")
4 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/join-pandas.py:
--------------------------------------------------------------------------------
1 | data1.merge(data2, how="left",  on="id")
2 | data1.merge(data2, how="inner", on="id")
3 | data1.merge(data2, how="outer", on="id")
4 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/modify-dataiter.py:
--------------------------------------------------------------------------------
1 | data.modify(c=(data.a + data.b))
2 | data.modify(c=lambda x: x.a + x.b)
3 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/modify-dplyr.R:
--------------------------------------------------------------------------------
1 | mutate(data, c=(a + b))
2 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/modify-pandas.py:
--------------------------------------------------------------------------------
1 | data.assign(c=(data["a"] + data["b"]))
2 | data.assign(c=lambda x: x["a"] + x["b"])
3 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/non-join-dataiter.py:
--------------------------------------------------------------------------------
1 | data1.semi_join(data2, "id")
2 | data1.anti_join(data2, "id")
3 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/non-join-dplyr.R:
--------------------------------------------------------------------------------
1 | semi_join(data1, data2, by="id")
2 | anti_join(data1, data2, by="id")
3 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/non-join-pandas.py:
--------------------------------------------------------------------------------
1 | # No singular operations
2 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/rbind-dataiter.py:
--------------------------------------------------------------------------------
1 | data1.rbind(data2)
2 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/rbind-dplyr.R:
--------------------------------------------------------------------------------
1 | bind_rows(data1, data2)
2 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/rbind-pandas.py:
--------------------------------------------------------------------------------
1 | pd.concat([data1, data2])
2 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/rename-dataiter.py:
--------------------------------------------------------------------------------
1 | data.rename(to="from")
2 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/rename-dplyr.R:
--------------------------------------------------------------------------------
1 | rename(data, to="from")
2 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/rename-pandas.py:
--------------------------------------------------------------------------------
1 | data.rename(columns={"from": "to"}, errors="raise")
2 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/select-dataiter.py:
--------------------------------------------------------------------------------
1 | data.select("a", "b", "c")
2 | data.unselect("a", "b", "c")
3 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/select-dplyr.R:
--------------------------------------------------------------------------------
1 | select(data, a, b, c)
2 | select(data, -a, -b, -c)
3 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/select-pandas.py:
--------------------------------------------------------------------------------
1 | data[["a", "b", "c"]]
2 | data.drop(columns=["a", "b", "c"])
3 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/size-dataiter.py:
--------------------------------------------------------------------------------
1 | data.nrow
2 | data.ncol
3 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/size-dplyr.R:
--------------------------------------------------------------------------------
1 | nrow(data)
2 | ncol(data)
3 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/size-pandas.py:
--------------------------------------------------------------------------------
1 | len(data)
2 | len(data.columns)
3 | nrow, ncol = data.shape
4 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/sort-dataiter.py:
--------------------------------------------------------------------------------
1 | data.sort(a=1, b=1, c=-1)
2 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/sort-dplyr.R:
--------------------------------------------------------------------------------
1 | arrange(data, a, b, desc(c))
2 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/sort-pandas.py:
--------------------------------------------------------------------------------
1 | data.sort_values(["a", "b", "c"], ascending=[True, True, False])
2 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/unique-dataiter.py:
--------------------------------------------------------------------------------
1 | data.unique("a", "b", "c")
2 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/unique-dplyr.R:
--------------------------------------------------------------------------------
1 | distinct(data, a, b, c, .keep_all=TRUE)
2 | 


--------------------------------------------------------------------------------
/doc/comparison/blocks/unique-pandas.py:
--------------------------------------------------------------------------------
1 | data.drop_duplicates(["a", "b", "c"])
2 | 


--------------------------------------------------------------------------------
/doc/comparison/build.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import re
 4 | 
 5 | from pathlib import Path
 6 | 
 7 | lines = []
 8 | text = Path("index.html").read_text("utf-8").strip()
 9 | print("Compiling index.html + blocks → comparison.html...")
10 | print(f"index.html: {len(text)}")
11 | for line in text.splitlines():
12 |     if not line.strip().startswith('<pre data-src="'):
13 |         lines.append(line)
14 |         continue
15 |     path = Path(line.split('"')[1])
16 |     assert path.exists()
17 |     code = path.read_text("utf-8").strip()
18 |     lang = path.suffix.lstrip(".").lower()
19 |     html = f'<pre><code class="language-{lang}">{code}</code></pre>'
20 |     for line in html.splitlines():
21 |         lines.append(line)
22 | 
23 | text = "\n".join(lines) + "\n"
24 | print(f"comparison.html: {len(text)}")
25 | Path("comparison.html").write_text(text, "utf-8")
26 | 
27 | text = Path("prism.css").read_text("utf-8").strip()
28 | if "font-family:" in text or "font-size:" in text:
29 |     # Strip Prism font rules so that they don't override
30 |     # Tailwind CSS's better-thought-out default system font stack.
31 |     # https://tailwindcss.com/docs/font-family
32 |     text_length_prior = len(text)
33 |     print("Patching prims.css... ", end="")
34 |     text = re.sub(r"font-family:.+?;", "", text)
35 |     text = re.sub(r"font-size:.+?;", "", text)
36 |     print(len(text) - text_length_prior)
37 |     Path("prism.css").write_text(text + "\n", "utf-8")
38 | 


--------------------------------------------------------------------------------
/doc/comparison/generate.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | for ARG; do
3 |     ARG="${ARG//_/-}"
4 |     touch blocks/$ARG-dplyr.R
5 |     touch blocks/$ARG-pandas.py
6 |     touch blocks/$ARG-dataiter.py
7 | done
8 | 


--------------------------------------------------------------------------------
/doc/comparison/index.html:
--------------------------------------------------------------------------------
  1 | <!doctype html>
  2 | <html>
  3 |   <head>
  4 |     <meta charset="UTF-8">
  5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
  6 |     <meta name="description" content="Compare basic data frame operations in dplyr vs. Dataiter vs. Pandas">
  7 |     <title>Comparison Table of Basic Data Frame Operations in dplyr vs. Dataiter vs. Pandas</title>
  8 |     <script src="https://cdn.tailwindcss.com"></script>
  9 |     <style type="text/tailwindcss">
 10 |       div.grid-row { @apply gap-x-5 gap-y-1 grid grid-cols-3 items-start max-w-7xl mx-auto my-1; }
 11 |       div.grid-row-6 { @apply gap-x-5 gap-y-1 grid grid-cols-6 items-start max-w-7xl mx-auto my-1; }
 12 |       div.grid-row-9 { @apply gap-x-5 gap-y-1 grid grid-cols-9 items-start max-w-7xl mx-auto my-1; }
 13 |       #search { @apply border-2 border-slate-300 font-mono mb-6 outline-0 px-4 py-2 rounded-lg w-full; }
 14 |       h1 { @apply font-bold leading-normal mb-6 text-3xl text-slate-900 tracking-wide; }
 15 |       h2 { @apply font-bold max-w-7xl mx-auto my-3 text-xl text-slate-900 tracking-wide; }
 16 |       h3 { @apply text-center text-slate-500 text-sm tracking-wide; }
 17 |       pre { @apply px-4 py-3 rounded-lg text-slate-900 text-sm !important; }
 18 |       pre:nth-child(1) { @apply bg-stone-100; }
 19 |       pre:nth-child(2) { @apply bg-slate-100; }
 20 |       pre:nth-child(3) { @apply bg-stone-100; }
 21 |       code { @apply text-slate-900 text-sm !important; }
 22 |       code * { @apply bg-inherit !important; }
 23 |       p.note { @apply mb-6 mx-1 text-sm text-slate-700; }
 24 |       p.note code { @apply bg-stone-100 p-0.5 rounded text-emerald-700 !important; }
 25 |       a { @apply text-blue-500; }
 26 |       a:hover { @apply border-b-2 border-blue-500; }
 27 |     </style>
 28 |     <link rel="stylesheet" href="prism.css">
 29 |   </head>
 30 | 
 31 |   <body class="pb-48 pt-6 px-6">
 32 | 
 33 |     <div class="grid-row">
 34 |       <h1 class="col-span-2">Comparison Table of Basic Data Frame Operations in dplyr vs. Dataiter vs. Pandas</h1>
 35 |       <input type="search" id="search" placeholder="Search" autofocus>
 36 |     </div>
 37 | 
 38 |     <div class="grid-row">
 39 |       <h3>dplyr</h3>
 40 |       <h3>Dataiter</h3>
 41 |       <h3>Pandas</h3>
 42 |     </div>
 43 | 
 44 |     <h2>Imports & Configuration</h2>
 45 |     <div class="grid-row" search-terms="import library">
 46 |       <pre data-src="blocks/import-dplyr.R"></pre>
 47 |       <pre data-src="blocks/import-dataiter.py"></pre>
 48 |       <pre data-src="blocks/import-pandas.py"></pre>
 49 |       <p class="note">Some of the below code uses other parts of tidyverse besides dplyr too, such
 50 |         as readr. For simplicity, you can load them all via the tidyverse metapackage.</p>
 51 |       <p class="note">We often need NumPy too for certain calculations.</p>
 52 |       <p class="note"></p>
 53 |     </div>
 54 | 
 55 |     <h2>Input/Output</h2>
 56 |     <div class="grid-row" search-terms="read write csv">
 57 |       <pre data-src="blocks/io-csv-dplyr.R"></pre>
 58 |       <pre data-src="blocks/io-csv-dataiter.py"></pre>
 59 |       <pre data-src="blocks/io-csv-pandas.py"></pre>
 60 |     </div>
 61 |     <div class="grid-row" search-terms="read write binary rds pickle pkl npz">
 62 |       <pre data-src="blocks/io-binary-dplyr.R"></pre>
 63 |       <pre data-src="blocks/io-binary-dataiter.py"></pre>
 64 |       <pre data-src="blocks/io-binary-pandas.py"></pre>
 65 |     </div>
 66 |     <div class="grid-row-6" search-terms="read write binary rds pickle pkl npz">
 67 |       <p class="col-start-2 col-span-4 note">
 68 |         All three support multiple binary formats, the above are sensible defaults (assuming you
 69 |         don't need interoperability) that work out of the box.
 70 |       </p>
 71 |     </div>
 72 | 
 73 |     <h2>Structure</h2>
 74 |     <div class="grid-row" search-terms="nrow ncol length shape">
 75 |       <pre data-src="blocks/size-dplyr.R"></pre>
 76 |       <pre data-src="blocks/size-dataiter.py"></pre>
 77 |       <pre data-src="blocks/size-pandas.py"></pre>
 78 |     </div>
 79 | 
 80 |     <h2>Indexing</h2>
 81 |     <div class="grid-row" search-terms="index loc iloc">
 82 |       <pre data-src="blocks/index-dplyr.R"></pre>
 83 |       <pre data-src="blocks/index-dataiter.py"></pre>
 84 |       <pre data-src="blocks/index-pandas.py"></pre>
 85 |       <p class="note"></p>
 86 |       <p class="note">Attribute access to columns (dot notation) is preferred.</p>
 87 |       <p class="note">Attribute access to columns (dot notation) does not work in all contexts,
 88 |         bracket notation is more common. Pandas uses terms "axis=0" to refer to rows, "axis=1" to
 89 |         refer to columns, "index" to refer to row names and "labels" to refer to row and column
 90 |         names. Certain operations use the "index" for implicit joins called "alignment".</p>
 91 |     </div>
 92 | 
 93 |     <h2>Chaining/Piping</h2>
 94 |     <div class="grid-row" search-terms="chain chaining pipe piping">
 95 |       <pre data-src="blocks/chain-dplyr.R"></pre>
 96 |       <pre data-src="blocks/chain-dataiter.py"></pre>
 97 |       <pre data-src="blocks/chain-pandas.py"></pre>
 98 |       <p class="note"></p>
 99 |       <p class="note"></p>
100 |       <p class="note">Pandas is not really designed for method chaining but it mostly works these
101 |         days. Note also that the "inplace" arguments that many methods take, which if used are
102 |         incompatible with method chaining,
103 |         are <a href="https://github.com/pandas-dev/pandas/issues/16529">apparently</a> not
104 |         useful.</p>
105 |     </div>
106 | 
107 |     <h2>Column Operations</h2>
108 |     <div class="grid-row" search-terms="colnames columns">
109 |       <pre data-src="blocks/colnames-dplyr.R"></pre>
110 |       <pre data-src="blocks/colnames-dataiter.py"></pre>
111 |       <pre data-src="blocks/colnames-pandas.py"></pre>
112 |     </div>
113 |     <div class="grid-row" search-terms="rename">
114 |       <pre data-src="blocks/rename-dplyr.R"></pre>
115 |       <pre data-src="blocks/rename-dataiter.py"></pre>
116 |       <pre data-src="blocks/rename-pandas.py"></pre>
117 |     </div>
118 |     <div class="grid-row" search-terms="select unselect drop">
119 |       <pre data-src="blocks/select-dplyr.R"></pre>
120 |       <pre data-src="blocks/select-dataiter.py"></pre>
121 |       <pre data-src="blocks/select-pandas.py"></pre>
122 |     </div>
123 |     <div class="grid-row" search-terms="modify mutate assign">
124 |       <pre data-src="blocks/modify-dplyr.R"></pre>
125 |       <pre data-src="blocks/modify-dataiter.py"></pre>
126 |       <pre data-src="blocks/modify-pandas.py"></pre>
127 |     </div>
128 | 
129 |     <h2>Sorting</h2>
130 |     <div class="grid-row" search-terms="sort arrange values">
131 |       <pre data-src="blocks/sort-dplyr.R"></pre>
132 |       <pre data-src="blocks/sort-dataiter.py"></pre>
133 |       <pre data-src="blocks/sort-pandas.py"></pre>
134 |     </div>
135 | 
136 |     <h2>Subsetting by Row</h2>
137 |     <div class="grid-row" search-terms="head tail sample slice">
138 |       <pre data-src="blocks/head-dplyr.R"></pre>
139 |       <pre data-src="blocks/head-dataiter.py"></pre>
140 |       <pre data-src="blocks/head-pandas.py"></pre>
141 |     </div>
142 |     <div class="grid-row" search-terms="filter out">
143 |       <pre data-src="blocks/filter-dplyr.R"></pre>
144 |       <pre data-src="blocks/filter-dataiter.py"></pre>
145 |       <pre data-src="blocks/filter-pandas.py"></pre>
146 |       <p class="note"></p>
147 |       <p class="note">Dataiter also has <code>filter_out</code> as a shorthand to negate the given
148 |         condition.</p>
149 |       <p class="note"></p>
150 |     </div>
151 |     <div class="grid-row" search-terms="unique distinct drop duplicates">
152 |       <pre data-src="blocks/unique-dplyr.R"></pre>
153 |       <pre data-src="blocks/unique-dataiter.py"></pre>
154 |       <pre data-src="blocks/unique-pandas.py"></pre>
155 |     </div>
156 | 
157 |     <h2>Concatenation</h2>
158 |     <div class="grid-row" search-terms="rbind bind rows concat append">
159 |       <pre data-src="blocks/rbind-dplyr.R"></pre>
160 |       <pre data-src="blocks/rbind-dataiter.py"></pre>
161 |       <pre data-src="blocks/rbind-pandas.py"></pre>
162 |       <p class="note"></p>
163 |       <p class="note"></p>
164 |       <p class="note"></p>
165 |     </div>
166 |     <div class="grid-row" search-terms="cbind bind cols concat">
167 |       <pre data-src="blocks/cbind-dplyr.R"></pre>
168 |       <pre data-src="blocks/cbind-dataiter.py"></pre>
169 |       <pre data-src="blocks/cbind-pandas.py"></pre>
170 |       <p class="note"></p>
171 |       <p class="note"></p>
172 |       <p class="note">Pandas wants to do "alignment" by "index" here. Resetting the indices prior to
173 |         concatenation should give the same result as the plain concatenation in dplyr and
174 |         Dataiter.</p>
175 |     </div>
176 | 
177 |     <h2>Joins</h2>
178 |     <div class="grid-row" search-terms="join left inner full merge outer">
179 |       <pre data-src="blocks/join-dplyr.R"></pre>
180 |       <pre data-src="blocks/join-dataiter.py"></pre>
181 |       <pre data-src="blocks/join-pandas.py"></pre>
182 |     </div>
183 |     <div class="grid-row-6" search-terms="join left inner full merge outer">
184 |       <p class="col-start-2 col-span-4 note">
185 |         dplyr and Pandas follow the SQL convention of joining all matching rows, i.e. if doing a
186 |         left join with ten rows on the left side, the result will have ten or more rows – ten if all
187 |         keys have zero or one match on the right side, twenty if all have two matches etc. Dataiter
188 |         differs by only joining the first match, on account of it usually being more practical and
189 |         less liable to produce unpleasant surprises. If Dataiter's <code>a.left_join(b)</code>
190 |         doesn't give you all the results you're looking for, you might want instead
191 |         either <code>b.left_join(a)</code> or <code>a.full_join(b)</code>. SQL-style joins are
192 |         currently unsupported, but may be added in the future.
193 |       </p>
194 |     </div>
195 |     <div class="grid-row" search-terms="join semi anti">
196 |       <pre data-src="blocks/non-join-dplyr.R"></pre>
197 |       <pre data-src="blocks/non-join-dataiter.py"></pre>
198 |       <pre data-src="blocks/non-join-pandas.py"></pre>
199 |     </div>
200 | 
201 |     <h2>Grouping & Modification</h2>
202 |     <div class="grid-row" search-terms="group by modify mutate">
203 |       <pre data-src="blocks/grouped-modify-dplyr.R"></pre>
204 |       <pre data-src="blocks/grouped-modify-dataiter.py"></pre>
205 |       <pre data-src="blocks/grouped-modify-pandas.py"></pre>
206 |     </div>
207 | 
208 |     <h2>Grouping & Aggregation</h2>
209 |     <div class="grid-row" search-terms="group by aggregate groupby agg apply">
210 |       <pre data-src="blocks/aggregate-dplyr.R"></pre>
211 |       <pre data-src="blocks/aggregate-dataiter.py"></pre>
212 |       <pre data-src="blocks/aggregate-pandas.py"></pre>
213 |     </div>
214 |     <div class="grid-row-9" search-terms="group by aggregate groupby agg apply">
215 |       <p class="col-start-5 col-span-4 note">Both Dataiter and Pandas have two aggregation forms:
216 |         one for fast aggregation limited to common operations with a single column and another for
217 |         arbitrary calculation with access to all columns. In Dataiter, these forms are equivalent in
218 |         the sense that e.g. <code>di.sum("sales")</code> returns a function that takes data as
219 |         argument and calculates the sum of the "sales" column, and also in the sense that, unlike
220 |         with Pandas, you can mix and match both forms within the same <code>aggregate</code> call.
221 |         Pandas' <code>agg</code> method arguments can take very many forms, the above is called
222 |         "named aggregation". Likewise, the <code>apply</code> method can be used very many ways, the
223 |         above is one way. The first form is about equally fast in Dataiter and Pandas, the second
224 |         form is a lot faster in Dataiter.</p>
225 |     </div>
226 | 
227 |     <script src="prism.js"></script>
228 | 
229 |     <script>
230 |       function matches(query, terms) {
231 |           if (query.length === 0) return true;
232 |           return query.split(/[ _]+/).every(q => terms.includes(q));
233 |       }
234 |       document.getElementById("search").addEventListener("input", () => {
235 |           const query = document.getElementById("search").value.trim().toLowerCase();
236 |           document.querySelectorAll("div[search-terms]").forEach(div => {
237 |               const terms = div.getAttribute("search-terms");
238 |               const show = matches(query, terms);
239 |               div.style.display = show ? "" : "none";
240 |           });
241 |           const show = query.length === 0;
242 |           document.querySelectorAll("h2").forEach(h => {
243 |               h.style.display = show ? "" : "none";
244 |           });
245 |       });
246 |     </script>
247 | 
248 |   </body>
249 | </html>
250 | 


--------------------------------------------------------------------------------
/doc/comparison/prism.css:
--------------------------------------------------------------------------------
1 | /* PrismJS 1.25.0
2 | https://prismjs.com/download.html#themes=prism&languages=python+r&plugins=file-highlight */
3 | code[class*=language-],pre[class*=language-]{color:#000;background:0 0;text-shadow:0 1px #fff;text-align:left;white-space:pre;word-spacing:normal;word-break:normal;word-wrap:normal;line-height:1.5;-moz-tab-size:4;-o-tab-size:4;tab-size:4;-webkit-hyphens:none;-moz-hyphens:none;-ms-hyphens:none;hyphens:none}code[class*=language-] ::-moz-selection,code[class*=language-]::-moz-selection,pre[class*=language-] ::-moz-selection,pre[class*=language-]::-moz-selection{text-shadow:none;background:#b3d4fc}code[class*=language-] ::selection,code[class*=language-]::selection,pre[class*=language-] ::selection,pre[class*=language-]::selection{text-shadow:none;background:#b3d4fc}@media print{code[class*=language-],pre[class*=language-]{text-shadow:none}}pre[class*=language-]{padding:1em;margin:.5em 0;overflow:auto}:not(pre)>code[class*=language-],pre[class*=language-]{background:#f5f2f0}:not(pre)>code[class*=language-]{padding:.1em;border-radius:.3em;white-space:normal}.token.cdata,.token.comment,.token.doctype,.token.prolog{color:#708090}.token.punctuation{color:#999}.token.namespace{opacity:.7}.token.boolean,.token.constant,.token.deleted,.token.number,.token.property,.token.symbol,.token.tag{color:#905}.token.attr-name,.token.builtin,.token.char,.token.inserted,.token.selector,.token.string{color:#690}.language-css .token.string,.style .token.string,.token.entity,.token.operator,.token.url{color:#9a6e3a;background:hsla(0,0%,100%,.5)}.token.atrule,.token.attr-value,.token.keyword{color:#07a}.token.class-name,.token.function{color:#dd4a68}.token.important,.token.regex,.token.variable{color:#e90}.token.bold,.token.important{font-weight:700}.token.italic{font-style:italic}.token.entity{cursor:help}
4 | 


--------------------------------------------------------------------------------
/doc/comparison/prism.js:
--------------------------------------------------------------------------------
1 | /* PrismJS 1.25.0
2 | https://prismjs.com/download.html#themes=prism&languages=python+r&plugins=file-highlight */
3 | var _self="undefined"!=typeof window?window:"undefined"!=typeof WorkerGlobalScope&&self instanceof WorkerGlobalScope?self:{},Prism=function(u){var t=/(?:^|\s)lang(?:uage)?-([\w-]+)(?=\s|$)/i,n=0,e={},M={manual:u.Prism&&u.Prism.manual,disableWorkerMessageHandler:u.Prism&&u.Prism.disableWorkerMessageHandler,util:{encode:function e(n){return n instanceof W?new W(n.type,e(n.content),n.alias):Array.isArray(n)?n.map(e):n.replace(/&/g,"&amp;").replace(/</g,"&lt;").replace(/\u00a0/g," ")},type:function(e){return Object.prototype.toString.call(e).slice(8,-1)},objId:function(e){return e.__id||Object.defineProperty(e,"__id",{value:++n}),e.__id},clone:function t(e,r){var a,n;switch(r=r||{},M.util.type(e)){case"Object":if(n=M.util.objId(e),r[n])return r[n];for(var i in a={},r[n]=a,e)e.hasOwnProperty(i)&&(a[i]=t(e[i],r));return a;case"Array":return n=M.util.objId(e),r[n]?r[n]:(a=[],r[n]=a,e.forEach(function(e,n){a[n]=t(e,r)}),a);default:return e}},getLanguage:function(e){for(;e;){var n=t.exec(e.className);if(n)return n[1].toLowerCase();e=e.parentElement}return"none"},setLanguage:function(e,n){e.className=e.className.replace(RegExp(t,"gi"),""),e.classList.add("language-"+n)},currentScript:function(){if("undefined"==typeof document)return null;if("currentScript"in document)return document.currentScript;try{throw new Error}catch(e){var n=(/at [^(\r\n]*\((.*):[^:]+:[^:]+\)$/i.exec(e.stack)||[])[1];if(n){var t=document.getElementsByTagName("script");for(var r in t)if(t[r].src==n)return t[r]}return null}},isActive:function(e,n,t){for(var r="no-"+n;e;){var a=e.classList;if(a.contains(n))return!0;if(a.contains(r))return!1;e=e.parentElement}return!!t}},languages:{plain:e,plaintext:e,text:e,txt:e,extend:function(e,n){var t=M.util.clone(M.languages[e]);for(var r in n)t[r]=n[r];return t},insertBefore:function(t,e,n,r){var a=(r=r||M.languages)[t],i={};for(var l in a)if(a.hasOwnProperty(l)){if(l==e)for(var o in n)n.hasOwnProperty(o)&&(i[o]=n[o]);n.hasOwnProperty(l)||(i[l]=a[l])}var s=r[t];return r[t]=i,M.languages.DFS(M.languages,function(e,n){n===s&&e!=t&&(this[e]=i)}),i},DFS:function e(n,t,r,a){a=a||{};var i=M.util.objId;for(var l in n)if(n.hasOwnProperty(l)){t.call(n,l,n[l],r||l);var o=n[l],s=M.util.type(o);"Object"!==s||a[i(o)]?"Array"!==s||a[i(o)]||(a[i(o)]=!0,e(o,t,l,a)):(a[i(o)]=!0,e(o,t,null,a))}}},plugins:{},highlightAll:function(e,n){M.highlightAllUnder(document,e,n)},highlightAllUnder:function(e,n,t){var r={callback:t,container:e,selector:'code[class*="language-"], [class*="language-"] code, code[class*="lang-"], [class*="lang-"] code'};M.hooks.run("before-highlightall",r),r.elements=Array.prototype.slice.apply(r.container.querySelectorAll(r.selector)),M.hooks.run("before-all-elements-highlight",r);for(var a,i=0;a=r.elements[i++];)M.highlightElement(a,!0===n,r.callback)},highlightElement:function(e,n,t){var r=M.util.getLanguage(e),a=M.languages[r];M.util.setLanguage(e,r);var i=e.parentElement;i&&"pre"===i.nodeName.toLowerCase()&&M.util.setLanguage(i,r);var l={element:e,language:r,grammar:a,code:e.textContent};function o(e){l.highlightedCode=e,M.hooks.run("before-insert",l),l.element.innerHTML=l.highlightedCode,M.hooks.run("after-highlight",l),M.hooks.run("complete",l),t&&t.call(l.element)}if(M.hooks.run("before-sanity-check",l),(i=l.element.parentElement)&&"pre"===i.nodeName.toLowerCase()&&!i.hasAttribute("tabindex")&&i.setAttribute("tabindex","0"),!l.code)return M.hooks.run("complete",l),void(t&&t.call(l.element));if(M.hooks.run("before-highlight",l),l.grammar)if(n&&u.Worker){var s=new Worker(M.filename);s.onmessage=function(e){o(e.data)},s.postMessage(JSON.stringify({language:l.language,code:l.code,immediateClose:!0}))}else o(M.highlight(l.code,l.grammar,l.language));else o(M.util.encode(l.code))},highlight:function(e,n,t){var r={code:e,grammar:n,language:t};return M.hooks.run("before-tokenize",r),r.tokens=M.tokenize(r.code,r.grammar),M.hooks.run("after-tokenize",r),W.stringify(M.util.encode(r.tokens),r.language)},tokenize:function(e,n){var t=n.rest;if(t){for(var r in t)n[r]=t[r];delete n.rest}var a=new i;return I(a,a.head,e),function e(n,t,r,a,i,l){for(var o in r)if(r.hasOwnProperty(o)&&r[o]){var s=r[o];s=Array.isArray(s)?s:[s];for(var u=0;u<s.length;++u){if(l&&l.cause==o+","+u)return;var c=s[u],g=c.inside,f=!!c.lookbehind,h=!!c.greedy,d=c.alias;if(h&&!c.pattern.global){var v=c.pattern.toString().match(/[imsuy]*$/)[0];c.pattern=RegExp(c.pattern.source,v+"g")}for(var p=c.pattern||c,m=a.next,y=i;m!==t.tail&&!(l&&y>=l.reach);y+=m.value.length,m=m.next){var k=m.value;if(t.length>n.length)return;if(!(k instanceof W)){var x,b=1;if(h){if(!(x=z(p,y,n,f))||x.index>=n.length)break;var w=x.index,A=x.index+x[0].length,P=y;for(P+=m.value.length;P<=w;)m=m.next,P+=m.value.length;if(P-=m.value.length,y=P,m.value instanceof W)continue;for(var E=m;E!==t.tail&&(P<A||"string"==typeof E.value);E=E.next)b++,P+=E.value.length;b--,k=n.slice(y,P),x.index-=y}else if(!(x=z(p,0,k,f)))continue;var w=x.index,L=x[0],S=k.slice(0,w),O=k.slice(w+L.length),j=y+k.length;l&&j>l.reach&&(l.reach=j);var C=m.prev;S&&(C=I(t,C,S),y+=S.length),q(t,C,b);var N=new W(o,g?M.tokenize(L,g):L,d,L);if(m=I(t,C,N),O&&I(t,m,O),1<b){var _={cause:o+","+u,reach:j};e(n,t,r,m.prev,y,_),l&&_.reach>l.reach&&(l.reach=_.reach)}}}}}}(e,a,n,a.head,0),function(e){var n=[],t=e.head.next;for(;t!==e.tail;)n.push(t.value),t=t.next;return n}(a)},hooks:{all:{},add:function(e,n){var t=M.hooks.all;t[e]=t[e]||[],t[e].push(n)},run:function(e,n){var t=M.hooks.all[e];if(t&&t.length)for(var r,a=0;r=t[a++];)r(n)}},Token:W};function W(e,n,t,r){this.type=e,this.content=n,this.alias=t,this.length=0|(r||"").length}function z(e,n,t,r){e.lastIndex=n;var a=e.exec(t);if(a&&r&&a[1]){var i=a[1].length;a.index+=i,a[0]=a[0].slice(i)}return a}function i(){var e={value:null,prev:null,next:null},n={value:null,prev:e,next:null};e.next=n,this.head=e,this.tail=n,this.length=0}function I(e,n,t){var r=n.next,a={value:t,prev:n,next:r};return n.next=a,r.prev=a,e.length++,a}function q(e,n,t){for(var r=n.next,a=0;a<t&&r!==e.tail;a++)r=r.next;(n.next=r).prev=n,e.length-=a}if(u.Prism=M,W.stringify=function n(e,t){if("string"==typeof e)return e;if(Array.isArray(e)){var r="";return e.forEach(function(e){r+=n(e,t)}),r}var a={type:e.type,content:n(e.content,t),tag:"span",classes:["token",e.type],attributes:{},language:t},i=e.alias;i&&(Array.isArray(i)?Array.prototype.push.apply(a.classes,i):a.classes.push(i)),M.hooks.run("wrap",a);var l="";for(var o in a.attributes)l+=" "+o+'="'+(a.attributes[o]||"").replace(/"/g,"&quot;")+'"';return"<"+a.tag+' class="'+a.classes.join(" ")+'"'+l+">"+a.content+"</"+a.tag+">"},!u.document)return u.addEventListener&&(M.disableWorkerMessageHandler||u.addEventListener("message",function(e){var n=JSON.parse(e.data),t=n.language,r=n.code,a=n.immediateClose;u.postMessage(M.highlight(r,M.languages[t],t)),a&&u.close()},!1)),M;var r=M.util.currentScript();function a(){M.manual||M.highlightAll()}if(r&&(M.filename=r.src,r.hasAttribute("data-manual")&&(M.manual=!0)),!M.manual){var l=document.readyState;"loading"===l||"interactive"===l&&r&&r.defer?document.addEventListener("DOMContentLoaded",a):window.requestAnimationFrame?window.requestAnimationFrame(a):window.setTimeout(a,16)}return M}(_self);"undefined"!=typeof module&&module.exports&&(module.exports=Prism),"undefined"!=typeof global&&(global.Prism=Prism);
4 | Prism.languages.python={comment:{pattern:/(^|[^\\])#.*/,lookbehind:!0,greedy:!0},"string-interpolation":{pattern:/(?:f|fr|rf)(?:("""|''')[\s\S]*?\1|("|')(?:\\.|(?!\2)[^\\\r\n])*\2)/i,greedy:!0,inside:{interpolation:{pattern:/((?:^|[^{])(?:\{\{)*)\{(?!\{)(?:[^{}]|\{(?!\{)(?:[^{}]|\{(?!\{)(?:[^{}])+\})+\})+\}/,lookbehind:!0,inside:{"format-spec":{pattern:/(:)[^:(){}]+(?=\}$)/,lookbehind:!0},"conversion-option":{pattern:/![sra](?=[:}]$)/,alias:"punctuation"},rest:null}},string:/[\s\S]+/}},"triple-quoted-string":{pattern:/(?:[rub]|br|rb)?("""|''')[\s\S]*?\1/i,greedy:!0,alias:"string"},string:{pattern:/(?:[rub]|br|rb)?("|')(?:\\.|(?!\1)[^\\\r\n])*\1/i,greedy:!0},function:{pattern:/((?:^|\s)def[ \t]+)[a-zA-Z_]\w*(?=\s*\()/g,lookbehind:!0},"class-name":{pattern:/(\bclass\s+)\w+/i,lookbehind:!0},decorator:{pattern:/(^[\t ]*)@\w+(?:\.\w+)*/m,lookbehind:!0,alias:["annotation","punctuation"],inside:{punctuation:/\./}},keyword:/\b(?:_(?=\s*:)|and|as|assert|async|await|break|case|class|continue|def|del|elif|else|except|exec|finally|for|from|global|if|import|in|is|lambda|match|nonlocal|not|or|pass|print|raise|return|try|while|with|yield)\b/,builtin:/\b(?:__import__|abs|all|any|apply|ascii|basestring|bin|bool|buffer|bytearray|bytes|callable|chr|classmethod|cmp|coerce|compile|complex|delattr|dict|dir|divmod|enumerate|eval|execfile|file|filter|float|format|frozenset|getattr|globals|hasattr|hash|help|hex|id|input|int|intern|isinstance|issubclass|iter|len|list|locals|long|map|max|memoryview|min|next|object|oct|open|ord|pow|property|range|raw_input|reduce|reload|repr|reversed|round|set|setattr|slice|sorted|staticmethod|str|sum|super|tuple|type|unichr|unicode|vars|xrange|zip)\b/,boolean:/\b(?:False|None|True)\b/,number:/\b0(?:b(?:_?[01])+|o(?:_?[0-7])+|x(?:_?[a-f0-9])+)\b|(?:\b\d+(?:_\d+)*(?:\.(?:\d+(?:_\d+)*)?)?|\B\.\d+(?:_\d+)*)(?:e[+-]?\d+(?:_\d+)*)?j?(?!\w)/i,operator:/[-+%=]=?|!=|:=|\*\*?=?|\/\/?=?|<[<=>]?|>[=>]?|[&|^~]/,punctuation:/[{}[\];(),.:]/},Prism.languages.python["string-interpolation"].inside.interpolation.inside.rest=Prism.languages.python,Prism.languages.py=Prism.languages.python;
5 | Prism.languages.r={comment:/#.*/,string:{pattern:/(['"])(?:\\.|(?!\1)[^\\\r\n])*\1/,greedy:!0},"percent-operator":{pattern:/%[^%\s]*%/,alias:"operator"},boolean:/\b(?:FALSE|TRUE)\b/,ellipsis:/\.\.(?:\.|\d+)/,number:[/\b(?:Inf|NaN)\b/,/(?:\b0x[\dA-Fa-f]+(?:\.\d*)?|\b\d+(?:\.\d*)?|\B\.\d+)(?:[EePp][+-]?\d+)?[iL]?/],keyword:/\b(?:NA|NA_character_|NA_complex_|NA_integer_|NA_real_|NULL|break|else|for|function|if|in|next|repeat|while)\b/,operator:/->?>?|<(?:=|<?-)?|[>=!]=?|::?|&&?|\|\|?|[+*\/^$@~]/,punctuation:/[(){}\[\],;]/};
6 | !function(){if("undefined"!=typeof Prism&&"undefined"!=typeof document){Element.prototype.matches||(Element.prototype.matches=Element.prototype.msMatchesSelector||Element.prototype.webkitMatchesSelector);var l={js:"javascript",py:"python",rb:"ruby",ps1:"powershell",psm1:"powershell",sh:"bash",bat:"batch",h:"c",tex:"latex"},o="data-src-status",h="loading",g="loaded",u="pre[data-src]:not(["+o+'="'+g+'"]):not(['+o+'="'+h+'"])';Prism.hooks.add("before-highlightall",function(t){t.selector+=", "+u}),Prism.hooks.add("before-sanity-check",function(t){var r=t.element;if(r.matches(u)){t.code="",r.setAttribute(o,h);var s=r.appendChild(document.createElement("CODE"));s.textContent="Loading…";var e=r.getAttribute("data-src"),i=t.language;if("none"===i){var n=(/\.(\w+)$/.exec(e)||[,"none"])[1];i=l[n]||n}Prism.util.setLanguage(s,i),Prism.util.setLanguage(r,i);var a=Prism.plugins.autoloader;a&&a.loadLanguages(i),function(t,e,i){var n=new XMLHttpRequest;n.open("GET",t,!0),n.onreadystatechange=function(){4==n.readyState&&(n.status<400&&n.responseText?e(n.responseText):400<=n.status?i(function(t,e){return"✖ Error "+t+" while fetching file: "+e}(n.status,n.statusText)):i("✖ Error: File does not exist or is empty"))},n.send(null)}(e,function(t){r.setAttribute(o,g);var e=function(t){var e=/^\s*(\d+)\s*(?:(,)\s*(?:(\d+)\s*)?)?$/.exec(t||"");if(e){var i=Number(e[1]),n=e[2],a=e[3];return n?a?[i,Number(a)]:[i,void 0]:[i,i]}}(r.getAttribute("data-range"));if(e){var i=t.split(/\r\n?|\n/g),n=e[0],a=null==e[1]?i.length:e[1];n<0&&(n+=i.length),n=Math.max(0,Math.min(n-1,i.length)),a<0&&(a+=i.length),a=Math.max(0,Math.min(a,i.length)),t=i.slice(n,a).join("\n"),r.hasAttribute("data-start")||r.setAttribute("data-start",String(n+1))}s.textContent=t,Prism.highlightElement(s)},function(t){r.setAttribute(o,"failed"),s.textContent=t})}});var t=!(Prism.plugins.fileHighlight={highlight:function(t){for(var e,i=(t||document).querySelectorAll(u),n=0;e=i[n++];)Prism.highlightElement(e)}});Prism.fileHighlight=function(){t||(console.warn("Prism.fileHighlight is deprecated. Use `Prism.plugins.fileHighlight.highlight` instead."),t=!0),Prism.plugins.fileHighlight.highlight.apply(this,arguments)}}}();
7 | 


--------------------------------------------------------------------------------
/doc/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | sys.path.insert(0, os.path.abspath('.'))
16 | sys.path.insert(0, os.path.abspath('..'))
17 | 
18 | 
19 | # -- Project information -----------------------------------------------------
20 | 
21 | project = 'Dataiter'
22 | copyright = '2020–2024 Osmo Salomaa'
23 | author = 'Osmo Salomaa'
24 | 
25 | # The full version, including alpha/beta/rc tags
26 | import dataiter
27 | release = dataiter.__version__
28 | 
29 | 
30 | # -- General configuration ---------------------------------------------------
31 | 
32 | master_doc = 'index'
33 | 
34 | # Add any Sphinx extension module names here, as strings. They can be
35 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
36 | # ones.
37 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode', 'output']
38 | 
39 | # Add any paths that contain templates here, relative to this directory.
40 | templates_path = ['_templates']
41 | 
42 | # List of patterns, relative to source directory, that match files and
43 | # directories to ignore when looking for source files.
44 | # This pattern also affects html_static_path and html_extra_path.
45 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
46 | 
47 | 
48 | # -- Options for HTML output -------------------------------------------------
49 | 
50 | # The theme to use for HTML and HTML Help pages.  See the documentation for
51 | # a list of builtin themes.
52 | 
53 | import sphinx_rtd_theme
54 | html_theme = 'sphinx_rtd_theme'
55 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
56 | 
57 | html_theme_options = {
58 |     'navigation_depth': 3,
59 | }
60 | 
61 | html_context = {
62 |     'display_github': True,
63 | }
64 | 
65 | rst_prolog = """
66 | :github_url: https://github.com/otsaloma/dataiter
67 | """
68 | 
69 | # Add any paths that contain custom static files (such as style sheets) here,
70 | # relative to this directory. They are copied after the builtin static files,
71 | # so a file named "default.css" will overwrite the builtin "default.css".
72 | html_static_path = [
73 |     '_static',
74 |     'comparison/prism.css',
75 |     'comparison/prism.js',
76 |     'comparison/comparison.html',
77 | ]
78 | 
79 | def setup(app):
80 |     # Build comparison/comparison.html. Note that readthedocs.org doesn't
81 |     # run the Makefile, so anything there doesn't help in production.
82 |     # https://github.com/readthedocs/readthedocs.org/issues/2276#issuecomment-231899567
83 |     import subprocess
84 |     from pathlib import Path
85 |     cwd = Path(__file__).parent.resolve() / 'comparison'
86 |     subprocess.run([sys.executable, 'build.py'], cwd=cwd, check=True)
87 | 


--------------------------------------------------------------------------------
/doc/data-frame-column.rst:
--------------------------------------------------------------------------------
 1 | dataiter.DataFrameColumn
 2 | ========================
 3 | 
 4 | :meth:`~dataiter.DataFrameColumn.__init__`
 5 | :attr:`~dataiter.DataFrameColumn.nrow`
 6 | 
 7 | .. autoclass:: dataiter.DataFrameColumn
 8 |    :members:
 9 |    :special-members: __init__
10 | 


--------------------------------------------------------------------------------
/doc/data-frame.rst:
--------------------------------------------------------------------------------
 1 | dataiter.DataFrame
 2 | ==================
 3 | 
 4 | :meth:`~dataiter.DataFrame.__init__`
 5 | :meth:`~dataiter.DataFrame.aggregate`
 6 | :meth:`~dataiter.DataFrame.anti_join`
 7 | :meth:`~dataiter.DataFrame.cbind`
 8 | :attr:`~dataiter.DataFrame.colnames`
 9 | :attr:`~dataiter.DataFrame.columns`
10 | :meth:`~dataiter.DataFrame.compare`
11 | :meth:`~dataiter.DataFrame.copy`
12 | :meth:`~dataiter.DataFrame.count`
13 | :meth:`~dataiter.DataFrame.deepcopy`
14 | :meth:`~dataiter.DataFrame.drop_na`
15 | :meth:`~dataiter.DataFrame.filter`
16 | :meth:`~dataiter.DataFrame.filter_out`
17 | :meth:`~dataiter.DataFrame.from_arrow`
18 | :meth:`~dataiter.DataFrame.from_json`
19 | :meth:`~dataiter.DataFrame.from_pandas`
20 | :meth:`~dataiter.DataFrame.full_join`
21 | :meth:`~dataiter.DataFrame.group_by`
22 | :meth:`~dataiter.DataFrame.head`
23 | :meth:`~dataiter.DataFrame.inner_join`
24 | :meth:`~dataiter.DataFrame.left_join`
25 | :meth:`~dataiter.DataFrame.map`
26 | :meth:`~dataiter.DataFrame.modify`
27 | :attr:`~dataiter.DataFrame.ncol`
28 | :attr:`~dataiter.DataFrame.nrow`
29 | :meth:`~dataiter.DataFrame.print_`
30 | :meth:`~dataiter.DataFrame.print_memory_use`
31 | :meth:`~dataiter.DataFrame.print_na_counts`
32 | :meth:`~dataiter.DataFrame.rbind`
33 | :meth:`~dataiter.DataFrame.read_csv`
34 | :meth:`~dataiter.DataFrame.read_json`
35 | :meth:`~dataiter.DataFrame.read_npz`
36 | :meth:`~dataiter.DataFrame.read_parquet`
37 | :meth:`~dataiter.DataFrame.read_pickle`
38 | :meth:`~dataiter.DataFrame.rename`
39 | :meth:`~dataiter.DataFrame.sample`
40 | :meth:`~dataiter.DataFrame.select`
41 | :meth:`~dataiter.DataFrame.semi_join`
42 | :meth:`~dataiter.DataFrame.slice`
43 | :meth:`~dataiter.DataFrame.slice_off`
44 | :meth:`~dataiter.DataFrame.sort`
45 | :meth:`~dataiter.DataFrame.split`
46 | :meth:`~dataiter.DataFrame.tail`
47 | :meth:`~dataiter.DataFrame.to_arrow`
48 | :meth:`~dataiter.DataFrame.to_json`
49 | :meth:`~dataiter.DataFrame.to_list_of_dicts`
50 | :meth:`~dataiter.DataFrame.to_pandas`
51 | :meth:`~dataiter.DataFrame.to_string`
52 | :meth:`~dataiter.DataFrame.unique`
53 | :meth:`~dataiter.DataFrame.unselect`
54 | :meth:`~dataiter.DataFrame.update`
55 | :meth:`~dataiter.DataFrame.write_csv`
56 | :meth:`~dataiter.DataFrame.write_json`
57 | :meth:`~dataiter.DataFrame.write_npz`
58 | :meth:`~dataiter.DataFrame.write_parquet`
59 | :meth:`~dataiter.DataFrame.write_pickle`
60 | 
61 | .. autoclass:: dataiter.DataFrame
62 |    :members:
63 |    :special-members: __init__
64 | 


--------------------------------------------------------------------------------
/doc/dataiter.rst:
--------------------------------------------------------------------------------
 1 | dataiter
 2 | ========
 3 | 
 4 | The following functions are shorthand helpers for use in conjunction
 5 | with :meth:`.DataFrame.aggregate`, see the guide on :doc:`aggregation
 6 | </aggregation>` for details.
 7 | 
 8 | :func:`~dataiter.all`
 9 | :func:`~dataiter.any`
10 | :func:`~dataiter.count`
11 | :func:`~dataiter.count_unique`
12 | :func:`~dataiter.first`
13 | :func:`~dataiter.last`
14 | :func:`~dataiter.max`
15 | :func:`~dataiter.mean`
16 | :func:`~dataiter.median`
17 | :func:`~dataiter.min`
18 | :func:`~dataiter.mode`
19 | :func:`~dataiter.nth`
20 | :func:`~dataiter.quantile`
21 | :func:`~dataiter.std`
22 | :func:`~dataiter.sum`
23 | :func:`~dataiter.var`
24 | 
25 | The following read functions are convenience aliases to the correspoding
26 | methods of the classes generally most suitable for the particular file
27 | type, i.e. :class:`.DataFrame` for CSV, NPZ and Parquet,
28 | :class:`.GeoJSON` for GeoJSON and :class:`.ListOfDicts` for JSON.
29 | 
30 | :func:`~dataiter.read_csv`
31 | :func:`~dataiter.read_geojson`
32 | :func:`~dataiter.read_json`
33 | :func:`~dataiter.read_npz`
34 | :func:`~dataiter.read_parquet`
35 | 
36 | The following constants can be used to customize certain defaults, such as
37 | formatting and limits for printing.
38 | 
39 | :attr:`dataiter.PRINT_MAX_WIDTH`
40 | :attr:`dataiter.PRINT_THOUSAND_SEPARATOR`
41 | :attr:`dataiter.PRINT_TRUNCATE_WIDTH`
42 | :attr:`dataiter.USE_NUMBA`
43 | :attr:`dataiter.USE_NUMBA_CACHE`
44 | 
45 | .. automodule:: dataiter
46 |    :members: all,
47 |              any,
48 |              count,
49 |              count_unique,
50 |              first,
51 |              last,
52 |              max,
53 |              mean,
54 |              median,
55 |              min,
56 |              mode,
57 |              nth,
58 |              quantile,
59 |              read_csv,
60 |              read_geojson,
61 |              read_json,
62 |              read_npz,
63 |              read_parquet,
64 |              std,
65 |              sum,
66 |              var,
67 |              PRINT_MAX_WIDTH,
68 |              PRINT_THOUSAND_SEPARATOR,
69 |              PRINT_TRUNCATE_WIDTH,
70 |              USE_NUMBA,
71 |              USE_NUMBA_CACHE
72 | 


--------------------------------------------------------------------------------
/doc/dt.rst:
--------------------------------------------------------------------------------
 1 | dataiter.dt
 2 | ===========
 3 | 
 4 | The ``dt`` module contains vectorized functions for dealing with dates and
 5 | datetimes, similar to ``numpy.strings`` for strings. This is mostly a
 6 | convenience wrapper around Python's standard library ``datetime`` module, not
 7 | any efficient reimplementation.
 8 | 
 9 | :func:`~dataiter.dt.day`
10 | :func:`~dataiter.dt.from_string`
11 | :func:`~dataiter.dt.hour`
12 | :func:`~dataiter.dt.isoweek`
13 | :func:`~dataiter.dt.isoweekday`
14 | :func:`~dataiter.dt.microsecond`
15 | :func:`~dataiter.dt.minute`
16 | :func:`~dataiter.dt.month`
17 | :func:`~dataiter.dt.new`
18 | :func:`~dataiter.dt.now`
19 | :func:`~dataiter.dt.quarter`
20 | :func:`~dataiter.dt.replace`
21 | :func:`~dataiter.dt.second`
22 | :func:`~dataiter.dt.to_string`
23 | :func:`~dataiter.dt.today`
24 | :func:`~dataiter.dt.weekday`
25 | :func:`~dataiter.dt.year`
26 | 
27 | .. automodule:: dataiter.dt
28 |    :members:
29 | 


--------------------------------------------------------------------------------
/doc/dtypes.rst:
--------------------------------------------------------------------------------
1 | dataiter.dtypes
2 | ===============
3 | 
4 | Custom data types for vectors.
5 | 
6 | .. automodule:: dataiter.dtypes
7 |    :members:
8 | 


--------------------------------------------------------------------------------
/doc/geojson.rst:
--------------------------------------------------------------------------------
 1 | dataiter.GeoJSON
 2 | ================
 3 | 
 4 | :meth:`~dataiter.GeoJSON.__init__`
 5 | :meth:`~dataiter.GeoJSON.read`
 6 | :meth:`~dataiter.GeoJSON.to_data_frame`
 7 | :meth:`~dataiter.GeoJSON.write`
 8 | 
 9 | .. autoclass:: dataiter.GeoJSON
10 |    :members: read, to_data_frame, write
11 |    :special-members: __init__
12 | 


--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
 1 | Dataiter Documentation
 2 | ======================
 3 | 
 4 | Dataiter's :class:`.DataFrame` is a class for tabular data similar to R's
 5 | ``data.frame``, implementing all common operations to manipulate data. It is
 6 | under the hood a dictionary of NumPy arrays and thus capable of fast vectorized
 7 | operations. You can consider it to be a light-weight alternative to Pandas with
 8 | a simple and consistent API. Performance-wise Dataiter relies on NumPy and Numba
 9 | and is likely to be at best comparable to Pandas.
10 | 
11 | Additionally Dataiter includes :class:`.ListOfDicts`, a class for manipulating
12 | hierarchical data, such as from JSON APIs or document databases, and
13 | :class:`.GeoJSON`, a class for manipulating data from GeoJSON files in a data
14 | frame.
15 | 
16 | .. toctree::
17 |    :maxdepth: 1
18 |    :caption: Tutorials
19 | 
20 |    quick-start
21 |    comparison
22 |    aggregation
23 | 
24 | .. toctree::
25 |    :maxdepth: 1
26 |    :caption: API Documentation
27 | 
28 |    dataiter
29 |    data-frame
30 |    data-frame-column
31 |    geojson
32 |    list-of-dicts
33 |    vector
34 |    dt
35 |    dtypes
36 |    regex
37 | 


--------------------------------------------------------------------------------
/doc/list-of-dicts.rst:
--------------------------------------------------------------------------------
 1 | dataiter.ListOfDicts
 2 | ====================
 3 | 
 4 | :meth:`~dataiter.ListOfDicts.__init__`
 5 | :meth:`~dataiter.ListOfDicts.aggregate`
 6 | :meth:`~dataiter.ListOfDicts.anti_join`
 7 | :meth:`~dataiter.ListOfDicts.append`
 8 | :meth:`~dataiter.ListOfDicts.clear`
 9 | :meth:`~dataiter.ListOfDicts.copy`
10 | :meth:`~dataiter.ListOfDicts.deepcopy`
11 | :meth:`~dataiter.ListOfDicts.drop_na`
12 | :meth:`~dataiter.ListOfDicts.extend`
13 | :meth:`~dataiter.ListOfDicts.fill_missing_keys`
14 | :meth:`~dataiter.ListOfDicts.filter`
15 | :meth:`~dataiter.ListOfDicts.filter_out`
16 | :meth:`~dataiter.ListOfDicts.from_json`
17 | :meth:`~dataiter.ListOfDicts.full_join`
18 | :meth:`~dataiter.ListOfDicts.group_by`
19 | :meth:`~dataiter.ListOfDicts.head`
20 | :meth:`~dataiter.ListOfDicts.inner_join`
21 | :meth:`~dataiter.ListOfDicts.insert`
22 | :meth:`~dataiter.ListOfDicts.keys`
23 | :meth:`~dataiter.ListOfDicts.left_join`
24 | :meth:`~dataiter.ListOfDicts.map`
25 | :meth:`~dataiter.ListOfDicts.modify`
26 | :meth:`~dataiter.ListOfDicts.modify_if`
27 | :meth:`~dataiter.ListOfDicts.pluck`
28 | :meth:`~dataiter.ListOfDicts.print_`
29 | :meth:`~dataiter.ListOfDicts.print_memory_use`
30 | :meth:`~dataiter.ListOfDicts.print_na_counts`
31 | :meth:`~dataiter.ListOfDicts.read_csv`
32 | :meth:`~dataiter.ListOfDicts.read_json`
33 | :meth:`~dataiter.ListOfDicts.read_pickle`
34 | :meth:`~dataiter.ListOfDicts.rename`
35 | :meth:`~dataiter.ListOfDicts.reverse`
36 | :meth:`~dataiter.ListOfDicts.sample`
37 | :meth:`~dataiter.ListOfDicts.select`
38 | :meth:`~dataiter.ListOfDicts.semi_join`
39 | :meth:`~dataiter.ListOfDicts.sort`
40 | :meth:`~dataiter.ListOfDicts.split`
41 | :meth:`~dataiter.ListOfDicts.tail`
42 | :meth:`~dataiter.ListOfDicts.to_data_frame`
43 | :meth:`~dataiter.ListOfDicts.to_json`
44 | :meth:`~dataiter.ListOfDicts.to_pandas`
45 | :meth:`~dataiter.ListOfDicts.to_string`
46 | :meth:`~dataiter.ListOfDicts.unique`
47 | :meth:`~dataiter.ListOfDicts.unselect`
48 | :meth:`~dataiter.ListOfDicts.write_csv`
49 | :meth:`~dataiter.ListOfDicts.write_json`
50 | :meth:`~dataiter.ListOfDicts.write_pickle`
51 | 
52 | .. autoclass:: dataiter.ListOfDicts
53 |    :members:
54 |    :special-members: __init__
55 | 


--------------------------------------------------------------------------------
/doc/output.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import subprocess
 4 | 
 5 | from pathlib import Path
 6 | 
 7 | CODE = """
 8 | import sys
 9 | from pathlib import Path
10 | sys.path.insert(0, str(Path(".")))
11 | import dataiter as di
12 | import numpy as np
13 | from dataiter import dt
14 | from dataiter import regex
15 | di.PRINT_MAX_ITEMS = 3
16 | di.PRINT_MAX_ROWS = 10
17 | di.PRINT_MAX_WIDTH = 72
18 | """
19 | 
20 | def get_output(lines):
21 |     try:
22 |         return subprocess.check_output(
23 |             args=["python3", "-c", "\n".join(lines)],
24 |             stderr=subprocess.STDOUT,
25 |             cwd=Path("..").resolve(),
26 |             encoding="utf-8",
27 |             errors="replace",
28 |             universal_newlines=True,
29 |             text=True,
30 |             timeout=30,
31 |         ).splitlines()
32 |     except subprocess.CalledProcessError as e:
33 |         return e.output.splitlines()
34 | 
35 | def on_autodoc_process_docstring(app, what, name, obj, options, lines):
36 |     print(f"Processing {name}...")
37 |     # Intercept all ">>>" lines in docstring, run the corresponding code
38 |     # and inject any possible output into the docstring.
39 |     code = CODE.strip().splitlines()
40 |     output = []
41 |     for i, line in enumerate(lines):
42 |         if not line.startswith(">>>"): continue
43 |         line = line.lstrip("> ")
44 |         if line.startswith("#"): continue
45 |         # Some docstrings will, on purpose, have lines of code that raise
46 |         # errors. Wrap lines in try-except so that all lines will always be
47 |         # executed and output from only the last line will be used.
48 |         code.append(f"try: {line}\nexcept Exception: pass")
49 |         if " = " in line: continue
50 |         if line.startswith(("from ", "import ")): continue
51 |         blob = get_output(code[:-1] + [f"print({line})"])
52 |         for j in range(len(blob)):
53 |             # Avoid a paragraph change on blank lines.
54 |             if not blob[j].strip():
55 |                 blob[j] = "."
56 |         output.append((i + 1, blob))
57 |     for i, blob in reversed(output):
58 |         lines[i:i] = blob
59 | 
60 | def setup(app):
61 |     # https://www.sphinx-doc.org/en/master/usage/extensions/autodoc.html#event-autodoc-process-docstring
62 |     app.connect("autodoc-process-docstring", on_autodoc_process_docstring)
63 |     return {
64 |         "version": "0.1",
65 |         "parallel_read_safe": True,
66 |         "parallel_write_safe": True,
67 |     }
68 | 


--------------------------------------------------------------------------------
/doc/quick-start.rst:
--------------------------------------------------------------------------------
  1 | Quick Start
  2 | ===========
  3 | 
  4 | DataFrame
  5 | ---------
  6 | 
  7 | >>> import dataiter as di
  8 | >>> data = di.read_csv("data/listings.csv")
  9 | >>> data.price_per_guest = data.price / data.guests
 10 | >>> data.head()
 11 | .
 12 |      id      hood zipcode guests    sqft price price_per_guest
 13 |   int64      <U13    <U11  int64 float64 int64         float64
 14 |   ───── ───────── ─────── ────── ─────── ───── ───────────────
 15 | 0  2060 Manhattan   10040      2     nan   100          50.000
 16 | 1  2595 Manhattan   10018      2     nan   225         112.500
 17 | 2  3831  Brooklyn   11238      3     500    89          29.667
 18 | 3  5099 Manhattan   10016      2     nan   200         100.000
 19 | 4  5121  Brooklyn   11216      2     nan    60          30.000
 20 | 5  5136  Brooklyn   11232      4     nan   253          63.250
 21 | 6  5178 Manhattan   10019      2     nan    79          39.500
 22 | 7  5203 Manhattan   10025      1     nan    79          79.000
 23 | 8  5238 Manhattan   10002      2     nan   150          75.000
 24 | 9  5441 Manhattan   10036      2     nan    99          49.500
 25 | .
 26 | >>> data.filter(hood="Manhattan").filter(guests=2).sort(price=1).head()
 27 | .
 28 |         id      hood zipcode guests    sqft price price_per_guest
 29 |      int64      <U13    <U11  int64 float64 int64         float64
 30 |   ──────── ───────── ─────── ────── ─────── ───── ───────────────
 31 | 0 42279170 Manhattan   10013      2     nan     0             0.0
 32 | 1 42384530 Manhattan   10036      2     nan     0             0.0
 33 | 2 18835820 Manhattan   10021      2     nan    10             5.0
 34 | 3 20171179 Manhattan   10027      2     nan    10             5.0
 35 | 4 14858544 Manhattan     nan      2     nan    15             7.5
 36 | 5 31397084 Manhattan   10002      2     nan    19             9.5
 37 | 6 22289683 Manhattan   10031      2     nan    20            10.0
 38 | 7  7760204 Manhattan   10040      2     nan    22            11.0
 39 | 8 43292527 Manhattan   10033      2     nan    22            11.0
 40 | 9 43268040 Manhattan   10033      2     nan    23            11.5
 41 | .
 42 | 
 43 | GeoJSON
 44 | -------
 45 | 
 46 | >>> import dataiter as di
 47 | >>> data = di.read_geojson("data/neighbourhoods.geojson")
 48 | >>> data.head()
 49 | .
 50 |      neighbourhood neighbourhood_group       geometry
 51 |               <U26                <U13         object
 52 |   ──────────────── ─────────────────── ──────────────
 53 | 0        Bayswater              Queens <MultiPolygon>
 54 | 1         Allerton               Bronx <MultiPolygon>
 55 | 2      City Island               Bronx <MultiPolygon>
 56 | 3 Ditmars Steinway              Queens <MultiPolygon>
 57 | 4       Ozone Park              Queens <MultiPolygon>
 58 | 5          Fordham               Bronx <MultiPolygon>
 59 | 6       Whitestone              Queens <MultiPolygon>
 60 | 7    Arden Heights       Staten Island <MultiPolygon>
 61 | 8         Arrochar       Staten Island <MultiPolygon>
 62 | 9          Arverne              Queens <MultiPolygon>
 63 | .
 64 | 
 65 | ListOfDicts
 66 | -----------
 67 | 
 68 | >>> import dataiter as di
 69 | >>> data = di.read_json("data/listings.json")
 70 | >>> data = data.modify(price_per_guest=lambda x: x.price / x.guests)
 71 | >>> data.head()
 72 | [
 73 |   {
 74 |     "id": 2060,
 75 |     "hood": "Manhattan",
 76 |     "zipcode": "10040",
 77 |     "guests": 2,
 78 |     "sqft": null,
 79 |     "price": 100,
 80 |     "price_per_guest": 50.0
 81 |   },
 82 |   {
 83 |     "id": 2595,
 84 |     "hood": "Manhattan",
 85 |     "zipcode": "10018",
 86 |     "guests": 2,
 87 |     "sqft": null,
 88 |     "price": 225,
 89 |     "price_per_guest": 112.5
 90 |   },
 91 |   {
 92 |     "id": 3831,
 93 |     "hood": "Brooklyn",
 94 |     "zipcode": "11238",
 95 |     "guests": 3,
 96 |     "sqft": 500.0,
 97 |     "price": 89,
 98 |     "price_per_guest": 29.666666666666668
 99 |   }
100 | ]
101 | >>> data.filter(hood="Manhattan").filter(guests=2).sort(price=1).head()
102 | [
103 |   {
104 |     "id": 42279170,
105 |     "hood": "Manhattan",
106 |     "zipcode": "10013",
107 |     "guests": 2,
108 |     "sqft": null,
109 |     "price": 0,
110 |     "price_per_guest": 0.0
111 |   },
112 |   {
113 |     "id": 42384530,
114 |     "hood": "Manhattan",
115 |     "zipcode": "10036",
116 |     "guests": 2,
117 |     "sqft": null,
118 |     "price": 0,
119 |     "price_per_guest": 0.0
120 |   },
121 |   {
122 |     "id": 18835820,
123 |     "hood": "Manhattan",
124 |     "zipcode": "10021",
125 |     "guests": 2,
126 |     "sqft": null,
127 |     "price": 10,
128 |     "price_per_guest": 5.0
129 |   }
130 | ]
131 | 


--------------------------------------------------------------------------------
/doc/regex.rst:
--------------------------------------------------------------------------------
 1 | dataiter.regex
 2 | ==============
 3 | 
 4 | The ``regex`` module contains vectorized versions of regular expression matching
 5 | operations, similar to ``numpy.strings`` for string operations. This is a
 6 | convenience wrapper around Python's standard library ``re`` module, not any
 7 | efficient reimplementation.
 8 | 
 9 | :func:`~dataiter.regex.findall`
10 | :func:`~dataiter.regex.fullmatch`
11 | :func:`~dataiter.regex.match`
12 | :func:`~dataiter.regex.search`
13 | :func:`~dataiter.regex.split`
14 | :func:`~dataiter.regex.sub`
15 | :func:`~dataiter.regex.subn`
16 | 
17 | .. automodule:: dataiter.regex
18 |    :members:
19 | 


--------------------------------------------------------------------------------
/doc/requirements.txt:
--------------------------------------------------------------------------------
1 | attd==1.0
2 | jinja2==3.1.3
3 | numpy==2.0.2
4 | pandas==2.2.3
5 | pyarrow==18.1.0
6 | sphinx==7.2.6
7 | sphinx-rtd-theme==2.0.0
8 | wcwidth==0.2.13
9 | 


--------------------------------------------------------------------------------
/doc/vector.rst:
--------------------------------------------------------------------------------
 1 | dataiter.Vector
 2 | ===============
 3 | 
 4 | :meth:`~dataiter.Vector.__init__`
 5 | :meth:`~dataiter.Vector.as_boolean`
 6 | :meth:`~dataiter.Vector.as_bytes`
 7 | :meth:`~dataiter.Vector.as_date`
 8 | :meth:`~dataiter.Vector.as_datetime`
 9 | :meth:`~dataiter.Vector.as_float`
10 | :meth:`~dataiter.Vector.as_integer`
11 | :meth:`~dataiter.Vector.as_object`
12 | :meth:`~dataiter.Vector.as_string`
13 | :meth:`~dataiter.Vector.concat`
14 | :meth:`~dataiter.Vector.drop_na`
15 | :attr:`~dataiter.Vector.dt`
16 | :attr:`~dataiter.Vector.dtype_label`
17 | :meth:`~dataiter.Vector.equal`
18 | :meth:`~dataiter.Vector.fast`
19 | :meth:`~dataiter.Vector.get_memory_use`
20 | :meth:`~dataiter.Vector.head`
21 | :meth:`~dataiter.Vector.is_boolean`
22 | :meth:`~dataiter.Vector.is_bytes`
23 | :meth:`~dataiter.Vector.is_datetime`
24 | :meth:`~dataiter.Vector.is_float`
25 | :meth:`~dataiter.Vector.is_integer`
26 | :meth:`~dataiter.Vector.is_na`
27 | :meth:`~dataiter.Vector.is_number`
28 | :meth:`~dataiter.Vector.is_object`
29 | :meth:`~dataiter.Vector.is_string`
30 | :meth:`~dataiter.Vector.is_timedelta`
31 | :attr:`~dataiter.Vector.length`
32 | :meth:`~dataiter.Vector.map`
33 | :attr:`~dataiter.Vector.na_dtype`
34 | :attr:`~dataiter.Vector.na_value`
35 | :meth:`~dataiter.Vector.range`
36 | :meth:`~dataiter.Vector.rank`
37 | :attr:`~dataiter.Vector.re`
38 | :meth:`~dataiter.Vector.replace_na`
39 | :meth:`~dataiter.Vector.sample`
40 | :meth:`~dataiter.Vector.sort`
41 | :attr:`~dataiter.Vector.str`
42 | :meth:`~dataiter.Vector.tail`
43 | :meth:`~dataiter.Vector.to_string`
44 | :meth:`~dataiter.Vector.tolist`
45 | :meth:`~dataiter.Vector.unique`
46 | 
47 | .. autoclass:: dataiter.Vector
48 |    :members:
49 |    :special-members: __init__
50 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["build==1.2.2.post1", "hatchling==1.21.1"]
 3 | build-backend = "hatchling.build"
 4 | 
 5 | [project]
 6 | name = "dataiter"
 7 | dynamic = ["version"]
 8 | description = "Simple, light-weight data frames for Python"
 9 | readme = "README.md"
10 | license = "MIT"
11 | requires-python = ">=3.9.0"
12 | authors = [{ name = "Osmo Salomaa", email = "otsaloma@iki.fi" }]
13 | dependencies = ["attd>=0.3", "numpy>=2.0,<3.0", "pyarrow>=2.0", "wcwidth>=0.1"]
14 | 
15 | [project.urls]
16 | Homepage = "https://github.com/otsaloma/dataiter"
17 | 
18 | [tool.hatch.version]
19 | path = "dataiter/__init__.py"
20 | 
21 | [tool.hatch.build.targets.sdist]
22 | include = ["/dataiter"]
23 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | attd==1.0
 2 | click==8.1.7
 3 | flake8==7.1.1
 4 | jinja2==3.1.3
 5 | numba==0.60.0
 6 | numpy==2.0.2
 7 | pandas==2.2.3
 8 | pyarrow==18.1.0
 9 | pytest==8.3.4
10 | sphinx==7.2.6
11 | sphinx-rtd-theme==2.0.0
12 | wcwidth==0.2.13
13 | 


--------------------------------------------------------------------------------
/tools/check-missing.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import dataiter as di
 4 | import inspect
 5 | 
 6 | df = di.DataFrame()
 7 | ld = di.ListOfDicts()
 8 | 
 9 | base_df = {}
10 | base_ld = []
11 | 
12 | print("")
13 | print("Methods missing from DataFrame:")
14 | for name in sorted(dir(ld)):
15 |     if name in dir(df): continue
16 |     if name.startswith("_"): continue
17 |     if name in dir(base_ld) and name not in dir(base_df): continue
18 |     if not inspect.ismethod(getattr(ld, name)): continue
19 |     print(f"... {name}")
20 | 
21 | print("")
22 | print("Methods missing from ListOfDicts:")
23 | for name in sorted(dir(df)):
24 |     if name in dir(ld): continue
25 |     if name.startswith("_"): continue
26 |     if name in dir(base_df) and name not in dir(base_ld): continue
27 |     if not inspect.ismethod(getattr(df, name)): continue
28 |     print(f"... {name}")
29 | 


--------------------------------------------------------------------------------
/tools/release:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Commit changes, tag and push release to GitHub.
 3 | cd "$(dirname "$0")/.." || exit 1
 4 | VERSION="$(python3 -c "import dataiter; print(dataiter.__version__)")"
 5 | echo "Git status:"
 6 | git status --porcelain
 7 | printf "\nRelease version: $VERSION\n"
 8 | read -p "Press Enter to continue or Ctrl+C to abort: "
 9 | git commit -a -m "RELEASE $VERSION"
10 | git tag -s -m "RELEASE $VERSION" $VERSION
11 | git push
12 | git push --tags
13 | egrep -B 999 -m2 "^===+" NEWS.md \
14 |     | head -n-3 \
15 |     | tail -n+4 \
16 |     | sed ':a;N;$!ba;s/\n  / /g' \
17 |     | gh release create \
18 |          --notes-file - \
19 |          --title $VERSION \
20 |          $VERSION
21 | 


--------------------------------------------------------------------------------
/validation/generate-df.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import sys
  4 | sys.path.insert(0, "..")
  5 | 
  6 | import dataiter as di
  7 | 
  8 | print(f"USE_NUMBA: {di.USE_NUMBA}")
  9 | 
 10 | def read_csv(path):
 11 |     data = di.read_csv(path)
 12 |     for name in data.colnames:
 13 |         # Drop all rows with NAs to avoid upcasting to float
 14 |         # and differing NA representation in output.
 15 |         data = data.filter_out(data[name].is_na())
 16 |         if data[name].is_string():
 17 |             # Use all lower case for strings to avoid differing
 18 |             # sorting of lower vs. upper case characters.
 19 |             data[name] = data[name].str.lower()
 20 |     return data
 21 | 
 22 | # AGGREGATE
 23 | (read_csv("../data/vehicles.csv")
 24 |  .modify(fuel_regular=lambda x: x.fuel == "regular")
 25 |  .group_by("make", "model")
 26 |  .aggregate(
 27 |      all_fuel_regular=di.all("fuel_regular"),
 28 |      any_fuel_regular=di.any("fuel_regular"),
 29 |      count=di.count(),
 30 |      count_unique_cyl=di.count_unique("cyl"),
 31 |      first_hwy=di.first("hwy"),
 32 |      last_hwy=di.last("hwy"),
 33 |      max_hwy=di.max("hwy"),
 34 |      mean_hwy=di.mean("hwy"),
 35 |      median_hwy=di.median("hwy"),
 36 |      min_hwy=di.min("hwy"),
 37 |      mode_year=di.mode("year"),
 38 |      nth_id=di.nth("id", 0),
 39 |      quantile_hwy=di.quantile("hwy", 0.75),
 40 |      std_hwy=di.std("hwy", ddof=1),
 41 |      sum_hwy=di.sum("hwy"),
 42 |      var_hwy=di.var("hwy", ddof=1))
 43 |  .modify(mean_hwy=lambda x: x.mean_hwy.round(2))
 44 |  .modify(std_hwy =lambda x: x.std_hwy.round(2))
 45 |  .modify(var_hwy =lambda x: x.var_hwy.round(2))
 46 |  .write_csv("aggregate.df.csv"))
 47 | 
 48 | # ANTI JOIN
 49 | reviews = read_csv("../data/listings-reviews.csv")
 50 | (read_csv("../data/listings.csv")
 51 |  .anti_join(reviews, "id")
 52 |  .write_csv("anti_join.df.csv"))
 53 | 
 54 | # FILTER
 55 | (read_csv("../data/vehicles.csv")
 56 |  .filter(lambda x: x.year < 2000)
 57 |  .filter(lambda x: x.cyl < 10)
 58 |  .write_csv("filter.df.csv"))
 59 | 
 60 | # FILTER OUT
 61 | (read_csv("../data/vehicles.csv")
 62 |  .filter_out(lambda x: x.year < 2000)
 63 |  .filter_out(lambda x: x.cyl < 10)
 64 |  .write_csv("filter_out.df.csv"))
 65 | 
 66 | # FULL JOIN
 67 | reviews = read_csv("../data/listings-reviews.csv")
 68 | reviews = reviews.rbind(reviews)
 69 | (read_csv("../data/listings.csv")
 70 |  .full_join(reviews, "id")
 71 |  .write_csv("full_join.df.csv"))
 72 | 
 73 | # INNER JOIN
 74 | reviews = read_csv("../data/listings-reviews.csv")
 75 | (read_csv("../data/listings.csv")
 76 |  .inner_join(reviews, "id")
 77 |  .write_csv("inner_join.df.csv"))
 78 | 
 79 | # LEFT JOIN
 80 | reviews = read_csv("../data/listings-reviews.csv")
 81 | (read_csv("../data/listings.csv")
 82 |  .left_join(reviews, "id")
 83 |  .write_csv("left_join.df.csv"))
 84 | 
 85 | # SEMI JOIN
 86 | reviews = read_csv("../data/listings-reviews.csv")
 87 | (read_csv("../data/listings.csv")
 88 |  .semi_join(reviews, "id")
 89 |  .write_csv("semi_join.df.csv"))
 90 | 
 91 | # SORT
 92 | (read_csv("../data/vehicles.csv")
 93 |  .sort(make=1, model=1, year=-1)
 94 |  .write_csv("sort.df.csv"))
 95 | 
 96 | # UNIQUE
 97 | (read_csv("../data/vehicles.csv")
 98 |  .unique("make", "model", "year")
 99 |  .write_csv("unique.df.csv"))
100 | 


--------------------------------------------------------------------------------
/validation/generate-ld.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import sys
  4 | sys.path.insert(0, "..")
  5 | 
  6 | import dataiter as di
  7 | import statistics
  8 | 
  9 | from statistics import mean
 10 | from statistics import median
 11 | from statistics import mode
 12 | 
 13 | def read_json(path):
 14 |     data = di.read_json(path)
 15 |     for name in list(data[0].keys()):
 16 |         # Drop all rows with NAs to avoid upcasting to float
 17 |         # and differing NA representation in output.
 18 |         data = data.filter_out(lambda x: x[name] is None)
 19 |         for item in data:
 20 |             if isinstance(item[name], str):
 21 |                 # Use all lower case for strings to avoid differing
 22 |                 # sorting of lower vs. upper case characters.
 23 |                 item[name] = item[name].lower()
 24 |     return data
 25 | 
 26 | round2 = lambda x: round(x, 2) if x is not None else None
 27 | stdev = lambda x: statistics.stdev(x) if len(x) > 1 else None
 28 | variance = lambda x: statistics.variance(x) if len(x) > 1 else None
 29 | 
 30 | # AGGREGATE
 31 | (read_json("../data/vehicles.json")
 32 |  .modify(fuel_regular=lambda x: x.fuel == "regular")
 33 |  .group_by("make", "model")
 34 |  .aggregate(
 35 |      all_fuel_regular=lambda x: all(x.pluck("fuel_regular")),
 36 |      any_fuel_regular=lambda x: any(x.pluck("fuel_regular")),
 37 |      count=len,
 38 |      count_unique_cyl=lambda x: len(set(x.pluck("cyl"))),
 39 |      first_hwy=lambda x: x[0].hwy,
 40 |      last_hwy=lambda x: x[-1].hwy,
 41 |      max_hwy=lambda x: max(x.pluck("hwy")),
 42 |      mean_hwy=lambda x: mean(x.pluck("hwy")),
 43 |      median_hwy=lambda x: median(x.pluck("hwy")),
 44 |      min_hwy=lambda x: min(x.pluck("hwy")),
 45 |      mode_year=lambda x: mode(x.pluck("year")),
 46 |      nth_id=lambda x: x[0].id,
 47 |      quantile_hwy=lambda x: di.quantile(di.Vector(x.pluck("hwy")), 0.75),
 48 |      std_hwy=lambda x: stdev(x.pluck("hwy")),
 49 |      sum_hwy=lambda x: sum(x.pluck("hwy")),
 50 |      var_hwy=lambda x: variance(x.pluck("hwy")))
 51 |  .modify(mean_hwy=lambda x: round2(x.mean_hwy))
 52 |  .modify(std_hwy =lambda x: round2(x.std_hwy))
 53 |  .modify(var_hwy =lambda x: round2(x.var_hwy))
 54 |  .write_csv("aggregate.ld.csv"))
 55 | 
 56 | # ANTI JOIN
 57 | reviews = read_json("../data/listings-reviews.json")
 58 | (read_json("../data/listings.json")
 59 |  .anti_join(reviews, "id")
 60 |  .write_csv("anti_join.ld.csv"))
 61 | 
 62 | # FILTER
 63 | (read_json("../data/vehicles.json")
 64 |  .filter(lambda x: x.year < 2000)
 65 |  .filter(lambda x: x.cyl < 10)
 66 |  .write_csv("filter.ld.csv"))
 67 | 
 68 | # FILTER OUT
 69 | (read_json("../data/vehicles.json")
 70 |  .filter_out(lambda x: x.year < 2000)
 71 |  .filter_out(lambda x: x.cyl < 10)
 72 |  .write_csv("filter_out.ld.csv"))
 73 | 
 74 | # FULL JOIN
 75 | reviews = read_json("../data/listings-reviews.json")
 76 | reviews = reviews + reviews
 77 | (read_json("../data/listings.json")
 78 |  .full_join(reviews, "id")
 79 |  .write_csv("full_join.ld.csv"))
 80 | 
 81 | # INNER JOIN
 82 | reviews = read_json("../data/listings-reviews.json")
 83 | (read_json("../data/listings.json")
 84 |  .inner_join(reviews, "id")
 85 |  .write_csv("inner_join.ld.csv"))
 86 | 
 87 | # LEFT JOIN
 88 | reviews = read_json("../data/listings-reviews.json")
 89 | (read_json("../data/listings.json")
 90 |  .left_join(reviews, "id")
 91 |  .write_csv("left_join.ld.csv"))
 92 | 
 93 | # SEMI JOIN
 94 | reviews = read_json("../data/listings-reviews.json")
 95 | (read_json("../data/listings.json")
 96 |  .semi_join(reviews, "id")
 97 |  .write_csv("semi_join.ld.csv"))
 98 | 
 99 | # SORT
100 | (read_json("../data/vehicles.json")
101 |  .sort(make=1, model=1, year=-1)
102 |  .write_csv("sort.ld.csv"))
103 | 
104 | # UNIQUE
105 | (read_json("../data/vehicles.json")
106 |  .unique("make", "model", "year")
107 |  .write_csv("unique.ld.csv"))
108 | 


--------------------------------------------------------------------------------
/validation/generate.R:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8-unix -*-
  2 | 
  3 | suppressPackageStartupMessages({
  4 |     library(dplyr)
  5 |     library(readr)
  6 | })
  7 | 
  8 | options(dplyr.summarise.inform=FALSE)
  9 | 
 10 | Mode = function(x) {
 11 |     # https://stackoverflow.com/q/2547402
 12 |     ux = unique(x)
 13 |     return(ux[which.max(tabulate(match(x, ux)))])
 14 | }
 15 | 
 16 | read_csv = function(path) {
 17 |     data = readr::read_csv(path, show_col_types=FALSE, lazy=FALSE)
 18 |     for (name in colnames(data)) {
 19 |         # Drop all rows with NAs to avoid upcasting to float
 20 |         # and differing NA representation in output.
 21 |         data = data[!is.na(data[[name]]),]
 22 |         if (is.character(data[[name]]))
 23 |             # Use all lower case for strings to avoid differing
 24 |             # sorting of lower vs. upper case characters.
 25 |             data[[name]] = tolower(data[[name]])
 26 |     }
 27 |     return(data)
 28 | }
 29 | 
 30 | write_csv = function(data, path) {
 31 |     readr::write_csv(data, path, na="")
 32 | }
 33 | 
 34 | # AGGREGATE
 35 | read_csv("../data/vehicles.csv") |>
 36 |     mutate(fuel_regular=(fuel == "regular")) |>
 37 |     group_by(make, model) |>
 38 |     summarise(
 39 |         all_fuel_regular=all(fuel_regular),
 40 |         any_fuel_regular=any(fuel_regular),
 41 |         count=n(),
 42 |         count_unique_cyl=n_distinct(cyl),
 43 |         first_hwy=first(hwy),
 44 |         last_hwy=last(hwy),
 45 |         max_hwy=max(hwy),
 46 |         mean_hwy=mean(hwy),
 47 |         median_hwy=median(hwy),
 48 |         min_hwy=min(hwy),
 49 |         mode_year=Mode(year),
 50 |         nth_id=nth(id, 1),
 51 |         quantile_hwy=quantile(hwy, 0.75, type=7),
 52 |         std_hwy=sd(hwy),
 53 |         sum_hwy=sum(hwy),
 54 |         var_hwy=var(hwy)) |>
 55 |     mutate(mean_hwy=round(mean_hwy, 2)) |>
 56 |     mutate(std_hwy=round(std_hwy, 2)) |>
 57 |     mutate(var_hwy=round(var_hwy, 2)) |>
 58 |     write_csv("aggregate.R.csv")
 59 | 
 60 | # ANTI JOIN
 61 | reviews = read_csv("../data/listings-reviews.csv")
 62 | read_csv("../data/listings.csv") |>
 63 |     anti_join(reviews, by="id") |>
 64 |     write_csv("anti_join.R.csv")
 65 | 
 66 | # FILTER
 67 | read_csv("../data/vehicles.csv") |>
 68 |     filter(year < 2000) |>
 69 |     filter(cyl < 10) |>
 70 |     write_csv("filter.R.csv")
 71 | 
 72 | # FILTER OUT
 73 | read_csv("../data/vehicles.csv") |>
 74 |     filter(!(year < 2000)) |>
 75 |     filter(!(cyl < 10)) |>
 76 |     write_csv("filter_out.R.csv")
 77 | 
 78 | # FULL JOIN
 79 | reviews = read_csv("../data/listings-reviews.csv")
 80 | reviews = bind_rows(reviews, reviews)
 81 | read_csv("../data/listings.csv") |>
 82 |     full_join(reviews, by="id") |>
 83 |     write_csv("full_join.R.csv")
 84 | 
 85 | # INNER JOIN
 86 | reviews = read_csv("../data/listings-reviews.csv")
 87 | read_csv("../data/listings.csv") |>
 88 |     inner_join(reviews, by="id") |>
 89 |     write_csv("inner_join.R.csv")
 90 | 
 91 | # LEFT JOIN
 92 | reviews = read_csv("../data/listings-reviews.csv")
 93 | read_csv("../data/listings.csv") |>
 94 |     left_join(reviews, by="id") |>
 95 |     write_csv("left_join.R.csv")
 96 | 
 97 | # SEMI JOIN
 98 | reviews = read_csv("../data/listings-reviews.csv")
 99 | read_csv("../data/listings.csv") |>
100 |     semi_join(reviews, by="id") |>
101 |     write_csv("semi_join.R.csv")
102 | 
103 | # SORT
104 | read_csv("../data/vehicles.csv") |>
105 |     arrange(make, model, desc(year)) |>
106 |     write_csv("sort.R.csv")
107 | 
108 | # UNIQUE
109 | read_csv("../data/vehicles.csv") |>
110 |     distinct(make, model, year, .keep_all=TRUE) |>
111 |     write_csv("unique.R.csv")
112 | 


--------------------------------------------------------------------------------
/validation/validate-df.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | rm -f *.df.csv
 3 | rm -f *.R.csv
 4 | echo "Generating data..."
 5 | python3 generate-df.py
 6 | Rscript generate.R
 7 | # Remove quotes around strings.
 8 | sed -ri 's/"//g' *.csv
 9 | # Remove trailing zero decimals.
10 | sed -ri "s/\.0*(,|$)/\1/g" *.csv
11 | # Unify spelling of special values.
12 | sed -ri "s/true/TRUE/gi" *.csv
13 | sed -ri "s/false/FALSE/gi" *.csv
14 | EXIT_STATUS=0
15 | for NUM in $(ls *.df.csv | cut -d. -f1); do
16 |     printf "%-23s" "Checking $NUM... "
17 |     NLINES=$(diff -y --suppress-common-lines $NUM.df.csv $NUM.R.csv | wc -l)
18 |     if [ $NLINES -gt 0 ]; then
19 |         echo "$NLINES lines differ"
20 |         EXIT_STATUS=1
21 |     else
22 |         echo "OK"
23 |     fi
24 | done
25 | exit $EXIT_STATUS
26 | 


--------------------------------------------------------------------------------
/validation/validate-ld.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | rm -f *.ld.csv
 3 | rm -f *.R.csv
 4 | echo "Generating data..."
 5 | python3 generate-ld.py
 6 | Rscript generate.R
 7 | # Remove quotes around strings.
 8 | sed -ri 's/"//g' *.csv
 9 | # Remove trailing zero decimals.
10 | sed -ri "s/\.0*(,|$)/\1/g" *.csv
11 | # Unify spelling of special values.
12 | sed -ri "s/true/TRUE/gi" *.csv
13 | sed -ri "s/false/FALSE/gi" *.csv
14 | EXIT_STATUS=0
15 | for NUM in $(ls *.ld.csv | cut -d. -f1); do
16 |     printf "%-23s" "Checking $NUM... "
17 |     NLINES=$(diff -y --suppress-common-lines $NUM.ld.csv $NUM.R.csv | wc -l)
18 |     if [ $NLINES -gt 0 ]; then
19 |         echo "$NLINES lines differ"
20 |         EXIT_STATUS=1
21 |     else
22 |         echo "OK"
23 |     fi
24 | done
25 | exit $EXIT_STATUS
26 | 


--------------------------------------------------------------------------------