├── .flake8 ├── .github └── workflows │ └── test.yml ├── .gitignore ├── .readthedocs.yaml ├── AUTHORS.md ├── COPYING ├── Makefile ├── NEWS.md ├── README.md ├── benchmark-versions.sh ├── benchmark.py ├── bin ├── di-csv2json ├── di-format-geojson ├── di-geojson2csv ├── di-json2csv └── di-open ├── data ├── README.md ├── downloads.csv ├── downloads.json ├── holidays.csv ├── holidays.json ├── listings-reviews.csv ├── listings-reviews.json ├── listings.csv ├── listings.json ├── listings.py ├── neighbourhoods.geojson ├── vehicles.csv └── vehicles.json ├── dataiter ├── __init__.py ├── aggregate.py ├── data_frame.py ├── deco.py ├── dt.py ├── dtypes.py ├── geojson.py ├── io.py ├── list_of_dicts.py ├── regex.py ├── test │ ├── __init__.py │ ├── test_aggregate.py │ ├── test_data_frame.py │ ├── test_dt.py │ ├── test_geojson.py │ ├── test_io.py │ ├── test_list_of_dicts.py │ ├── test_regex.py │ ├── test_util.py │ └── test_vector.py ├── util.py └── vector.py ├── doc ├── Makefile ├── aggregation.rst ├── check.py ├── comparison.rst ├── comparison │ ├── Makefile │ ├── README.md │ ├── blocks │ │ ├── .flake8 │ │ ├── aggregate-dataiter.py │ │ ├── aggregate-dplyr.R │ │ ├── aggregate-pandas.py │ │ ├── cbind-dataiter.py │ │ ├── cbind-dplyr.R │ │ ├── cbind-pandas.py │ │ ├── chain-dataiter.py │ │ ├── chain-dplyr.R │ │ ├── chain-pandas.py │ │ ├── colnames-dataiter.py │ │ ├── colnames-dplyr.R │ │ ├── colnames-pandas.py │ │ ├── filter-dataiter.py │ │ ├── filter-dplyr.R │ │ ├── filter-pandas.py │ │ ├── grouped-modify-dataiter.py │ │ ├── grouped-modify-dplyr.R │ │ ├── grouped-modify-pandas.py │ │ ├── head-dataiter.py │ │ ├── head-dplyr.R │ │ ├── head-pandas.py │ │ ├── import-dataiter.py │ │ ├── import-dplyr.R │ │ ├── import-pandas.py │ │ ├── index-dataiter.py │ │ ├── index-dplyr.R │ │ ├── index-pandas.py │ │ ├── io-binary-dataiter.py │ │ ├── io-binary-dplyr.R │ │ ├── io-binary-pandas.py │ │ ├── io-csv-dataiter.py │ │ ├── io-csv-dplyr.R │ │ ├── io-csv-pandas.py │ │ ├── join-dataiter.py │ │ ├── join-dplyr.R │ │ ├── join-pandas.py │ │ ├── modify-dataiter.py │ │ ├── modify-dplyr.R │ │ ├── modify-pandas.py │ │ ├── non-join-dataiter.py │ │ ├── non-join-dplyr.R │ │ ├── non-join-pandas.py │ │ ├── rbind-dataiter.py │ │ ├── rbind-dplyr.R │ │ ├── rbind-pandas.py │ │ ├── rename-dataiter.py │ │ ├── rename-dplyr.R │ │ ├── rename-pandas.py │ │ ├── select-dataiter.py │ │ ├── select-dplyr.R │ │ ├── select-pandas.py │ │ ├── size-dataiter.py │ │ ├── size-dplyr.R │ │ ├── size-pandas.py │ │ ├── sort-dataiter.py │ │ ├── sort-dplyr.R │ │ ├── sort-pandas.py │ │ ├── unique-dataiter.py │ │ ├── unique-dplyr.R │ │ └── unique-pandas.py │ ├── build.py │ ├── generate.sh │ ├── index.html │ ├── prism.css │ └── prism.js ├── conf.py ├── data-frame-column.rst ├── data-frame.rst ├── dataiter.rst ├── dt.rst ├── dtypes.rst ├── geojson.rst ├── index.rst ├── list-of-dicts.rst ├── output.py ├── quick-start.rst ├── regex.rst ├── requirements.txt └── vector.rst ├── pyproject.toml ├── requirements.txt ├── tools ├── check-missing.py └── release └── validation ├── generate-df.py ├── generate-ld.py ├── generate.R ├── validate-df.sh └── validate-ld.sh /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | select = E1,E9,F 3 | ignore = E125,E129 4 | exclude = doc/comparison/blocks,venv 5 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | on: [push] 3 | jobs: 4 | test: 5 | runs-on: ubuntu-latest 6 | strategy: 7 | matrix: 8 | python-version: ["3.9", "3.10", "3.11", "3.12"] 9 | steps: 10 | - uses: actions/checkout@v4 11 | - uses: actions/setup-python@v5 12 | with: 13 | python-version: ${{ matrix.python-version }} 14 | - run: pip install -U attd flake8 'numpy>=2.0,<3.0' pandas pyarrow pytest wcwidth 15 | - run: make check 16 | - run: make test 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.lprof 2 | *.prof 3 | .env 4 | .envrc 5 | .pytest_cache 6 | __pycache__ 7 | benchmark-head.py 8 | benchmark-versions.csv 9 | benchmark-versions.ods 10 | build 11 | dataiter.egg-info 12 | dist 13 | doc/_build 14 | doc/comparison/comparison.html 15 | test.py 16 | tmp.csv 17 | validation/*.csv 18 | venv 19 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # https://docs.readthedocs.io/en/stable/config-file/v2.html 2 | version: 2 3 | build: 4 | os: ubuntu-22.04 5 | tools: 6 | python: "3.11" 7 | sphinx: 8 | configuration: doc/conf.py 9 | python: 10 | install: 11 | - requirements: doc/requirements.txt 12 | -------------------------------------------------------------------------------- /AUTHORS.md: -------------------------------------------------------------------------------- 1 | Osmo Salomaa 2 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2019-2025 Osmo Salomaa 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8-unix -*- 2 | 3 | # EDITOR must wait! 4 | EDITOR = nano 5 | PREFIX = /usr/local 6 | PYTHON = python3 7 | 8 | check: 9 | flake8 . 10 | flake8 . `grep -Fl '#!/usr/bin/env python3' bin/*` 11 | 12 | clean: 13 | rm -rf *.lprof 14 | rm -rf *.prof 15 | rm -rf build 16 | rm -rf dataiter.egg-info 17 | rm -rf dist 18 | rm -rf doc/_build 19 | rm -rf doc/comparison/comparison.html 20 | rm -rf validation/*.csv 21 | rm -rf __pycache__ 22 | rm -rf */__pycache__ 23 | rm -rf */*/__pycache__ 24 | rm -rf .pytest_cache 25 | rm -rf */.pytest_cache 26 | rm -rf */*/.pytest_cache 27 | 28 | doc: 29 | $(MAKE) SPHINXBUILD=../venv/bin/sphinx-build -C doc clean html 30 | 31 | doc-check: 32 | PYTHONPATH=. doc/check.py 33 | 34 | doc-open: 35 | xdg-open doc/_build/html/index.html 36 | 37 | doc-watch: 38 | watchexec -e py,rst --workdir doc $(MAKE) SPHINXBUILD=../venv/bin/sphinx-build html 39 | 40 | install: 41 | pip3 install --break-system-packages . 42 | 43 | # Non-essential scripts, not installed by default. 44 | # Note that these don't go through setuptools rewriting, 45 | # instead they just do a plain unspecified dataiter import. 46 | install-cli: 47 | mkdir -p $(PREFIX)/bin 48 | for X in `ls bin | grep di-`; do \ 49 | cp -fv bin/$$X $(PREFIX)/bin && \ 50 | chmod +x $(PREFIX)/bin/$$X; \ 51 | done 52 | 53 | # Interactive! 54 | publish: 55 | $(MAKE) clean 56 | python3 -m build 57 | test -s dist/dataiter-*-py3-none-any.whl 58 | test -s dist/dataiter-*.tar.gz 59 | ls -l dist 60 | @printf "Press Enter to upload or Ctrl+C to abort: "; read _ 61 | twine upload dist/* 62 | sudo pip3 uninstall --break-system-packages -y dataiter || true 63 | sudo pip3 uninstall --break-system-packages -y dataiter || true 64 | sudo pip3 install --break-system-packages -U dataiter 65 | $(MAKE) test-installed 66 | 67 | # Interactive! 68 | release: 69 | $(MAKE) check doc-check test validate clean 70 | @echo "BUMP VERSION NUMBERS" 71 | $(EDITOR) bin/di-open 72 | $(EDITOR) dataiter/__init__.py 73 | $(EDITOR) benchmark-versions.sh 74 | @echo "ADD RELEASE NOTES" 75 | $(EDITOR) NEWS.md 76 | sudo $(MAKE) install clean 77 | $(MAKE) test-installed 78 | tools/release 79 | 80 | test: 81 | py.test . 82 | 83 | test-installed: 84 | cd && python3 -c "import dataiter; dataiter.DataFrame()" 85 | cd && python3 -c "import dataiter; dataiter.ListOfDicts()" 86 | 87 | validate: 88 | cd validation && DATAITER_USE_NUMBA=false ./validate-df.sh 89 | cd validation && DATAITER_USE_NUMBA=true ./validate-df.sh 90 | cd validation && ./validate-ld.sh 91 | 92 | venv: 93 | rm -rf venv 94 | $(PYTHON) -m venv venv 95 | . venv/bin/activate && \ 96 | pip install -U pip setuptools wheel && \ 97 | pip install -r requirements.txt 98 | 99 | .PHONY: check clean doc doc-check doc-open doc-watch install install-cli publish release test test-installed validate venv 100 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | 2025-02-07: Dataiter 1.0 2 | ======================== 3 | 4 | * Silence warnings about writing NPZ files with StringDType: 5 | "UserWarning: Custom dtypes are saved as python objects using the 6 | pickle protocol. Loading this file requires allow_pickle=True to be 7 | set." 8 | 9 | Dataiter can now be considered stable. If upgrading from <= 0.51, 10 | please read the release notes for 0.99–0.9999. 11 | 12 | 2025-01-12: Dataiter 0.9999 13 | =========================== 14 | 15 | * New module `dataiter.regex` for vectorized regular expressions 16 | * Add proxy object `Vector.dt` for `dataiter.dt` 17 | * Add proxy object `Vector.re` for `dataiter.regex` 18 | * Add proxy object `Vector.str` for `numpy.strings` 19 | * Use PyArrow instead of Pandas to read and write CSV files 20 | * Replace Pandas dependency with PyArrow 21 | 22 | This is likely to be a breaking change in some rare weirdly formatted 23 | CSV files that Pandas and PyArrow might parse differently, resulting in 24 | something like diffently guessed data types or differently detected 25 | missing value markers. The note about stability below release 0.99 still 26 | applies. 27 | 28 | 2024-12-15: Dataiter 0.999 29 | ========================== 30 | 31 | * `DataFrame.fom_arrow`: Remove `strings_as_object` argument 32 | * `DataFrame.from_pandas`: Remove `strings_as_object` argument 33 | * `DataFrame.read_csv`: Remove `strings_as_object` argument 34 | * `DataFrame.read_parquet`: Remove `strings_as_object` argument 35 | * `GeoJSON.read`: Remove `strings_as_object` argument 36 | * `ListOfDicts.to_data_frame`: Remove `strings_as_object` argument 37 | * `read_csv`: Remove `strings_as_object` argument 38 | * `read_geojson`: Remove `strings_as_object` argument 39 | * `read_parquet`: Remove `strings_as_object` argument 40 | * `Vector.as_string`: Remove `length` argument 41 | * `Vector.is_na`: Fix to work in multidimensional cases where the 42 | elements of an object vector are arrays/vectors 43 | * `Vector.rank`: Change default `method` to "min" 44 | * `Vector.rank`: Remove `method` "average" 45 | 46 | This is a breaking change to switch the string data type from the 47 | fixed-width `str_` a.k.a. `= 2.0, any NPZ or Pickle 53 | files saved cannot be opened using Dataiter < 0.99 and NumPy < 2.0. If 54 | you need that kind of interoperability, consider using the Parquet file 55 | format. 56 | 57 | 2024-08-17: Dataiter 0.99 58 | ========================= 59 | 60 | * Adapt to changes in NumPy 2.0 61 | * Bump NumPy dependency to >= 2.0 62 | 63 | This is a minimal change to be NumPy 2.0 compatible. In the 0.99+ 64 | releases, we plan to adopt the new NumPy string dtype and fix any 65 | regressions that come up, leading to a 1.0 release when everything looks 66 | to be working reliably (#26). Anyone looking for extreme stability 67 | should consider avoiding the 0.99+ releases and waiting for 1.0. 68 | 69 | 2024-06-24: Dataiter 0.51 70 | ========================= 71 | 72 | * Mark NumPy dependency as < 2.0 73 | 74 | 2024-04-06: Dataiter 0.50 75 | ========================= 76 | 77 | * `ListOfDicts.drop_na`: New method 78 | * `ListOfDicts.keys`: New method 79 | * `ListOfDicts.print_memory_use`: New method 80 | * Fix tabular display of Unicode characters with width != 1 81 | * Add dependency on wcwidth: https://pypi.org/project/wcwidth 82 | 83 | 2023-11-08: Dataiter 0.49 84 | ========================= 85 | 86 | * `dt`: Handle all NaT input 87 | * Migrate from `setup.py` to `hatch` and `pyproject.toml` 88 | 89 | 2023-10-08: Dataiter 0.48 90 | ========================= 91 | 92 | * `Vector.as_datetime`: Add `precision` argument 93 | * `Vector.concat`: New method 94 | * `Vector.sort`: Fix sorting object vectors 95 | 96 | 2023-09-09: Dataiter 0.47 97 | ========================= 98 | 99 | * `DataFrame`: Fix column and method name clash errors in certain operations 100 | * `dt.replace`: Allow vector arguments the same length as `x` 101 | 102 | 2023-09-05: Dataiter 0.46 103 | ========================= 104 | 105 | * `DataFrame.count`: New method, shorthand for 106 | `data.group_by(...).aggregate(n=di.count())` 107 | * `Vector.rank`: Handle empty and all-NA vectors 108 | 109 | 2023-06-14: Dataiter 0.45 110 | ========================= 111 | 112 | * `USE_NUMBA_CACHE`: New option, read from environment variable 113 | `DATAITER_USE_NUMBA_CACHE` if exists, defauls to `True` 114 | * Fix a possible issue with Numba caching 115 | 116 | 2023-06-13: Dataiter 0.44 117 | ========================= 118 | 119 | * Use `numba.extending.overload` instead of the deprecated 120 | `numba.generated_jit` 121 | 122 | 2023-06-08: Dataiter 0.43 123 | ========================= 124 | 125 | * `DataFrame`: Don't try to do joins on NA values in `by` columns 126 | * `DataFrame.drop_na`: New method 127 | 128 | 2023-05-30: Dataiter 0.42 129 | ========================= 130 | 131 | * `DataFrame`: Truncate multiline strings when printing 132 | * `DataFrame.from_arrow`: New method 133 | * `DataFrame.read_parquet`: New method 134 | * `DataFrame.to_arrow`: New method 135 | * `DataFrame.write_parquet`: New method 136 | * `read_parquet`: New function 137 | * `Vector.__init__`: Fix type guessing when mixing Python and NumPy 138 | floats or integers and missing values 139 | * Allow using a thousand separator when printing numbers, 140 | off by default, can be set with `dataiter.PRINT_THOUSAND_SEPARATOR` 141 | 142 | 2023-03-11: Dataiter 0.41 143 | ========================= 144 | 145 | * Fix printing really small numbers 146 | 147 | 2023-02-21: Dataiter 0.40.1 148 | =========================== 149 | 150 | * `DataFrame.modify`: Fix grouped modify on unsorted data frame 151 | 152 | 2023-02-20: Dataiter 0.40 153 | ========================= 154 | 155 | * `Vector.map`: Add `dtype` argument 156 | 157 | 2023-02-06: Dataiter 0.39.1 158 | =========================== 159 | 160 | * `ListOfDicts.to_data_frame`: Add `strings_as_object` argument 161 | 162 | 2023-01-21: Dataiter 0.39 163 | ========================= 164 | 165 | * `read_csv`, `read_geojson`, `DataFrame.from_pandas`, 166 | `DataFrame.read_csv`, `GeoJSON.read`: Add `strings_as_object` argument 167 | 168 | 2022-12-15: Dataiter 0.38 169 | ========================= 170 | 171 | * `DataFrame.slice_off`: New method 172 | * `GeoJSON.to_data_frame`: New method 173 | * Fix error with new column placeholder attributes in conjunction with 174 | pop, popitem and clear 175 | 176 | 2022-11-17: Dataiter 0.37 177 | ========================= 178 | 179 | * `DataFrame`: Add placeholder attributes for columns so that 180 | tab completion of columns as attributes at a shell works 181 | * `dt.from_string`: New function 182 | * `dt.to_string`: New function 183 | * `nrow`: Remove deprecated aggregation function 184 | * Don't use Numba for aggregation involving strings due to bad performance 185 | 186 | 2022-10-16: Dataiter 0.36 187 | ========================= 188 | 189 | * `dt`: New module for dealing with dates and datetimes 190 | 191 | 2022-10-03: Dataiter 0.35 192 | ========================= 193 | 194 | * `DataFrame.from_pandas`: Speed up by avoiding unnecessary conversions 195 | * `DataFrame.full_join`: Fix join and output when `by` is a tuple 196 | * `GeoJSON`: Fix printing object 197 | 198 | 2022-09-17: Dataiter 0.34 199 | ========================= 200 | 201 | * `Vector`: Handle timedeltas correctly for NA checks and printing 202 | * `Vector.is_timedelta`: New method 203 | 204 | 2022-09-03: Dataiter 0.33 205 | ========================= 206 | 207 | * `DataFrame.sort`: Convert object to string for sorting 208 | * `Vector.sort`: Convert object to string for sorting 209 | * Fix conditional Numba use when importing the numba package works, 210 | but caching doesn't 211 | * Add `di-open` cli command (currently not part of the default install, 212 | but can be installed from source using `make install-cli`) 213 | 214 | 2022-04-02: Dataiter 0.32 215 | ========================= 216 | 217 | * `DataFrame.modify`: Add support for grouped modification (#19) 218 | * `DataFrame.split`: New method 219 | * `ListOfDicts.split`: New method 220 | 221 | 2022-02-26: Dataiter 0.31 222 | ========================= 223 | 224 | * `DataFrame.compare`: New experimental method 225 | * `Vector.as_string`: Add `length` argument 226 | * Change the documentation to default to the latest release ("stable") 227 | instead of the development version ("latest") 228 | 229 | 2022-02-19: Dataiter 0.30 230 | ========================= 231 | 232 | * Use keyword-only arguments where appropriate – the general principle 233 | is that mandatory arguments are allowed as positional, but optional 234 | modifiers are keyword only 235 | * Rename all instances of "missing" to "na", such as `Vector.is_missing` 236 | to `Vector.is_na`, the only exception being 237 | `ListOfDicts.fill_missing`, which becomes 238 | `ListOfDicts.fill_missing_keys` 239 | * Truncate data frame object and string columns at 240 | `PRINT_TRUNCATE_WIDTH` (default 32) for printing 241 | 242 | 2022-02-09: Dataiter 0.29.2 243 | =========================== 244 | 245 | * Fix aggregation functions to work with all main data types: 246 | boolean, integer, float, date, datetime and string 247 | * Fix aggregation functions to handle all missing values (NaN, NaT, 248 | blank string) correctly, the same as implemented in Vector 249 | * Rename aggregation functions' `dropna` arguments to `drop_missing` 250 | * `first`, `last`, `nth`: Add `drop_missing` argument 251 | * `Vector.drop_missing`: New method 252 | 253 | 2022-01-30: Dataiter 0.29.1 254 | =========================== 255 | 256 | * `mode`: Fix to return first in case of ties (requires Python >= 3.8) 257 | * `std`, `var`: Add `ddof` argument (defaults to 0 on account of Numba limitations) 258 | * Don't try to dropna for non-float vectors in aggregation functions 259 | 260 | 2022-01-29: Dataiter 0.29 261 | ========================= 262 | 263 | * Add shorthand helper functions for use with `DataFrame.aggregate`, 264 | optionally using Numba JIT-compiled code for speed 265 | - https://dataiter.readthedocs.io/en/latest/aggregation.html 266 | - https://dataiter.readthedocs.io/en/latest/data-frame.html#dataiter.DataFrame.aggregate 267 | - https://dataiter.readthedocs.io/en/latest/dataiter.html 268 | * `DataFrame.map`: New method 269 | * `ncol`: Removed 270 | * `nrow`: Deprecated in favor of `dataiter.count` 271 | * `read_csv`: New alias for `DataFrame.read_csv` 272 | * `read_geojson`: New alias for `GeoJSON.read` 273 | * `read_json`: New alias for `ListOfDicts.read_json` 274 | * `read_npz`: New alias for `DataFrame.read_npz` 275 | 276 | 2022-01-09: Dataiter 0.28 277 | ========================= 278 | 279 | * `DataFrame`: Make object columns work in various operations 280 | * `DataFrame.from_json`: Add arguments `columns` and `dtypes` 281 | * `DataFrame.from_pandas`: Add argument `dtypes` 282 | * `DataFrame.full_join`: Speed up 283 | * `DataFrame.read_csv`: Add argument `dtypes` 284 | * `DataFrame.read_json`: Add arguments `columns` and `dtypes` 285 | * `GeoJSON.read`: Add arguments `columns` and `dtypes` 286 | * `ListOfDicts.fill_missing`: New method 287 | * `ListOfDicts.from_json`: Add arguments `keys` and `types` 288 | * `ListOfDicts.full_join`: Speed up 289 | * `ListOfDicts.read_csv`: Add argument `types`, rename `columns` to `keys` 290 | * `ListOfDicts.read_json`: Add arguments `keys` and `types` 291 | 292 | 2022-01-01: Dataiter 0.27 293 | ========================= 294 | 295 | * `DataFrame`: Fix error message when column not found 296 | * `DataFrame.aggregate`: Speed up 297 | * `DataFrame.full_join`: Fix to join all possible columns 298 | * `DataFrame.read_csv`: Try to avoid mixed types 299 | * `ListOfDicts.full_join`: Fix to join all possible keys 300 | * `ListOfDicts.write_csv`: Use minimal quoting 301 | * `Vector.get_memory_use`: New method 302 | * `Vector.rank`: Rewrite, add `method` argument 303 | * `*.read_*`: Rename `fname` argument `path` 304 | * `*.write_*`: Rename `fname` argument `path` 305 | * Add comparison table dplyr vs. Dataiter vs. Pandas to documentation: 306 | 307 | 308 | 2021-12-02: Dataiter 0.26 309 | ========================= 310 | 311 | * `DataFrame.read_npz`: New method to read NumPy npz format 312 | * `DataFrame.write_npz`: New method to write NumPy npz format 313 | * `*.read_*`: Decompress `.bz2|.gz|.xz` automatically 314 | * `*.write_*`: Compress `.bz2|.gz|.xz` automatically 315 | 316 | 2021-11-13: Dataiter 0.25 317 | ========================= 318 | 319 | * `DataFrame.print_missing_counts`: Fix when nothing missing 320 | * `Vector.replace_missing`: New method 321 | 322 | 2021-10-27: Dataiter 0.24 323 | ========================= 324 | 325 | * `DataFrame.print_memory_use`: New method 326 | * `ListOfDicts.write_csv`: Use less memory 327 | 328 | 2021-07-08: Dataiter 0.23 329 | ========================= 330 | 331 | * `Vector.is_*`: Change to be methods instead of properties 332 | * Drop deprecated use of `np.int` 333 | * Drop deprecated comparisons against NaN 334 | 335 | 2021-05-13: Dataiter 0.22 336 | ========================= 337 | 338 | * `ListOfDicts.map`: New method 339 | 340 | 2021-03-08: Dataiter 0.21 341 | ========================= 342 | 343 | * `DataFrame.read_csv`: Add `columns` argument 344 | * `ListOfDicts.read_csv`: Add `columns` argument 345 | 346 | 2021-03-06: Dataiter 0.20 347 | ========================= 348 | 349 | * `DataFrame.*_join`: Handle differing by names via tuple argument 350 | * `ListOfDicts.*_join`: Handle differing by names via tuple argument 351 | 352 | 2021-03-04: Dataiter 0.19 353 | ========================= 354 | 355 | * Use terminal window width as maximum print width 356 | * `Vector.__init__`: Handle NaN values in non-float vectors 357 | 358 | 2021-03-03: Dataiter 0.18 359 | ========================= 360 | 361 | * `Vector.__init__`: Accept generators/iterators 362 | * `Vector.map`: New method 363 | 364 | 2021-02-27: Dataiter 0.17 365 | ========================= 366 | 367 | * `DataFrame.print_missing_counts`: New method 368 | * `GeoJSON.read`: Handle properties differing between features 369 | * `ListOfDicts.print_missing_counts`: New method 370 | * `Vector.as_object`: New method 371 | 372 | 2020-10-03: Dataiter 0.16.1 373 | =========================== 374 | 375 | * `GeoJSON.read`: Use warnings, not errors for ignored excess feature keys 376 | 377 | 2020-09-26: Dataiter 0.16 378 | ========================= 379 | 380 | * `GeoJSON`: New class 381 | 382 | 2020-09-12: Dataiter 0.15 383 | ========================= 384 | 385 | * `ListOfDicts.sort`: Handle descending sort for all types 386 | 387 | 2020-08-22: Dataiter 0.14 388 | ========================= 389 | 390 | * `ListOfDicts`: Make obsoletion a warning instead of an error 391 | 392 | 2020-08-15: Dataiter 0.13 393 | ========================= 394 | 395 | * `DataFrame`: Fix error printing blank strings (#8) 396 | 397 | 2020-07-25: Dataiter 0.12 398 | ========================= 399 | 400 | * `DataFrame.filter`: Add `colname_value_pairs` argument 401 | * `DataFrame.filter_out`: Add `colname_value_pairs` argument 402 | * `ListOfDicts.__init__`: Remove arguments not intended for external use 403 | * `ListOfDicts.rename`: Preserve order of keys 404 | * Add documentation: https://dataiter.readthedocs.io/ 405 | 406 | 2020-06-02: Dataiter 0.11 407 | ========================= 408 | 409 | * `Vector.__init__`: Speed up by fixing type deduction 410 | 411 | 2020-05-28: Dataiter 0.10.1 412 | =========================== 413 | 414 | * `ListOfDicts.select`: Fix return value (#7) 415 | 416 | 2020-05-21: Dataiter 0.10 417 | ========================= 418 | 419 | * `DataFrame.aggregate`: Fix `UnicodeEncodeError` with string columns 420 | * `DataFrame.unique`: Fix `UnicodeEncodeError` with string columns 421 | * `ListOfDicts.select`: Return keys in requested order 422 | * `Vector.__repr__`: Add custom conversion to string for display 423 | * `Vector.__str__`: Add custom conversion to string for display 424 | * `Vector.to_string`: Add custom conversion to string for display 425 | * `Vector.to_strings`: Add custom conversion to string for display 426 | 427 | 2020-05-11: Dataiter 0.9 428 | ======================== 429 | 430 | * `Array`: Rename to `Vector` 431 | * `Vector.head`: New method 432 | * `Vector.range`: New method 433 | * `Vector.sample`: New method 434 | * `Vector.sort`: New method 435 | * `Vector.tail`: New method 436 | * `Vector.unique`: New method 437 | 438 | 2020-05-10: Dataiter 0.8 439 | ======================== 440 | 441 | * `DataFrame`: New class 442 | * `ListOfDicts.__add__`: New method to support the `+` operator 443 | * `ListOfDicts.__init__`: Rename, reorder arguments 444 | * `ListOfDicts.__mul__`: New method to support the `*` operator 445 | * `ListOfDicts.__repr__`: New method, format as JSON 446 | * `ListOfDicts.__rmul__`: New method to support the `*` operator 447 | * `ListOfDicts.__setitem__`: New method, coerce to `AttributeDict` 448 | * `ListOfDicts.__str__`: New method, format as JSON 449 | * `ListOfDicts.aggregate`: Speed up 450 | * `ListOfDicts.anti_join`: New method 451 | * `ListOfDicts.append`: New method 452 | * `ListOfDicts.clear`: New method 453 | * `ListOfDicts.extend`: New method 454 | * `ListOfDicts.full_join`: New method 455 | * `ListOfDicts.head`: New method 456 | * `ListOfDicts.inner_join`: New method 457 | * `ListOfDicts.insert`: New method 458 | * `ListOfDicts.join`: Removed in favor of specific join types 459 | * `ListOfDicts.left_join`: New method 460 | * `ListOfDicts.pluck`: Add argument "default" to handle missing keys 461 | * `ListOfDicts.print_`: New method 462 | * `ListOfDicts.read_csv`: Add explicit arguments 463 | * `ListOfDicts.read_json`: Relay arguments to `json.loads` 464 | * `ListOfDicts.read_pickle`: New method 465 | * `ListOfDicts.reverse`: New method 466 | * `ListOfDicts.sample`: New method 467 | * `ListOfDicts.semi_join`: New method 468 | * `ListOfDicts.sort`: Change arguments to support sort direction better 469 | * `ListOfDicts.tail`: New method 470 | * `ListOfDicts.to_data_frame`: New method 471 | * `ListOfDicts.to_pandas`: New method 472 | * `ListOfDicts.unique`: Return unique by all keys if none given 473 | * `ListOfDicts.write_csv`: Add explicit arguments 474 | * `ListOfDicts.write_pickle`: New method 475 | 476 | 2019-12-03: Dataiter 0.7 477 | ======================== 478 | 479 | * Make `sort` handle `None` values, sorted last 480 | 481 | 2019-11-29: Dataiter 0.6 482 | ======================== 483 | 484 | * Fix `ObsoleteError` after multiple modifying actions 485 | 486 | 2019-11-10: Dataiter 0.5 487 | ======================== 488 | 489 | * Add `read_csv` 490 | * Add `read_json` 491 | * Add `write_csv` 492 | * Add `write_json` 493 | 494 | 2019-11-01: Dataiter 0.4 495 | ======================== 496 | 497 | * Fix `ObsoleteError` with `deepcopy` 498 | * Define `__deepcopy__` so that `copy.deepcopy` works too 499 | * Add `copy` (and `__copy__` for `copy.copy`) 500 | 501 | 2019-11-01: Dataiter 0.3 502 | ======================== 503 | 504 | * Mark `ListOfDicts` object obsolete thus preventing (accidental) use if 505 | a chained successor has modified the shared dicts 506 | * Add `modify_if` 507 | 508 | 2019-10-31: Dataiter 0.2 509 | ======================== 510 | 511 | * Speed up, mostly by avoiding copying (methods that modify dicts now do 512 | it in place rather than making a copy) 513 | 514 | 2019-09-29: Dataiter 0.1 515 | ======================== 516 | 517 | * Initial release 518 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Simple, Light-Weight Data Frames for Python 2 | =========================================== 3 | 4 | [![PyPI](https://img.shields.io/pypi/v/dataiter.svg)](https://pypi.org/project/dataiter) 5 | [![Downloads](https://pepy.tech/badge/dataiter/month)](https://pepy.tech/project/dataiter) 6 | 7 | Dataiter's **`DataFrame`** is a class for tabular data similar to R's 8 | `data.frame`, implementing all common operations to manipulate data. It 9 | is under the hood a dictionary of NumPy arrays and thus capable of fast 10 | vectorized operations. You can consider it to be a light-weight 11 | alternative to Pandas with a simple and consistent API. Performance-wise 12 | Dataiter relies on NumPy and Numba and is likely to be at best 13 | comparable to Pandas. 14 | 15 | ## Installation 16 | 17 | ```bash 18 | # Latest stable version 19 | pip install -U dataiter 20 | 21 | # Latest development version 22 | pip install -U git+https://github.com/otsaloma/dataiter 23 | 24 | # Numba (optional) 25 | pip install -U numba 26 | ``` 27 | 28 | Dataiter optionally uses **Numba** to speed up certain operations. If 29 | you have Numba installed, Dataiter will use it automatically. It's 30 | currently not a hard dependency, so you need to install it separately. 31 | 32 | ## Quick Start 33 | 34 | ```python 35 | >>> import dataiter as di 36 | >>> data = di.read_csv("data/listings.csv") 37 | >>> data.filter(hood="Manhattan", guests=2).sort(price=1).head() 38 | . 39 | id hood zipcode guests sqft price 40 | int64 string string int64 float64 int64 41 | ──────── ───────── ─────── ────── ─────── ───── 42 | 0 42279170 Manhattan 10013 2 nan 0 43 | 1 42384530 Manhattan 10036 2 nan 0 44 | 2 18835820 Manhattan 10021 2 nan 10 45 | 3 20171179 Manhattan 10027 2 nan 10 46 | 4 14858544 Manhattan 2 nan 15 47 | 5 31397084 Manhattan 10002 2 nan 19 48 | 6 22289683 Manhattan 10031 2 nan 20 49 | 7 7760204 Manhattan 10040 2 nan 22 50 | 8 43292527 Manhattan 10033 2 nan 22 51 | 9 43268040 Manhattan 10033 2 nan 23 52 | . 53 | ``` 54 | 55 | ## Documentation 56 | 57 | https://dataiter.readthedocs.io/ 58 | 59 | If you're familiar with either dplyr (R) or Pandas (Python), the 60 | comparison table in the documentation will give you a quick overview of 61 | the differences and similarities in common operations. 62 | 63 | https://dataiter.readthedocs.io/en/stable/comparison.html 64 | 65 | ## Development 66 | 67 | To install a virtualenv for development, use 68 | 69 | make venv 70 | 71 | or, for a specific Python version 72 | 73 | make PYTHON=python3.X venv 74 | -------------------------------------------------------------------------------- /benchmark-versions.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | SCRIPT=benchmark-head.py 4 | SCRIPT_ARGS="$@" 5 | OUT_FILE=benchmark-versions.csv 6 | TMP_FILE=tmp.csv 7 | 8 | benchmark() { 9 | VERSION=$1 10 | printf "\n$VERSION:\n" 11 | git checkout -q $VERSION 12 | ./$SCRIPT -o $TMP_FILE --version=$VERSION $SCRIPT_ARGS || true 13 | tail -n+2 $TMP_FILE >> $OUT_FILE 14 | sed -i 's/"//g' $OUT_FILE 15 | } 16 | 17 | set -e 18 | rm -f $OUT_FILE 19 | echo "name,version,elapsed" > $OUT_FILE 20 | cp -fv benchmark.py $SCRIPT 21 | benchmark 1.0 22 | benchmark master 23 | rm -f $SCRIPT $TMP_FILE 24 | -------------------------------------------------------------------------------- /benchmark.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import click 4 | import dataiter as di 5 | import functools 6 | import numpy as np 7 | import random 8 | import time 9 | 10 | from dataiter import test 11 | from statistics import mean 12 | from unittest.mock import patch 13 | 14 | @functools.cache 15 | def _data_frame(path, nrow): 16 | data = test.data_frame(path) 17 | n = nrow // data.nrow 18 | data = data.rbind(*([data] * n)) 19 | return data.head(nrow) 20 | 21 | def data_frame(path, nrow=1_000_000): 22 | return _data_frame(path, nrow).deepcopy() 23 | 24 | @functools.cache 25 | def _data_frame_random(nrows, ngroups): 26 | return di.DataFrame(g=np.random.choice(ngroups, nrows, replace=True), 27 | a=np.random.normal(10, 2, nrows)) 28 | 29 | def data_frame_random(nrows, ngroups): 30 | return _data_frame_random(nrows, ngroups).deepcopy() 31 | 32 | def data_frame_aggregate_128(): 33 | data = data_frame("vehicles.csv") 34 | start = time.time() 35 | (data 36 | .group_by("make") 37 | .aggregate( 38 | n=di.count(), 39 | hwy=di.mean("hwy"), 40 | cty=di.mean("cty"))) 41 | return time.time() - start 42 | 43 | def data_frame_aggregate_3264(): 44 | data = data_frame("vehicles.csv") 45 | start = time.time() 46 | (data 47 | .group_by("make", "model") 48 | .aggregate( 49 | n=di.count(), 50 | hwy=di.mean("hwy"), 51 | cty=di.mean("cty"))) 52 | return time.time() - start 53 | 54 | def data_frame_aggregate_14668(): 55 | data = data_frame("vehicles.csv") 56 | start = time.time() 57 | (data 58 | .group_by("make", "model", "year") 59 | .aggregate( 60 | n=di.count(), 61 | hwy=di.mean("hwy"), 62 | cty=di.mean("cty"))) 63 | return time.time() - start 64 | 65 | def data_frame_aggregate_100000_lambda(): 66 | data = data_frame_random(1_000_000, 100_000) 67 | start = time.time() 68 | (data 69 | .group_by("g") 70 | .aggregate( 71 | a_mean=lambda x: np.mean(x.a), 72 | a_std=lambda x: np.std(x.a))) 73 | return time.time() - start 74 | 75 | def data_frame_aggregate_100000_short(): 76 | with patch("dataiter.USE_NUMBA", False): 77 | data = data_frame_random(1_000_000, 100_000) 78 | start = time.time() 79 | (data 80 | .group_by("g") 81 | .aggregate( 82 | a_mean=di.mean("a"), 83 | a_std=di.std("a"))) 84 | return time.time() - start 85 | 86 | def data_frame_aggregate_100000_short_numba(): 87 | with patch("dataiter.USE_NUMBA", True): 88 | data = data_frame_random(1_000_000, 100_000) 89 | start = time.time() 90 | (data 91 | .group_by("g") 92 | .aggregate( 93 | a_mean=di.mean("a"), 94 | a_std=di.std("a"))) 95 | return time.time() - start 96 | 97 | def data_frame_full_join(): 98 | data = data_frame("vehicles.csv") 99 | meta = data.select("make", "model").unique() 100 | meta = meta.rbind(meta.modify(model="X")) 101 | meta.random = np.random.random(meta.nrow) 102 | assert meta.anti_join(data, "make", "model").nrow > 0 103 | start = time.time() 104 | data.full_join(meta, "make", "model") 105 | return time.time() - start 106 | 107 | def data_frame_left_join(): 108 | data = data_frame("vehicles.csv") 109 | meta = data.select("make", "model").unique() 110 | meta.random = np.random.random(meta.nrow) 111 | start = time.time() 112 | data.left_join(meta, "make", "model") 113 | return time.time() - start 114 | 115 | def data_frame_read_csv(): 116 | start = time.time() 117 | test.data_frame("vehicles.csv") 118 | return time.time() - start 119 | 120 | def data_frame_read_json(): 121 | start = time.time() 122 | test.data_frame("vehicles.json") 123 | return time.time() - start 124 | 125 | def data_frame_rbind_2(): 126 | # 2 * 500,000 = 1,000,000 127 | data = data_frame("vehicles.csv", 500_000) 128 | start = time.time() 129 | data.rbind(data) 130 | return time.time() - start 131 | 132 | def data_frame_rbind_100(): 133 | # 100 * 10,000 = 1,000,000 134 | data = data_frame("vehicles.csv", 10_000) 135 | start = time.time() 136 | data.rbind(*([data] * (100 - 1))) 137 | return time.time() - start 138 | 139 | def data_frame_rbind_100000(): 140 | # 100,000 * 10 = 1,000,000 141 | data = data_frame("vehicles.csv", 10) 142 | start = time.time() 143 | data.rbind(*([data] * (100_000 - 1))) 144 | return time.time() - start 145 | 146 | def data_frame_sort(): 147 | data = data_frame("vehicles.csv") 148 | start = time.time() 149 | data.sort(make=1, model=1, year=1) 150 | return time.time() - start 151 | 152 | def data_frame_unique(): 153 | data = data_frame("vehicles.csv") 154 | start = time.time() 155 | data.unique("make", "model", "year") 156 | return time.time() - start 157 | 158 | @functools.cache 159 | def _list_of_dicts(path, length): 160 | data = test.list_of_dicts(path) 161 | n = length // len(data) + 1 162 | data = data * n 163 | return data.head(length) 164 | 165 | def list_of_dicts(path, length=100_000): 166 | return _list_of_dicts(path, length).deepcopy() 167 | 168 | def list_of_dicts_aggregate_128(): 169 | data = list_of_dicts("vehicles.json") 170 | start = time.time() 171 | (data 172 | .group_by("make") 173 | .aggregate( 174 | n=len, 175 | hwy=lambda x: mean(x.pluck("hwy")), 176 | cty=lambda x: mean(x.pluck("cty")))) 177 | return time.time() - start 178 | 179 | def list_of_dicts_aggregate_3264(): 180 | data = list_of_dicts("vehicles.json") 181 | start = time.time() 182 | (data 183 | .group_by("make", "model") 184 | .aggregate( 185 | n=len, 186 | hwy=lambda x: mean(x.pluck("hwy")), 187 | cty=lambda x: mean(x.pluck("cty")))) 188 | return time.time() - start 189 | 190 | def list_of_dicts_aggregate_14668(): 191 | data = list_of_dicts("vehicles.json") 192 | start = time.time() 193 | (data 194 | .group_by("make", "model", "year") 195 | .aggregate( 196 | n=len, 197 | hwy=lambda x: mean(x.pluck("hwy")), 198 | cty=lambda x: mean(x.pluck("cty")))) 199 | return time.time() - start 200 | 201 | def list_of_dicts_full_join(): 202 | data = list_of_dicts("vehicles.json") 203 | meta = data.deepcopy().select("make", "model").unique() 204 | meta = meta + meta.deepcopy().modify(model=lambda x: "X") 205 | meta = meta.modify(random=lambda x: random.random()) 206 | assert len(meta.anti_join(data, "make", "model")) > 0 207 | start = time.time() 208 | data.full_join(meta, "make", "model") 209 | return time.time() - start 210 | 211 | def list_of_dicts_left_join(): 212 | data = list_of_dicts("vehicles.json") 213 | meta = data.deepcopy().select("make", "model").unique() 214 | meta = meta.deepcopy().modify(random=lambda x: random.random()) 215 | start = time.time() 216 | data.left_join(meta, "make", "model") 217 | return time.time() - start 218 | 219 | def list_of_dicts_read_csv(): 220 | start = time.time() 221 | test.list_of_dicts("vehicles.csv") 222 | return time.time() - start 223 | 224 | def list_of_dicts_read_json(): 225 | start = time.time() 226 | test.list_of_dicts("vehicles.json") 227 | return time.time() - start 228 | 229 | def list_of_dicts_sort(): 230 | data = list_of_dicts("vehicles.csv") 231 | start = time.time() 232 | data.sort(make=1, model=1, year=1) 233 | return time.time() - start 234 | 235 | def vector_fast_list(): 236 | seq = list(range(1_000_000)) 237 | start = time.time() 238 | di.Vector.fast(seq, int) 239 | return time.time() - start 240 | 241 | def vector_fast_np_array(): 242 | seq = list(range(1_000_000)) 243 | seq = np.array(seq) 244 | start = time.time() 245 | di.Vector.fast(seq, int) 246 | return time.time() - start 247 | 248 | def vector_new_list(): 249 | seq = list(range(1_000_000)) 250 | start = time.time() 251 | di.Vector(seq) 252 | return time.time() - start 253 | 254 | def vector_new_np_array(): 255 | seq = list(range(1_000_000)) 256 | seq = np.array(seq) 257 | start = time.time() 258 | di.Vector(seq) 259 | return time.time() - start 260 | 261 | def vector_rank_max(): 262 | data = data_frame("vehicles.csv") 263 | start = time.time() 264 | data.model.rank(method="max") 265 | return time.time() - start 266 | 267 | def vector_rank_min(): 268 | data = data_frame("vehicles.csv") 269 | start = time.time() 270 | data.model.rank(method="min") 271 | return time.time() - start 272 | 273 | def vector_rank_ordinal(): 274 | data = data_frame("vehicles.csv") 275 | start = time.time() 276 | data.model.rank(method="ordinal") 277 | return time.time() - start 278 | 279 | def vector_sort(): 280 | data = data_frame("vehicles.csv") 281 | start = time.time() 282 | data.model.sort() 283 | return time.time() - start 284 | 285 | def vector_unique(): 286 | data = data_frame("vehicles.csv") 287 | start = time.time() 288 | data.model.unique() 289 | return time.time() - start 290 | 291 | def is_benchmark(name): 292 | prefixes = ("data_frame_", "list_of_dicts_", "vector_") 293 | return name.startswith(prefixes) and name != "data_frame_random" 294 | 295 | BENCHMARKS = sorted(filter(is_benchmark, dir()), key=lambda x: ( 296 | [x.zfill(9) if x.isdigit() else x for x in x.split("_")])) 297 | 298 | def run_benchmarks(benchmarks, output, rounds): 299 | width = max(map(len, benchmarks)) + 2 300 | for i, benchmark in enumerate(benchmarks): 301 | print(f"{i+1:2d}/{len(benchmarks)}. ", end="", flush=True) 302 | print(f"{benchmark+' ':.<{width}} ", end="", flush=True) 303 | try: 304 | f = globals()[benchmark] 305 | elapsed = 1000 * min(f() for i in range(rounds)) 306 | print("{:5.0f} ms".format(elapsed), flush=True) 307 | except Exception as error: 308 | elapsed = -1 309 | print(error.__class__.__name__) 310 | if not output: raise 311 | yield {"name": benchmark, "elapsed": round(elapsed)} 312 | 313 | @click.command() 314 | @click.option("-o", "--output", help="Filename for optional CSV output") 315 | @click.option("-r", "--rounds", default=5, help="Number of rounds per benchmark") 316 | @click.option("--version", default=di.__version__, help="Version number for CSV output") 317 | @click.argument("pattern", nargs=-1) 318 | def main(output, rounds, version, pattern): 319 | pattern = pattern or "_" 320 | f = lambda x: any(y in x for y in pattern) 321 | benchmarks = list(filter(f, BENCHMARKS)) 322 | results = di.ListOfDicts(run_benchmarks(benchmarks, output, rounds)) 323 | results = results.modify(version=lambda x: version) 324 | if output: 325 | assert output.endswith(".csv") 326 | print(f"Writing {output}...") 327 | results.write_csv(output) 328 | 329 | if __name__ == "__main__": 330 | main() 331 | -------------------------------------------------------------------------------- /bin/di-csv2json: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import click 4 | import dataiter as di 5 | 6 | from pathlib import Path 7 | 8 | @click.command(no_args_is_help=True) 9 | @click.option("-f", "--force", is_flag=True, default=False, help="Overwrite existing file") 10 | @click.argument("file", nargs=-1, type=click.Path(exists=True)) 11 | def main(force, file): 12 | """Convert CSV file to JSON file.""" 13 | for input in map(Path, file): 14 | output = input.with_suffix(".json") 15 | click.echo(f"{input} → {output}") 16 | if output.exists() and not force: 17 | raise SystemExit( 18 | f"Output file {output} exists, " 19 | f"use -f/--force to overwrite") 20 | data = di.read_csv(input) 21 | data.write_json(output) 22 | 23 | if __name__ == "__main__": 24 | main() 25 | -------------------------------------------------------------------------------- /bin/di-format-geojson: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import click 4 | import dataiter as di 5 | import shutil 6 | import time 7 | 8 | @click.command(no_args_is_help=True) 9 | @click.option("-i", "--indent", default=2, help="Indent level") 10 | @click.option("-p", "--precision", default=9, help="Coordinate precision") 11 | @click.argument("file", nargs=-1, type=click.Path(exists=True)) 12 | def main(indent, precision, file): 13 | """Rewrite GeoJSON file with proper formatting.""" 14 | for path in file: 15 | click.echo(path) 16 | data = di.read_geojson(path) 17 | for i in range(data.nrow): 18 | coords = data.geometry[i].coordinates 19 | data.geometry[i].coordinates = round_recursive(coords, precision) 20 | backup = path + ".bak" + str(int(time.time())) 21 | shutil.copyfile(path, backup) 22 | data.write(path, indent=indent) 23 | 24 | def round_recursive(value, precision): 25 | if isinstance(value, list): 26 | return [round_recursive(x, precision) for x in value] 27 | return round(value, precision) 28 | 29 | if __name__ == "__main__": 30 | main() 31 | -------------------------------------------------------------------------------- /bin/di-geojson2csv: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import click 4 | import dataiter as di 5 | 6 | from pathlib import Path 7 | 8 | @click.command(no_args_is_help=True) 9 | @click.option("-f", "--force", is_flag=True, default=False, help="Overwrite existing file") 10 | @click.argument("file", nargs=-1, type=click.Path(exists=True)) 11 | def main(force, file): 12 | """Convert GeoJSON file to CSV file.""" 13 | for input in map(Path, file): 14 | output = input.with_suffix(".csv") 15 | click.echo(f"{input} → {output}") 16 | if output.exists() and not force: 17 | raise SystemExit( 18 | f"Output file {output} exists, " 19 | f"use -f/--force to overwrite") 20 | data = di.read_geojson(input) 21 | data = data.unselect("geometry") 22 | data.write_csv(output) 23 | 24 | if __name__ == "__main__": 25 | main() 26 | -------------------------------------------------------------------------------- /bin/di-json2csv: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import click 4 | import dataiter as di 5 | 6 | from pathlib import Path 7 | 8 | @click.command(no_args_is_help=True) 9 | @click.option("-f", "--force", is_flag=True, default=False, help="Overwrite existing file") 10 | @click.argument("file", nargs=-1, type=click.Path(exists=True)) 11 | def main(force, file): 12 | """Convert JSON file to CSV file.""" 13 | for input in map(Path, file): 14 | output = input.with_suffix(".csv") 15 | click.echo(f"{input} → {output}") 16 | if output.exists() and not force: 17 | raise SystemExit( 18 | f"Output file {output} exists, " 19 | f"use -f/--force to overwrite") 20 | data = di.read_json(input) 21 | data.write_csv(output) 22 | 23 | if __name__ == "__main__": 24 | main() 25 | -------------------------------------------------------------------------------- /bin/di-open: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | uv run --no-project --with "dataiter==1.0,numba,pytz" python3.12 -i -c " 3 | import dataiter as di 4 | import os 5 | import sys 6 | from pathlib import Path 7 | path = Path('$1') 8 | assert path.exists() 9 | os.chdir(path.parent) 10 | path = path.relative_to(path.parent) 11 | suffix = path.suffix.lstrip('.') 12 | read = getattr(di, f'read_{suffix}') 13 | print(f'Reading {str(path)} into data...') 14 | data = read(path) 15 | del os 16 | del Path 17 | del path 18 | del read 19 | del suffix 20 | del sys 21 | " 22 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | Test Datasets 2 | ============= 3 | 4 | | Data | Source | 5 | | :--- | :----- | 6 | | downloads | https://pypistats.org/api/packages/urllib3/system | 7 | | listings | http://insideairbnb.com/get-the-data.html | 8 | | neighbourhoods | http://insideairbnb.com/get-the-data.html | 9 | | vehicles | https://github.com/hadley/fueleconomy | 10 | -------------------------------------------------------------------------------- /data/holidays.csv: -------------------------------------------------------------------------------- 1 | date,holiday 2 | 1000-01-01,New Year's Day 3 | 2019-01-01,New Year's Day 4 | 2019-01-06,Epiphany 5 | 2019-04-19,Good Friday 6 | 2019-04-21,Easter Sunday 7 | 2019-04-22,Easter Monday 8 | 2019-05-01,May Day 9 | 2019-05-30,Ascension Day 10 | 2019-06-09,Whit Sunday 11 | 2019-06-21,Midsummer's Eve 12 | 2019-06-22,Midsummer Day 13 | 2019-11-02,All Saints' Day 14 | 2019-12-06,Independence Day 15 | 2019-12-24,Christmas Eve 16 | 2019-12-25,Christmas Day 17 | 2019-12-26,2nd Day of Christmas 18 | 2020-01-01,New Year's Day 19 | 2020-01-06,Epiphany 20 | 2020-04-10,Good Friday 21 | 2020-04-12,Easter Sunday 22 | 2020-04-13,Easter Monday 23 | 2020-05-01,May Day 24 | 2020-05-21,Ascension Day 25 | 2020-05-31,Whit Sunday 26 | 2020-06-19,Midsummer's Eve 27 | 2020-06-20,Midsummer Day 28 | 2020-10-31,All Saints' Day 29 | 2020-12-06,Independence Day 30 | 2020-12-24,Christmas Eve 31 | 2020-12-25,Christmas Day 32 | 2020-12-26,2nd Day of Christmas 33 | 3000-01-01,New Year's Day 34 | -------------------------------------------------------------------------------- /data/holidays.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "date": "1000-01-01", 4 | "holiday": "New Year's Day" 5 | }, 6 | { 7 | "date": "2019-01-01", 8 | "holiday": "New Year's Day" 9 | }, 10 | { 11 | "date": "2019-01-06", 12 | "holiday": "Epiphany" 13 | }, 14 | { 15 | "date": "2019-04-19", 16 | "holiday": "Good Friday" 17 | }, 18 | { 19 | "date": "2019-04-21", 20 | "holiday": "Easter Sunday" 21 | }, 22 | { 23 | "date": "2019-04-22", 24 | "holiday": "Easter Monday" 25 | }, 26 | { 27 | "date": "2019-05-01", 28 | "holiday": "May Day" 29 | }, 30 | { 31 | "date": "2019-05-30", 32 | "holiday": "Ascension Day" 33 | }, 34 | { 35 | "date": "2019-06-09", 36 | "holiday": "Whit Sunday" 37 | }, 38 | { 39 | "date": "2019-06-21", 40 | "holiday": "Midsummer's Eve" 41 | }, 42 | { 43 | "date": "2019-06-22", 44 | "holiday": "Midsummer Day" 45 | }, 46 | { 47 | "date": "2019-11-02", 48 | "holiday": "All Saints' Day" 49 | }, 50 | { 51 | "date": "2019-12-06", 52 | "holiday": "Independence Day" 53 | }, 54 | { 55 | "date": "2019-12-24", 56 | "holiday": "Christmas Eve" 57 | }, 58 | { 59 | "date": "2019-12-25", 60 | "holiday": "Christmas Day" 61 | }, 62 | { 63 | "date": "2019-12-26", 64 | "holiday": "2nd Day of Christmas" 65 | }, 66 | { 67 | "date": "2020-01-01", 68 | "holiday": "New Year's Day" 69 | }, 70 | { 71 | "date": "2020-01-06", 72 | "holiday": "Epiphany" 73 | }, 74 | { 75 | "date": "2020-04-10", 76 | "holiday": "Good Friday" 77 | }, 78 | { 79 | "date": "2020-04-12", 80 | "holiday": "Easter Sunday" 81 | }, 82 | { 83 | "date": "2020-04-13", 84 | "holiday": "Easter Monday" 85 | }, 86 | { 87 | "date": "2020-05-01", 88 | "holiday": "May Day" 89 | }, 90 | { 91 | "date": "2020-05-21", 92 | "holiday": "Ascension Day" 93 | }, 94 | { 95 | "date": "2020-05-31", 96 | "holiday": "Whit Sunday" 97 | }, 98 | { 99 | "date": "2020-06-19", 100 | "holiday": "Midsummer's Eve" 101 | }, 102 | { 103 | "date": "2020-06-20", 104 | "holiday": "Midsummer Day" 105 | }, 106 | { 107 | "date": "2020-10-31", 108 | "holiday": "All Saints' Day" 109 | }, 110 | { 111 | "date": "2020-12-06", 112 | "holiday": "Independence Day" 113 | }, 114 | { 115 | "date": "2020-12-24", 116 | "holiday": "Christmas Eve" 117 | }, 118 | { 119 | "date": "2020-12-25", 120 | "holiday": "Christmas Day" 121 | }, 122 | { 123 | "date": "2020-12-26", 124 | "holiday": "2nd Day of Christmas" 125 | }, 126 | { 127 | "date": "3000-01-01", 128 | "holiday": "New Year's Day" 129 | } 130 | ] 131 | -------------------------------------------------------------------------------- /data/listings.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import dataiter as di 4 | 5 | # Same as for documentation. 6 | di.PRINT_MAX_WIDTH = 72 7 | 8 | def parse_price(price): 9 | return int(float(price.lstrip("$").replace(",", ""))) 10 | 11 | data = ( 12 | di.read_csv("orig/listings.csv") 13 | .select("id", 14 | "neighbourhood_group_cleansed", 15 | "zipcode", 16 | "accommodates", 17 | "square_feet", 18 | "price") 19 | .rename(hood="neighbourhood_group_cleansed") 20 | .rename(guests="accommodates") 21 | .rename(sqft="square_feet") 22 | .modify(price=lambda x: x.price.map(parse_price)) 23 | ) 24 | 25 | print(data.head()) 26 | data.write_csv("listings.csv") 27 | data.write_json("listings.json") 28 | 29 | data = ( 30 | di.read_csv("orig/listings.csv") 31 | .select("id", 32 | "number_of_reviews", 33 | "review_scores_rating") 34 | .rename(reviews="number_of_reviews") 35 | .rename(rating="review_scores_rating") 36 | .filter(lambda x: x.reviews >= 10) 37 | ) 38 | 39 | print(data.head()) 40 | data.write_csv("listings-reviews.csv") 41 | data.write_json("listings-reviews.json") 42 | -------------------------------------------------------------------------------- /dataiter/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright (c) 2020 Osmo Salomaa 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | # THE SOFTWARE. 22 | 23 | import contextlib 24 | import numpy as np 25 | 26 | from dataiter import util 27 | 28 | __version__ = "1.0" 29 | 30 | DEFAULT_PEEK_ELEMENTS = 10 31 | DEFAULT_PEEK_ITEMS = 3 32 | DEFAULT_PEEK_ROWS = 10 33 | PRINT_FLOAT_PRECISION = 6 34 | PRINT_MAX_ELEMENTS = 100 35 | PRINT_MAX_ITEMS = 10 36 | PRINT_MAX_ROWS = 100 37 | 38 | #: Maximum amount of columns to wrap print output to. This is only a fallback 39 | #: in case Python's ``shutil.get_terminal_size`` fails to detect the width of 40 | #: your terminal. By default the detected full width is used. 41 | PRINT_MAX_WIDTH = 80 42 | 43 | #: Thousand separator to use when printing numbers. By default this is blank, 44 | #: meaning no thousand separators are rendered. 45 | PRINT_THOUSAND_SEPARATOR = "" 46 | 47 | #: Maximum width to truncate string columns to in :class:`DataFrame` print 48 | #: output. When this is exceeded, strings will be cut and an ellipsis (``…``) 49 | #: rendered at the cut point. 50 | PRINT_TRUNCATE_WIDTH = 36 51 | 52 | #: ``True`` to use Numba, if available, to speed up :doc:`aggregations 53 | #: `, ``False`` to only use pure Python code. 54 | USE_NUMBA = False 55 | 56 | #: ``True`` to use Numba cache for JIT-compiled :doc:`aggregations 57 | #: `, ``False`` to only keep compiled code in memory for the 58 | #: duration of the session. 59 | USE_NUMBA_CACHE = True 60 | 61 | if not np.__version__.startswith("2."): 62 | raise Exception("NumPy 2.x required") 63 | 64 | with contextlib.suppress(LookupError): 65 | USE_NUMBA_CACHE = util.parse_env_boolean("DATAITER_USE_NUMBA_CACHE") 66 | 67 | try: 68 | # Force Numba on or off if environment variable defined. 69 | USE_NUMBA = util.parse_env_boolean("DATAITER_USE_NUMBA") 70 | except LookupError: 71 | with contextlib.suppress(Exception): 72 | # Use Numba automatically if found 73 | # and calling a trivial function works. 74 | import numba 75 | try: 76 | @numba.njit(cache=USE_NUMBA_CACHE) 77 | def check(x): 78 | return x**2 79 | assert check(10) == 100 80 | USE_NUMBA = True 81 | except Exception as error: 82 | print(f"Numba found, but disabled due to error: {error!s}") 83 | 84 | globals().pop("check", None) 85 | globals().pop("contextlib", None) 86 | globals().pop("np", None) 87 | globals().pop("numba", None) 88 | globals().pop("util", None) 89 | 90 | from dataiter import dtypes # noqa 91 | from dataiter.vector import Vector # noqa 92 | from dataiter.data_frame import DataFrame # noqa 93 | from dataiter.data_frame import DataFrameColumn # noqa 94 | from dataiter.geojson import GeoJSON # noqa 95 | from dataiter.list_of_dicts import ListOfDicts # noqa 96 | from dataiter import dt # noqa 97 | from dataiter import regex # noqa 98 | 99 | from dataiter.aggregate import all # noqa 100 | from dataiter.aggregate import any # noqa 101 | from dataiter.aggregate import count # noqa 102 | from dataiter.aggregate import count_unique # noqa 103 | from dataiter.aggregate import first # noqa 104 | from dataiter.aggregate import last # noqa 105 | from dataiter.aggregate import max # noqa 106 | from dataiter.aggregate import mean # noqa 107 | from dataiter.aggregate import median # noqa 108 | from dataiter.aggregate import min # noqa 109 | from dataiter.aggregate import mode # noqa 110 | from dataiter.aggregate import nth # noqa 111 | from dataiter.aggregate import quantile # noqa 112 | from dataiter.aggregate import std # noqa 113 | from dataiter.aggregate import sum # noqa 114 | from dataiter.aggregate import var # noqa 115 | 116 | from dataiter.io import read_csv # noqa 117 | from dataiter.io import read_geojson # noqa 118 | from dataiter.io import read_json # noqa 119 | from dataiter.io import read_npz # noqa 120 | from dataiter.io import read_parquet # noqa 121 | -------------------------------------------------------------------------------- /dataiter/deco.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright (c) 2019 Osmo Salomaa 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | # THE SOFTWARE. 22 | 23 | import functools 24 | 25 | def listify(function): 26 | @functools.wraps(function) 27 | def wrapper(*args, **kwargs): 28 | value = function(*args, **kwargs) 29 | return list(value) 30 | return wrapper 31 | 32 | def new_from_generator(function): 33 | @functools.wraps(function) 34 | def wrapper(self, *args, **kwargs): 35 | value = function(self, *args, **kwargs) 36 | return self._new(value) 37 | return wrapper 38 | 39 | def obsoletes(function): 40 | @functools.wraps(function) 41 | def wrapper(self, *args, **kwargs): 42 | value = function(self, *args, **kwargs) 43 | self._mark_obsolete() 44 | return value 45 | return wrapper 46 | 47 | def tuplefy(function): 48 | @functools.wraps(function) 49 | def wrapper(*args, **kwargs): 50 | value = function(*args, **kwargs) 51 | return tuple(value) 52 | return wrapper 53 | -------------------------------------------------------------------------------- /dataiter/dt.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright (c) 2022 Osmo Salomaa 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | # THE SOFTWARE. 22 | 23 | import datetime 24 | import numpy as np 25 | 26 | from dataiter import dtypes 27 | from dataiter import util 28 | from dataiter import Vector 29 | from numpy.dtypes import StringDType 30 | 31 | def day(x): 32 | """ 33 | Extract day of the month from datetime `x`. 34 | 35 | >>> x = dt.new(["2022-10-15"]) 36 | >>> dt.day(x) 37 | """ 38 | return _pull_int(x, lambda y: y.day) 39 | 40 | def from_string(x, format): 41 | """ 42 | Initialize a datetime scalar or vector from `x`. 43 | 44 | `format` uses Python ``strptime`` format codes: 45 | https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes 46 | 47 | >>> x = di.Vector(["15.10.2022"]) 48 | >>> dt.from_string(x, "%d.%m.%Y") 49 | """ 50 | if util.is_scalar(x): 51 | x = Vector([x], str) 52 | return from_string(x, format)[0] 53 | assert isinstance(x, np.ndarray) 54 | assert isinstance(x.dtype, StringDType) 55 | out = np.full_like(x, None, object) 56 | out = Vector.fast(out, object) 57 | na = x == dtypes.string.na_object 58 | f = np.vectorize(lambda x: datetime.datetime.strptime(x, format)) 59 | out[~na] = f(x[~na].astype(object)) 60 | out = out.as_datetime() 61 | if (len(out[~na]) > 0 and 62 | (hour(out[~na]) == 0).all() and 63 | (minute(out[~na]) == 0).all() and 64 | (second(out[~na]) == 0).all()): 65 | out = out.as_date() 66 | return out 67 | 68 | def hour(x): 69 | """ 70 | Extract hour from datetime `x`. 71 | 72 | >>> x = dt.new(["2022-10-15T12:34:56"]) 73 | >>> dt.hour(x) 74 | """ 75 | return _pull_int(x, lambda y: y.hour) 76 | 77 | def isoweek(x): 78 | """ 79 | Extract ISO 8601 week from datetime `x`. 80 | 81 | >>> x = dt.new(["2022-10-15"]) 82 | >>> dt.isoweek(x) 83 | """ 84 | return _pull_int(x, lambda y: y.isocalendar()[1]) 85 | 86 | def isoweekday(x): 87 | """ 88 | Extract day of the week from datetime `x`. 89 | 90 | Day of the week is an integer between 1 and 7, where 1 is Monday and 7 is 91 | Sunday. 92 | 93 | See also: :func:`weekday` 94 | 95 | >>> x = dt.new(["2022-10-15"]) 96 | >>> dt.isoweekday(x) 97 | """ 98 | return _pull_int(x, lambda y: y.isoweekday()) 99 | 100 | def microsecond(x): 101 | """ 102 | Extract microsecond from datetime `x`. 103 | 104 | >>> x = dt.new(["2022-10-15T12:34:56.789"]) 105 | >>> dt.microsecond(x) 106 | """ 107 | return _pull_int(x, lambda y: y.microsecond) 108 | 109 | def minute(x): 110 | """ 111 | Extract minute from datetime `x`. 112 | 113 | >>> x = dt.new(["2022-10-15T12:34:56"]) 114 | >>> dt.minute(x) 115 | """ 116 | return _pull_int(x, lambda y: y.minute) 117 | 118 | def month(x): 119 | """ 120 | Extract month from datetime `x`. 121 | 122 | >>> x = dt.new(["2022-10-15"]) 123 | >>> dt.month(x) 124 | """ 125 | return _pull_int(x, lambda y: y.month) 126 | 127 | def new(x): 128 | """ 129 | Initialize a datetime scalar or vector from `x`. 130 | 131 | >>> dt.new("2022-10-15") 132 | >>> dt.new("2022-10-15T12:00:00") 133 | >>> dt.new(["2022-10-15"]) 134 | >>> dt.new(["2022-10-15T12:00:00"]) 135 | """ 136 | if util.is_scalar(x): 137 | return np.datetime64(x) 138 | return Vector.fast(map(np.datetime64, x), np.datetime64) 139 | 140 | def now(): 141 | """ 142 | Return the current local datetime. 143 | 144 | >>> dt.now() 145 | """ 146 | return np.datetime64(datetime.datetime.now()) 147 | 148 | def _pull_datetime(x, function): 149 | if util.is_scalar(x): 150 | x = Vector([x], np.datetime64) 151 | return _pull_datetime(x, function)[0] 152 | assert isinstance(x, np.ndarray) 153 | assert np.issubdtype(x.dtype, np.datetime64) 154 | out = np.full_like(x, np.nan) 155 | out = Vector.fast(out, np.datetime64) 156 | na = np.isnat(x) 157 | if na.all(): return out 158 | f = np.vectorize(function) 159 | out[~na] = f(x[~na].astype(object)) 160 | return out 161 | 162 | def _pull_int(x, function): 163 | if util.is_scalar(x): 164 | x = Vector([x], np.datetime64) 165 | return _pull_int(x, function)[0] 166 | assert isinstance(x, np.ndarray) 167 | assert np.issubdtype(x.dtype, np.datetime64) 168 | out = np.full_like(x, np.nan, float) 169 | out = Vector.fast(out, float) 170 | na = np.isnat(x) 171 | if na.all(): return out 172 | f = np.vectorize(function) 173 | out[~na] = f(x[~na].astype(object)) 174 | return out if na.any() else out.as_integer() 175 | 176 | def _pull_str(x, function): 177 | if util.is_scalar(x): 178 | x = Vector([x], np.datetime64) 179 | return _pull_str(x, function)[0] 180 | assert isinstance(x, np.ndarray) 181 | assert np.issubdtype(x.dtype, np.datetime64) 182 | out = np.full_like(x, dtypes.string.na_object, object) 183 | out = Vector.fast(out, object) 184 | na = np.isnat(x) 185 | if na.all(): return out 186 | f = np.vectorize(function) 187 | out[~na] = f(x[~na].astype(object)) 188 | return out.as_string() 189 | 190 | def quarter(x): 191 | """ 192 | Extract quarter from datetime `x`. 193 | 194 | >>> x = dt.new(["2022-10-15"]) 195 | >>> dt.quarter(x) 196 | """ 197 | y = np.ceil(month(x) / 3) 198 | return y if np.isnan(y).any() else y.astype(int) 199 | 200 | def replace(x, year=None, month=None, day=None, hour=None, minute=None, second=None, microsecond=None): 201 | """ 202 | Return datetime `x` with given components replaced. 203 | 204 | >>> x = dt.new(["2022-10-15"]) 205 | >>> dt.replace(x, month=1, day=1) 206 | """ 207 | kwargs = {k: v for k, v in locals().items() if k != "x" and v is not None} 208 | if all(map(util.is_scalar, kwargs.values())): 209 | return _pull_datetime(x, lambda y: y.replace(**kwargs)) 210 | for value in kwargs.values(): 211 | assert util.is_scalar(value) or len(value) == len(x) 212 | scalar_keys = [x for x in kwargs if util.is_scalar(kwargs[x])] 213 | vector_keys = [x for x in kwargs if x not in scalar_keys] 214 | # Like _pull_datetime, but no vectorized function. 215 | assert isinstance(x, np.ndarray) 216 | assert np.issubdtype(x.dtype, np.datetime64) 217 | out = np.full_like(x, np.nan) 218 | out = Vector.fast(out, np.datetime64) 219 | na = np.isnat(x) 220 | xobj = x.astype(object) 221 | kwargs_scalar = {x: kwargs[x] for x in scalar_keys} 222 | for i in np.flatnonzero(~na): 223 | for key in vector_keys: 224 | kwargs_scalar[key] = kwargs[key][i] 225 | out[i] = xobj[i].replace(**kwargs_scalar) 226 | return out 227 | 228 | def second(x): 229 | """ 230 | Extract second from datetime `x`. 231 | 232 | >>> x = dt.new(["2022-10-15T12:34:56"]) 233 | >>> dt.second(x) 234 | """ 235 | return _pull_int(x, lambda y: y.second) 236 | 237 | def to_string(x, format): 238 | """ 239 | Format datetime `x` as string. 240 | 241 | `format` uses Python ``strftime`` format codes: 242 | https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes 243 | 244 | >>> x = dt.new(["2022-10-15"]) 245 | >>> dt.to_string(x, "%d.%m.%Y") 246 | """ 247 | return _pull_str(x, lambda x: x.strftime(format)) 248 | 249 | def today(): 250 | """ 251 | Return the current local date. 252 | 253 | >>> dt.today() 254 | """ 255 | return np.datetime64(datetime.date.today()) 256 | 257 | def weekday(x): 258 | """ 259 | Extract day of the week from datetime `x`. 260 | 261 | Day of the week is an integer between 0 and 6, where 0 is Monday and 6 is 262 | Sunday. 263 | 264 | See also: :func:`isoweekday` 265 | 266 | >>> x = dt.new(["2022-10-15"]) 267 | >>> dt.weekday(x) 268 | """ 269 | return _pull_int(x, lambda y: y.weekday()) 270 | 271 | def year(x): 272 | """ 273 | Extract year from datetime `x`. 274 | 275 | >>> x = dt.new(["2022-10-15"]) 276 | >>> dt.year(x) 277 | """ 278 | return _pull_int(x, lambda y: y.year) 279 | -------------------------------------------------------------------------------- /dataiter/dtypes.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright (c) 2024 Osmo Salomaa 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | # THE SOFTWARE. 22 | 23 | import warnings 24 | 25 | from numpy.dtypes import StringDType 26 | 27 | #: Instance of NumPy variable-width StringDType used 28 | string = StringDType(na_object="") 29 | 30 | # Use a blank string as missing value sentinel (1) because that's what we used 31 | # prior to the NumPy 2.0 StringDType and (2) because in many cases, such as CSV 32 | # input, a distinction between NA and blank cannot usually be made. 33 | # TODO: Consider changing this to something like ':NA:'. 34 | # https://numpy.org/doc/stable/user/basics.strings.html#missing-data-support 35 | 36 | # Ignore pointless warnings about using StringDType in numpy.savez. 37 | _pattern = "Custom dtypes are saved as python objects using the pickle protocol" 38 | warnings.filterwarnings("ignore", message=_pattern, category=UserWarning) 39 | -------------------------------------------------------------------------------- /dataiter/geojson.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright (c) 2020 Osmo Salomaa 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | # THE SOFTWARE. 22 | 23 | import json 24 | 25 | from attd import AttributeDict 26 | from dataiter import DataFrame 27 | from dataiter import DataFrameColumn 28 | from dataiter import util 29 | from dataiter import Vector 30 | 31 | class GeoJSON(DataFrame): 32 | 33 | """ 34 | A class for GeoJSON data. 35 | 36 | GeoJSON is a simple wrapper class that reads GeoJSON features into a 37 | :class:`.DataFrame`. Any operations on the data are thus done with methods 38 | provided by the data frame class. Geometry is available in the "geometry" 39 | column, but no special geometric operations are supported. All other data 40 | is available in the "metadata" attribute as an ``attd.AttributeDict``. 41 | """ 42 | 43 | # List of names that are actual attributes, not columns 44 | ATTRIBUTES = DataFrame.ATTRIBUTES + ["metadata"] 45 | 46 | # Lists of supported GeoJSON keys and types 47 | FEATURE_KEYS = ["type", "properties", "geometry"] 48 | FEATURE_TYPES = ["Feature"] 49 | PROPERTY_TYPES = [bool, int, float, str, type(None)] 50 | TOP_LEVEL_TYPES = ["FeatureCollection"] 51 | 52 | def __init__(self, *args, **kwargs): 53 | """ 54 | Return a new GeoJSON object. 55 | 56 | `args` and `kwargs` are like for ``dict``. 57 | 58 | https://docs.python.org/3/library/stdtypes.html#dict 59 | """ 60 | super().__init__(*args, **kwargs) 61 | self.metadata = AttributeDict(type="FeatureCollection") 62 | 63 | @classmethod 64 | def _check_raw_data(cls, data): 65 | if data.type not in cls.TOP_LEVEL_TYPES: 66 | raise TypeError(f"Top-level type {data.type!r} not supported") 67 | warned_feature_keys = [] 68 | for feature in data.features: 69 | cls._check_raw_feature(feature, warned_feature_keys) 70 | 71 | @classmethod 72 | def _check_raw_feature(cls, feature, warned_feature_keys): 73 | if feature.type not in cls.FEATURE_TYPES: 74 | raise TypeError(f"Feature type {feature.type!r} not supported") 75 | for key in set(feature) - set(cls.FEATURE_KEYS): 76 | if key in warned_feature_keys: continue 77 | print(f"Warning: Ignoring feature key {key!r}") 78 | warned_feature_keys.append(key) 79 | for key, value in feature.properties.items(): 80 | if isinstance(value, tuple(cls.PROPERTY_TYPES)): continue 81 | raise TypeError(f"Property type {type(value)} of {key!r} not supported") 82 | 83 | @classmethod 84 | def read(cls, path, *, encoding="utf-8", columns=[], dtypes={}, **kwargs): 85 | """ 86 | Return data from GeoJSON file `path`. 87 | 88 | Will automatically decompress if `path` ends in ``.bz2|.gz|.xz``. 89 | `columns` is an optional list of columns to limit to. `dtypes` is an 90 | optional dict mapping column names to NumPy datatypes. `kwargs` are 91 | passed to ``json.load``. 92 | """ 93 | with util.xopen(path, "rt", encoding=encoding) as f: 94 | raw = AttributeDict(json.load(f, **kwargs)) 95 | cls._check_raw_data(raw) 96 | data = {} 97 | for feature in raw.features: 98 | for key in feature.properties: 99 | data.setdefault(key, []) 100 | if columns: 101 | data = {k: v for k, v in data.items() if k in columns} 102 | for feature in raw.features: 103 | for key in data: 104 | value = feature.properties.get(key, None) 105 | data[key].append(value) 106 | data["geometry"] = [x.geometry for x in raw.features] 107 | for name, dtype in dtypes.items(): 108 | data[name] = DataFrameColumn(data[name], dtype) 109 | data = cls(**data) 110 | del raw.features 111 | data.metadata = raw 112 | return data 113 | 114 | def to_data_frame(self, drop_geometry=False): 115 | """ 116 | Return GeoJSON converted to a regular data frame. 117 | """ 118 | data = dict.copy(self) 119 | if drop_geometry: 120 | data.pop("geometry", None) 121 | return DataFrame(**data) 122 | 123 | def to_string(self, *, max_rows=None, max_width=None): 124 | if "geometry" in self.colnames: 125 | geometry = [f"<{x['type']}>" for x in self.geometry] 126 | self = self.modify(geometry=Vector.fast(geometry, object)) 127 | return DataFrame.to_string(self, max_rows=max_rows, max_width=max_width) 128 | 129 | def write(self, path, *, encoding="utf-8", **kwargs): 130 | """ 131 | Write data to GeoJSON file `path`. 132 | 133 | Will automatically compress if `path` ends in ``.bz2|.gz|.xz``. 134 | `kwargs` are passed to ``json.dumps``. 135 | """ 136 | kwargs.setdefault("default", str) 137 | kwargs.setdefault("ensure_ascii", False) 138 | indent_width = kwargs.pop("indent", 2) or 0 139 | indent1 = " " * indent_width * 1 140 | indent2 = " " * indent_width * 2 141 | if "geometry" not in self: 142 | raise ValueError("Geometry missing") 143 | data = self.to_list_of_dicts() 144 | util.makedirs_for_file(path) 145 | with util.xopen(path, "wt", encoding=encoding) as f: 146 | f.write("{\n") 147 | for key, value in self.metadata.items(): 148 | blob = json.dumps(value, **kwargs) 149 | f.write(f'{indent1}"{key}": {blob},\n') 150 | f.write(f'{indent1}"features": [\n') 151 | for i, item in enumerate(data): 152 | geometry = item.pop("geometry") 153 | blob = {"type": "Feature", "properties": item, "geometry": geometry} 154 | blob = json.dumps(blob, **kwargs) 155 | comma = "," if i < len(data) - 1 else "" 156 | f.write(f"{indent2}{blob}{comma}\n") 157 | f.write(f"{indent1}]\n") 158 | f.write("}\n") 159 | -------------------------------------------------------------------------------- /dataiter/io.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright (c) 2022 Osmo Salomaa 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | # THE SOFTWARE. 22 | 23 | from dataiter import DataFrame 24 | from dataiter import GeoJSON 25 | from dataiter import ListOfDicts 26 | from dataiter import util 27 | 28 | def read_csv(path, *, encoding="utf-8", sep=",", header=True, columns=[], dtypes={}): 29 | return DataFrame.read_csv(path, 30 | encoding=encoding, 31 | sep=sep, 32 | header=header, 33 | columns=columns, 34 | dtypes=dtypes) 35 | 36 | def read_geojson(path, *, encoding="utf-8", columns=[], dtypes={}, **kwargs): 37 | return GeoJSON.read(path, 38 | encoding=encoding, 39 | columns=columns, 40 | dtypes=dtypes, 41 | **kwargs) 42 | 43 | def read_json(path, *, encoding="utf-8", keys=[], types={}, **kwargs): 44 | return ListOfDicts.read_json(path, 45 | encoding=encoding, 46 | keys=keys, 47 | types=types, 48 | **kwargs) 49 | 50 | def read_npz(path, *, allow_pickle=True): 51 | return DataFrame.read_npz(path, allow_pickle=allow_pickle) 52 | 53 | def read_parquet(path, *, columns=[], dtypes={}): 54 | return DataFrame.read_parquet(path, columns=[], dtypes={}) 55 | 56 | read_csv.__doc__ = util.format_alias_doc(read_csv, DataFrame.read_csv) 57 | read_geojson.__doc__ = util.format_alias_doc(read_geojson, GeoJSON.read) 58 | read_json.__doc__ = util.format_alias_doc(read_json, ListOfDicts.read_json) 59 | read_npz.__doc__ = util.format_alias_doc(read_npz, DataFrame.read_npz) 60 | read_parquet.__doc__ = util.format_alias_doc(read_parquet, DataFrame.read_parquet) 61 | -------------------------------------------------------------------------------- /dataiter/regex.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright (c) 2025 Osmo Salomaa 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | # THE SOFTWARE. 22 | 23 | import numpy as np 24 | import re 25 | 26 | from dataiter import dtypes 27 | from dataiter import util 28 | from dataiter import Vector 29 | from numpy.dtypes import StringDType 30 | 31 | def _prep(string, dtype, default): 32 | assert isinstance(string, np.ndarray) 33 | assert isinstance(string.dtype, StringDType) 34 | out = np.full_like(string, default, dtype) 35 | na = string == dtypes.string.na_object 36 | return out, na 37 | 38 | def findall(pattern, string, flags=0): 39 | """ 40 | Return a list of matches of `pattern` in `string`. 41 | 42 | https://docs.python.org/3/library/re.html#re.findall 43 | 44 | >>> x = di.Vector(["asdf", "1234"]) 45 | >>> regex.findall(r"[a-z]", x) 46 | """ 47 | if util.is_scalar(string): 48 | return re.findall(pattern, string, flags=flags) 49 | out, na = _prep(string, object, None) 50 | for i in np.flatnonzero(~na): 51 | out[i] = re.findall(pattern, string[i], flags=flags) 52 | return Vector.fast(out, object) 53 | 54 | def fullmatch(pattern, string, flags=0): 55 | """ 56 | Return a ``re.Match`` object or ``None``. 57 | 58 | https://docs.python.org/3/library/re.html#re.fullmatch 59 | 60 | >>> x = di.Vector(["asdf", "1234"]) 61 | >>> regex.fullmatch(r"[a-z]+", x) 62 | """ 63 | if util.is_scalar(string): 64 | return re.fullmatch(pattern, string, flags=flags) 65 | out, na = _prep(string, object, None) 66 | for i in np.flatnonzero(~na): 67 | out[i] = re.fullmatch(pattern, string[i], flags=flags) 68 | return Vector.fast(out, object) 69 | 70 | def match(pattern, string, flags=0): 71 | """ 72 | Return a ``re.Match`` object or ``None``. 73 | 74 | https://docs.python.org/3/library/re.html#re.match 75 | 76 | >>> x = di.Vector(["asdf", "1234"]) 77 | >>> regex.match(r"[a-z]", x) 78 | """ 79 | if util.is_scalar(string): 80 | return re.match(pattern, string, flags=flags) 81 | out, na = _prep(string, object, None) 82 | for i in np.flatnonzero(~na): 83 | out[i] = re.match(pattern, string[i], flags=flags) 84 | return Vector.fast(out, object) 85 | 86 | def search(pattern, string, flags=0): 87 | """ 88 | Return a ``re.Match`` object or ``None``. 89 | 90 | https://docs.python.org/3/library/re.html#re.search 91 | 92 | >>> x = di.Vector(["asdf", "1234"]) 93 | >>> regex.search(r"[a-z]", x) 94 | """ 95 | if util.is_scalar(string): 96 | return re.search(pattern, string, flags=flags) 97 | out, na = _prep(string, object, None) 98 | for i in np.flatnonzero(~na): 99 | out[i] = re.search(pattern, string[i], flags=flags) 100 | return Vector.fast(out, object) 101 | 102 | def split(pattern, string, maxsplit=0, flags=0): 103 | """ 104 | Return a list of `string` split by `pattern`. 105 | 106 | https://docs.python.org/3/library/re.html#re.split 107 | 108 | >>> x = di.Vector(["one two three", "four"]) 109 | >>> regex.split(r" +", x) 110 | """ 111 | if util.is_scalar(string): 112 | return re.split(pattern, string, maxsplit=maxsplit, flags=flags) 113 | out, na = _prep(string, object, None) 114 | for i in np.flatnonzero(~na): 115 | out[i] = re.split(pattern, string[i], maxsplit=maxsplit, flags=flags) 116 | return Vector.fast(out, object) 117 | 118 | def sub(pattern, repl, string, count=0, flags=0): 119 | """ 120 | Return `string` with instances of `pattern` replaced with `repl`. 121 | 122 | https://docs.python.org/3/library/re.html#re.sub 123 | 124 | >>> x = di.Vector(["great", "fantastic"]) 125 | >>> regex.sub(r"$", r"!", x) 126 | """ 127 | if util.is_scalar(string): 128 | return re.sub(pattern, repl, string, count=count, flags=flags) 129 | out, na = _prep(string, dtypes.string, dtypes.string.na_object) 130 | for i in np.flatnonzero(~na): 131 | out[i] = re.sub(pattern, repl, string[i], count=count, flags=flags) 132 | return Vector.fast(out, str) 133 | 134 | def subn(pattern, repl, string, count=0, flags=0): 135 | """ 136 | Return `string`, count of instances of `pattern` replaced with `repl`. 137 | 138 | https://docs.python.org/3/library/re.html#re.subn 139 | 140 | >>> x = di.Vector(["great", "fantastic"]) 141 | >>> regex.subn(r"$", r"!", x) 142 | """ 143 | if util.is_scalar(string): 144 | return re.subn(pattern, repl, string, count=count, flags=flags) 145 | out, na = _prep(string, object, None) 146 | for i in np.flatnonzero(~na): 147 | out[i] = re.subn(pattern, repl, string[i], count=count, flags=flags) 148 | return Vector.fast(out, object) 149 | -------------------------------------------------------------------------------- /dataiter/test/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright (c) 2020 Osmo Salomaa 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | # THE SOFTWARE. 22 | 23 | import functools 24 | 25 | from dataiter import DataFrame 26 | from dataiter import GeoJSON 27 | from dataiter import ListOfDicts 28 | from pathlib import Path 29 | 30 | def cached(function): 31 | cache = {} 32 | @functools.wraps(function) 33 | def wrapper(path): 34 | if path not in cache: 35 | cache[path] = function(path) 36 | return cache[path].deepcopy() 37 | return wrapper 38 | 39 | @cached 40 | def data_frame(name): 41 | path = get_data_path(name) 42 | extension = path.suffix.lstrip(".") 43 | read = getattr(DataFrame, f"read_{extension}") 44 | return read(path) 45 | 46 | @cached 47 | def geojson(name): 48 | path = get_data_path(name) 49 | return GeoJSON.read(path) 50 | 51 | def get_data_path(name): 52 | for parent in Path(__file__).parents: 53 | path = parent / "data" / name 54 | if path.exists(): 55 | return path 56 | 57 | @cached 58 | def list_of_dicts(name): 59 | path = get_data_path(name) 60 | extension = path.suffix.lstrip(".") 61 | read = getattr(ListOfDicts, f"read_{extension}") 62 | return read(path) 63 | -------------------------------------------------------------------------------- /dataiter/test/test_aggregate.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright (c) 2022 Osmo Salomaa 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | # THE SOFTWARE. 22 | 23 | import dataiter 24 | import datetime 25 | import numpy as np 26 | import pytest 27 | 28 | from dataiter import DataFrame 29 | from dataiter import Vector 30 | from dataiter.aggregate import all 31 | from dataiter.aggregate import any 32 | from dataiter.aggregate import count 33 | from dataiter.aggregate import count_unique 34 | from dataiter.aggregate import first 35 | from dataiter.aggregate import last 36 | from dataiter.aggregate import max 37 | from dataiter.aggregate import mean 38 | from dataiter.aggregate import median 39 | from dataiter.aggregate import min 40 | from dataiter.aggregate import mode 41 | from dataiter.aggregate import nth 42 | from dataiter.aggregate import quantile 43 | from dataiter.aggregate import std 44 | from dataiter.aggregate import sum 45 | from dataiter.aggregate import var 46 | from unittest.mock import patch 47 | 48 | T = True 49 | F = False 50 | 51 | D1 = datetime.date.today() 52 | D2 = D1 + datetime.timedelta(days=1) 53 | D3 = D1 + datetime.timedelta(days=2) 54 | D4 = D1 + datetime.timedelta(days=3) 55 | D5 = D1 + datetime.timedelta(days=4) 56 | D6 = D1 + datetime.timedelta(days=5) 57 | D7 = D1 + datetime.timedelta(days=6) 58 | 59 | NaN = np.nan 60 | NaT = np.datetime64("NaT") 61 | 62 | EMPTY_VECTOR = Vector([], float) 63 | GROUPS = [1, 1, 2, 2, 3, 3, 4, 4, 5, 5] 64 | 65 | nth0 = lambda x: nth(x, 0) 66 | quantile05 = lambda x: quantile(x, 0.5) 67 | 68 | TEST_MATRIX = [ 69 | 70 | # NaNs evaluate to true, because they are not equal to zero. 71 | # https://numpy.org/doc/stable/reference/generated/numpy.all.html 72 | (all, [T, T, T, T, T, F, F, F, F, F], [T, T, F, F, F]), 73 | (all, [1, 2, 3, 4, 5, 0, 0, 0, 0, 0], [T, T, F, F, F]), 74 | (all, [0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0, NaN, NaN, NaN], [F, F, T, T, T]), 75 | (all, [D1, D2, D3, D4, D5, NaT, NaT, NaT, NaT, NaT], [T, T, T, T, T]), 76 | (all, ["a", "b", "c", "d", "e", "", "", "", "", ""], [T, T, T, T, T]), 77 | 78 | # NaNs evaluate to true, because they are not equal to zero. 79 | # https://numpy.org/doc/stable/reference/generated/numpy.any.html 80 | (any, [T, T, T, T, T, F, F, F, F, F], [T, T, T, F, F]), 81 | (any, [1, 2, 3, 4, 5, 0, 0, 0, 0, 0], [T, T, T, F, F]), 82 | (any, [0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0, NaN, NaN, NaN], [F, T, T, T, T]), 83 | (any, [D1, D2, D3, D4, D5, NaT, NaT, NaT, NaT, NaT], [T, T, T, T, T]), 84 | (any, ["a", "b", "c", "d", "e", "", "", "", "", ""], [T, T, T, T, T]), 85 | 86 | (count, [T, T, T, T, T, F, F, F, F, F], [2, 2, 2, 2, 2]), 87 | (count, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [2, 2, 2, 2, 2]), 88 | (count, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, NaN, NaN, NaN], [2, 2, 2, 2, 2]), 89 | (count, [D1, D2, D3, D4, D5, D6, D7, NaT, NaT, NaT], [2, 2, 2, 2, 2]), 90 | (count, ["a", "b", "c", "d", "e", "f", "g", "", "", ""], [2, 2, 2, 2, 2]), 91 | 92 | # NaN is not considered equal to itself and thus all are counted here. 93 | (count_unique, [T, T, T, T, T, F, F, F, F, F], [1, 1, 2, 1, 1]), 94 | (count_unique, [1, 1, 3, 3, 5, 6, 7, 8, 9, 10], [1, 1, 2, 2, 2]), 95 | (count_unique, [1.0, 1.0, 3.0, 3.0, 5.0, 6.0, 7.0, NaN, NaN, NaN], [1, 1, 2, 2, 2]), 96 | (count_unique, [D1, D1, D3, D3, D5, D6, D7, NaT, NaT, NaT], [1, 1, 2, 2, 2]), 97 | (count_unique, ["a", "a", "c", "c", "e", "f", "g", "", "", ""], [1, 1, 2, 2, 1]), 98 | 99 | (first, [T, T, T, T, T, F, F, F, F, F], [T, T, T, F, F]), 100 | (first, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1, 3, 5, 7, 9]), 101 | (first, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, NaN, NaN, NaN], [1.0, 3.0, 5.0, 7.0, NaN]), 102 | (first, [D1, D2, D3, D4, D5, D6, D7, NaT, NaT, NaT], [D1, D3, D5, D7, NaT]), 103 | (first, ["a", "b", "c", "d", "e", "f", "g", "", "", ""], ["a", "c", "e", "g", ""]), 104 | 105 | (last, [T, T, T, T, T, F, F, F, F, F], [T, T, F, F, F]), 106 | (last, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [2, 4, 6, 8, 10]), 107 | (last, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, NaN, NaN, NaN], [2.0, 4.0, 6.0, NaN, NaN]), 108 | (last, [D1, D2, D3, D4, D5, D6, D7, NaT, NaT, NaT], [D2, D4, D6, NaT, NaT]), 109 | (last, ["a", "b", "c", "d", "e", "f", "g", "", "", ""], ["b", "d", "f", "", ""]), 110 | 111 | (max, [T, T, T, T, T, F, F, F, F, F], [T, T, T, F, F]), 112 | (max, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [2, 4, 6, 8, 10]), 113 | (max, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, NaN, NaN, NaN], [2.0, 4.0, 6.0, 7.0, NaN]), 114 | (max, [D1, D2, D3, D4, D5, D6, D7, NaT, NaT, NaT], [D2, D4, D6, D7, NaT]), 115 | 116 | (mean, [T, T, T, T, T, F, F, F, F, F], [1.0, 1.0, 0.5, 0.0, 0.0]), 117 | (mean, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1.5, 3.5, 5.5, 7.5, 9.5]), 118 | (mean, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, NaN, NaN, NaN], [1.5, 3.5, 5.5, 7.0, NaN]), 119 | 120 | (median, [T, T, T, T, T, F, F, F, F, F], [1.0, 1.0, 0.5, 0.0, 0.0]), 121 | (median, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1.5, 3.5, 5.5, 7.5, 9.5]), 122 | (median, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, NaN, NaN, NaN], [1.5, 3.5, 5.5, 7.0, NaN]), 123 | 124 | (min, [T, T, T, T, T, F, F, F, F, F], [T, T, F, F, F]), 125 | (min, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1, 3, 5, 7, 9]), 126 | (min, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, NaN, NaN, NaN], [1.0, 3.0, 5.0, 7.0, NaN]), 127 | (min, [D1, D2, D3, D4, D5, D6, D7, NaT, NaT, NaT], [D1, D3, D5, D7, NaT]), 128 | 129 | (mode, [T, T, T, T, T, F, F, F, F, F], [T, T, T, F, F]), 130 | (mode, [1, 1, 3, 3, 5, 6, 7, 8, 9, 10], [1, 3, 5, 7, 9]), 131 | (mode, [1.0, 1.0, 3.0, 3.0, 5.0, 6.0, 7.0, NaN, NaN, NaN], [1.0, 3.0, 5.0, 7.0, NaN]), 132 | (mode, [D1, D1, D3, D3, D5, D6, D7, NaT, NaT, NaT], [D1, D3, D5, D7, NaT]), 133 | (mode, ["a", "a", "c", "c", "e", "f", "g", "", "", ""], ["a", "c", "e", "g", ""]), 134 | 135 | (nth0, [T, T, T, T, T, F, F, F, F, F], [T, T, T, F, F]), 136 | (nth0, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1, 3, 5, 7, 9]), 137 | (nth0, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, NaN, NaN, NaN], [1.0, 3.0, 5.0, 7.0, NaN]), 138 | (nth0, [D1, D2, D3, D4, D5, D6, D7, NaT, NaT, NaT], [D1, D3, D5, D7, NaT]), 139 | (nth0, ["a", "b", "c", "d", "e", "f", "g", "", "", ""], ["a", "c", "e", "g", ""]), 140 | 141 | (quantile05, [T, T, T, T, T, F, F, F, F, F], [1.0, 1.0, 0.5, 0.0, 0.0]), 142 | (quantile05, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1.5, 3.5, 5.5, 7.5, 9.5]), 143 | (quantile05, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, NaN, NaN, NaN], [1.5, 3.5, 5.5, 7.0, NaN]), 144 | 145 | (std, [T, T, T, T, T, F, F, F, F, F], [0.0, 0.0, 0.5, 0.0, 0.0]), 146 | (std, [1, 1, 2, 3, 5, 7, 8, 12, 13, 21], [0.0, 0.5, 1.0, 2.0, 4.0]), 147 | (std, [1.0, 1.0, 2.0, 3.0, 5.0, 7.0, 8.0, NaN, NaN, NaN], [0.0, 0.5, 1.0, NaN, NaN]), 148 | 149 | (sum, [T, T, T, T, T, F, F, F, F, F], [2, 2, 1, 0, 0]), 150 | (sum, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [3, 7, 11, 15, 19]), 151 | (sum, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, NaN, NaN, NaN], [3.0, 7.0, 11.0, 7.0, 0.0]), 152 | 153 | (var, [T, T, T, T, T, F, F, F, F, F], [0.0, 0.0, 0.25, 0.0, 0.0]), 154 | (var, [1, 1, 2, 3, 5, 7, 8, 12, 13, 21], [0.0, 0.25, 1.0, 4.0, 16.0]), 155 | (var, [1.0, 1.0, 2.0, 3.0, 5.0, 7.0, 8.0, NaN, NaN, NaN], [0.0, 0.25, 1.0, NaN, NaN]), 156 | 157 | ] 158 | 159 | class TestAggregate: 160 | 161 | @pytest.mark.parametrize("use_numba", [False, True]) 162 | @pytest.mark.parametrize("function,input,output", TEST_MATRIX) 163 | def test_aggregate(self, function, input, output, use_numba): 164 | if use_numba and not dataiter.USE_NUMBA: 165 | pytest.skip("No Numba") 166 | with patch("dataiter.USE_NUMBA", use_numba): 167 | data = DataFrame(g=GROUPS, a=input) 168 | stat = data.group_by("g").aggregate(a=function("a")) 169 | expected = Vector(output) 170 | try: 171 | assert stat.a.equal(expected) 172 | except AssertionError: 173 | print("") 174 | print(data) 175 | print("Expected:") 176 | print(expected) 177 | print("Got:") 178 | print(stat.a) 179 | raise 180 | 181 | @pytest.mark.parametrize("use_numba", [False, True]) 182 | def test_aggregate_count(self, use_numba): 183 | if use_numba and not dataiter.USE_NUMBA: 184 | pytest.skip("No Numba") 185 | with patch("dataiter.USE_NUMBA", use_numba): 186 | data = DataFrame(g=GROUPS) 187 | stat = data.group_by("g").aggregate(n=count()) 188 | assert (stat.n == 2).all() 189 | 190 | def test_all(self): 191 | assert all(EMPTY_VECTOR) 192 | assert all(Vector([T, T])) 193 | assert not all(Vector([T, F])) 194 | assert not all(Vector([F, F])) 195 | 196 | def test_any(self): 197 | assert not any(EMPTY_VECTOR) 198 | assert any(Vector([T, T])) 199 | assert any(Vector([T, F])) 200 | assert not any(Vector([F, F])) 201 | 202 | def test_count(self): 203 | assert count(EMPTY_VECTOR) == 0 204 | assert count(Vector([1])) == 1 205 | assert count(Vector([1, 2])) == 2 206 | assert count(Vector([1, 2, NaN])) == 3 207 | assert count(Vector([1, 2, NaN]), drop_na=True) == 2 208 | 209 | def test_count_unique(self): 210 | assert count_unique(EMPTY_VECTOR) == 0 211 | assert count_unique(Vector([1])) == 1 212 | assert count_unique(Vector([1, 1])) == 1 213 | assert count_unique(Vector([1, 1, 2])) == 2 214 | assert count_unique(Vector([1, 1, 2, NaN])) == 3 215 | assert count_unique(Vector([1, 1, 2, NaN]), drop_na=True) == 2 216 | 217 | def test_first(self): 218 | assert first(Vector([1, 2, 3])) == 1 219 | assert first(Vector([NaN, 1, 2]), drop_na=True) == 1 220 | 221 | def test_first_nan(self): 222 | assert np.isnan(first(EMPTY_VECTOR)) 223 | assert np.isnan(first(Vector([NaN, 1, 2]))) 224 | 225 | def test_last(self): 226 | assert last(Vector([1, 2, 3])) == 3 227 | assert last(Vector([1, 2, NaN]), drop_na=True) == 2 228 | 229 | def test_last_nan(self): 230 | assert np.isnan(last(EMPTY_VECTOR)) 231 | assert np.isnan(last(Vector([1, 2, NaN]))) 232 | 233 | def test_max(self): 234 | assert max(Vector([3, 2, 1])) == 3 235 | assert max(Vector([3, 2, NaN])) == 3 236 | 237 | def test_max_nan(self): 238 | assert np.isnan(max(EMPTY_VECTOR)) 239 | assert np.isnan(max(Vector([3, 2, NaN]), drop_na=False)) 240 | 241 | def test_mean(self): 242 | assert np.isclose(mean(Vector([1, 2, 10])), 4.333333) 243 | assert np.isclose(mean(Vector([1, 2, NaN])), 1.5) 244 | 245 | def test_mean_nan(self): 246 | assert np.isnan(mean(EMPTY_VECTOR)) 247 | assert np.isnan(mean(Vector([1, 2, NaN]), drop_na=False)) 248 | 249 | def test_median(self): 250 | assert median(Vector([1, 4, 6, 8, 5])) == 5 251 | assert median(Vector([1, 4, 6, NaN, NaN])) == 4 252 | 253 | def test_median_nan(self): 254 | assert np.isnan(median(EMPTY_VECTOR)) 255 | assert np.isnan(median(Vector([1, 4, NaN]), drop_na=False)) 256 | 257 | def test_min(self): 258 | assert min(Vector([3, 2, 1])) == 1 259 | assert min(Vector([3, 2, NaN])) == 2 260 | 261 | def test_min_nan(self): 262 | assert np.isnan(min(EMPTY_VECTOR)) 263 | assert np.isnan(min(Vector([3, 2, NaN]), drop_na=False)) 264 | 265 | def test_mode(self): 266 | assert mode(Vector([1])) == 1 267 | assert mode(Vector([1, 2])) == 1 268 | assert mode(Vector([1, 2, 2])) == 2 269 | assert mode(Vector([1, 2, 2, NaN])) == 2 270 | assert mode(Vector([1, 2, 2, NaN]), drop_na=False) == 2 271 | 272 | def test_mode_nan(self): 273 | assert np.isnan(mode(EMPTY_VECTOR)) 274 | assert np.isnan(mode(Vector([NaN, NaN], float), drop_na=False)) 275 | 276 | def test_nth(self): 277 | assert nth(Vector([1, 2, 3]), 0) == 1 278 | assert nth(Vector([NaN, 1, 2]), 0, drop_na=True) == 1 279 | 280 | def test_nth_nan(self): 281 | assert np.isnan(nth(EMPTY_VECTOR, 0)) 282 | assert np.isnan(nth(Vector([NaN, 1, 2]), 0)) 283 | 284 | def test_quantile(self): 285 | assert quantile(Vector([1, 4, 6, 8, 5]), 0.5) == 5 286 | assert quantile(Vector([1, 4, 6, NaN, NaN]), 0.5) == 4 287 | 288 | def test_quantile_nan(self): 289 | assert np.isnan(quantile(EMPTY_VECTOR, 0.5)) 290 | assert np.isnan(quantile(Vector([1, 4, NaN]), 0.5, drop_na=False)) 291 | 292 | def test_std(self): 293 | assert np.isclose(std(Vector([3, 6, 7])), 1.699673) 294 | assert np.isclose(std(Vector([3, 6, NaN])), 1.5) 295 | 296 | def test_std_nan(self): 297 | assert np.isnan(std(EMPTY_VECTOR)) 298 | assert np.isnan(std(Vector([1]))) 299 | assert np.isnan(std(Vector([3, 6, NaN]), drop_na=False)) 300 | 301 | def test_sum(self): 302 | assert sum(EMPTY_VECTOR) == 0 303 | assert sum(Vector([1])) == 1 304 | assert sum(Vector([1, 2])) == 3 305 | assert sum(Vector([1, 2, NaN])) == 3 306 | 307 | def test_sum_nan(self): 308 | assert np.isnan(sum(Vector([1, 2, NaN]), drop_na=False)) 309 | 310 | def test_var(self): 311 | assert np.isclose(var(Vector([3, 6, 7])), 2.888889) 312 | assert np.isclose(var(Vector([3, 6, NaN])), 2.25) 313 | 314 | def test_var_nan(self): 315 | assert np.isnan(var(EMPTY_VECTOR)) 316 | assert np.isnan(var(Vector([1]))) 317 | assert np.isnan(var(Vector([3, 6, NaN]), drop_na=False)) 318 | -------------------------------------------------------------------------------- /dataiter/test/test_dt.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright (c) 2022 Osmo Salomaa 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | # THE SOFTWARE. 22 | 23 | import numpy as np 24 | 25 | from dataiter import dt 26 | from dataiter import Vector 27 | 28 | NaT = np.datetime64("NaT") 29 | 30 | class TestDT: 31 | 32 | def test_day(self): 33 | a = dt.new(["2022-10-15", NaT]) 34 | assert dt.day(a).tolist() == [15, None] 35 | 36 | def test_day_nat(self): 37 | assert np.isnan(dt.day(NaT)) 38 | 39 | def test_day_numpy(self): 40 | a = np.array(["2022-10-15", NaT], np.datetime64) 41 | assert dt.day(a).tolist() == [15, None] 42 | 43 | def test_day_scalar(self): 44 | x = np.datetime64("2022-10-15") 45 | assert dt.day(x) == 15 46 | 47 | def test_from_string_date(self): 48 | a = Vector(["14.11.2022", ""], str) 49 | b = dt.from_string(a, "%d.%m.%Y") 50 | assert b.is_datetime() 51 | assert b[0] == np.datetime64("2022-11-14") 52 | assert np.isnat(b[1]) 53 | 54 | def test_from_string_datetime(self): 55 | a = Vector(["14.11.2022 22:49", ""], str) 56 | b = dt.from_string(a, "%d.%m.%Y %H:%M") 57 | assert b.is_datetime() 58 | assert b[0] == np.datetime64("2022-11-14T22:49:00") 59 | assert np.isnat(b[1]) 60 | 61 | def test_hour(self): 62 | a = dt.new(["2022-10-15T12:34:56", NaT]) 63 | assert dt.hour(a).tolist() == [12, None] 64 | 65 | def test_isoweek(self): 66 | a = dt.new(["2022-10-15", NaT]) 67 | assert dt.isoweek(a).tolist() == [41, None] 68 | 69 | def test_isoweekday(self): 70 | a = dt.new(["2022-10-15", NaT]) 71 | assert dt.isoweekday(a).tolist() == [6, None] 72 | 73 | def test_microsecond(self): 74 | a = dt.new(["2022-10-15T12:34:56.789", NaT]) 75 | assert dt.microsecond(a).tolist() == [789_000, None] 76 | 77 | def test_minute(self): 78 | a = dt.new(["2022-10-15T12:34:56", NaT]) 79 | assert dt.minute(a).tolist() == [34, None] 80 | 81 | def test_month(self): 82 | a = dt.new([NaT, NaT]) 83 | assert dt.month(a).tolist() == [None, None] 84 | a = dt.new(["2022-10-15", NaT]) 85 | assert dt.month(a).tolist() == [10, None] 86 | a = dt.new(["2022-10-15", "2022-11-15"]) 87 | assert dt.month(a).tolist() == [10, 11] 88 | 89 | def test_new_date(self): 90 | a = dt.new(["2022-10-15"]) 91 | b = Vector(["2022-10-15"]).as_date() 92 | assert a.equal(b) 93 | 94 | def test_new_datetime(self): 95 | a = dt.new(["2022-10-15T12:00:00"]) 96 | b = Vector(["2022-10-15T12:00:00"]).as_datetime() 97 | assert a.equal(b) 98 | 99 | def test_new_scalar(self): 100 | a = dt.new("2022-10-15") 101 | b = np.datetime64("2022-10-15") 102 | assert a == b 103 | 104 | def test_now(self): 105 | assert isinstance(dt.now(), np.datetime64) 106 | 107 | def test_quarter(self): 108 | a = dt.new("2022-10-15") 109 | assert dt.quarter(a) == 4 110 | assert np.isnan(dt.quarter(NaT)) 111 | a = dt.new(["2022-10-15"]) 112 | assert dt.quarter(a).tolist() == [4] 113 | a = dt.new(["2022-10-15", NaT]) 114 | assert dt.quarter(a).tolist() == [4, None] 115 | 116 | def test_replace(self): 117 | a = dt.new(["2022-10-15", NaT]) 118 | b = dt.new(["2022-01-01", NaT]) 119 | assert dt.replace(a, month=1, day=1).equal(b) 120 | 121 | def test_replace_vector(self): 122 | a = dt.new(["2023-08-09", "2023-08-10", "2023-08-11"]) 123 | b = dt.new(["2023-07-01", "2023-07-02", "2023-07-03"]) 124 | assert dt.replace(a, month=7, day=[1, 2, 3]).equal(b) 125 | 126 | def test_replace_vector_1m(self): 127 | a = np.repeat(dt.new("2023-08-09"), 1_000_000) 128 | month = np.repeat([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 100_000) 129 | assert len(month) == len(a) == 1_000_000 130 | b = dt.replace(a, month=month, day=1) 131 | assert len(b) == len(a) 132 | 133 | def test_second(self): 134 | a = dt.new(["2022-10-15T12:34:56", NaT]) 135 | assert dt.second(a).tolist() == [56, None] 136 | 137 | def test_to_string_date(self): 138 | a = dt.new(["2022-11-14", NaT]) 139 | b = dt.to_string(a, "%d.%m.%Y") 140 | assert b.is_string() 141 | assert b.tolist() == ["14.11.2022", None] 142 | 143 | def test_to_string_datetime(self): 144 | a = dt.new(["2022-11-14T22:49:00", NaT]) 145 | b = dt.to_string(a, "%Y%m%d-%H%M%S") 146 | assert b.is_string() 147 | assert b.tolist() == ["20221114-224900", None] 148 | 149 | def test_today(self): 150 | assert isinstance(dt.today(), np.datetime64) 151 | 152 | def test_weekday(self): 153 | a = dt.new(["2022-10-15", NaT]) 154 | assert dt.weekday(a).tolist() == [5, None] 155 | 156 | def test_year(self): 157 | a = dt.new(["2022-10-15", NaT]) 158 | assert dt.year(a).tolist() == [2022, None] 159 | -------------------------------------------------------------------------------- /dataiter/test/test_geojson.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright (c) 2020 Osmo Salomaa 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | # THE SOFTWARE. 22 | 23 | import tempfile 24 | 25 | from dataiter import GeoJSON 26 | from dataiter import test 27 | from pathlib import Path 28 | 29 | class TestGeoJSON: 30 | 31 | path = "neighbourhoods.geojson" 32 | 33 | def test_read(self): 34 | path = str(test.get_data_path(self.path)) 35 | data = GeoJSON.read(path) 36 | assert data.nrow == 233 37 | assert data.ncol == 3 38 | 39 | def test_read_columns(self): 40 | path = test.get_data_path(self.path) 41 | data = GeoJSON.read(path, columns=["neighbourhood"]) 42 | assert data.colnames == ["neighbourhood", "geometry"] 43 | 44 | def test_read_dtypes(self): 45 | path = test.get_data_path(self.path) 46 | dtypes = {"neighbourhood": object, "neighbourhood_group": object} 47 | data = GeoJSON.read(path, dtypes=dtypes) 48 | assert data.neighbourhood.is_object() 49 | assert data.neighbourhood_group.is_object() 50 | 51 | def test_to_data_frame(self): 52 | orig = test.geojson(self.path) 53 | data = orig.to_data_frame() 54 | assert data.ncol == orig.ncol 55 | assert data.nrow == orig.nrow 56 | assert not isinstance(data, GeoJSON) 57 | 58 | def test_to_data_frame_drop_geometry(self): 59 | orig = test.geojson(self.path) 60 | data = orig.to_data_frame(drop_geometry=True) 61 | assert data.ncol == orig.ncol - 1 62 | assert data.nrow == orig.nrow 63 | assert not isinstance(data, GeoJSON) 64 | assert "geometry" not in data.colnames 65 | 66 | def test_to_string(self): 67 | data = test.geojson(self.path) 68 | assert data.head(0).to_string() 69 | assert data.head(5).to_string() 70 | 71 | def test_to_string_no_geometry(self): 72 | data = test.geojson(self.path) 73 | del data.geometry 74 | assert data.head(0).to_string() 75 | assert data.head(5).to_string() 76 | 77 | def test_write(self): 78 | orig = test.geojson(self.path) 79 | handle, path = tempfile.mkstemp(".geojson") 80 | orig.write(path) 81 | data = GeoJSON.read(path) 82 | assert data == orig 83 | assert data.metadata == orig.metadata 84 | 85 | def test_write_path(self): 86 | orig = test.geojson(self.path) 87 | handle, path = tempfile.mkstemp(".geojson") 88 | orig.write(Path(path)) 89 | -------------------------------------------------------------------------------- /dataiter/test/test_io.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright (c) 2022 Osmo Salomaa 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | # THE SOFTWARE. 22 | 23 | import inspect 24 | 25 | from dataiter import DataFrame 26 | from dataiter import GeoJSON 27 | from dataiter import io 28 | from dataiter import ListOfDicts 29 | 30 | class TestIO: 31 | 32 | def test_read_csv(self): 33 | s1 = inspect.signature(io.read_csv) 34 | s2 = inspect.signature(DataFrame.read_csv) 35 | assert s1 == s2 36 | 37 | def test_read_geojson(self): 38 | s1 = inspect.signature(io.read_geojson) 39 | s2 = inspect.signature(GeoJSON.read) 40 | assert s1 == s2 41 | 42 | def test_read_json(self): 43 | s1 = inspect.signature(io.read_json) 44 | s2 = inspect.signature(ListOfDicts.read_json) 45 | assert s1 == s2 46 | 47 | def test_read_npz(self): 48 | s1 = inspect.signature(io.read_npz) 49 | s2 = inspect.signature(DataFrame.read_npz) 50 | assert s1 == s2 51 | 52 | def test_read_parquet(self): 53 | s1 = inspect.signature(io.read_parquet) 54 | s2 = inspect.signature(DataFrame.read_parquet) 55 | assert s1 == s2 56 | -------------------------------------------------------------------------------- /dataiter/test/test_regex.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright (c) 2025 Osmo Salomaa 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | # THE SOFTWARE. 22 | 23 | import re 24 | 25 | from dataiter import regex 26 | from dataiter import Vector 27 | 28 | class TestRegex: 29 | 30 | def test_findall(self): 31 | pattern = r"[a-z]" 32 | string = Vector(["asdf", "1234", ""]) 33 | result = regex.findall(pattern, string) 34 | expected = [["a", "s", "d", "f"], [], None] 35 | assert result.tolist() == expected 36 | assert regex.findall(pattern, string[0]) == result[0] 37 | 38 | def test_fullmatch(self): 39 | pattern = r"[a-z]+" 40 | string = Vector(["asdf", "1234", ""]) 41 | result = regex.fullmatch(pattern, string) 42 | assert isinstance(result[0], re.Match) 43 | assert result[1] is None 44 | assert result[2] is None 45 | match = regex.fullmatch(pattern, string[0]) 46 | assert isinstance(match, re.Match) 47 | 48 | def test_match(self): 49 | pattern = r"[a-z]" 50 | string = Vector(["asdf", "1234", ""]) 51 | result = regex.match(pattern, string) 52 | assert isinstance(result[0], re.Match) 53 | assert result[1] is None 54 | assert result[2] is None 55 | match = regex.match(pattern, string[0]) 56 | assert isinstance(match, re.Match) 57 | 58 | def test_search(self): 59 | pattern = r"[a-z]" 60 | string = Vector(["asdf", "1234", ""]) 61 | result = regex.search(pattern, string) 62 | assert isinstance(result[0], re.Match) 63 | assert result[1] is None 64 | assert result[2] is None 65 | match = regex.search(pattern, string[0]) 66 | assert isinstance(match, re.Match) 67 | 68 | def test_split(self): 69 | pattern = r" +" 70 | string = Vector(["one two three", "four", ""]) 71 | result = regex.split(pattern, string) 72 | expected = [["one", "two", "three"], ["four"], None] 73 | assert result.tolist() == expected 74 | assert regex.split(pattern, string[0]) == result[0] 75 | 76 | def test_sub(self): 77 | pattern = r"$" 78 | repl = "!" 79 | string = Vector(["great", "fantastic", ""]) 80 | result = regex.sub(pattern, repl, string) 81 | expected = ["great!", "fantastic!", None] 82 | assert result.tolist() == expected 83 | assert regex.sub(pattern, repl, string[0]) == result[0] 84 | 85 | def test_subn(self): 86 | pattern = r"$" 87 | repl = "!" 88 | string = Vector(["great", "fantastic", ""]) 89 | result = regex.subn(pattern, repl, string) 90 | expected = [("great!", 1), ("fantastic!", 1), None] 91 | assert result.tolist() == expected 92 | assert regex.subn(pattern, repl, string[0]) == result[0] 93 | -------------------------------------------------------------------------------- /dataiter/test/test_util.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright (c) 2020 Osmo Salomaa 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | # THE SOFTWARE. 22 | 23 | import datetime 24 | import math 25 | import numpy as np 26 | import tempfile 27 | 28 | from dataiter import util 29 | 30 | class TestUtil: 31 | 32 | def test_count_digits(self): 33 | assert util.count_digits(0) == (0, 0) 34 | assert util.count_digits(0.1) == (0, 1) 35 | assert util.count_digits(1.000) == (1, 0) 36 | assert util.count_digits(123.456) == (3, 3) 37 | assert util.count_digits(1e-1) == (0, 1) 38 | assert util.count_digits(1e-10) == (0, 10) 39 | 40 | def test_count_digits_special(self): 41 | assert util.count_digits(np.nan) == (0, 0) 42 | assert util.count_digits(math.inf) == (0, 0) 43 | 44 | def test_format_floats_1(self): 45 | a = [1/1000000000, 1/1000000, 1/1000, np.nan] 46 | b = ["1e-09", "1e-06", "1e-03", "nan"] 47 | assert util.format_floats(a) == b 48 | 49 | def test_format_floats_2(self): 50 | a = [0.000123456, 0.123456, 0, np.nan] 51 | b = ["0.000123", "0.123456", "0.000000", "nan"] 52 | assert util.format_floats(a) == b 53 | 54 | def test_format_floats_3(self): 55 | a = [0.123456, 1, 123.456, np.nan] 56 | b = ["0.123", "1.000", "123.456", "nan"] 57 | assert util.format_floats(a) == b 58 | 59 | def test_format_floats_4(self): 60 | a = [123.456789, 123456.789, 123456789, np.nan] 61 | b = ["123", "123457", "123456789", "nan"] 62 | assert util.format_floats(a) == b 63 | 64 | def test_format_floats_4_ksep(self): 65 | a = [123.456789, 123456.789, 123456789, np.nan] 66 | b = ["123", "123,457", "123,456,789", "nan"] 67 | assert util.format_floats(a, ksep=",") == b 68 | 69 | def test_format_floats_5(self): 70 | a = [12345678, 1234567812345678, 123456781234567812345678, np.nan] 71 | b = ["1.234568e+07", "1.234568e+15", "1.234568e+23", "nan"] 72 | assert util.format_floats(a) == b 73 | 74 | def test_format_floats_6(self): 75 | a = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5] 76 | b = ["0.10000", "0.01000", "0.00100", "0.00010", "0.00001"] 77 | assert util.format_floats(a) == b 78 | 79 | def test_format_floats_inf(self): 80 | a = [-math.inf, 0, math.inf, np.nan] 81 | b = ["-inf", "0e+00", "inf", "nan"] 82 | assert util.format_floats(a) == b 83 | 84 | def test_format_floats_integer(self): 85 | a = [1.0, 2.0, 3.0, np.nan] 86 | b = ["1", "2", "3", "nan"] 87 | assert util.format_floats(a) == b 88 | 89 | def test_generate_colnames(self): 90 | colnames = util.generate_colnames(1000) 91 | assert len(colnames) == 1000 92 | assert len(set(colnames)) == 1000 93 | 94 | def test_get_print_width(self): 95 | assert 0 < util.get_print_width() < 1000 96 | 97 | def test_is_scalar(self): 98 | assert util.is_scalar(None) 99 | assert util.is_scalar(b"") 100 | assert util.is_scalar(1.0) 101 | assert util.is_scalar(1) 102 | assert util.is_scalar("") 103 | assert util.is_scalar(datetime.date.today()) 104 | assert util.is_scalar(datetime.datetime.now()) 105 | assert util.is_scalar(datetime.timedelta(days=1)) 106 | assert not util.is_scalar(np.array([1, 2, 3])) 107 | assert not util.is_scalar([1, 2, 3]) 108 | assert not util.is_scalar((1, 2, 3)) 109 | 110 | def test_length(self): 111 | assert util.length(1) == 1 112 | assert util.length([1]) == 1 113 | assert util.length([1, 2]) == 2 114 | 115 | def test_quote(self): 116 | assert util.quote("hello") == '"hello"' 117 | assert util.quote('"hello"') == '"\\"hello\\""' 118 | 119 | def test_sequencify(self): 120 | assert util.sequencify(np.array([1])) == np.array([1]) 121 | assert util.sequencify([1]) == [1] 122 | assert util.sequencify((1,)) == (1,) 123 | assert util.sequencify(None) == [None] 124 | assert util.sequencify(1) == [1] 125 | assert util.sequencify(map(math.sqrt, [1, 4, 9])) == [1, 2, 3] 126 | 127 | def test_ulen(self): 128 | assert util.ulen("asdf") == 4 129 | assert util.ulen("asdf\u200b") == 4 130 | assert util.ulen("asdf\u200b\u200b") == 4 131 | 132 | def test_unique_keys(self): 133 | assert util.unique_keys([1, 2, 3]) == [1, 2, 3] 134 | assert util.unique_keys([1, 2, 3, 1]) == [1, 2, 3] 135 | 136 | def test_unique_types(self): 137 | assert util.unique_types([1, 2, 3.3, np.nan, None]) == {int, float} 138 | 139 | def test_upad(self): 140 | assert util.upad(["a", "aa", "aaa"], align="right") == [" a", " aa", "aaa"] 141 | assert util.upad(["a", "aa", "aaa"], align="left") == ["a ", "aa ", "aaa"] 142 | 143 | def test_utruncate(self): 144 | assert util.utruncate("abcdef", 4) == "abcd" 145 | assert util.utruncate("abc\u200bdef", 4) == "abc\u200bd" 146 | assert util.utruncate("abc\u200bdef\u200b", 4) == "abc\u200bd" 147 | 148 | def test_xopen_bz2(self): 149 | text = "test åäö" 150 | handle, path = tempfile.mkstemp(".bz2") 151 | with util.xopen(path, "wt") as f: 152 | f.write(text) 153 | with util.xopen(path, "rt") as f: 154 | assert f.read() == text 155 | 156 | def test_xopen_gz(self): 157 | text = "test åäö" 158 | handle, path = tempfile.mkstemp(".gz") 159 | with util.xopen(path, "wt") as f: 160 | f.write(text) 161 | with util.xopen(path, "rt") as f: 162 | assert f.read() == text 163 | 164 | def test_xopen_txt(self): 165 | text = "test åäö" 166 | handle, path = tempfile.mkstemp(".txt") 167 | with util.xopen(path, "wt") as f: 168 | f.write(text) 169 | with util.xopen(path, "rt") as f: 170 | assert f.read() == text 171 | 172 | def test_xopen_xz(self): 173 | text = "test åäö" 174 | handle, path = tempfile.mkstemp(".xz") 175 | with util.xopen(path, "wt") as f: 176 | f.write(text) 177 | with util.xopen(path, "rt") as f: 178 | assert f.read() == text 179 | -------------------------------------------------------------------------------- /dataiter/util.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright (c) 2020 Osmo Salomaa 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | # THE SOFTWARE. 22 | 23 | import bz2 24 | import dataiter 25 | import datetime 26 | import gzip 27 | import itertools 28 | import lzma 29 | import math 30 | import numpy as np 31 | import os 32 | import shutil 33 | import string 34 | import wcwidth 35 | 36 | from dataiter import deco 37 | from pathlib import Path 38 | 39 | def count_digits(value): 40 | if np.isnan(value): return 0, 0 41 | if math.isinf(value): return 0, 0 42 | parts = np.format_float_positional(value).split(".") 43 | n = len(parts[0].lstrip("0")) 44 | m = len(parts[1].rstrip("0")) 45 | return n, m 46 | 47 | def format_alias_doc(alias, target): 48 | return f"{target.__doc__}\n\n{' '*8}" + ( 49 | ".. note:: :func:`{}` is a convenience alias for :meth:`{}`." 50 | .format(alias.__name__, target.__qualname__)) 51 | 52 | def format_floats(seq, ksep=None): 53 | precision = dataiter.PRINT_FLOAT_PRECISION 54 | if any(0 < abs(x) < 1/10**precision or abs(x) > 10**16 - 1 for x in seq): 55 | # Format tiny and huge numbers in scientific notation. 56 | f = np.format_float_scientific 57 | return [f(x, precision=precision, trim="-") for x in seq] 58 | if ksep is None: 59 | ksep = dataiter.PRINT_THOUSAND_SEPARATOR 60 | # Format like largest by significant digits. 61 | digits = [count_digits(x) for x in seq] 62 | n = max(x[0] for x in digits) 63 | m = max(x[1] for x in digits) 64 | precision = min(m, max(0, precision - n)) 65 | return [f"{{:,.{precision}f}}".format(x).replace(",", ksep) 66 | for x in seq] 67 | 68 | def generate_colnames(n): 69 | return list(itertools.islice(yield_colnames(), n)) 70 | 71 | def get_print_width(): 72 | return shutil.get_terminal_size((dataiter.PRINT_MAX_WIDTH, 24))[0] - 1 73 | 74 | def is_scalar(value): 75 | # np.isscalar doesn't cover all needed cases. 76 | return (np.isscalar(value) or 77 | value is None or 78 | isinstance(value, (bytes, 79 | bool, 80 | float, 81 | int, 82 | str, 83 | datetime.date, 84 | datetime.datetime, 85 | datetime.timedelta))) 86 | 87 | def length(value): 88 | return 1 if is_scalar(value) else len(value) 89 | 90 | def makedirs_for_file(path): 91 | return Path(path).parent.mkdir(parents=True, exist_ok=True) 92 | 93 | def parse_env_boolean(name): 94 | return { 95 | "1": True, 96 | "t": True, 97 | "true": True, 98 | "y": True, 99 | "yes": True, 100 | "0": False, 101 | "f": False, 102 | "false": False, 103 | "n": False, 104 | "no": False, 105 | }[os.environ[name].strip().lower()] 106 | 107 | def quote(value): 108 | return '"{}"'.format(str(value).replace('"', r'\"')) 109 | 110 | def sequencify(value): 111 | if isinstance(value, (np.ndarray, list, tuple)): 112 | return value 113 | if is_scalar(value): 114 | return [value] 115 | if hasattr(value, "__iter__"): 116 | # Evaluate generator or iterator. 117 | return list(value) 118 | raise ValueError(f"Unexpected type: {type(value)}") 119 | 120 | def unique_keys(keys): 121 | return list(dict.fromkeys(keys)) 122 | 123 | def ulen(string): 124 | # Return the display length of string accounting for 125 | # Unicode characters that have a display width != 1. 126 | length = wcwidth.wcswidth(string) 127 | return length if length >= 0 else 0 128 | 129 | def unique_types(seq): 130 | return set(x.__class__ for x in seq if 131 | x is not None and 132 | not (isinstance(x, float) and np.isnan(x))) 133 | 134 | @deco.listify 135 | def upad(strings, *, align="right"): 136 | # Pad strings for display accounting for 137 | # Unicode characters that have a display width != 1. 138 | width = max(ulen(x) for x in strings) 139 | for value in strings: 140 | padding = " " * (width - ulen(value)) 141 | yield (padding + value 142 | if align == "right" 143 | else value + padding) 144 | 145 | def utruncate(string, width): 146 | # Truncate string to display width accounting for 147 | # Unicode characters that have a display width != 1. 148 | for i in range(1, len(string)): 149 | if ulen(string[:i]) > width: 150 | return string[:(i-1)] 151 | return string 152 | 153 | def xopen(path, mode="r", **kwargs): 154 | if "b" not in mode: 155 | kwargs.setdefault("encoding", "utf-8") 156 | if str(path).endswith(".bz2"): 157 | kwargs.setdefault("compresslevel", 6) 158 | return bz2.open(path, mode, **kwargs) 159 | if str(path).endswith(".gz"): 160 | kwargs.setdefault("compresslevel", 6) 161 | return gzip.open(path, mode, **kwargs) 162 | if str(path).endswith(".xz"): 163 | return lzma.open(path, mode) 164 | return open(path, mode, **kwargs) 165 | 166 | def yield_colnames(): 167 | # Like Excel: a, b, c, ..., aa, bb, cc, ... 168 | for batch in range(1, 1000): 169 | for letter in string.ascii_lowercase: 170 | yield letter * batch 171 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /doc/aggregation.rst: -------------------------------------------------------------------------------- 1 | Aggregation 2 | =========== 3 | 4 | .. note:: The following applies currently only to the 5 | :class:`.DataFrame` class. Aggregation with a 6 | :class:`.ListOfDicts` is simpler and covered by the 7 | API-documentation on :meth:`.ListOfDicts.aggregate`. 8 | 9 | By aggregation, we refer to splitting a data frame into groups based on 10 | the values of one or more columns and then calculating group-wise 11 | summaries, such total count or mean of a column. The first step is 12 | called ``group_by`` and the second ``aggregate``, usually written via 13 | method chaining as ``data.group_by(...).aggregate(...)``. 14 | 15 | A simple example below of how to calculate the total count and mean 16 | price of AirBnb listings in New York grouped by neighbourhood. The 17 | ``aggregate`` method takes keyword arguments of the function to be used 18 | to calculate the summary and the name of the column for that summary in 19 | the output. The return value is a regular data frame. See the following 20 | sections for what kinds of aggregation functions you can use. 21 | 22 | >>> import dataiter as di 23 | >>> data = di.read_csv("data/listings.csv") 24 | >>> data.group_by("hood").aggregate(n=di.count(), price=di.mean("price")) 25 | . 26 | hood n price 27 | >> import dataiter as di 89 | >>> data = di.read_csv("data/listings.csv") 90 | >>> data.group_by("hood").aggregate(n=lambda x: x.nrow, price=lambda x: x.price.mean()) 91 | . 92 | hood n price 93 | `_ code (fast). If you have Numba installed, 107 | then Dataiter will **automatically** use it for aggregation involving 108 | **boolean**, **integer**, **float**, **date**, and **datetime** columns. 109 | If Numba is not available, Dataiter will automatically fall back on the 110 | slower pure Python implementations. The result should be the same, 111 | whether Numba is used or not, excluding some minor rounding or float 112 | precision differences. 113 | 114 | Numba is currently not a hard dependency of Dataiter, so you'll need to 115 | install it separately:: 116 | 117 | pip install -U numba 118 | 119 | When, for a particular version of Dataiter, you first use a 120 | Numba-accelerated aggregation function, the code will be compiled, which 121 | might take a couple seconds. The compiled code is saved in `cache 122 | `_. 123 | After that, using the function from cache will be really fast. In case 124 | you're benchmarking something, note also that on the first use of such a 125 | function in a Python session, the compiled code is loaded from cache on 126 | disk, which takes something like 10–100 ms and further calls will be 127 | faster as there's no more need to load anything. 128 | 129 | .. note:: If you have trouble with Numba, please check the value of 130 | ``di.USE_NUMBA`` to see if Numba has been found. You can also 131 | set ``di.USE_NUMBA = False`` if you have Numba installed, but 132 | it's not working right, or via the environment variable 133 | ``DATAITER_USE_NUMBA=false``. Sometimes it's just the 134 | `caching 135 | `_ 136 | part of Numba that's causing issues. When upgrading you might 137 | sometimes need to delete old caches. If that doesn't help, you 138 | can also turn caching off with ``di.USE_NUMBA_CACHE = False`` 139 | or the environment variable 140 | ``DATAITER_USE_NUMBA_CACHE=false``. 141 | -------------------------------------------------------------------------------- /doc/check.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import dataiter as di 4 | import inspect 5 | 6 | from pathlib import Path 7 | 8 | DOCUMENTED_SPECIALS = [ 9 | "__init__", 10 | ] 11 | 12 | PAGES = { 13 | "dataiter.rst": di, 14 | "data-frame.rst": di.DataFrame, 15 | "data-frame-column.rst": di.DataFrameColumn, 16 | "dt.rst": di.dt, 17 | "dtypes.rst": di.dtypes, 18 | "geojson.rst": di.GeoJSON, 19 | "list-of-dicts.rst": di.ListOfDicts, 20 | "regex.rst": di.regex, 21 | "vector.rst": di.Vector, 22 | } 23 | 24 | SKIP = [ 25 | "DataFrame.clear", 26 | "DataFrame.COLUMN_PLACEHOLDER", 27 | "DataFrame.pop", 28 | "DataFrame.popitem", 29 | "GeoJSON.to_string", 30 | "Vector.to_strings", 31 | ] 32 | 33 | DIRECTORY = Path(__file__).parent 34 | for page, obj in PAGES.items(): 35 | text = (DIRECTORY / page).read_text("utf-8") 36 | source = inspect.getsourcefile(obj) 37 | print(f"Checking {source}...") 38 | for name, value in inspect.getmembers(obj): 39 | if (name.startswith("_") and 40 | name not in DOCUMENTED_SPECIALS): 41 | continue 42 | if not inspect.getmodule(value): continue 43 | module = inspect.getmodule(value) 44 | if inspect.ismodule(obj): 45 | # Skip objects documented separately. 46 | if inspect.ismodule(value): continue 47 | if inspect.isclass(value): continue 48 | if inspect.isclass(obj): 49 | # Skip base class methods from NumPy etc. 50 | if inspect.getsourcefile(module) != source: continue 51 | full_name = f"{obj.__name__}.{name}" 52 | if full_name in SKIP: continue 53 | print(f"... {full_name}") 54 | if full_name not in text: 55 | raise Exception("Not found") 56 | -------------------------------------------------------------------------------- /doc/comparison.rst: -------------------------------------------------------------------------------- 1 | Comparison 2 | ========== 3 | 4 | If you're familiar with `dplyr `_ (R) or 5 | `Pandas `_ (Python) you might find the below 6 | comparison table useful to get started. Dataiter is heavily inspired by 7 | dplyr, but not an implementation of the dplyr API, rather an adaptation 8 | of mixed influences primarily from dplyr, base R, SQL and base Python. 9 | 10 | `Comparison Table of Basic Data Frame Operations in dplyr vs. Dataiter vs. Pandas <_static/comparison.html>`_ 11 | -------------------------------------------------------------------------------- /doc/comparison/Makefile: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8-unix -*- 2 | 3 | build: 4 | ./build.py 5 | 6 | run: 7 | python3 -m http.server 8 | 9 | .PHONY: build run 10 | -------------------------------------------------------------------------------- /doc/comparison/README.md: -------------------------------------------------------------------------------- 1 | Comparison Table dplyr vs. Dataiter vs. Pandas 2 | ============================================== 3 | 4 | ## Development 5 | 6 | Use `make run` and open . 7 | 8 | ## Production 9 | 10 | `index.html` is compiled into `comparison.html`, which is used as a 11 | static file as part of the autogenerated Sphinx documentation. 12 | -------------------------------------------------------------------------------- /doc/comparison/blocks/.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | select = E1,E9,F 3 | ignore = E129,F401,F821 4 | -------------------------------------------------------------------------------- /doc/comparison/blocks/aggregate-dataiter.py: -------------------------------------------------------------------------------- 1 | (data 2 | .group_by("year", "month") 3 | .aggregate( 4 | sales_total=di.sum("sales"), 5 | sales_per_day=di.mean("sales"))) 6 | 7 | (data 8 | .group_by("year", "month") 9 | .aggregate( 10 | sales_total=lambda x: x.sales.sum(), 11 | sales_per_day=lambda x: x.sales.mean())) 12 | -------------------------------------------------------------------------------- /doc/comparison/blocks/aggregate-dplyr.R: -------------------------------------------------------------------------------- 1 | data %>% 2 | group_by(year, month) %>% 3 | summarise( 4 | sales_total=sum(sales), 5 | sales_per_day=mean(sales)) 6 | -------------------------------------------------------------------------------- /doc/comparison/blocks/aggregate-pandas.py: -------------------------------------------------------------------------------- 1 | (data 2 | .groupby(["year", "month"], as_index=False) 3 | .agg( 4 | sales_total=("sales", "sum"), 5 | sales_per_day=("sales", "mean"))) 6 | 7 | (data 8 | .groupby(["year", "month"], as_index=False) 9 | .apply(lambda x: pd.Series({ 10 | "sales_total": x["sales"].sum(), 11 | "sales_per_day": x["sales"].mean()}))) 12 | -------------------------------------------------------------------------------- /doc/comparison/blocks/cbind-dataiter.py: -------------------------------------------------------------------------------- 1 | data1.cbind(data2) 2 | -------------------------------------------------------------------------------- /doc/comparison/blocks/cbind-dplyr.R: -------------------------------------------------------------------------------- 1 | bind_cols(data1, data2) 2 | -------------------------------------------------------------------------------- /doc/comparison/blocks/cbind-pandas.py: -------------------------------------------------------------------------------- 1 | data1 = data1.reset_index(drop=True) 2 | data2 = data2.reset_index(drop=True) 3 | pd.concat([data1, data2], axis=1) 4 | -------------------------------------------------------------------------------- /doc/comparison/blocks/chain-dataiter.py: -------------------------------------------------------------------------------- 1 | (data 2 | .filter(year=2021) 3 | .sort(sales=-1) 4 | .head(10)) 5 | -------------------------------------------------------------------------------- /doc/comparison/blocks/chain-dplyr.R: -------------------------------------------------------------------------------- 1 | data |> 2 | filter(year == 2021) |> 3 | arrange(desc(sales)) |> 4 | head(10) 5 | -------------------------------------------------------------------------------- /doc/comparison/blocks/chain-pandas.py: -------------------------------------------------------------------------------- 1 | (data 2 | .loc[lambda x: x["year"] == 2021] 3 | .sort_values("sales", ascending=False) 4 | .head(10)) 5 | -------------------------------------------------------------------------------- /doc/comparison/blocks/colnames-dataiter.py: -------------------------------------------------------------------------------- 1 | names = data.colnames 2 | data.colnames = ["a", "b", "c"] 3 | -------------------------------------------------------------------------------- /doc/comparison/blocks/colnames-dplyr.R: -------------------------------------------------------------------------------- 1 | names = colnames(data) 2 | colnames(data) = c("a", "b", "c") 3 | -------------------------------------------------------------------------------- /doc/comparison/blocks/colnames-pandas.py: -------------------------------------------------------------------------------- 1 | names = data.columns 2 | data.columns = ["a", "b", "c"] 3 | -------------------------------------------------------------------------------- /doc/comparison/blocks/filter-dataiter.py: -------------------------------------------------------------------------------- 1 | data.filter(year=2021) 2 | data.filter(data.year == 2021) 3 | data.filter(lambda x: x.year == 2021) 4 | -------------------------------------------------------------------------------- /doc/comparison/blocks/filter-dplyr.R: -------------------------------------------------------------------------------- 1 | filter(data, year == 2021) 2 | -------------------------------------------------------------------------------- /doc/comparison/blocks/filter-pandas.py: -------------------------------------------------------------------------------- 1 | data[data["year"] == 2021] 2 | data.loc[data["year"] == 2021] 3 | data[lambda x: x["year"] == 2021] 4 | data.loc[lambda x: x["year"] == 2021] 5 | data.query("year == 2021") 6 | -------------------------------------------------------------------------------- /doc/comparison/blocks/grouped-modify-dataiter.py: -------------------------------------------------------------------------------- 1 | (data 2 | .group_by("year", "month") 3 | .modify(fraction=lambda x: ( 4 | x.sales / x.sales.sum()))) 5 | -------------------------------------------------------------------------------- /doc/comparison/blocks/grouped-modify-dplyr.R: -------------------------------------------------------------------------------- 1 | data %>% 2 | group_by(year, month) %>% 3 | mutate(fraction=sales/sum(sales)) 4 | -------------------------------------------------------------------------------- /doc/comparison/blocks/grouped-modify-pandas.py: -------------------------------------------------------------------------------- 1 | # No singular operation 2 | -------------------------------------------------------------------------------- /doc/comparison/blocks/head-dataiter.py: -------------------------------------------------------------------------------- 1 | data.head(10) 2 | data.tail(10) 3 | data.sample(10) 4 | -------------------------------------------------------------------------------- /doc/comparison/blocks/head-dplyr.R: -------------------------------------------------------------------------------- 1 | head(data, 10) 2 | tail(data, 10) 3 | slice_sample(data, n=10) 4 | -------------------------------------------------------------------------------- /doc/comparison/blocks/head-pandas.py: -------------------------------------------------------------------------------- 1 | data.head(10) 2 | data.tail(10) 3 | data.sample(10) 4 | -------------------------------------------------------------------------------- /doc/comparison/blocks/import-dataiter.py: -------------------------------------------------------------------------------- 1 | import dataiter as di 2 | import numpy as np 3 | -------------------------------------------------------------------------------- /doc/comparison/blocks/import-dplyr.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | 3 | # Avoid hiding print output. 4 | options(pillar.width=1000) 5 | -------------------------------------------------------------------------------- /doc/comparison/blocks/import-pandas.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Avoid hiding print output. 4 | pd.set_option("display.max_columns", 1000) 5 | -------------------------------------------------------------------------------- /doc/comparison/blocks/index-dataiter.py: -------------------------------------------------------------------------------- 1 | # Column by name 2 | data.x 3 | data["x"] 4 | 5 | # Column by index 6 | data.columns[2] 7 | 8 | # Row by index 9 | data.slice(2) 10 | 11 | # Column element 12 | data.x[2] 13 | -------------------------------------------------------------------------------- /doc/comparison/blocks/index-dplyr.R: -------------------------------------------------------------------------------- 1 | # Column by name 2 | data$x 3 | data[["x"]] 4 | 5 | # Column by index 6 | data[[3]] 7 | 8 | # Row by index 9 | data[3,] 10 | 11 | # Column element 12 | data$x[3] 13 | -------------------------------------------------------------------------------- /doc/comparison/blocks/index-pandas.py: -------------------------------------------------------------------------------- 1 | # Column by name 2 | data.x 3 | data["x"] 4 | 5 | # Column by index 6 | data.iloc[:,2] 7 | 8 | # Row by index 9 | data.iloc[2,:] 10 | 11 | # Column element 12 | data["x"][2] 13 | -------------------------------------------------------------------------------- /doc/comparison/blocks/io-binary-dataiter.py: -------------------------------------------------------------------------------- 1 | data = di.read_npz("data.npz") 2 | data.write_npz("data.npz") 3 | -------------------------------------------------------------------------------- /doc/comparison/blocks/io-binary-dplyr.R: -------------------------------------------------------------------------------- 1 | data = read_rds("data.rds") 2 | write_rds(data, "data.rds") 3 | -------------------------------------------------------------------------------- /doc/comparison/blocks/io-binary-pandas.py: -------------------------------------------------------------------------------- 1 | data = pd.read_pickle("data.pkl") 2 | data.to_pickle("data.pkl") 3 | -------------------------------------------------------------------------------- /doc/comparison/blocks/io-csv-dataiter.py: -------------------------------------------------------------------------------- 1 | data = di.read_csv("data.csv") 2 | data.write_csv("data.csv") 3 | -------------------------------------------------------------------------------- /doc/comparison/blocks/io-csv-dplyr.R: -------------------------------------------------------------------------------- 1 | data = read_csv("data.csv") 2 | write_csv(data, "data.csv") 3 | -------------------------------------------------------------------------------- /doc/comparison/blocks/io-csv-pandas.py: -------------------------------------------------------------------------------- 1 | data = pd.read_csv("data.csv") 2 | data.to_csv("data.csv", index=False) 3 | -------------------------------------------------------------------------------- /doc/comparison/blocks/join-dataiter.py: -------------------------------------------------------------------------------- 1 | data1.left_join (data2, "id") 2 | data1.inner_join(data2, "id") 3 | data1.full_join (data2, "id") 4 | -------------------------------------------------------------------------------- /doc/comparison/blocks/join-dplyr.R: -------------------------------------------------------------------------------- 1 | left_join (data1, data2, by="id") 2 | inner_join(data1, data2, by="id") 3 | full_join (data1, data2, by="id") 4 | -------------------------------------------------------------------------------- /doc/comparison/blocks/join-pandas.py: -------------------------------------------------------------------------------- 1 | data1.merge(data2, how="left", on="id") 2 | data1.merge(data2, how="inner", on="id") 3 | data1.merge(data2, how="outer", on="id") 4 | -------------------------------------------------------------------------------- /doc/comparison/blocks/modify-dataiter.py: -------------------------------------------------------------------------------- 1 | data.modify(c=(data.a + data.b)) 2 | data.modify(c=lambda x: x.a + x.b) 3 | -------------------------------------------------------------------------------- /doc/comparison/blocks/modify-dplyr.R: -------------------------------------------------------------------------------- 1 | mutate(data, c=(a + b)) 2 | -------------------------------------------------------------------------------- /doc/comparison/blocks/modify-pandas.py: -------------------------------------------------------------------------------- 1 | data.assign(c=(data["a"] + data["b"])) 2 | data.assign(c=lambda x: x["a"] + x["b"]) 3 | -------------------------------------------------------------------------------- /doc/comparison/blocks/non-join-dataiter.py: -------------------------------------------------------------------------------- 1 | data1.semi_join(data2, "id") 2 | data1.anti_join(data2, "id") 3 | -------------------------------------------------------------------------------- /doc/comparison/blocks/non-join-dplyr.R: -------------------------------------------------------------------------------- 1 | semi_join(data1, data2, by="id") 2 | anti_join(data1, data2, by="id") 3 | -------------------------------------------------------------------------------- /doc/comparison/blocks/non-join-pandas.py: -------------------------------------------------------------------------------- 1 | # No singular operations 2 | -------------------------------------------------------------------------------- /doc/comparison/blocks/rbind-dataiter.py: -------------------------------------------------------------------------------- 1 | data1.rbind(data2) 2 | -------------------------------------------------------------------------------- /doc/comparison/blocks/rbind-dplyr.R: -------------------------------------------------------------------------------- 1 | bind_rows(data1, data2) 2 | -------------------------------------------------------------------------------- /doc/comparison/blocks/rbind-pandas.py: -------------------------------------------------------------------------------- 1 | pd.concat([data1, data2]) 2 | -------------------------------------------------------------------------------- /doc/comparison/blocks/rename-dataiter.py: -------------------------------------------------------------------------------- 1 | data.rename(to="from") 2 | -------------------------------------------------------------------------------- /doc/comparison/blocks/rename-dplyr.R: -------------------------------------------------------------------------------- 1 | rename(data, to="from") 2 | -------------------------------------------------------------------------------- /doc/comparison/blocks/rename-pandas.py: -------------------------------------------------------------------------------- 1 | data.rename(columns={"from": "to"}, errors="raise") 2 | -------------------------------------------------------------------------------- /doc/comparison/blocks/select-dataiter.py: -------------------------------------------------------------------------------- 1 | data.select("a", "b", "c") 2 | data.unselect("a", "b", "c") 3 | -------------------------------------------------------------------------------- /doc/comparison/blocks/select-dplyr.R: -------------------------------------------------------------------------------- 1 | select(data, a, b, c) 2 | select(data, -a, -b, -c) 3 | -------------------------------------------------------------------------------- /doc/comparison/blocks/select-pandas.py: -------------------------------------------------------------------------------- 1 | data[["a", "b", "c"]] 2 | data.drop(columns=["a", "b", "c"]) 3 | -------------------------------------------------------------------------------- /doc/comparison/blocks/size-dataiter.py: -------------------------------------------------------------------------------- 1 | data.nrow 2 | data.ncol 3 | -------------------------------------------------------------------------------- /doc/comparison/blocks/size-dplyr.R: -------------------------------------------------------------------------------- 1 | nrow(data) 2 | ncol(data) 3 | -------------------------------------------------------------------------------- /doc/comparison/blocks/size-pandas.py: -------------------------------------------------------------------------------- 1 | len(data) 2 | len(data.columns) 3 | nrow, ncol = data.shape 4 | -------------------------------------------------------------------------------- /doc/comparison/blocks/sort-dataiter.py: -------------------------------------------------------------------------------- 1 | data.sort(a=1, b=1, c=-1) 2 | -------------------------------------------------------------------------------- /doc/comparison/blocks/sort-dplyr.R: -------------------------------------------------------------------------------- 1 | arrange(data, a, b, desc(c)) 2 | -------------------------------------------------------------------------------- /doc/comparison/blocks/sort-pandas.py: -------------------------------------------------------------------------------- 1 | data.sort_values(["a", "b", "c"], ascending=[True, True, False]) 2 | -------------------------------------------------------------------------------- /doc/comparison/blocks/unique-dataiter.py: -------------------------------------------------------------------------------- 1 | data.unique("a", "b", "c") 2 | -------------------------------------------------------------------------------- /doc/comparison/blocks/unique-dplyr.R: -------------------------------------------------------------------------------- 1 | distinct(data, a, b, c, .keep_all=TRUE) 2 | -------------------------------------------------------------------------------- /doc/comparison/blocks/unique-pandas.py: -------------------------------------------------------------------------------- 1 | data.drop_duplicates(["a", "b", "c"]) 2 | -------------------------------------------------------------------------------- /doc/comparison/build.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import re 4 | 5 | from pathlib import Path 6 | 7 | lines = [] 8 | text = Path("index.html").read_text("utf-8").strip() 9 | print("Compiling index.html + blocks → comparison.html...") 10 | print(f"index.html: {len(text)}") 11 | for line in text.splitlines(): 12 | if not line.strip().startswith('
{code}
' 20 | for line in html.splitlines(): 21 | lines.append(line) 22 | 23 | text = "\n".join(lines) + "\n" 24 | print(f"comparison.html: {len(text)}") 25 | Path("comparison.html").write_text(text, "utf-8") 26 | 27 | text = Path("prism.css").read_text("utf-8").strip() 28 | if "font-family:" in text or "font-size:" in text: 29 | # Strip Prism font rules so that they don't override 30 | # Tailwind CSS's better-thought-out default system font stack. 31 | # https://tailwindcss.com/docs/font-family 32 | text_length_prior = len(text) 33 | print("Patching prims.css... ", end="") 34 | text = re.sub(r"font-family:.+?;", "", text) 35 | text = re.sub(r"font-size:.+?;", "", text) 36 | print(len(text) - text_length_prior) 37 | Path("prism.css").write_text(text + "\n", "utf-8") 38 | -------------------------------------------------------------------------------- /doc/comparison/generate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | for ARG; do 3 | ARG="${ARG//_/-}" 4 | touch blocks/$ARG-dplyr.R 5 | touch blocks/$ARG-pandas.py 6 | touch blocks/$ARG-dataiter.py 7 | done 8 | -------------------------------------------------------------------------------- /doc/comparison/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Comparison Table of Basic Data Frame Operations in dplyr vs. Dataiter vs. Pandas 8 | 9 | 28 | 29 | 30 | 31 | 32 | 33 |
34 |

Comparison Table of Basic Data Frame Operations in dplyr vs. Dataiter vs. Pandas

35 | 36 |
37 | 38 |
39 |

dplyr

40 |

Dataiter

41 |

Pandas

42 |
43 | 44 |

Imports & Configuration

45 |
46 |

 47 |       

 48 |       

 49 |       

Some of the below code uses other parts of tidyverse besides dplyr too, such 50 | as readr. For simplicity, you can load them all via the tidyverse metapackage.

51 |

We often need NumPy too for certain calculations.

52 |

53 |
54 | 55 |

Input/Output

56 |
57 |

 58 |       

 59 |       

 60 |     
61 |
62 |

 63 |       

 64 |       

 65 |     
66 |
67 |

68 | All three support multiple binary formats, the above are sensible defaults (assuming you 69 | don't need interoperability) that work out of the box. 70 |

71 |
72 | 73 |

Structure

74 |
75 |

 76 |       

 77 |       

 78 |     
79 | 80 |

Indexing

81 |
82 |

 83 |       

 84 |       

 85 |       

86 |

Attribute access to columns (dot notation) is preferred.

87 |

Attribute access to columns (dot notation) does not work in all contexts, 88 | bracket notation is more common. Pandas uses terms "axis=0" to refer to rows, "axis=1" to 89 | refer to columns, "index" to refer to row names and "labels" to refer to row and column 90 | names. Certain operations use the "index" for implicit joins called "alignment".

91 |
92 | 93 |

Chaining/Piping

94 |
95 |

 96 |       

 97 |       

 98 |       

99 |

100 |

Pandas is not really designed for method chaining but it mostly works these 101 | days. Note also that the "inplace" arguments that many methods take, which if used are 102 | incompatible with method chaining, 103 | are apparently not 104 | useful.

105 |
106 | 107 |

Column Operations

108 |
109 |

110 |       

111 |       

112 |     
113 |
114 |

115 |       

116 |       

117 |     
118 |
119 |

120 |       

121 |       

122 |     
123 |
124 |

125 |       

126 |       

127 |     
128 | 129 |

Sorting

130 |
131 |

132 |       

133 |       

134 |     
135 | 136 |

Subsetting by Row

137 |
138 |

139 |       

140 |       

141 |     
142 |
143 |

144 |       

145 |       

146 |       

147 |

Dataiter also has filter_out as a shorthand to negate the given 148 | condition.

149 |

150 |
151 |
152 |

153 |       

154 |       

155 |     
156 | 157 |

Concatenation

158 |
159 |

160 |       

161 |       

162 |       

163 |

164 |

165 |
166 |
167 |

168 |       

169 |       

170 |       

171 |

172 |

Pandas wants to do "alignment" by "index" here. Resetting the indices prior to 173 | concatenation should give the same result as the plain concatenation in dplyr and 174 | Dataiter.

175 |
176 | 177 |

Joins

178 |
179 |

180 |       

181 |       

182 |     
183 |
184 |

185 | dplyr and Pandas follow the SQL convention of joining all matching rows, i.e. if doing a 186 | left join with ten rows on the left side, the result will have ten or more rows – ten if all 187 | keys have zero or one match on the right side, twenty if all have two matches etc. Dataiter 188 | differs by only joining the first match, on account of it usually being more practical and 189 | less liable to produce unpleasant surprises. If Dataiter's a.left_join(b) 190 | doesn't give you all the results you're looking for, you might want instead 191 | either b.left_join(a) or a.full_join(b). SQL-style joins are 192 | currently unsupported, but may be added in the future. 193 |

194 |
195 |
196 |

197 |       

198 |       

199 |     
200 | 201 |

Grouping & Modification

202 |
203 |

204 |       

205 |       

206 |     
207 | 208 |

Grouping & Aggregation

209 |
210 |

211 |       

212 |       

213 |     
214 |
215 |

Both Dataiter and Pandas have two aggregation forms: 216 | one for fast aggregation limited to common operations with a single column and another for 217 | arbitrary calculation with access to all columns. In Dataiter, these forms are equivalent in 218 | the sense that e.g. di.sum("sales") returns a function that takes data as 219 | argument and calculates the sum of the "sales" column, and also in the sense that, unlike 220 | with Pandas, you can mix and match both forms within the same aggregate call. 221 | Pandas' agg method arguments can take very many forms, the above is called 222 | "named aggregation". Likewise, the apply method can be used very many ways, the 223 | above is one way. The first form is about equally fast in Dataiter and Pandas, the second 224 | form is a lot faster in Dataiter.

225 |
226 | 227 | 228 | 229 | 247 | 248 | 249 | 250 | -------------------------------------------------------------------------------- /doc/comparison/prism.css: -------------------------------------------------------------------------------- 1 | /* PrismJS 1.25.0 2 | https://prismjs.com/download.html#themes=prism&languages=python+r&plugins=file-highlight */ 3 | code[class*=language-],pre[class*=language-]{color:#000;background:0 0;text-shadow:0 1px #fff;text-align:left;white-space:pre;word-spacing:normal;word-break:normal;word-wrap:normal;line-height:1.5;-moz-tab-size:4;-o-tab-size:4;tab-size:4;-webkit-hyphens:none;-moz-hyphens:none;-ms-hyphens:none;hyphens:none}code[class*=language-] ::-moz-selection,code[class*=language-]::-moz-selection,pre[class*=language-] ::-moz-selection,pre[class*=language-]::-moz-selection{text-shadow:none;background:#b3d4fc}code[class*=language-] ::selection,code[class*=language-]::selection,pre[class*=language-] ::selection,pre[class*=language-]::selection{text-shadow:none;background:#b3d4fc}@media print{code[class*=language-],pre[class*=language-]{text-shadow:none}}pre[class*=language-]{padding:1em;margin:.5em 0;overflow:auto}:not(pre)>code[class*=language-],pre[class*=language-]{background:#f5f2f0}:not(pre)>code[class*=language-]{padding:.1em;border-radius:.3em;white-space:normal}.token.cdata,.token.comment,.token.doctype,.token.prolog{color:#708090}.token.punctuation{color:#999}.token.namespace{opacity:.7}.token.boolean,.token.constant,.token.deleted,.token.number,.token.property,.token.symbol,.token.tag{color:#905}.token.attr-name,.token.builtin,.token.char,.token.inserted,.token.selector,.token.string{color:#690}.language-css .token.string,.style .token.string,.token.entity,.token.operator,.token.url{color:#9a6e3a;background:hsla(0,0%,100%,.5)}.token.atrule,.token.attr-value,.token.keyword{color:#07a}.token.class-name,.token.function{color:#dd4a68}.token.important,.token.regex,.token.variable{color:#e90}.token.bold,.token.important{font-weight:700}.token.italic{font-style:italic}.token.entity{cursor:help} 4 | -------------------------------------------------------------------------------- /doc/comparison/prism.js: -------------------------------------------------------------------------------- 1 | /* PrismJS 1.25.0 2 | https://prismjs.com/download.html#themes=prism&languages=python+r&plugins=file-highlight */ 3 | var _self="undefined"!=typeof window?window:"undefined"!=typeof WorkerGlobalScope&&self instanceof WorkerGlobalScope?self:{},Prism=function(u){var t=/(?:^|\s)lang(?:uage)?-([\w-]+)(?=\s|$)/i,n=0,e={},M={manual:u.Prism&&u.Prism.manual,disableWorkerMessageHandler:u.Prism&&u.Prism.disableWorkerMessageHandler,util:{encode:function e(n){return n instanceof W?new W(n.type,e(n.content),n.alias):Array.isArray(n)?n.map(e):n.replace(/&/g,"&").replace(/=l.reach);y+=m.value.length,m=m.next){var k=m.value;if(t.length>n.length)return;if(!(k instanceof W)){var x,b=1;if(h){if(!(x=z(p,y,n,f))||x.index>=n.length)break;var w=x.index,A=x.index+x[0].length,P=y;for(P+=m.value.length;P<=w;)m=m.next,P+=m.value.length;if(P-=m.value.length,y=P,m.value instanceof W)continue;for(var E=m;E!==t.tail&&(Pl.reach&&(l.reach=j);var C=m.prev;S&&(C=I(t,C,S),y+=S.length),q(t,C,b);var N=new W(o,g?M.tokenize(L,g):L,d,L);if(m=I(t,C,N),O&&I(t,m,O),1l.reach&&(l.reach=_.reach)}}}}}}(e,a,n,a.head,0),function(e){var n=[],t=e.head.next;for(;t!==e.tail;)n.push(t.value),t=t.next;return n}(a)},hooks:{all:{},add:function(e,n){var t=M.hooks.all;t[e]=t[e]||[],t[e].push(n)},run:function(e,n){var t=M.hooks.all[e];if(t&&t.length)for(var r,a=0;r=t[a++];)r(n)}},Token:W};function W(e,n,t,r){this.type=e,this.content=n,this.alias=t,this.length=0|(r||"").length}function z(e,n,t,r){e.lastIndex=n;var a=e.exec(t);if(a&&r&&a[1]){var i=a[1].length;a.index+=i,a[0]=a[0].slice(i)}return a}function i(){var e={value:null,prev:null,next:null},n={value:null,prev:e,next:null};e.next=n,this.head=e,this.tail=n,this.length=0}function I(e,n,t){var r=n.next,a={value:t,prev:n,next:r};return n.next=a,r.prev=a,e.length++,a}function q(e,n,t){for(var r=n.next,a=0;a"+a.content+""},!u.document)return u.addEventListener&&(M.disableWorkerMessageHandler||u.addEventListener("message",function(e){var n=JSON.parse(e.data),t=n.language,r=n.code,a=n.immediateClose;u.postMessage(M.highlight(r,M.languages[t],t)),a&&u.close()},!1)),M;var r=M.util.currentScript();function a(){M.manual||M.highlightAll()}if(r&&(M.filename=r.src,r.hasAttribute("data-manual")&&(M.manual=!0)),!M.manual){var l=document.readyState;"loading"===l||"interactive"===l&&r&&r.defer?document.addEventListener("DOMContentLoaded",a):window.requestAnimationFrame?window.requestAnimationFrame(a):window.setTimeout(a,16)}return M}(_self);"undefined"!=typeof module&&module.exports&&(module.exports=Prism),"undefined"!=typeof global&&(global.Prism=Prism); 4 | Prism.languages.python={comment:{pattern:/(^|[^\\])#.*/,lookbehind:!0,greedy:!0},"string-interpolation":{pattern:/(?:f|fr|rf)(?:("""|''')[\s\S]*?\1|("|')(?:\\.|(?!\2)[^\\\r\n])*\2)/i,greedy:!0,inside:{interpolation:{pattern:/((?:^|[^{])(?:\{\{)*)\{(?!\{)(?:[^{}]|\{(?!\{)(?:[^{}]|\{(?!\{)(?:[^{}])+\})+\})+\}/,lookbehind:!0,inside:{"format-spec":{pattern:/(:)[^:(){}]+(?=\}$)/,lookbehind:!0},"conversion-option":{pattern:/![sra](?=[:}]$)/,alias:"punctuation"},rest:null}},string:/[\s\S]+/}},"triple-quoted-string":{pattern:/(?:[rub]|br|rb)?("""|''')[\s\S]*?\1/i,greedy:!0,alias:"string"},string:{pattern:/(?:[rub]|br|rb)?("|')(?:\\.|(?!\1)[^\\\r\n])*\1/i,greedy:!0},function:{pattern:/((?:^|\s)def[ \t]+)[a-zA-Z_]\w*(?=\s*\()/g,lookbehind:!0},"class-name":{pattern:/(\bclass\s+)\w+/i,lookbehind:!0},decorator:{pattern:/(^[\t ]*)@\w+(?:\.\w+)*/m,lookbehind:!0,alias:["annotation","punctuation"],inside:{punctuation:/\./}},keyword:/\b(?:_(?=\s*:)|and|as|assert|async|await|break|case|class|continue|def|del|elif|else|except|exec|finally|for|from|global|if|import|in|is|lambda|match|nonlocal|not|or|pass|print|raise|return|try|while|with|yield)\b/,builtin:/\b(?:__import__|abs|all|any|apply|ascii|basestring|bin|bool|buffer|bytearray|bytes|callable|chr|classmethod|cmp|coerce|compile|complex|delattr|dict|dir|divmod|enumerate|eval|execfile|file|filter|float|format|frozenset|getattr|globals|hasattr|hash|help|hex|id|input|int|intern|isinstance|issubclass|iter|len|list|locals|long|map|max|memoryview|min|next|object|oct|open|ord|pow|property|range|raw_input|reduce|reload|repr|reversed|round|set|setattr|slice|sorted|staticmethod|str|sum|super|tuple|type|unichr|unicode|vars|xrange|zip)\b/,boolean:/\b(?:False|None|True)\b/,number:/\b0(?:b(?:_?[01])+|o(?:_?[0-7])+|x(?:_?[a-f0-9])+)\b|(?:\b\d+(?:_\d+)*(?:\.(?:\d+(?:_\d+)*)?)?|\B\.\d+(?:_\d+)*)(?:e[+-]?\d+(?:_\d+)*)?j?(?!\w)/i,operator:/[-+%=]=?|!=|:=|\*\*?=?|\/\/?=?|<[<=>]?|>[=>]?|[&|^~]/,punctuation:/[{}[\];(),.:]/},Prism.languages.python["string-interpolation"].inside.interpolation.inside.rest=Prism.languages.python,Prism.languages.py=Prism.languages.python; 5 | Prism.languages.r={comment:/#.*/,string:{pattern:/(['"])(?:\\.|(?!\1)[^\\\r\n])*\1/,greedy:!0},"percent-operator":{pattern:/%[^%\s]*%/,alias:"operator"},boolean:/\b(?:FALSE|TRUE)\b/,ellipsis:/\.\.(?:\.|\d+)/,number:[/\b(?:Inf|NaN)\b/,/(?:\b0x[\dA-Fa-f]+(?:\.\d*)?|\b\d+(?:\.\d*)?|\B\.\d+)(?:[EePp][+-]?\d+)?[iL]?/],keyword:/\b(?:NA|NA_character_|NA_complex_|NA_integer_|NA_real_|NULL|break|else|for|function|if|in|next|repeat|while)\b/,operator:/->?>?|<(?:=|=!]=?|::?|&&?|\|\|?|[+*\/^$@~]/,punctuation:/[(){}\[\],;]/}; 6 | !function(){if("undefined"!=typeof Prism&&"undefined"!=typeof document){Element.prototype.matches||(Element.prototype.matches=Element.prototype.msMatchesSelector||Element.prototype.webkitMatchesSelector);var l={js:"javascript",py:"python",rb:"ruby",ps1:"powershell",psm1:"powershell",sh:"bash",bat:"batch",h:"c",tex:"latex"},o="data-src-status",h="loading",g="loaded",u="pre[data-src]:not(["+o+'="'+g+'"]):not(['+o+'="'+h+'"])';Prism.hooks.add("before-highlightall",function(t){t.selector+=", "+u}),Prism.hooks.add("before-sanity-check",function(t){var r=t.element;if(r.matches(u)){t.code="",r.setAttribute(o,h);var s=r.appendChild(document.createElement("CODE"));s.textContent="Loading…";var e=r.getAttribute("data-src"),i=t.language;if("none"===i){var n=(/\.(\w+)$/.exec(e)||[,"none"])[1];i=l[n]||n}Prism.util.setLanguage(s,i),Prism.util.setLanguage(r,i);var a=Prism.plugins.autoloader;a&&a.loadLanguages(i),function(t,e,i){var n=new XMLHttpRequest;n.open("GET",t,!0),n.onreadystatechange=function(){4==n.readyState&&(n.status<400&&n.responseText?e(n.responseText):400<=n.status?i(function(t,e){return"✖ Error "+t+" while fetching file: "+e}(n.status,n.statusText)):i("✖ Error: File does not exist or is empty"))},n.send(null)}(e,function(t){r.setAttribute(o,g);var e=function(t){var e=/^\s*(\d+)\s*(?:(,)\s*(?:(\d+)\s*)?)?$/.exec(t||"");if(e){var i=Number(e[1]),n=e[2],a=e[3];return n?a?[i,Number(a)]:[i,void 0]:[i,i]}}(r.getAttribute("data-range"));if(e){var i=t.split(/\r\n?|\n/g),n=e[0],a=null==e[1]?i.length:e[1];n<0&&(n+=i.length),n=Math.max(0,Math.min(n-1,i.length)),a<0&&(a+=i.length),a=Math.max(0,Math.min(a,i.length)),t=i.slice(n,a).join("\n"),r.hasAttribute("data-start")||r.setAttribute("data-start",String(n+1))}s.textContent=t,Prism.highlightElement(s)},function(t){r.setAttribute(o,"failed"),s.textContent=t})}});var t=!(Prism.plugins.fileHighlight={highlight:function(t){for(var e,i=(t||document).querySelectorAll(u),n=0;e=i[n++];)Prism.highlightElement(e)}});Prism.fileHighlight=function(){t||(console.warn("Prism.fileHighlight is deprecated. Use `Prism.plugins.fileHighlight.highlight` instead."),t=!0),Prism.plugins.fileHighlight.highlight.apply(this,arguments)}}}(); 7 | -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | sys.path.insert(0, os.path.abspath('.')) 16 | sys.path.insert(0, os.path.abspath('..')) 17 | 18 | 19 | # -- Project information ----------------------------------------------------- 20 | 21 | project = 'Dataiter' 22 | copyright = '2020–2024 Osmo Salomaa' 23 | author = 'Osmo Salomaa' 24 | 25 | # The full version, including alpha/beta/rc tags 26 | import dataiter 27 | release = dataiter.__version__ 28 | 29 | 30 | # -- General configuration --------------------------------------------------- 31 | 32 | master_doc = 'index' 33 | 34 | # Add any Sphinx extension module names here, as strings. They can be 35 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 36 | # ones. 37 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode', 'output'] 38 | 39 | # Add any paths that contain templates here, relative to this directory. 40 | templates_path = ['_templates'] 41 | 42 | # List of patterns, relative to source directory, that match files and 43 | # directories to ignore when looking for source files. 44 | # This pattern also affects html_static_path and html_extra_path. 45 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 46 | 47 | 48 | # -- Options for HTML output ------------------------------------------------- 49 | 50 | # The theme to use for HTML and HTML Help pages. See the documentation for 51 | # a list of builtin themes. 52 | 53 | import sphinx_rtd_theme 54 | html_theme = 'sphinx_rtd_theme' 55 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 56 | 57 | html_theme_options = { 58 | 'navigation_depth': 3, 59 | } 60 | 61 | html_context = { 62 | 'display_github': True, 63 | } 64 | 65 | rst_prolog = """ 66 | :github_url: https://github.com/otsaloma/dataiter 67 | """ 68 | 69 | # Add any paths that contain custom static files (such as style sheets) here, 70 | # relative to this directory. They are copied after the builtin static files, 71 | # so a file named "default.css" will overwrite the builtin "default.css". 72 | html_static_path = [ 73 | '_static', 74 | 'comparison/prism.css', 75 | 'comparison/prism.js', 76 | 'comparison/comparison.html', 77 | ] 78 | 79 | def setup(app): 80 | # Build comparison/comparison.html. Note that readthedocs.org doesn't 81 | # run the Makefile, so anything there doesn't help in production. 82 | # https://github.com/readthedocs/readthedocs.org/issues/2276#issuecomment-231899567 83 | import subprocess 84 | from pathlib import Path 85 | cwd = Path(__file__).parent.resolve() / 'comparison' 86 | subprocess.run([sys.executable, 'build.py'], cwd=cwd, check=True) 87 | -------------------------------------------------------------------------------- /doc/data-frame-column.rst: -------------------------------------------------------------------------------- 1 | dataiter.DataFrameColumn 2 | ======================== 3 | 4 | :meth:`~dataiter.DataFrameColumn.__init__` 5 | :attr:`~dataiter.DataFrameColumn.nrow` 6 | 7 | .. autoclass:: dataiter.DataFrameColumn 8 | :members: 9 | :special-members: __init__ 10 | -------------------------------------------------------------------------------- /doc/data-frame.rst: -------------------------------------------------------------------------------- 1 | dataiter.DataFrame 2 | ================== 3 | 4 | :meth:`~dataiter.DataFrame.__init__` 5 | :meth:`~dataiter.DataFrame.aggregate` 6 | :meth:`~dataiter.DataFrame.anti_join` 7 | :meth:`~dataiter.DataFrame.cbind` 8 | :attr:`~dataiter.DataFrame.colnames` 9 | :attr:`~dataiter.DataFrame.columns` 10 | :meth:`~dataiter.DataFrame.compare` 11 | :meth:`~dataiter.DataFrame.copy` 12 | :meth:`~dataiter.DataFrame.count` 13 | :meth:`~dataiter.DataFrame.deepcopy` 14 | :meth:`~dataiter.DataFrame.drop_na` 15 | :meth:`~dataiter.DataFrame.filter` 16 | :meth:`~dataiter.DataFrame.filter_out` 17 | :meth:`~dataiter.DataFrame.from_arrow` 18 | :meth:`~dataiter.DataFrame.from_json` 19 | :meth:`~dataiter.DataFrame.from_pandas` 20 | :meth:`~dataiter.DataFrame.full_join` 21 | :meth:`~dataiter.DataFrame.group_by` 22 | :meth:`~dataiter.DataFrame.head` 23 | :meth:`~dataiter.DataFrame.inner_join` 24 | :meth:`~dataiter.DataFrame.left_join` 25 | :meth:`~dataiter.DataFrame.map` 26 | :meth:`~dataiter.DataFrame.modify` 27 | :attr:`~dataiter.DataFrame.ncol` 28 | :attr:`~dataiter.DataFrame.nrow` 29 | :meth:`~dataiter.DataFrame.print_` 30 | :meth:`~dataiter.DataFrame.print_memory_use` 31 | :meth:`~dataiter.DataFrame.print_na_counts` 32 | :meth:`~dataiter.DataFrame.rbind` 33 | :meth:`~dataiter.DataFrame.read_csv` 34 | :meth:`~dataiter.DataFrame.read_json` 35 | :meth:`~dataiter.DataFrame.read_npz` 36 | :meth:`~dataiter.DataFrame.read_parquet` 37 | :meth:`~dataiter.DataFrame.read_pickle` 38 | :meth:`~dataiter.DataFrame.rename` 39 | :meth:`~dataiter.DataFrame.sample` 40 | :meth:`~dataiter.DataFrame.select` 41 | :meth:`~dataiter.DataFrame.semi_join` 42 | :meth:`~dataiter.DataFrame.slice` 43 | :meth:`~dataiter.DataFrame.slice_off` 44 | :meth:`~dataiter.DataFrame.sort` 45 | :meth:`~dataiter.DataFrame.split` 46 | :meth:`~dataiter.DataFrame.tail` 47 | :meth:`~dataiter.DataFrame.to_arrow` 48 | :meth:`~dataiter.DataFrame.to_json` 49 | :meth:`~dataiter.DataFrame.to_list_of_dicts` 50 | :meth:`~dataiter.DataFrame.to_pandas` 51 | :meth:`~dataiter.DataFrame.to_string` 52 | :meth:`~dataiter.DataFrame.unique` 53 | :meth:`~dataiter.DataFrame.unselect` 54 | :meth:`~dataiter.DataFrame.update` 55 | :meth:`~dataiter.DataFrame.write_csv` 56 | :meth:`~dataiter.DataFrame.write_json` 57 | :meth:`~dataiter.DataFrame.write_npz` 58 | :meth:`~dataiter.DataFrame.write_parquet` 59 | :meth:`~dataiter.DataFrame.write_pickle` 60 | 61 | .. autoclass:: dataiter.DataFrame 62 | :members: 63 | :special-members: __init__ 64 | -------------------------------------------------------------------------------- /doc/dataiter.rst: -------------------------------------------------------------------------------- 1 | dataiter 2 | ======== 3 | 4 | The following functions are shorthand helpers for use in conjunction 5 | with :meth:`.DataFrame.aggregate`, see the guide on :doc:`aggregation 6 | ` for details. 7 | 8 | :func:`~dataiter.all` 9 | :func:`~dataiter.any` 10 | :func:`~dataiter.count` 11 | :func:`~dataiter.count_unique` 12 | :func:`~dataiter.first` 13 | :func:`~dataiter.last` 14 | :func:`~dataiter.max` 15 | :func:`~dataiter.mean` 16 | :func:`~dataiter.median` 17 | :func:`~dataiter.min` 18 | :func:`~dataiter.mode` 19 | :func:`~dataiter.nth` 20 | :func:`~dataiter.quantile` 21 | :func:`~dataiter.std` 22 | :func:`~dataiter.sum` 23 | :func:`~dataiter.var` 24 | 25 | The following read functions are convenience aliases to the correspoding 26 | methods of the classes generally most suitable for the particular file 27 | type, i.e. :class:`.DataFrame` for CSV, NPZ and Parquet, 28 | :class:`.GeoJSON` for GeoJSON and :class:`.ListOfDicts` for JSON. 29 | 30 | :func:`~dataiter.read_csv` 31 | :func:`~dataiter.read_geojson` 32 | :func:`~dataiter.read_json` 33 | :func:`~dataiter.read_npz` 34 | :func:`~dataiter.read_parquet` 35 | 36 | The following constants can be used to customize certain defaults, such as 37 | formatting and limits for printing. 38 | 39 | :attr:`dataiter.PRINT_MAX_WIDTH` 40 | :attr:`dataiter.PRINT_THOUSAND_SEPARATOR` 41 | :attr:`dataiter.PRINT_TRUNCATE_WIDTH` 42 | :attr:`dataiter.USE_NUMBA` 43 | :attr:`dataiter.USE_NUMBA_CACHE` 44 | 45 | .. automodule:: dataiter 46 | :members: all, 47 | any, 48 | count, 49 | count_unique, 50 | first, 51 | last, 52 | max, 53 | mean, 54 | median, 55 | min, 56 | mode, 57 | nth, 58 | quantile, 59 | read_csv, 60 | read_geojson, 61 | read_json, 62 | read_npz, 63 | read_parquet, 64 | std, 65 | sum, 66 | var, 67 | PRINT_MAX_WIDTH, 68 | PRINT_THOUSAND_SEPARATOR, 69 | PRINT_TRUNCATE_WIDTH, 70 | USE_NUMBA, 71 | USE_NUMBA_CACHE 72 | -------------------------------------------------------------------------------- /doc/dt.rst: -------------------------------------------------------------------------------- 1 | dataiter.dt 2 | =========== 3 | 4 | The ``dt`` module contains vectorized functions for dealing with dates and 5 | datetimes, similar to ``numpy.strings`` for strings. This is mostly a 6 | convenience wrapper around Python's standard library ``datetime`` module, not 7 | any efficient reimplementation. 8 | 9 | :func:`~dataiter.dt.day` 10 | :func:`~dataiter.dt.from_string` 11 | :func:`~dataiter.dt.hour` 12 | :func:`~dataiter.dt.isoweek` 13 | :func:`~dataiter.dt.isoweekday` 14 | :func:`~dataiter.dt.microsecond` 15 | :func:`~dataiter.dt.minute` 16 | :func:`~dataiter.dt.month` 17 | :func:`~dataiter.dt.new` 18 | :func:`~dataiter.dt.now` 19 | :func:`~dataiter.dt.quarter` 20 | :func:`~dataiter.dt.replace` 21 | :func:`~dataiter.dt.second` 22 | :func:`~dataiter.dt.to_string` 23 | :func:`~dataiter.dt.today` 24 | :func:`~dataiter.dt.weekday` 25 | :func:`~dataiter.dt.year` 26 | 27 | .. automodule:: dataiter.dt 28 | :members: 29 | -------------------------------------------------------------------------------- /doc/dtypes.rst: -------------------------------------------------------------------------------- 1 | dataiter.dtypes 2 | =============== 3 | 4 | Custom data types for vectors. 5 | 6 | .. automodule:: dataiter.dtypes 7 | :members: 8 | -------------------------------------------------------------------------------- /doc/geojson.rst: -------------------------------------------------------------------------------- 1 | dataiter.GeoJSON 2 | ================ 3 | 4 | :meth:`~dataiter.GeoJSON.__init__` 5 | :meth:`~dataiter.GeoJSON.read` 6 | :meth:`~dataiter.GeoJSON.to_data_frame` 7 | :meth:`~dataiter.GeoJSON.write` 8 | 9 | .. autoclass:: dataiter.GeoJSON 10 | :members: read, to_data_frame, write 11 | :special-members: __init__ 12 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | Dataiter Documentation 2 | ====================== 3 | 4 | Dataiter's :class:`.DataFrame` is a class for tabular data similar to R's 5 | ``data.frame``, implementing all common operations to manipulate data. It is 6 | under the hood a dictionary of NumPy arrays and thus capable of fast vectorized 7 | operations. You can consider it to be a light-weight alternative to Pandas with 8 | a simple and consistent API. Performance-wise Dataiter relies on NumPy and Numba 9 | and is likely to be at best comparable to Pandas. 10 | 11 | Additionally Dataiter includes :class:`.ListOfDicts`, a class for manipulating 12 | hierarchical data, such as from JSON APIs or document databases, and 13 | :class:`.GeoJSON`, a class for manipulating data from GeoJSON files in a data 14 | frame. 15 | 16 | .. toctree:: 17 | :maxdepth: 1 18 | :caption: Tutorials 19 | 20 | quick-start 21 | comparison 22 | aggregation 23 | 24 | .. toctree:: 25 | :maxdepth: 1 26 | :caption: API Documentation 27 | 28 | dataiter 29 | data-frame 30 | data-frame-column 31 | geojson 32 | list-of-dicts 33 | vector 34 | dt 35 | dtypes 36 | regex 37 | -------------------------------------------------------------------------------- /doc/list-of-dicts.rst: -------------------------------------------------------------------------------- 1 | dataiter.ListOfDicts 2 | ==================== 3 | 4 | :meth:`~dataiter.ListOfDicts.__init__` 5 | :meth:`~dataiter.ListOfDicts.aggregate` 6 | :meth:`~dataiter.ListOfDicts.anti_join` 7 | :meth:`~dataiter.ListOfDicts.append` 8 | :meth:`~dataiter.ListOfDicts.clear` 9 | :meth:`~dataiter.ListOfDicts.copy` 10 | :meth:`~dataiter.ListOfDicts.deepcopy` 11 | :meth:`~dataiter.ListOfDicts.drop_na` 12 | :meth:`~dataiter.ListOfDicts.extend` 13 | :meth:`~dataiter.ListOfDicts.fill_missing_keys` 14 | :meth:`~dataiter.ListOfDicts.filter` 15 | :meth:`~dataiter.ListOfDicts.filter_out` 16 | :meth:`~dataiter.ListOfDicts.from_json` 17 | :meth:`~dataiter.ListOfDicts.full_join` 18 | :meth:`~dataiter.ListOfDicts.group_by` 19 | :meth:`~dataiter.ListOfDicts.head` 20 | :meth:`~dataiter.ListOfDicts.inner_join` 21 | :meth:`~dataiter.ListOfDicts.insert` 22 | :meth:`~dataiter.ListOfDicts.keys` 23 | :meth:`~dataiter.ListOfDicts.left_join` 24 | :meth:`~dataiter.ListOfDicts.map` 25 | :meth:`~dataiter.ListOfDicts.modify` 26 | :meth:`~dataiter.ListOfDicts.modify_if` 27 | :meth:`~dataiter.ListOfDicts.pluck` 28 | :meth:`~dataiter.ListOfDicts.print_` 29 | :meth:`~dataiter.ListOfDicts.print_memory_use` 30 | :meth:`~dataiter.ListOfDicts.print_na_counts` 31 | :meth:`~dataiter.ListOfDicts.read_csv` 32 | :meth:`~dataiter.ListOfDicts.read_json` 33 | :meth:`~dataiter.ListOfDicts.read_pickle` 34 | :meth:`~dataiter.ListOfDicts.rename` 35 | :meth:`~dataiter.ListOfDicts.reverse` 36 | :meth:`~dataiter.ListOfDicts.sample` 37 | :meth:`~dataiter.ListOfDicts.select` 38 | :meth:`~dataiter.ListOfDicts.semi_join` 39 | :meth:`~dataiter.ListOfDicts.sort` 40 | :meth:`~dataiter.ListOfDicts.split` 41 | :meth:`~dataiter.ListOfDicts.tail` 42 | :meth:`~dataiter.ListOfDicts.to_data_frame` 43 | :meth:`~dataiter.ListOfDicts.to_json` 44 | :meth:`~dataiter.ListOfDicts.to_pandas` 45 | :meth:`~dataiter.ListOfDicts.to_string` 46 | :meth:`~dataiter.ListOfDicts.unique` 47 | :meth:`~dataiter.ListOfDicts.unselect` 48 | :meth:`~dataiter.ListOfDicts.write_csv` 49 | :meth:`~dataiter.ListOfDicts.write_json` 50 | :meth:`~dataiter.ListOfDicts.write_pickle` 51 | 52 | .. autoclass:: dataiter.ListOfDicts 53 | :members: 54 | :special-members: __init__ 55 | -------------------------------------------------------------------------------- /doc/output.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import subprocess 4 | 5 | from pathlib import Path 6 | 7 | CODE = """ 8 | import sys 9 | from pathlib import Path 10 | sys.path.insert(0, str(Path("."))) 11 | import dataiter as di 12 | import numpy as np 13 | from dataiter import dt 14 | from dataiter import regex 15 | di.PRINT_MAX_ITEMS = 3 16 | di.PRINT_MAX_ROWS = 10 17 | di.PRINT_MAX_WIDTH = 72 18 | """ 19 | 20 | def get_output(lines): 21 | try: 22 | return subprocess.check_output( 23 | args=["python3", "-c", "\n".join(lines)], 24 | stderr=subprocess.STDOUT, 25 | cwd=Path("..").resolve(), 26 | encoding="utf-8", 27 | errors="replace", 28 | universal_newlines=True, 29 | text=True, 30 | timeout=30, 31 | ).splitlines() 32 | except subprocess.CalledProcessError as e: 33 | return e.output.splitlines() 34 | 35 | def on_autodoc_process_docstring(app, what, name, obj, options, lines): 36 | print(f"Processing {name}...") 37 | # Intercept all ">>>" lines in docstring, run the corresponding code 38 | # and inject any possible output into the docstring. 39 | code = CODE.strip().splitlines() 40 | output = [] 41 | for i, line in enumerate(lines): 42 | if not line.startswith(">>>"): continue 43 | line = line.lstrip("> ") 44 | if line.startswith("#"): continue 45 | # Some docstrings will, on purpose, have lines of code that raise 46 | # errors. Wrap lines in try-except so that all lines will always be 47 | # executed and output from only the last line will be used. 48 | code.append(f"try: {line}\nexcept Exception: pass") 49 | if " = " in line: continue 50 | if line.startswith(("from ", "import ")): continue 51 | blob = get_output(code[:-1] + [f"print({line})"]) 52 | for j in range(len(blob)): 53 | # Avoid a paragraph change on blank lines. 54 | if not blob[j].strip(): 55 | blob[j] = "." 56 | output.append((i + 1, blob)) 57 | for i, blob in reversed(output): 58 | lines[i:i] = blob 59 | 60 | def setup(app): 61 | # https://www.sphinx-doc.org/en/master/usage/extensions/autodoc.html#event-autodoc-process-docstring 62 | app.connect("autodoc-process-docstring", on_autodoc_process_docstring) 63 | return { 64 | "version": "0.1", 65 | "parallel_read_safe": True, 66 | "parallel_write_safe": True, 67 | } 68 | -------------------------------------------------------------------------------- /doc/quick-start.rst: -------------------------------------------------------------------------------- 1 | Quick Start 2 | =========== 3 | 4 | DataFrame 5 | --------- 6 | 7 | >>> import dataiter as di 8 | >>> data = di.read_csv("data/listings.csv") 9 | >>> data.price_per_guest = data.price / data.guests 10 | >>> data.head() 11 | . 12 | id hood zipcode guests sqft price price_per_guest 13 | int64 >> data.filter(hood="Manhattan").filter(guests=2).sort(price=1).head() 27 | . 28 | id hood zipcode guests sqft price price_per_guest 29 | int64 >> import dataiter as di 47 | >>> data = di.read_geojson("data/neighbourhoods.geojson") 48 | >>> data.head() 49 | . 50 | neighbourhood neighbourhood_group geometry 51 | 54 | 1 Allerton Bronx 55 | 2 City Island Bronx 56 | 3 Ditmars Steinway Queens 57 | 4 Ozone Park Queens 58 | 5 Fordham Bronx 59 | 6 Whitestone Queens 60 | 7 Arden Heights Staten Island 61 | 8 Arrochar Staten Island 62 | 9 Arverne Queens 63 | . 64 | 65 | ListOfDicts 66 | ----------- 67 | 68 | >>> import dataiter as di 69 | >>> data = di.read_json("data/listings.json") 70 | >>> data = data.modify(price_per_guest=lambda x: x.price / x.guests) 71 | >>> data.head() 72 | [ 73 | { 74 | "id": 2060, 75 | "hood": "Manhattan", 76 | "zipcode": "10040", 77 | "guests": 2, 78 | "sqft": null, 79 | "price": 100, 80 | "price_per_guest": 50.0 81 | }, 82 | { 83 | "id": 2595, 84 | "hood": "Manhattan", 85 | "zipcode": "10018", 86 | "guests": 2, 87 | "sqft": null, 88 | "price": 225, 89 | "price_per_guest": 112.5 90 | }, 91 | { 92 | "id": 3831, 93 | "hood": "Brooklyn", 94 | "zipcode": "11238", 95 | "guests": 3, 96 | "sqft": 500.0, 97 | "price": 89, 98 | "price_per_guest": 29.666666666666668 99 | } 100 | ] 101 | >>> data.filter(hood="Manhattan").filter(guests=2).sort(price=1).head() 102 | [ 103 | { 104 | "id": 42279170, 105 | "hood": "Manhattan", 106 | "zipcode": "10013", 107 | "guests": 2, 108 | "sqft": null, 109 | "price": 0, 110 | "price_per_guest": 0.0 111 | }, 112 | { 113 | "id": 42384530, 114 | "hood": "Manhattan", 115 | "zipcode": "10036", 116 | "guests": 2, 117 | "sqft": null, 118 | "price": 0, 119 | "price_per_guest": 0.0 120 | }, 121 | { 122 | "id": 18835820, 123 | "hood": "Manhattan", 124 | "zipcode": "10021", 125 | "guests": 2, 126 | "sqft": null, 127 | "price": 10, 128 | "price_per_guest": 5.0 129 | } 130 | ] 131 | -------------------------------------------------------------------------------- /doc/regex.rst: -------------------------------------------------------------------------------- 1 | dataiter.regex 2 | ============== 3 | 4 | The ``regex`` module contains vectorized versions of regular expression matching 5 | operations, similar to ``numpy.strings`` for string operations. This is a 6 | convenience wrapper around Python's standard library ``re`` module, not any 7 | efficient reimplementation. 8 | 9 | :func:`~dataiter.regex.findall` 10 | :func:`~dataiter.regex.fullmatch` 11 | :func:`~dataiter.regex.match` 12 | :func:`~dataiter.regex.search` 13 | :func:`~dataiter.regex.split` 14 | :func:`~dataiter.regex.sub` 15 | :func:`~dataiter.regex.subn` 16 | 17 | .. automodule:: dataiter.regex 18 | :members: 19 | -------------------------------------------------------------------------------- /doc/requirements.txt: -------------------------------------------------------------------------------- 1 | attd==1.0 2 | jinja2==3.1.3 3 | numpy==2.0.2 4 | pandas==2.2.3 5 | pyarrow==18.1.0 6 | sphinx==7.2.6 7 | sphinx-rtd-theme==2.0.0 8 | wcwidth==0.2.13 9 | -------------------------------------------------------------------------------- /doc/vector.rst: -------------------------------------------------------------------------------- 1 | dataiter.Vector 2 | =============== 3 | 4 | :meth:`~dataiter.Vector.__init__` 5 | :meth:`~dataiter.Vector.as_boolean` 6 | :meth:`~dataiter.Vector.as_bytes` 7 | :meth:`~dataiter.Vector.as_date` 8 | :meth:`~dataiter.Vector.as_datetime` 9 | :meth:`~dataiter.Vector.as_float` 10 | :meth:`~dataiter.Vector.as_integer` 11 | :meth:`~dataiter.Vector.as_object` 12 | :meth:`~dataiter.Vector.as_string` 13 | :meth:`~dataiter.Vector.concat` 14 | :meth:`~dataiter.Vector.drop_na` 15 | :attr:`~dataiter.Vector.dt` 16 | :attr:`~dataiter.Vector.dtype_label` 17 | :meth:`~dataiter.Vector.equal` 18 | :meth:`~dataiter.Vector.fast` 19 | :meth:`~dataiter.Vector.get_memory_use` 20 | :meth:`~dataiter.Vector.head` 21 | :meth:`~dataiter.Vector.is_boolean` 22 | :meth:`~dataiter.Vector.is_bytes` 23 | :meth:`~dataiter.Vector.is_datetime` 24 | :meth:`~dataiter.Vector.is_float` 25 | :meth:`~dataiter.Vector.is_integer` 26 | :meth:`~dataiter.Vector.is_na` 27 | :meth:`~dataiter.Vector.is_number` 28 | :meth:`~dataiter.Vector.is_object` 29 | :meth:`~dataiter.Vector.is_string` 30 | :meth:`~dataiter.Vector.is_timedelta` 31 | :attr:`~dataiter.Vector.length` 32 | :meth:`~dataiter.Vector.map` 33 | :attr:`~dataiter.Vector.na_dtype` 34 | :attr:`~dataiter.Vector.na_value` 35 | :meth:`~dataiter.Vector.range` 36 | :meth:`~dataiter.Vector.rank` 37 | :attr:`~dataiter.Vector.re` 38 | :meth:`~dataiter.Vector.replace_na` 39 | :meth:`~dataiter.Vector.sample` 40 | :meth:`~dataiter.Vector.sort` 41 | :attr:`~dataiter.Vector.str` 42 | :meth:`~dataiter.Vector.tail` 43 | :meth:`~dataiter.Vector.to_string` 44 | :meth:`~dataiter.Vector.tolist` 45 | :meth:`~dataiter.Vector.unique` 46 | 47 | .. autoclass:: dataiter.Vector 48 | :members: 49 | :special-members: __init__ 50 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["build==1.2.2.post1", "hatchling==1.21.1"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "dataiter" 7 | dynamic = ["version"] 8 | description = "Simple, light-weight data frames for Python" 9 | readme = "README.md" 10 | license = "MIT" 11 | requires-python = ">=3.9.0" 12 | authors = [{ name = "Osmo Salomaa", email = "otsaloma@iki.fi" }] 13 | dependencies = ["attd>=0.3", "numpy>=2.0,<3.0", "pyarrow>=2.0", "wcwidth>=0.1"] 14 | 15 | [project.urls] 16 | Homepage = "https://github.com/otsaloma/dataiter" 17 | 18 | [tool.hatch.version] 19 | path = "dataiter/__init__.py" 20 | 21 | [tool.hatch.build.targets.sdist] 22 | include = ["/dataiter"] 23 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | attd==1.0 2 | click==8.1.7 3 | flake8==7.1.1 4 | jinja2==3.1.3 5 | numba==0.60.0 6 | numpy==2.0.2 7 | pandas==2.2.3 8 | pyarrow==18.1.0 9 | pytest==8.3.4 10 | sphinx==7.2.6 11 | sphinx-rtd-theme==2.0.0 12 | wcwidth==0.2.13 13 | -------------------------------------------------------------------------------- /tools/check-missing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import dataiter as di 4 | import inspect 5 | 6 | df = di.DataFrame() 7 | ld = di.ListOfDicts() 8 | 9 | base_df = {} 10 | base_ld = [] 11 | 12 | print("") 13 | print("Methods missing from DataFrame:") 14 | for name in sorted(dir(ld)): 15 | if name in dir(df): continue 16 | if name.startswith("_"): continue 17 | if name in dir(base_ld) and name not in dir(base_df): continue 18 | if not inspect.ismethod(getattr(ld, name)): continue 19 | print(f"... {name}") 20 | 21 | print("") 22 | print("Methods missing from ListOfDicts:") 23 | for name in sorted(dir(df)): 24 | if name in dir(ld): continue 25 | if name.startswith("_"): continue 26 | if name in dir(base_df) and name not in dir(base_ld): continue 27 | if not inspect.ismethod(getattr(df, name)): continue 28 | print(f"... {name}") 29 | -------------------------------------------------------------------------------- /tools/release: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Commit changes, tag and push release to GitHub. 3 | cd "$(dirname "$0")/.." || exit 1 4 | VERSION="$(python3 -c "import dataiter; print(dataiter.__version__)")" 5 | echo "Git status:" 6 | git status --porcelain 7 | printf "\nRelease version: $VERSION\n" 8 | read -p "Press Enter to continue or Ctrl+C to abort: " 9 | git commit -a -m "RELEASE $VERSION" 10 | git tag -s -m "RELEASE $VERSION" $VERSION 11 | git push 12 | git push --tags 13 | egrep -B 999 -m2 "^===+" NEWS.md \ 14 | | head -n-3 \ 15 | | tail -n+4 \ 16 | | sed ':a;N;$!ba;s/\n / /g' \ 17 | | gh release create \ 18 | --notes-file - \ 19 | --title $VERSION \ 20 | $VERSION 21 | -------------------------------------------------------------------------------- /validation/generate-df.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | sys.path.insert(0, "..") 5 | 6 | import dataiter as di 7 | 8 | print(f"USE_NUMBA: {di.USE_NUMBA}") 9 | 10 | def read_csv(path): 11 | data = di.read_csv(path) 12 | for name in data.colnames: 13 | # Drop all rows with NAs to avoid upcasting to float 14 | # and differing NA representation in output. 15 | data = data.filter_out(data[name].is_na()) 16 | if data[name].is_string(): 17 | # Use all lower case for strings to avoid differing 18 | # sorting of lower vs. upper case characters. 19 | data[name] = data[name].str.lower() 20 | return data 21 | 22 | # AGGREGATE 23 | (read_csv("../data/vehicles.csv") 24 | .modify(fuel_regular=lambda x: x.fuel == "regular") 25 | .group_by("make", "model") 26 | .aggregate( 27 | all_fuel_regular=di.all("fuel_regular"), 28 | any_fuel_regular=di.any("fuel_regular"), 29 | count=di.count(), 30 | count_unique_cyl=di.count_unique("cyl"), 31 | first_hwy=di.first("hwy"), 32 | last_hwy=di.last("hwy"), 33 | max_hwy=di.max("hwy"), 34 | mean_hwy=di.mean("hwy"), 35 | median_hwy=di.median("hwy"), 36 | min_hwy=di.min("hwy"), 37 | mode_year=di.mode("year"), 38 | nth_id=di.nth("id", 0), 39 | quantile_hwy=di.quantile("hwy", 0.75), 40 | std_hwy=di.std("hwy", ddof=1), 41 | sum_hwy=di.sum("hwy"), 42 | var_hwy=di.var("hwy", ddof=1)) 43 | .modify(mean_hwy=lambda x: x.mean_hwy.round(2)) 44 | .modify(std_hwy =lambda x: x.std_hwy.round(2)) 45 | .modify(var_hwy =lambda x: x.var_hwy.round(2)) 46 | .write_csv("aggregate.df.csv")) 47 | 48 | # ANTI JOIN 49 | reviews = read_csv("../data/listings-reviews.csv") 50 | (read_csv("../data/listings.csv") 51 | .anti_join(reviews, "id") 52 | .write_csv("anti_join.df.csv")) 53 | 54 | # FILTER 55 | (read_csv("../data/vehicles.csv") 56 | .filter(lambda x: x.year < 2000) 57 | .filter(lambda x: x.cyl < 10) 58 | .write_csv("filter.df.csv")) 59 | 60 | # FILTER OUT 61 | (read_csv("../data/vehicles.csv") 62 | .filter_out(lambda x: x.year < 2000) 63 | .filter_out(lambda x: x.cyl < 10) 64 | .write_csv("filter_out.df.csv")) 65 | 66 | # FULL JOIN 67 | reviews = read_csv("../data/listings-reviews.csv") 68 | reviews = reviews.rbind(reviews) 69 | (read_csv("../data/listings.csv") 70 | .full_join(reviews, "id") 71 | .write_csv("full_join.df.csv")) 72 | 73 | # INNER JOIN 74 | reviews = read_csv("../data/listings-reviews.csv") 75 | (read_csv("../data/listings.csv") 76 | .inner_join(reviews, "id") 77 | .write_csv("inner_join.df.csv")) 78 | 79 | # LEFT JOIN 80 | reviews = read_csv("../data/listings-reviews.csv") 81 | (read_csv("../data/listings.csv") 82 | .left_join(reviews, "id") 83 | .write_csv("left_join.df.csv")) 84 | 85 | # SEMI JOIN 86 | reviews = read_csv("../data/listings-reviews.csv") 87 | (read_csv("../data/listings.csv") 88 | .semi_join(reviews, "id") 89 | .write_csv("semi_join.df.csv")) 90 | 91 | # SORT 92 | (read_csv("../data/vehicles.csv") 93 | .sort(make=1, model=1, year=-1) 94 | .write_csv("sort.df.csv")) 95 | 96 | # UNIQUE 97 | (read_csv("../data/vehicles.csv") 98 | .unique("make", "model", "year") 99 | .write_csv("unique.df.csv")) 100 | -------------------------------------------------------------------------------- /validation/generate-ld.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | sys.path.insert(0, "..") 5 | 6 | import dataiter as di 7 | import statistics 8 | 9 | from statistics import mean 10 | from statistics import median 11 | from statistics import mode 12 | 13 | def read_json(path): 14 | data = di.read_json(path) 15 | for name in list(data[0].keys()): 16 | # Drop all rows with NAs to avoid upcasting to float 17 | # and differing NA representation in output. 18 | data = data.filter_out(lambda x: x[name] is None) 19 | for item in data: 20 | if isinstance(item[name], str): 21 | # Use all lower case for strings to avoid differing 22 | # sorting of lower vs. upper case characters. 23 | item[name] = item[name].lower() 24 | return data 25 | 26 | round2 = lambda x: round(x, 2) if x is not None else None 27 | stdev = lambda x: statistics.stdev(x) if len(x) > 1 else None 28 | variance = lambda x: statistics.variance(x) if len(x) > 1 else None 29 | 30 | # AGGREGATE 31 | (read_json("../data/vehicles.json") 32 | .modify(fuel_regular=lambda x: x.fuel == "regular") 33 | .group_by("make", "model") 34 | .aggregate( 35 | all_fuel_regular=lambda x: all(x.pluck("fuel_regular")), 36 | any_fuel_regular=lambda x: any(x.pluck("fuel_regular")), 37 | count=len, 38 | count_unique_cyl=lambda x: len(set(x.pluck("cyl"))), 39 | first_hwy=lambda x: x[0].hwy, 40 | last_hwy=lambda x: x[-1].hwy, 41 | max_hwy=lambda x: max(x.pluck("hwy")), 42 | mean_hwy=lambda x: mean(x.pluck("hwy")), 43 | median_hwy=lambda x: median(x.pluck("hwy")), 44 | min_hwy=lambda x: min(x.pluck("hwy")), 45 | mode_year=lambda x: mode(x.pluck("year")), 46 | nth_id=lambda x: x[0].id, 47 | quantile_hwy=lambda x: di.quantile(di.Vector(x.pluck("hwy")), 0.75), 48 | std_hwy=lambda x: stdev(x.pluck("hwy")), 49 | sum_hwy=lambda x: sum(x.pluck("hwy")), 50 | var_hwy=lambda x: variance(x.pluck("hwy"))) 51 | .modify(mean_hwy=lambda x: round2(x.mean_hwy)) 52 | .modify(std_hwy =lambda x: round2(x.std_hwy)) 53 | .modify(var_hwy =lambda x: round2(x.var_hwy)) 54 | .write_csv("aggregate.ld.csv")) 55 | 56 | # ANTI JOIN 57 | reviews = read_json("../data/listings-reviews.json") 58 | (read_json("../data/listings.json") 59 | .anti_join(reviews, "id") 60 | .write_csv("anti_join.ld.csv")) 61 | 62 | # FILTER 63 | (read_json("../data/vehicles.json") 64 | .filter(lambda x: x.year < 2000) 65 | .filter(lambda x: x.cyl < 10) 66 | .write_csv("filter.ld.csv")) 67 | 68 | # FILTER OUT 69 | (read_json("../data/vehicles.json") 70 | .filter_out(lambda x: x.year < 2000) 71 | .filter_out(lambda x: x.cyl < 10) 72 | .write_csv("filter_out.ld.csv")) 73 | 74 | # FULL JOIN 75 | reviews = read_json("../data/listings-reviews.json") 76 | reviews = reviews + reviews 77 | (read_json("../data/listings.json") 78 | .full_join(reviews, "id") 79 | .write_csv("full_join.ld.csv")) 80 | 81 | # INNER JOIN 82 | reviews = read_json("../data/listings-reviews.json") 83 | (read_json("../data/listings.json") 84 | .inner_join(reviews, "id") 85 | .write_csv("inner_join.ld.csv")) 86 | 87 | # LEFT JOIN 88 | reviews = read_json("../data/listings-reviews.json") 89 | (read_json("../data/listings.json") 90 | .left_join(reviews, "id") 91 | .write_csv("left_join.ld.csv")) 92 | 93 | # SEMI JOIN 94 | reviews = read_json("../data/listings-reviews.json") 95 | (read_json("../data/listings.json") 96 | .semi_join(reviews, "id") 97 | .write_csv("semi_join.ld.csv")) 98 | 99 | # SORT 100 | (read_json("../data/vehicles.json") 101 | .sort(make=1, model=1, year=-1) 102 | .write_csv("sort.ld.csv")) 103 | 104 | # UNIQUE 105 | (read_json("../data/vehicles.json") 106 | .unique("make", "model", "year") 107 | .write_csv("unique.ld.csv")) 108 | -------------------------------------------------------------------------------- /validation/generate.R: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8-unix -*- 2 | 3 | suppressPackageStartupMessages({ 4 | library(dplyr) 5 | library(readr) 6 | }) 7 | 8 | options(dplyr.summarise.inform=FALSE) 9 | 10 | Mode = function(x) { 11 | # https://stackoverflow.com/q/2547402 12 | ux = unique(x) 13 | return(ux[which.max(tabulate(match(x, ux)))]) 14 | } 15 | 16 | read_csv = function(path) { 17 | data = readr::read_csv(path, show_col_types=FALSE, lazy=FALSE) 18 | for (name in colnames(data)) { 19 | # Drop all rows with NAs to avoid upcasting to float 20 | # and differing NA representation in output. 21 | data = data[!is.na(data[[name]]),] 22 | if (is.character(data[[name]])) 23 | # Use all lower case for strings to avoid differing 24 | # sorting of lower vs. upper case characters. 25 | data[[name]] = tolower(data[[name]]) 26 | } 27 | return(data) 28 | } 29 | 30 | write_csv = function(data, path) { 31 | readr::write_csv(data, path, na="") 32 | } 33 | 34 | # AGGREGATE 35 | read_csv("../data/vehicles.csv") |> 36 | mutate(fuel_regular=(fuel == "regular")) |> 37 | group_by(make, model) |> 38 | summarise( 39 | all_fuel_regular=all(fuel_regular), 40 | any_fuel_regular=any(fuel_regular), 41 | count=n(), 42 | count_unique_cyl=n_distinct(cyl), 43 | first_hwy=first(hwy), 44 | last_hwy=last(hwy), 45 | max_hwy=max(hwy), 46 | mean_hwy=mean(hwy), 47 | median_hwy=median(hwy), 48 | min_hwy=min(hwy), 49 | mode_year=Mode(year), 50 | nth_id=nth(id, 1), 51 | quantile_hwy=quantile(hwy, 0.75, type=7), 52 | std_hwy=sd(hwy), 53 | sum_hwy=sum(hwy), 54 | var_hwy=var(hwy)) |> 55 | mutate(mean_hwy=round(mean_hwy, 2)) |> 56 | mutate(std_hwy=round(std_hwy, 2)) |> 57 | mutate(var_hwy=round(var_hwy, 2)) |> 58 | write_csv("aggregate.R.csv") 59 | 60 | # ANTI JOIN 61 | reviews = read_csv("../data/listings-reviews.csv") 62 | read_csv("../data/listings.csv") |> 63 | anti_join(reviews, by="id") |> 64 | write_csv("anti_join.R.csv") 65 | 66 | # FILTER 67 | read_csv("../data/vehicles.csv") |> 68 | filter(year < 2000) |> 69 | filter(cyl < 10) |> 70 | write_csv("filter.R.csv") 71 | 72 | # FILTER OUT 73 | read_csv("../data/vehicles.csv") |> 74 | filter(!(year < 2000)) |> 75 | filter(!(cyl < 10)) |> 76 | write_csv("filter_out.R.csv") 77 | 78 | # FULL JOIN 79 | reviews = read_csv("../data/listings-reviews.csv") 80 | reviews = bind_rows(reviews, reviews) 81 | read_csv("../data/listings.csv") |> 82 | full_join(reviews, by="id") |> 83 | write_csv("full_join.R.csv") 84 | 85 | # INNER JOIN 86 | reviews = read_csv("../data/listings-reviews.csv") 87 | read_csv("../data/listings.csv") |> 88 | inner_join(reviews, by="id") |> 89 | write_csv("inner_join.R.csv") 90 | 91 | # LEFT JOIN 92 | reviews = read_csv("../data/listings-reviews.csv") 93 | read_csv("../data/listings.csv") |> 94 | left_join(reviews, by="id") |> 95 | write_csv("left_join.R.csv") 96 | 97 | # SEMI JOIN 98 | reviews = read_csv("../data/listings-reviews.csv") 99 | read_csv("../data/listings.csv") |> 100 | semi_join(reviews, by="id") |> 101 | write_csv("semi_join.R.csv") 102 | 103 | # SORT 104 | read_csv("../data/vehicles.csv") |> 105 | arrange(make, model, desc(year)) |> 106 | write_csv("sort.R.csv") 107 | 108 | # UNIQUE 109 | read_csv("../data/vehicles.csv") |> 110 | distinct(make, model, year, .keep_all=TRUE) |> 111 | write_csv("unique.R.csv") 112 | -------------------------------------------------------------------------------- /validation/validate-df.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | rm -f *.df.csv 3 | rm -f *.R.csv 4 | echo "Generating data..." 5 | python3 generate-df.py 6 | Rscript generate.R 7 | # Remove quotes around strings. 8 | sed -ri 's/"//g' *.csv 9 | # Remove trailing zero decimals. 10 | sed -ri "s/\.0*(,|$)/\1/g" *.csv 11 | # Unify spelling of special values. 12 | sed -ri "s/true/TRUE/gi" *.csv 13 | sed -ri "s/false/FALSE/gi" *.csv 14 | EXIT_STATUS=0 15 | for NUM in $(ls *.df.csv | cut -d. -f1); do 16 | printf "%-23s" "Checking $NUM... " 17 | NLINES=$(diff -y --suppress-common-lines $NUM.df.csv $NUM.R.csv | wc -l) 18 | if [ $NLINES -gt 0 ]; then 19 | echo "$NLINES lines differ" 20 | EXIT_STATUS=1 21 | else 22 | echo "OK" 23 | fi 24 | done 25 | exit $EXIT_STATUS 26 | -------------------------------------------------------------------------------- /validation/validate-ld.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | rm -f *.ld.csv 3 | rm -f *.R.csv 4 | echo "Generating data..." 5 | python3 generate-ld.py 6 | Rscript generate.R 7 | # Remove quotes around strings. 8 | sed -ri 's/"//g' *.csv 9 | # Remove trailing zero decimals. 10 | sed -ri "s/\.0*(,|$)/\1/g" *.csv 11 | # Unify spelling of special values. 12 | sed -ri "s/true/TRUE/gi" *.csv 13 | sed -ri "s/false/FALSE/gi" *.csv 14 | EXIT_STATUS=0 15 | for NUM in $(ls *.ld.csv | cut -d. -f1); do 16 | printf "%-23s" "Checking $NUM... " 17 | NLINES=$(diff -y --suppress-common-lines $NUM.ld.csv $NUM.R.csv | wc -l) 18 | if [ $NLINES -gt 0 ]; then 19 | echo "$NLINES lines differ" 20 | EXIT_STATUS=1 21 | else 22 | echo "OK" 23 | fi 24 | done 25 | exit $EXIT_STATUS 26 | --------------------------------------------------------------------------------