├── .gitattributes ├── .github └── workflows │ ├── ci.yaml │ └── pypi-release.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE.txt ├── README.md ├── ci └── environment.yml ├── conftest.py ├── diagrams ├── aggregate.png ├── aggregate_dims.png ├── aggregate_dims.svg ├── diagram.docx ├── label_contiguous_1d.png ├── multi_arange.png └── multi_cumsum.png ├── numpy_groupies ├── __init__.py ├── aggregate_numba.py ├── aggregate_numpy.py ├── aggregate_numpy_ufunc.py ├── aggregate_pandas.py ├── aggregate_purepy.py ├── benchmarks │ ├── __init__.py │ ├── generic.py │ └── simple.py ├── tests │ ├── __init__.py │ ├── test_compare.py │ ├── test_generic.py │ ├── test_indices.py │ └── test_utils.py └── utils.py └── pyproject.toml /.gitattributes: -------------------------------------------------------------------------------- 1 | numpy_groupies/_version.py export-subst 2 | -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | push: 4 | branches: 5 | - "master" 6 | pull_request: 7 | branches: 8 | - "*" 9 | schedule: 10 | - cron: "0 0 * * *" # Daily “At 00:00” 11 | workflow_dispatch: # allows you to trigger manually 12 | 13 | concurrency: 14 | group: ${{ github.workflow }}-${{ github.ref }} 15 | cancel-in-progress: true 16 | 17 | jobs: 18 | build: 19 | name: Build (${{ matrix.python-version }}, ${{ matrix.os }}) 20 | runs-on: ${{ matrix.os }} 21 | defaults: 22 | run: 23 | shell: bash -l {0} 24 | strategy: 25 | fail-fast: false 26 | matrix: 27 | os: ["ubuntu-latest"] 28 | python-version: ["3.9", "3.10", "3.11", "3.12"] 29 | steps: 30 | - uses: actions/checkout@v4 31 | with: 32 | fetch-depth: 1 33 | - name: Set environment variables 34 | run: | 35 | echo "CONDA_ENV_FILE=ci/environment.yml" >> $GITHUB_ENV 36 | echo "PYTHON_VERSION=${{ matrix.python-version }}" >> $GITHUB_ENV 37 | 38 | - name: Setup micromamba 39 | uses: mamba-org/setup-micromamba@v1 40 | with: 41 | environment-file: ${{ env.CONDA_ENV_FILE }} 42 | environment-name: npg-tests 43 | cache-environment: true 44 | cache-environment-key: "${{runner.os}}-${{runner.arch}}-py${{matrix.python-version}}-${{hashFiles(env.CONDA_ENV_FILE)}}" 45 | create-args: >- 46 | python=${{matrix.python-version}} 47 | conda 48 | 49 | # We only want to install this on one run, because otherwise we'll have 50 | # duplicate annotations. 51 | - name: Install error reporter 52 | if: ${{ matrix.os }} == 'ubuntu-latest' and ${{ matrix.python-version }} == '3.11' 53 | run: | 54 | python -m pip install pytest-github-actions-annotate-failures 55 | 56 | - name: Set up conda environment 57 | shell: bash -l {0} 58 | run: | 59 | python -m pip install -e .[dev] 60 | conda list 61 | 62 | - name: Run Tests 63 | shell: bash -l {0} 64 | run: | 65 | pytest 66 | -------------------------------------------------------------------------------- /.github/workflows/pypi-release.yaml: -------------------------------------------------------------------------------- 1 | name: Publish to PyPI 2 | on: 3 | workflow_dispatch: 4 | release: 5 | types: 6 | - published 7 | 8 | jobs: 9 | build-artifacts: 10 | runs-on: ubuntu-latest 11 | if: github.repository == 'ml31415/numpy-groupies' 12 | steps: 13 | - uses: actions/checkout@v4 14 | with: 15 | fetch-depth: 1 16 | - uses: actions/setup-python@v5 17 | name: Install Python 18 | with: 19 | python-version: "3.11" 20 | - name: Build tarball and wheels 21 | run: | 22 | git clean -xdf 23 | git restore -SW . 24 | pipx run build 25 | - name: List contents of built dist 26 | run: | 27 | ls -ltrh 28 | ls -ltrh dist 29 | - name: Check built artifacts 30 | run: | 31 | pipx run twine check --strict dist/* 32 | pwd 33 | if [ -f dist/numpy_groupies-0.0.0.tar.gz ]; then 34 | echo "❌ INVALID VERSION NUMBER" 35 | exit 1 36 | else 37 | echo "✅ Looks good" 38 | fi 39 | - name: Test artifact installation 40 | run: | 41 | python -m pip install --upgrade pip 42 | python -m pip install dist/*.tar.gz 43 | - uses: actions/upload-artifact@v4 44 | with: 45 | name: release 46 | path: dist 47 | 48 | test-built-dist: 49 | needs: build-artifacts 50 | runs-on: ubuntu-latest 51 | steps: 52 | - uses: actions/download-artifact@v4 53 | with: 54 | name: release 55 | path: dist 56 | - name: Publish package to TestPyPI 57 | uses: pypa/gh-action-pypi-publish@release/v1 58 | with: 59 | password: ${{ secrets.TESTPYPI_TOKEN }} 60 | repository-url: https://test.pypi.org/legacy/ 61 | skip-existing: true 62 | 63 | upload-to-pypi: 64 | needs: test-built-dist 65 | if: github.event_name == 'release' 66 | runs-on: ubuntu-latest 67 | steps: 68 | - uses: actions/download-artifact@v4 69 | with: 70 | name: release 71 | path: dist 72 | - name: Publish package to PyPI 73 | uses: pypa/gh-action-pypi-publish@release/v1 74 | with: 75 | password: ${{ secrets.PYPI_TOKEN }} 76 | skip-existing: true 77 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | 21 | # Installer logs 22 | pip-log.txt 23 | 24 | # Unit test / coverage reports 25 | .coverage 26 | .tox 27 | nosetests.xml 28 | 29 | # Translations 30 | *.mo 31 | 32 | # Dev stuff 33 | .mr.developer.cfg 34 | .idea 35 | .project 36 | .pydevproject 37 | .settings/ 38 | .cache/ 39 | __pycache__/ 40 | .eggs/ 41 | .hypothesis/ 42 | *~ 43 | *.ini 44 | 45 | # Dynamic versioning 46 | numpy_groupies/_version.py 47 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.5.0 4 | hooks: 5 | - id: debug-statements 6 | - id: detect-private-key 7 | - id: check-builtin-literals 8 | - id: check-case-conflict 9 | - id: check-executables-have-shebangs 10 | - id: check-json 11 | - id: check-merge-conflict 12 | - id: check-symlinks 13 | - id: check-toml 14 | - id: check-xml 15 | - id: check-yaml 16 | exclude: (.pre-commit-config\.yaml) 17 | 18 | - repo: https://github.com/astral-sh/ruff-pre-commit 19 | rev: v0.3.7 20 | hooks: 21 | - id: ruff 22 | - id: ruff-format 23 | 24 | - repo: https://github.com/codespell-project/codespell 25 | rev: v2.2.6 26 | hooks: 27 | - id: codespell 28 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016, numpy-groupies developers 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![GitHub Workflow CI Status](https://img.shields.io/github/actions/workflow/status/ml31415/numpy-groupies/ci.yaml?branch=master&logo=github&style=flat)](https://github.com/ml31415/numpy-groupies/actions) 2 | [![PyPI](https://img.shields.io/pypi/v/numpy-groupies.svg?style=flat)](https://pypi.org/project/numpy-groupies/) 3 | [![Conda-forge](https://img.shields.io/conda/vn/conda-forge/numpy_groupies.svg?style=flat)](https://anaconda.org/conda-forge/numpy_groupies) 4 | ![Python Version from PEP 621 TOML](https://img.shields.io/python/required-version-toml?tomlFilePath=https%3A%2F%2Fraw.githubusercontent.com%2Fml31415%2Fnumpy-groupies%2Fmaster%2Fpyproject.toml) 5 | ![PyPI - Downloads](https://img.shields.io/pypi/dm/numpy-groupies) 6 | 7 | # numpy-groupies 8 | 9 | This package consists of a small library of optimised tools for doing things that can roughly 10 | be considered "group-indexing operations". The most prominent tool is `aggregate`, which is 11 | described in detail further down the page. 12 | 13 | 14 | ## Installation 15 | If you have `pip`, then simply: 16 | ``` 17 | pip install numpy_groupies 18 | ``` 19 | Note that `numpy_groupies` doesn't have any compulsory dependencies (even `numpy` is optional) 20 | so you should be able to install it fairly easily even without a package manager. If you just 21 | want one particular implementation of `aggregate` (e.g. `aggregate_numpy.py`), you can download 22 | that one file, and copy-paste the contents of `utils.py` into the top of that file (replacing 23 | the `from .utils import (...)` line). 24 | 25 | 26 | ## aggregate 27 | 28 | ![aggregate_diagram](/diagrams/aggregate.png) 29 | ```python 30 | import numpy as np 31 | import numpy_groupies as npg 32 | group_idx = np.array([ 3, 0, 0, 1, 0, 3, 5, 5, 0, 4]) 33 | a = np.array([13.2, 3.5, 3.5,-8.2, 3.0,13.4,99.2,-7.1, 0.0,53.7]) 34 | npg.aggregate(group_idx, a, func='sum', fill_value=0) 35 | # >>> array([10.0, -8.2, 0.0, 26.6, 53.7, 92.1]) 36 | ``` 37 | `aggregate` takes an array of values, and an array giving the group number for each of those values. 38 | It then returns the sum (or mean, or std, or any, ...etc.) of the values in each group. You have 39 | probably come across this idea before - see [Matlab's `accumarray` function](http://uk.mathworks.com/help/matlab/ref/accumarray.html?refresh=true), or 40 | [`pandas` groupby concept](http://pandas.pydata.org/pandas-docs/dev/groupby.html), or 41 | [MapReduce paradigm](http://en.wikipedia.org/wiki/MapReduce), or simply the [basic histogram](https://en.wikipedia.org/wiki/Histogram). 42 | 43 | A couple of implemented functions do not reduce the data, instead it calculates values cumulatively 44 | while iterating over the data or permutates them. The output size matches the input size. 45 | 46 | ```python 47 | group_idx = np.array([4, 3, 3, 4, 4, 1, 1, 1, 7, 8, 7, 4, 3, 3, 1, 1]) 48 | a = np.array([3, 4, 1, 3, 9, 9, 6, 7, 7, 0, 8, 2, 1, 8, 9, 8]) 49 | npg.aggregate(group_idx, a, func='cumsum') 50 | # >>> array([3, 4, 5, 6,15, 9,15,22, 7, 0,15,17, 6,14,31,39]) 51 | ``` 52 | 53 | 54 | ### Inputs 55 | The function accepts various different combinations of inputs, producing various different shapes of output. 56 | We give a brief description of the general meaning of the inputs and then go over the different combinations 57 | in more detail: 58 | 59 | * `group_idx` - array of non-negative integers to be used as the "labels" with which to group the values in `a`. 60 | * `a` - array of values to be aggregated. 61 | * `func='sum'` - the function to use for aggregation. See the section below for more details. 62 | * `size=None` - the shape of the output array. If `None`, the maximum value in `group_idx` will set the size of the output. 63 | * `fill_value=0` - value to use for output groups that do not appear anywhere in the `group_idx` input array. 64 | * `order='C'` - for multidimensional output, this controls the layout in memory, can be `'F'` for fortran-style. 65 | * `dtype=None` - the`dtype` of the output. `None` means choose a sensible type for the given `a`, `func`, and `fill_value`. 66 | * `axis=None` - explained below. 67 | * `ddof=0` - passed through into calculations of variance and standard deviation (see section on functions). 68 | 69 | ![aggregate_dims_diagram](/diagrams/aggregate_dims.png) 70 | 71 | * Form 1 is the simplest, taking `group_idx` and `a` of matching 1D lengths, and producing a 1D output. 72 | * Form 2 is similar to Form 1, but takes a scalar `a`, which is broadcast out to the length of `group_idx`. Note that this is generally not that useful. 73 | * Form 3 is more complicated. `group_idx` is the same length as the `a.shape[axis]`. The groups are broadcast out along the other axis/axes of `a`, thus the output is of shape `n_groups x a.shape[0] x ... x a.shape[axis-1] x a.shape[axis+1] x ... a.shape[-1]`, i.e. the output has two or more dimensions. 74 | * Form 4 also produces output with two or more dimensions, but for very different reasons to Form 3. Here `a` is 1D and `group_idx` is exactly `2D`, whereas in Form 3 `a` is `ND`, `group_idx` is `1D`, and we provide a value for `axis`. The length of `a` must match `group_idx.shape[1]`, the value of `group_idx.shape[0]` determines the number of dimensions in the output, i.e. `group_idx[:,99]` gives the `(x,y,z)` group indices for the `a[99]`. 75 | * Form 5 is the same as Form 4 but with scalar `a`. As with Form 2, this is rarely that helpful. 76 | 77 | **Note on performance.** The `order` of the output is unlikely to affect performance of `aggregate` (although it may affect your downstream usage of that output), however the order of multidimensional `a` or `group_idx` can affect performance: in Form 4 it is best if columns are contiguous in memory within `group_idx`, i.e. `group_idx[:, 99]` corresponds to a contiguous chunk of memory; in Form 3 it's best if all the data in `a` for `group_idx[i]` is contiguous, e.g. if `axis=1` then we want `a[:, 55]` to be contiguous. 78 | 79 | 80 | ### Available functions 81 | By default, `aggregate` assumes you want to sum the values within each group, however you can specify another 82 | function using the `func` kwarg. This `func` can be any custom callable, however you will likely want one of 83 | the following optimized functions. Note that not all functions might be provided by all implementations. 84 | 85 | * `'sum'` - sum of items within each group (see example above). 86 | * `'prod'` - product of items within each group 87 | * `'mean'` - mean of items within each group 88 | * `'var'`- variance of items within each group. Use `ddof` kwarg for degrees of freedom. The divisor used in calculations is `N - ddof`, where `N` represents the number of elements. By default `ddof` is zero. 89 | * `'std'` - standard deviation of items within each group. Use `ddof` kwarg for degrees of freedom (see `var` above). 90 | * `'min'` - minimum value of items within each group. 91 | * `'max'` - maximum value of items within each group. 92 | * `'first'` - first item in `a` from each group. 93 | * `'last'` - last item in `a` from each group. 94 | * `'argmax'` - the index in `a` of the maximum value in each group. 95 | * `'argmin'` - the index in `a` of the minimum value in each group. 96 | 97 | The above functions also have a `nan`-form, which skip the `nan` values instead of propagating them to the result of the calculation: 98 | * `'nansum'`, `'nanprod'`, `'nanmean'`, `'nanvar'`, `'nanstd'`, `'nanmin'`, `'nanmax'`, `'nanfirst'`, `'nanlast'`, `'nanargmax'`, `'nanargmin'` 99 | 100 | The following functions are slightly different in that they always return boolean values. Their treatment of nans is also different from above: 101 | * `'all'` - `True` if all items within a group are truethy. Note that `np.all(nan)` is `True`, i.e. `nan` is actually truethy. 102 | * `'any'` - `True` if any items within a group are truethy. 103 | * `'allnan'` - `True` if all items within a group are `nan`. 104 | * `'anynan'` - `True` if any items within a group are `nan`. 105 | 106 | The following functions don't reduce the data, but instead produce an output matching the size of the input: 107 | * `'cumsum'` - cumulative sum of items within each group. 108 | * `'cumprod'` - cumulative product of items within each group. (numba only) 109 | * `'cummin'` - cumulative minimum of items within each group. (numba only) 110 | * `'cummax'` - cumulative maximum of items within each group. (numba only) 111 | * `'sort'` - sort the items within each group in ascending order, use reverse=True to invert the order. 112 | 113 | Finally, there are three functions which don't reduce each group to a single value, instead they return the full 114 | set of items within the group: 115 | * `'array'` - simply returns the grouped items, using the same order as appeared in `a`. (numpy only) 116 | 117 | 118 | ### Examples 119 | Compute sums of consecutive integers, and then compute products of those consecutive integers. 120 | ```python 121 | group_idx = np.arange(5).repeat(3) 122 | # group_idx: array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4]) 123 | a = np.arange(group_idx.size) 124 | # a: array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]) 125 | x = npg.aggregate(group_idx, a) # sum is default 126 | # x: array([ 3, 12, 21, 30, 39]) 127 | x = npg.aggregate(group_idx, a, 'prod') 128 | # x: array([ 0, 60, 336, 990, 2184]) 129 | ``` 130 | 131 | Get variance ignoring nans, setting all-nan groups to `nan`. 132 | ```python 133 | x = npg.aggregate(group_idx, a, func='nanvar', fill_value=nan) 134 | ``` 135 | 136 | Count the number of elements in each group. Note that this is equivalent to doing `np.bincount(group_idx)`, 137 | indeed that is how the numpy implementation does it. 138 | ```python 139 | x = npg.aggregate(group_idx, 1) 140 | ``` 141 | 142 | Sum 1000 values into a three-dimensional cube of size 15x15x15. Note that in this example all three dimensions 143 | have the same size, but that doesn't have to be the case. 144 | ```python 145 | group_idx = np.random.randint(0, 15, size=(3, 1000)) 146 | a = np.random.random(group_idx.shape[1]) 147 | x = npg.aggregate(group_idx, a, func="sum", size=(15,15,15), order="F") 148 | # x.shape: (15, 15, 15) 149 | # np.isfortran(x): True 150 | ``` 151 | 152 | Use a custom function to generate some strings. 153 | ```python 154 | group_idx = np.array([1, 0, 1, 4, 1]) 155 | a = np.array([12.0, 3.2, -15, 88, 12.9]) 156 | x = npg.aggregate(group_idx, a, 157 | func=lambda g: ' or maybe '.join(str(gg) for gg in g), fill_value='') 158 | # x: ['3.2', '12.0 or maybe -15.0 or maybe 12.9', '', '', '88.0'] 159 | ``` 160 | 161 | Use the `axis` arg in order to do a sum-aggregation on three rows simultaneously. 162 | ```python 163 | a = np.array([[99, 2, 11, 14, 20], 164 | [33, 76, 12, 100, 71], 165 | [67, 10, -8, 1, 9]]) 166 | group_idx = np.array([[3, 3, 7, 0, 0]]) 167 | x = npg.aggregate(group_idx, a, axis=1) 168 | # x : [[ 34, 0, 0, 101, 0, 0, 0, 11], 169 | # [171, 0, 0, 109, 0, 0, 0, 12], 170 | # [ 10, 0, 0, 77, 0, 0, 0, -8]] 171 | ``` 172 | 173 | 174 | ### Multiple implementations 175 | There are multiple implementations of `aggregate` provided. If you use `from numpy_groupies import aggregate`, 176 | the best available implementation will automatically be selected. Otherwise you can pick a specific version directly 177 | like `from numpy_groupies import aggregate_nb as aggregate` or by importing aggregate from the implementing module 178 | `from numpy_groupies.aggregate_weave import aggregate`. 179 | 180 | Currently the following implementations exist: 181 | * **numpy** - This is the default implementation. It uses plain `numpy`, mainly relying on `np.bincount` and basic indexing magic. It comes without other dependencies except `numpy` and shows reasonable performance for the occasional usage. 182 | * **numba** - This is the most performant implementation, based on jit compilation provided by numba and LLVM. 183 | * **pure python** - This implementation has no dependencies and uses only the standard library. It's horribly slow and should only be used, if there is no numpy available. 184 | * **numpy ufunc** - *Only for benchmarking.* This implementation uses the `.at` method of numpy's `ufunc`s (e.g. `add.at`), which would appear to be designed for performing exactly the same calculation that `aggregate` executes, however the numpy implementation is rather incomplete. 185 | * **pandas** - *Only for reference.* The pandas' `groupby` concept is the same as the task performed by `aggregate`. However, `pandas` is not actually faster than the default `numpy` implementation. Also, note that there may be room for improvement in the way that `pandas` is utilized here. Most notably, when computing multiple aggregations of the same data (e.g. `'min'` and `'max'`) pandas could potentially be used more efficiently. 186 | 187 | All implementations have the same calling syntax and produce the same outputs, to within some floating-point error. 188 | However some implementations only support a subset of the valid inputs and will sometimes throw `NotImplementedError`. 189 | 190 | 191 | ### Benchmarks 192 | Scripts for testing and benchmarking are included in this repository. For benchmarking, run 193 | `python -m numpy_groupies.benchmarks.generic` from the root of this repository. 194 | 195 | Below we are using `500,000` indices uniformly picked from `[0, 1000)`. The values of `a` are uniformly picked from 196 | the interval `[0,1)`, with anything less than `0.2` then set to 0 (in order to serve as falsy values in boolean operations). 197 | For `nan-` operations another 20% of the values are set to nan, leaving the remainder on the interval `[0.2,0.8)`. 198 | 199 | The benchmarking results are given in ms for an i7-7560U running at 2.40GHz: 200 | 201 | | function | ufunc | numpy | numba | pandas | 202 | |-----------|---------|---------|---------|---------| 203 | | sum | 1.950 | 1.728 | 0.708 | 11.832 | 204 | | prod | 2.279 | 2.349 | 0.709 | 11.649 | 205 | | min | 2.472 | 2.489 | 0.716 | 11.686 | 206 | | max | 2.457 | 2.480 | 0.745 | 11.598 | 207 | | len | 1.481 | 1.270 | 0.635 | 10.932 | 208 | | all | 37.186 | 3.054 | 0.892 | 12.587 | 209 | | any | 35.278 | 5.157 | 0.890 | 12.845 | 210 | | anynan | 5.783 | 2.126 | 0.762 | 144.740 | 211 | | allnan | 7.971 | 4.367 | 0.774 | 144.507 | 212 | | mean | ---- | 2.500 | 0.825 | 13.284 | 213 | | std | ---- | 4.528 | 0.965 | 12.193 | 214 | | var | ---- | 4.269 | 0.969 | 12.657 | 215 | | first | ---- | 1.847 | 0.811 | 11.584 | 216 | | last | ---- | 1.309 | 0.581 | 11.842 | 217 | | argmax | ---- | 3.504 | 1.411 | 293.640 | 218 | | argmin | ---- | 6.996 | 1.347 | 290.977 | 219 | | nansum | ---- | 5.388 | 1.569 | 15.239 | 220 | | nanprod | ---- | 5.707 | 1.546 | 15.004 | 221 | | nanmin | ---- | 5.831 | 1.700 | 14.292 | 222 | | nanmax | ---- | 5.847 | 1.731 | 14.927 | 223 | | nanlen | ---- | 3.170 | 1.529 | 14.529 | 224 | | nanall | ---- | 6.499 | 1.640 | 15.931 | 225 | | nanany | ---- | 8.041 | 1.656 | 15.839 | 226 | | nanmean | ---- | 5.636 | 1.583 | 15.185 | 227 | | nanvar | ---- | 7.514 | 1.682 | 15.643 | 228 | | nanstd | ---- | 7.292 | 1.666 | 15.104 | 229 | | nanfirst | ---- | 5.318 | 2.096 | 14.432 | 230 | | nanlast | ---- | 4.943 | 1.473 | 14.637 | 231 | | nanargmin | ---- | 7.977 | 1.779 | 298.911 | 232 | | nanargmax | ---- | 5.869 | 1.802 | 301.022 | 233 | | cumsum | ---- | 71.713 | 1.119 | 8.864 | 234 | | cumprod | ---- | ---- | 1.123 | 12.100 | 235 | | cummax | ---- | ---- | 1.062 | 12.133 | 236 | | cummin | ---- | ---- | 0.973 | 11.908 | 237 | | arbitrary | ---- | 147.853 | 46.690 | 129.779 | 238 | | sort | ---- | 167.699 | ---- | ---- | 239 | 240 | _Linux(x86_64), Python 3.10.12, Numpy 1.25.2, Numba 0.58.0, Pandas 2.0.2_ 241 | 242 | ## Development 243 | This project was started by @ml31415 and the `numba` and `weave` implementations are by him. The pure 244 | python and `numpy` implementations were written by @d1manson. 245 | 246 | The authors hope that `numpy`'s `ufunc.at` methods or some other implementation of `aggregate` within 247 | `numpy` or `scipy` will eventually be fast enough, to make this package redundant. Numpy 1.25 actually 248 | contained major [improvements on ufunc speed](https://numpy.org/doc/stable/release/1.25.0-notes.html), 249 | which reduced the speed gap between numpy and the numba implementation a lot. 250 | -------------------------------------------------------------------------------- /ci/environment.yml: -------------------------------------------------------------------------------- 1 | name: npg-tests 2 | channels: 3 | - conda-forge 4 | - nodefaults 5 | dependencies: 6 | - numpy 7 | - pandas 8 | - numba 9 | - pytest 10 | - numpy_groupies 11 | -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | """ 2 | pytest configuration to silently discard test items with invalid parameter combinations 3 | See: https://github.com/pytest-dev/pytest/issues/3730#issuecomment-567142496 4 | """ 5 | 6 | 7 | def pytest_configure(config): 8 | config.addinivalue_line( 9 | "markers", "deselect_if(func): function to deselect tests from parametrization" 10 | ) 11 | 12 | 13 | def pytest_collection_modifyitems(config, items): 14 | removed = [] 15 | kept = [] 16 | for item in items: 17 | m = item.get_closest_marker("deselect_if") 18 | if m: 19 | func = m.kwargs["func"] 20 | if func(**item.callspec.params): 21 | removed.append(item) 22 | continue 23 | kept.append(item) 24 | if removed: 25 | config.hook.pytest_deselected(items=removed) 26 | items[:] = kept 27 | -------------------------------------------------------------------------------- /diagrams/aggregate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ml31415/numpy-groupies/1dd8c3dfcc55235ae3753ce0db264da53bae4a80/diagrams/aggregate.png -------------------------------------------------------------------------------- /diagrams/aggregate_dims.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ml31415/numpy-groupies/1dd8c3dfcc55235ae3753ce0db264da53bae4a80/diagrams/aggregate_dims.png -------------------------------------------------------------------------------- /diagrams/aggregate_dims.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 22 | 24 | 29 | 34 | 39 | 44 | 49 | 54 | 59 | 64 | 69 | 76 | 81 | 86 | 91 | 96 | 101 | 106 | 111 | 116 | 121 | 126 | 131 | 136 | 141 | 146 | 151 | 156 | 164 | 171 | 178 | 179 | 180 | 202 | 204 | 205 | 207 | image/svg+xml 208 | 210 | 211 | 212 | 213 | 214 | 219 | 235 | 246 | 257 | group_idx a 295 | 303 | result 317 | 327 | 328 | 339 | 350 | group_idx a, axis=1 395 | result 421 | 427 | 435 | 445 | group_idx 472 | a result 507 | 518 | 530 | 542 | 543 | 544 | 550 | 558 | 568 | group_idx a result 619 | 630 | 642 | 654 | 655 | 666 | 667 | 673 | 683 | 694 | group_idx 721 | a 748 | result form 1 781 | form 2 form 3 form 4 form 5 note ndim(a) can be >2 857 | and ndim(result) = ndim(a) 869 | 870 | 871 | -------------------------------------------------------------------------------- /diagrams/diagram.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ml31415/numpy-groupies/1dd8c3dfcc55235ae3753ce0db264da53bae4a80/diagrams/diagram.docx -------------------------------------------------------------------------------- /diagrams/label_contiguous_1d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ml31415/numpy-groupies/1dd8c3dfcc55235ae3753ce0db264da53bae4a80/diagrams/label_contiguous_1d.png -------------------------------------------------------------------------------- /diagrams/multi_arange.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ml31415/numpy-groupies/1dd8c3dfcc55235ae3753ce0db264da53bae4a80/diagrams/multi_arange.png -------------------------------------------------------------------------------- /diagrams/multi_cumsum.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ml31415/numpy-groupies/1dd8c3dfcc55235ae3753ce0db264da53bae4a80/diagrams/multi_cumsum.png -------------------------------------------------------------------------------- /numpy_groupies/__init__.py: -------------------------------------------------------------------------------- 1 | from .aggregate_purepy import aggregate as aggregate_py 2 | 3 | 4 | def dummy_no_impl(*args, **kwargs): 5 | raise NotImplementedError( 6 | "You may need to install another package (numpy or numba) to access a working implementation." 7 | ) 8 | 9 | 10 | aggregate = aggregate_py 11 | 12 | try: 13 | import numpy as np 14 | except ImportError: 15 | aggregate_np = aggregate_ufunc = dummy_no_impl 16 | multi_arange = multi_cumsum = label_contiguous_1d = dummy_no_impl 17 | else: 18 | from .aggregate_numpy import aggregate 19 | 20 | aggregate_np = aggregate 21 | from .aggregate_numpy_ufunc import aggregate as aggregate_ufunc 22 | from .utils import ( 23 | label_contiguous_1d, 24 | multi_arange, 25 | relabel_groups_masked, 26 | relabel_groups_unique, 27 | unpack, 28 | ) 29 | 30 | 31 | try: 32 | import numba 33 | except ImportError: 34 | aggregate_nb = None 35 | else: 36 | from .aggregate_numba import aggregate as aggregate_nb 37 | from .aggregate_numba import step_count, step_indices 38 | 39 | aggregate = aggregate_nb 40 | 41 | 42 | def uaggregate(group_idx, a, **kwargs): 43 | return unpack(group_idx, aggregate(group_idx, a, **kwargs)) 44 | 45 | 46 | try: 47 | # Version is added only when packaged 48 | from ._version import __version__ 49 | except ImportError: 50 | try: 51 | from setuptools_scm import get_version 52 | except ImportError: 53 | __version__ = "0.0.0" 54 | else: 55 | __version__ = get_version(root="..", relative_to=__file__) 56 | del get_version 57 | -------------------------------------------------------------------------------- /numpy_groupies/aggregate_numba.py: -------------------------------------------------------------------------------- 1 | import numba as nb 2 | import numpy as np 3 | 4 | from .utils import ( 5 | aggregate_common_doc, 6 | aliasing, 7 | check_dtype, 8 | check_fill_value, 9 | funcs_no_separate_nan, 10 | get_func, 11 | input_validation, 12 | ) 13 | 14 | 15 | class AggregateOp(object): 16 | """ 17 | Every subclass of AggregateOp handles a different aggregation operation. There are 18 | several private class methods that need to be overwritten by the subclasses 19 | in order to implement different functionality. 20 | 21 | On object instantiation, all necessary static methods are compiled together into 22 | two jitted callables, one for scalar arguments, and one for arrays. Calling the 23 | instantiated object picks the right cached callable, does some further preprocessing 24 | and then executes the actual aggregation operation. 25 | """ 26 | 27 | forced_fill_value = None 28 | counter_fill_value = 1 29 | counter_dtype = bool 30 | mean_fill_value = None 31 | mean_dtype = np.float64 32 | outer = False 33 | reverse = False 34 | nans = False 35 | 36 | def __init__(self, func=None, **kwargs): 37 | if func is None: 38 | func = type(self).__name__.lower() 39 | self.func = func 40 | self.__dict__.update(kwargs) 41 | # Cache the compiled functions, so they don't have to be recompiled on every call 42 | self._jit_scalar = self.callable(self.nans, self.reverse, scalar=True) 43 | self._jit_non_scalar = self.callable(self.nans, self.reverse, scalar=False) 44 | 45 | def __call__( 46 | self, 47 | group_idx, 48 | a, 49 | size=None, 50 | fill_value=0, 51 | order="C", 52 | dtype=None, 53 | axis=None, 54 | ddof=0, 55 | ): 56 | iv = input_validation( 57 | group_idx, 58 | a, 59 | size=size, 60 | order=order, 61 | axis=axis, 62 | check_bounds=False, 63 | func=self.func, 64 | ) 65 | group_idx, a, flat_size, ndim_idx, size, unravel_shape = iv 66 | 67 | # TODO: The typecheck should be done by the class itself, not by check_dtype 68 | dtype = check_dtype(dtype, self.func, a, len(group_idx)) 69 | check_fill_value(fill_value, dtype, func=self.func) 70 | input_dtype = type(a) if np.isscalar(a) else a.dtype 71 | ret, counter, mean, outer = self._initialize( 72 | flat_size, fill_value, dtype, input_dtype, group_idx.size 73 | ) 74 | group_idx = np.ascontiguousarray(group_idx) 75 | 76 | if not np.isscalar(a): 77 | a = np.ascontiguousarray(a) 78 | jitfunc = self._jit_non_scalar 79 | else: 80 | jitfunc = self._jit_scalar 81 | jitfunc(group_idx, a, ret, counter, mean, outer, fill_value, ddof) 82 | self._finalize(ret, counter, fill_value) 83 | 84 | if self.outer: 85 | ret = outer 86 | 87 | # Deal with ndimensional indexing 88 | if ndim_idx > 1: 89 | if unravel_shape is not None: 90 | # argreductions only 91 | mask = ret == fill_value 92 | ret[mask] = 0 93 | ret = np.unravel_index(ret, unravel_shape)[axis] 94 | ret[mask] = fill_value 95 | ret = ret.reshape(size, order=order) 96 | return ret 97 | 98 | @classmethod 99 | def _initialize(cls, flat_size, fill_value, dtype, input_dtype, input_size): 100 | if cls.forced_fill_value is None: 101 | ret = np.full(flat_size, fill_value, dtype=dtype) 102 | else: 103 | ret = np.full(flat_size, cls.forced_fill_value, dtype=dtype) 104 | 105 | counter = mean = outer = None 106 | if cls.counter_fill_value is not None: 107 | counter = np.full_like(ret, cls.counter_fill_value, dtype=cls.counter_dtype) 108 | if cls.mean_fill_value is not None: 109 | dtype = cls.mean_dtype if cls.mean_dtype else input_dtype 110 | mean = np.full_like(ret, cls.mean_fill_value, dtype=dtype) 111 | if cls.outer: 112 | outer = np.full(input_size, fill_value, dtype=dtype) 113 | 114 | return ret, counter, mean, outer 115 | 116 | @classmethod 117 | def _finalize(cls, ret, counter, fill_value): 118 | if cls.forced_fill_value is not None and fill_value != cls.forced_fill_value: 119 | if cls.counter_dtype == bool: 120 | ret[counter] = fill_value 121 | else: 122 | ret[~counter.astype(bool)] = fill_value 123 | 124 | @classmethod 125 | def callable(cls, nans=False, reverse=False, scalar=False): 126 | """Compile a jitted function doing the hard part of the job""" 127 | _valgetter = cls._valgetter_scalar if scalar else cls._valgetter 128 | valgetter = nb.njit(_valgetter) 129 | outersetter = nb.njit(cls._outersetter) 130 | 131 | if not nans: 132 | inner = nb.njit(cls._inner) 133 | else: 134 | cls_inner = nb.njit(cls._inner) 135 | cls_nan_check = nb.njit(cls._nan_check) 136 | 137 | @nb.njit 138 | def inner(ri, val, ret, counter, mean, fill_value): 139 | if not cls_nan_check(val): 140 | cls_inner(ri, val, ret, counter, mean, fill_value) 141 | 142 | @nb.njit 143 | def loop(group_idx, a, ret, counter, mean, outer, fill_value, ddof): 144 | # ddof needs to be present for being exchangeable with loop_2pass 145 | size = len(ret) 146 | rng = ( 147 | range(len(group_idx) - 1, -1, -1) if reverse else range(len(group_idx)) 148 | ) 149 | for i in rng: 150 | ri = group_idx[i] 151 | if ri < 0: 152 | raise ValueError("negative indices not supported") 153 | if ri >= size: 154 | raise ValueError("one or more indices in group_idx are too large") 155 | val = valgetter(a, i) 156 | inner(ri, val, ret, counter, mean, fill_value) 157 | outersetter(outer, i, ret[ri]) 158 | 159 | return loop 160 | 161 | @staticmethod 162 | def _valgetter(a, i): 163 | return a[i] 164 | 165 | @staticmethod 166 | def _valgetter_scalar(a, i): 167 | return a 168 | 169 | @staticmethod 170 | def _nan_check(val): 171 | return val != val 172 | 173 | @staticmethod 174 | def _inner(ri, val, ret, counter, mean, fill_value): 175 | raise NotImplementedError("subclasses need to overwrite _inner") 176 | 177 | @staticmethod 178 | def _outersetter(outer, i, val): 179 | pass 180 | 181 | 182 | class Aggregate2pass(AggregateOp): 183 | """Base class for everything that needs to process the data twice like mean, var and std.""" 184 | 185 | @classmethod 186 | def callable(cls, nans=False, reverse=False, scalar=False): 187 | # Careful, cls needs to be passed, so that the overwritten methods remain available in 188 | # AggregateOp.callable 189 | loop_1st = super().callable(nans=nans, reverse=reverse, scalar=scalar) 190 | 191 | _2pass_inner = nb.njit(cls._2pass_inner) 192 | 193 | @nb.njit 194 | def loop_2nd(ret, counter, mean, fill_value, ddof): 195 | for ri in range(len(ret)): 196 | if counter[ri] > ddof: 197 | ret[ri] = _2pass_inner(ri, ret, counter, mean, ddof) 198 | else: 199 | ret[ri] = fill_value 200 | 201 | @nb.njit 202 | def loop_2pass(group_idx, a, ret, counter, mean, outer, fill_value, ddof): 203 | loop_1st(group_idx, a, ret, counter, mean, outer, fill_value, ddof) 204 | loop_2nd(ret, counter, mean, fill_value, ddof) 205 | 206 | return loop_2pass 207 | 208 | @staticmethod 209 | def _2pass_inner(ri, ret, counter, mean, ddof): 210 | raise NotImplementedError("subclasses need to overwrite _2pass_inner") 211 | 212 | @classmethod 213 | def _finalize(cls, ret, counter, fill_value): 214 | """Copying the fill value is already done in the 2nd pass""" 215 | pass 216 | 217 | 218 | class AggregateNtoN(AggregateOp): 219 | """Base class for cumulative functions, where the output size matches the input size.""" 220 | 221 | outer = True 222 | 223 | @staticmethod 224 | def _outersetter(outer, i, val): 225 | outer[i] = val 226 | 227 | 228 | class AggregateGeneric(AggregateOp): 229 | """Base class for jitting arbitrary functions.""" 230 | 231 | counter_fill_value = None 232 | 233 | def __init__(self, func, **kwargs): 234 | self.func = func 235 | self.__dict__.update(kwargs) 236 | self._jitfunc = self.callable(self.nans) 237 | 238 | def __call__( 239 | self, 240 | group_idx, 241 | a, 242 | size=None, 243 | fill_value=0, 244 | order="C", 245 | dtype=None, 246 | axis=None, 247 | ddof=0, 248 | ): 249 | iv = input_validation( 250 | group_idx, a, size=size, order=order, axis=axis, check_bounds=False 251 | ) 252 | group_idx, a, flat_size, ndim_idx, size, _ = iv 253 | 254 | # TODO: The typecheck should be done by the class itself, not by check_dtype 255 | dtype = check_dtype(dtype, self.func, a, len(group_idx)) 256 | check_fill_value(fill_value, dtype, func=self.func) 257 | input_dtype = type(a) if np.isscalar(a) else a.dtype 258 | ret, _, _, _ = self._initialize( 259 | flat_size, fill_value, dtype, input_dtype, group_idx.size 260 | ) 261 | group_idx = np.ascontiguousarray(group_idx) 262 | 263 | sortidx = np.argsort(group_idx, kind="mergesort") 264 | self._jitfunc(sortidx, group_idx, a, ret) 265 | 266 | # Deal with ndimensional indexing 267 | if ndim_idx > 1: 268 | ret = ret.reshape(size, order=order) 269 | return ret 270 | 271 | def callable(self, nans=False): 272 | """Compile a jitted function and loop it over the sorted data.""" 273 | func = nb.njit(self.func) 274 | 275 | @nb.njit 276 | def loop(sortidx, group_idx, a, ret): 277 | size = len(ret) 278 | group_idx_srt = group_idx[sortidx] 279 | a_srt = a[sortidx] 280 | 281 | indices = step_indices(group_idx_srt) 282 | for i in range(len(indices) - 1): 283 | start_idx, stop_idx = indices[i], indices[i + 1] 284 | ri = group_idx_srt[start_idx] 285 | if ri < 0: 286 | raise ValueError("negative indices not supported") 287 | if ri >= size: 288 | raise ValueError("one or more indices in group_idx are too large") 289 | ret[ri] = func(a_srt[start_idx:stop_idx]) 290 | 291 | return loop 292 | 293 | 294 | class Sum(AggregateOp): 295 | forced_fill_value = 0 296 | 297 | @staticmethod 298 | def _inner(ri, val, ret, counter, mean, fill_value): 299 | counter[ri] = 0 300 | ret[ri] += val 301 | 302 | 303 | class Prod(AggregateOp): 304 | forced_fill_value = 1 305 | 306 | @staticmethod 307 | def _inner(ri, val, ret, counter, mean, fill_value): 308 | counter[ri] = 0 309 | ret[ri] *= val 310 | 311 | 312 | class Len(AggregateOp): 313 | forced_fill_value = 0 314 | 315 | @staticmethod 316 | def _inner(ri, val, ret, counter, mean, fill_value): 317 | counter[ri] = 0 318 | ret[ri] += 1 319 | 320 | 321 | class All(AggregateOp): 322 | forced_fill_value = 1 323 | 324 | @staticmethod 325 | def _inner(ri, val, ret, counter, mean, fill_value): 326 | counter[ri] = 0 327 | ret[ri] &= bool(val) 328 | 329 | 330 | class Any(AggregateOp): 331 | forced_fill_value = 0 332 | 333 | @staticmethod 334 | def _inner(ri, val, ret, counter, mean, fill_value): 335 | counter[ri] = 0 336 | ret[ri] |= bool(val) 337 | 338 | 339 | class Last(AggregateOp): 340 | counter_fill_value = None 341 | 342 | @staticmethod 343 | def _inner(ri, val, ret, counter, mean, fill_value): 344 | ret[ri] = val 345 | 346 | 347 | class First(Last): 348 | reverse = True 349 | 350 | 351 | class AllNan(AggregateOp): 352 | forced_fill_value = 1 353 | 354 | @staticmethod 355 | def _inner(ri, val, ret, counter, mean, fill_value): 356 | counter[ri] = 0 357 | ret[ri] &= val != val 358 | 359 | 360 | class AnyNan(AggregateOp): 361 | forced_fill_value = 0 362 | 363 | @staticmethod 364 | def _inner(ri, val, ret, counter, mean, fill_value): 365 | counter[ri] = 0 366 | ret[ri] |= val != val 367 | 368 | 369 | class Max(AggregateOp): 370 | @staticmethod 371 | def _inner(ri, val, ret, counter, mean, fill_value): 372 | if counter[ri]: 373 | ret[ri] = val 374 | counter[ri] = 0 375 | elif ret[ri] < val: 376 | ret[ri] = val 377 | 378 | 379 | class Min(AggregateOp): 380 | @staticmethod 381 | def _inner(ri, val, ret, counter, mean, fill_value): 382 | if counter[ri]: 383 | ret[ri] = val 384 | counter[ri] = 0 385 | elif ret[ri] > val: 386 | ret[ri] = val 387 | 388 | 389 | class ArgMax(AggregateOp): 390 | mean_fill_value = np.nan 391 | 392 | @staticmethod 393 | def _valgetter(a, i): 394 | return a[i], i 395 | 396 | @staticmethod 397 | def _nan_check(val): 398 | return val[0] != val[0] 399 | 400 | @staticmethod 401 | def _inner(ri, val, ret, counter, mean, fill_value): 402 | cmp_val, arg = val 403 | if counter[ri]: 404 | # start of a new group 405 | counter[ri] = 0 406 | mean[ri] = cmp_val 407 | if cmp_val == cmp_val: 408 | # Don't point on nans 409 | ret[ri] = arg 410 | elif mean[ri] < cmp_val: 411 | # larger valid value found 412 | mean[ri] = cmp_val 413 | ret[ri] = arg 414 | elif cmp_val != cmp_val: 415 | # nan found, reset group 416 | mean[ri] = cmp_val 417 | ret[ri] = fill_value 418 | 419 | 420 | class ArgMin(ArgMax): 421 | @staticmethod 422 | def _inner(ri, val, ret, counter, mean, fill_value): 423 | cmp_val, arg = val 424 | if counter[ri]: 425 | # start of a new group 426 | counter[ri] = 0 427 | mean[ri] = cmp_val 428 | if cmp_val == cmp_val: 429 | # Don't point on nans 430 | ret[ri] = arg 431 | elif mean[ri] > cmp_val: 432 | # larger valid value found 433 | mean[ri] = cmp_val 434 | ret[ri] = arg 435 | elif cmp_val != cmp_val: 436 | # nan found, reset group 437 | mean[ri] = cmp_val 438 | ret[ri] = fill_value 439 | 440 | 441 | class SumOfSquares(AggregateOp): 442 | forced_fill_value = 0 443 | 444 | @staticmethod 445 | def _inner(ri, val, ret, counter, mean, fill_value): 446 | counter[ri] = 0 447 | ret[ri] += val * val 448 | 449 | 450 | class Mean(Aggregate2pass): 451 | forced_fill_value = 0 452 | counter_fill_value = 0 453 | counter_dtype = int 454 | 455 | @staticmethod 456 | def _inner(ri, val, ret, counter, mean, fill_value): 457 | counter[ri] += 1 458 | ret[ri] += val 459 | 460 | @staticmethod 461 | def _2pass_inner(ri, ret, counter, mean, ddof): 462 | return ret[ri] / counter[ri] 463 | 464 | 465 | class Std(Mean): 466 | mean_fill_value = 0 467 | 468 | @staticmethod 469 | def _inner(ri, val, ret, counter, mean, fill_value): 470 | counter[ri] += 1 471 | mean[ri] += val 472 | ret[ri] += val * val 473 | 474 | @staticmethod 475 | def _2pass_inner(ri, ret, counter, mean, ddof): 476 | mean2 = mean[ri] * mean[ri] 477 | return np.sqrt((ret[ri] - mean2 / counter[ri]) / (counter[ri] - ddof)) 478 | 479 | 480 | class Var(Std): 481 | @staticmethod 482 | def _2pass_inner(ri, ret, counter, mean, ddof): 483 | mean2 = mean[ri] * mean[ri] 484 | return (ret[ri] - mean2 / counter[ri]) / (counter[ri] - ddof) 485 | 486 | 487 | class CumSum(AggregateNtoN, Sum): 488 | pass 489 | 490 | 491 | class CumProd(AggregateNtoN, Prod): 492 | pass 493 | 494 | 495 | class CumMax(AggregateNtoN, Max): 496 | pass 497 | 498 | 499 | class CumMin(AggregateNtoN, Min): 500 | pass 501 | 502 | 503 | def get_funcs(): 504 | funcs = {} 505 | for op in ( 506 | Sum, 507 | Prod, 508 | Len, 509 | All, 510 | Any, 511 | Last, 512 | First, 513 | AllNan, 514 | AnyNan, 515 | Min, 516 | Max, 517 | ArgMin, 518 | ArgMax, 519 | Mean, 520 | Std, 521 | Var, 522 | SumOfSquares, 523 | CumSum, 524 | CumProd, 525 | CumMax, 526 | CumMin, 527 | ): 528 | funcname = op.__name__.lower() 529 | funcs[funcname] = op(funcname) 530 | if funcname not in funcs_no_separate_nan: 531 | funcname = "nan" + funcname 532 | funcs[funcname] = op(funcname, nans=True) 533 | return funcs 534 | 535 | 536 | _impl_dict = get_funcs() 537 | _default_cache = {} 538 | 539 | 540 | def aggregate( 541 | group_idx, 542 | a, 543 | func="sum", 544 | size=None, 545 | fill_value=0, 546 | order="C", 547 | dtype=None, 548 | axis=None, 549 | cache=True, 550 | **kwargs, 551 | ): 552 | func = get_func(func, aliasing, _impl_dict) 553 | if not isinstance(func, str): 554 | if cache in (None, False): 555 | # Keep None and False in order to accept empty dictionaries 556 | aggregate_op = AggregateGeneric(func) 557 | else: 558 | if cache is True: 559 | cache = _default_cache 560 | aggregate_op = cache.setdefault(func, AggregateGeneric(func)) 561 | return aggregate_op( 562 | group_idx, a, size, fill_value, order, dtype, axis, **kwargs 563 | ) 564 | else: 565 | func = _impl_dict[func] 566 | return func(group_idx, a, size, fill_value, order, dtype, axis, **kwargs) 567 | 568 | 569 | aggregate.__doc__ = ( 570 | """ 571 | This is the numba implementation of aggregate. 572 | """ 573 | + aggregate_common_doc 574 | ) 575 | 576 | 577 | @nb.njit 578 | def step_count(group_idx): 579 | """Return the amount of index changes within group_idx.""" 580 | cmp_pos = 0 581 | steps = 1 582 | if len(group_idx) < 1: 583 | return 0 584 | for i in range(len(group_idx)): 585 | if group_idx[cmp_pos] != group_idx[i]: 586 | cmp_pos = i 587 | steps += 1 588 | return steps 589 | 590 | 591 | @nb.njit 592 | def step_indices(group_idx): 593 | """Return the edges of areas within group_idx, which are filled with the same value.""" 594 | ilen = step_count(group_idx) + 1 595 | indices = np.empty(ilen, np.int64) 596 | indices[0] = 0 597 | indices[-1] = group_idx.size 598 | cmp_pos = 0 599 | ri = 1 600 | for i in range(len(group_idx)): 601 | if group_idx[cmp_pos] != group_idx[i]: 602 | cmp_pos = i 603 | indices[ri] = i 604 | ri += 1 605 | return indices 606 | -------------------------------------------------------------------------------- /numpy_groupies/aggregate_numpy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from .utils import ( 4 | aggregate_common_doc, 5 | aliasing, 6 | check_boolean, 7 | check_dtype, 8 | check_fill_value, 9 | funcs_no_separate_nan, 10 | get_func, 11 | input_validation, 12 | iscomplexobj, 13 | maxval, 14 | minimum_dtype, 15 | minimum_dtype_scalar, 16 | minval, 17 | ) 18 | 19 | 20 | def _sum(group_idx, a, size, fill_value, dtype=None): 21 | dtype = minimum_dtype_scalar(fill_value, dtype, a) 22 | 23 | if np.ndim(a) == 0: 24 | ret = np.bincount(group_idx, minlength=size).astype(dtype, copy=False) 25 | if a != 1: 26 | ret *= a 27 | else: 28 | if iscomplexobj(a): 29 | ret = np.empty(size, dtype=dtype) 30 | ret.real = np.bincount(group_idx, weights=a.real, minlength=size) 31 | ret.imag = np.bincount(group_idx, weights=a.imag, minlength=size) 32 | else: 33 | ret = np.bincount(group_idx, weights=a, minlength=size).astype( 34 | dtype, copy=False 35 | ) 36 | 37 | if fill_value != 0: 38 | _fill_untouched(group_idx, ret, fill_value) 39 | return ret 40 | 41 | 42 | def _prod(group_idx, a, size, fill_value, dtype=None): 43 | dtype = minimum_dtype_scalar(fill_value, dtype, a) 44 | ret = np.full(size, fill_value, dtype=dtype) 45 | if fill_value != 1: 46 | ret[group_idx] = 1 # product starts from 1 47 | np.multiply.at(ret, group_idx, a) 48 | return ret 49 | 50 | 51 | def _len(group_idx, a, size, fill_value, dtype=None): 52 | return _sum(group_idx, 1, size, fill_value, dtype=int) 53 | 54 | 55 | def _last(group_idx, a, size, fill_value, dtype=None): 56 | dtype = minimum_dtype(fill_value, dtype or a.dtype) 57 | ret = np.full(size, fill_value, dtype=dtype) 58 | # repeated indexing gives last value, see: 59 | # the phrase "leaving behind the last value" on this page: 60 | # http://wiki.scipy.org/Tentative_NumPy_Tutorial 61 | ret[group_idx] = a 62 | return ret 63 | 64 | 65 | def _first(group_idx, a, size, fill_value, dtype=None): 66 | dtype = minimum_dtype(fill_value, dtype or a.dtype) 67 | ret = np.full(size, fill_value, dtype=dtype) 68 | ret[group_idx[::-1]] = a[::-1] # same trick as _last, but in reverse 69 | return ret 70 | 71 | 72 | def _all(group_idx, a, size, fill_value, dtype=None): 73 | check_boolean(fill_value) 74 | ret = np.full(size, fill_value, dtype=bool) 75 | if not fill_value: 76 | ret[group_idx] = True 77 | ret[group_idx.compress(np.logical_not(a))] = False 78 | return ret 79 | 80 | 81 | def _any(group_idx, a, size, fill_value, dtype=None): 82 | check_boolean(fill_value) 83 | ret = np.full(size, fill_value, dtype=bool) 84 | if fill_value: 85 | ret[group_idx] = False 86 | ret[group_idx.compress(a)] = True 87 | return ret 88 | 89 | 90 | def _min(group_idx, a, size, fill_value, dtype=None): 91 | dtype = minimum_dtype(fill_value, dtype or a.dtype) 92 | dmax = maxval(fill_value, dtype) 93 | with np.errstate(invalid="ignore"): 94 | ret = np.full(size, fill_value, dtype=dtype) 95 | if fill_value != dmax: 96 | ret[group_idx] = dmax # min starts from maximum 97 | with np.errstate(invalid="ignore"): 98 | np.minimum.at(ret, group_idx, a) 99 | return ret 100 | 101 | 102 | def _max(group_idx, a, size, fill_value, dtype=None): 103 | dtype = minimum_dtype(fill_value, dtype or a.dtype) 104 | dmin = minval(fill_value, dtype) 105 | with np.errstate(invalid="ignore"): 106 | ret = np.full(size, fill_value, dtype=dtype) 107 | if fill_value != dmin: 108 | ret[group_idx] = dmin # max starts from minimum 109 | with np.errstate(invalid="ignore"): 110 | np.maximum.at(ret, group_idx, a) 111 | return ret 112 | 113 | 114 | def _argmax(group_idx, a, size, fill_value, dtype=int, _nansqueeze=False): 115 | a_ = np.where(np.isnan(a), -np.inf, a) if _nansqueeze else a 116 | group_max = _max(group_idx, a_, size, np.nan) 117 | # nan should never be maximum, so use a and not a_ 118 | is_max = a == group_max[group_idx] 119 | ret = np.full(size, fill_value, dtype=dtype) 120 | group_idx_max = group_idx[is_max] 121 | (argmax,) = is_max.nonzero() 122 | ret[group_idx_max[::-1]] = argmax[ 123 | ::-1 124 | ] # reverse to ensure first value for each group wins 125 | return ret 126 | 127 | 128 | def _argmin(group_idx, a, size, fill_value, dtype=int, _nansqueeze=False): 129 | a_ = np.where(np.isnan(a), np.inf, a) if _nansqueeze else a 130 | group_min = _min(group_idx, a_, size, np.nan) 131 | # nan should never be minimum, so use a and not a_ 132 | is_min = a == group_min[group_idx] 133 | ret = np.full(size, fill_value, dtype=dtype) 134 | group_idx_min = group_idx[is_min] 135 | (argmin,) = is_min.nonzero() 136 | ret[group_idx_min[::-1]] = argmin[ 137 | ::-1 138 | ] # reverse to ensure first value for each group wins 139 | return ret 140 | 141 | 142 | def _mean(group_idx, a, size, fill_value, dtype=np.dtype(np.float64)): 143 | if np.ndim(a) == 0: 144 | raise ValueError("cannot take mean with scalar a") 145 | counts = np.bincount(group_idx, minlength=size) 146 | if iscomplexobj(a): 147 | dtype = a.dtype # TODO: this is a bit clumsy 148 | sums = np.empty(size, dtype=dtype) 149 | sums.real = np.bincount(group_idx, weights=a.real, minlength=size) 150 | sums.imag = np.bincount(group_idx, weights=a.imag, minlength=size) 151 | else: 152 | sums = np.bincount(group_idx, weights=a, minlength=size) 153 | 154 | with np.errstate(divide="ignore", invalid="ignore"): 155 | ret = sums / counts 156 | if not np.isnan(fill_value): 157 | ret[counts == 0] = fill_value 158 | if iscomplexobj(a): 159 | return ret 160 | else: 161 | return ret.astype(dtype, copy=False) 162 | 163 | 164 | def _sum_of_squres(group_idx, a, size, fill_value, dtype=np.dtype(np.float64)): 165 | ret = np.bincount(group_idx, weights=a * a, minlength=size) 166 | if fill_value != 0: 167 | counts = np.bincount(group_idx, minlength=size) 168 | ret[counts == 0] = fill_value 169 | if iscomplexobj(a): 170 | return ret 171 | else: 172 | return ret.astype(dtype, copy=False) 173 | 174 | 175 | def _var( 176 | group_idx, a, size, fill_value, dtype=np.dtype(np.float64), sqrt=False, ddof=0 177 | ): 178 | if np.ndim(a) == 0: 179 | raise ValueError("cannot take variance with scalar a") 180 | counts = np.bincount(group_idx, minlength=size) 181 | sums = np.bincount(group_idx, weights=a, minlength=size) 182 | with np.errstate(divide="ignore", invalid="ignore"): 183 | means = sums / counts 184 | counts = np.where(counts > ddof, counts - ddof, 0) 185 | ret = ( 186 | np.bincount(group_idx, (a - means[group_idx]) ** 2, minlength=size) / counts 187 | ) 188 | if sqrt: 189 | ret = np.sqrt(ret) # this is now std not var 190 | if not np.isnan(fill_value): 191 | ret[counts == 0] = fill_value 192 | if iscomplexobj(a): 193 | return ret 194 | else: 195 | return ret.astype(dtype, copy=False) 196 | 197 | 198 | def _std(group_idx, a, size, fill_value, dtype=np.dtype(np.float64), ddof=0): 199 | return _var(group_idx, a, size, fill_value, dtype=dtype, sqrt=True, ddof=ddof) 200 | 201 | 202 | def _allnan(group_idx, a, size, fill_value, dtype=bool): 203 | return _all(group_idx, np.isnan(a), size, fill_value=fill_value, dtype=dtype) 204 | 205 | 206 | def _anynan(group_idx, a, size, fill_value, dtype=bool): 207 | return _any(group_idx, np.isnan(a), size, fill_value=fill_value, dtype=dtype) 208 | 209 | 210 | def _sort(group_idx, a, size=None, fill_value=None, dtype=None, reverse=False): 211 | sortidx = np.lexsort((-a if reverse else a, group_idx)) 212 | # Reverse sorting back to into grouped order, but preserving groupwise sorting 213 | revidx = np.argsort(np.argsort(group_idx, kind="mergesort"), kind="mergesort") 214 | return a[sortidx][revidx] 215 | 216 | 217 | def _array(group_idx, a, size, fill_value, dtype=None): 218 | """groups a into separate arrays, keeping the order intact.""" 219 | if fill_value is not None and not (np.isscalar(fill_value) or len(fill_value) == 0): 220 | raise ValueError("fill_value must be None, a scalar or an empty sequence") 221 | order_group_idx = np.argsort(group_idx, kind="mergesort") 222 | counts = np.bincount(group_idx, minlength=size) 223 | ret = np.split(a[order_group_idx], np.cumsum(counts)[:-1]) 224 | ret = np.asanyarray(ret, dtype="object") 225 | if fill_value is None or np.isscalar(fill_value): 226 | _fill_untouched(group_idx, ret, fill_value) 227 | return ret 228 | 229 | 230 | def _generic_callable( 231 | group_idx, a, size, fill_value, dtype=None, func=lambda g: g, **kwargs 232 | ): 233 | """groups a by inds, and then applies foo to each group in turn, placing 234 | the results in an array.""" 235 | groups = _array(group_idx, a, size, ()) 236 | ret = np.full(size, fill_value, dtype=dtype or np.float64) 237 | 238 | for i, grp in enumerate(groups): 239 | if np.ndim(grp) == 1 and len(grp) > 0: 240 | ret[i] = func(grp) 241 | return ret 242 | 243 | 244 | def _cumsum(group_idx, a, size, fill_value=None, dtype=None): 245 | """ 246 | N to N aggregate operation of cumsum. Perform cumulative sum for each group. 247 | 248 | group_idx = np.array([4, 3, 3, 4, 4, 1, 1, 1, 7, 8, 7, 4, 3, 3, 1, 1]) 249 | a = np.array([3, 4, 1, 3, 9, 9, 6, 7, 7, 0, 8, 2, 1, 8, 9, 8]) 250 | _cumsum(group_idx, a, np.max(group_idx) + 1) 251 | >>> array([ 3, 4, 5, 6, 15, 9, 15, 22, 7, 0, 15, 17, 6, 14, 31, 39]) 252 | """ 253 | sortidx = np.argsort(group_idx, kind="mergesort") 254 | invsortidx = np.argsort(sortidx, kind="mergesort") 255 | group_idx_srt = group_idx[sortidx] 256 | 257 | a_srt = a[sortidx] 258 | a_srt_cumsum = np.cumsum(a_srt, dtype=dtype) 259 | 260 | increasing = np.arange(len(a), dtype=int) 261 | group_starts = _min(group_idx_srt, increasing, size, fill_value=0)[group_idx_srt] 262 | # First subtract large numbers 263 | a_srt_cumsum -= a_srt_cumsum[group_starts] 264 | # Then add potentially small numbers 265 | a_srt_cumsum += a_srt[group_starts] 266 | return a_srt_cumsum[invsortidx] 267 | 268 | 269 | def _nancumsum(group_idx, a, size, fill_value=None, dtype=None): 270 | a_nonans = np.where(np.isnan(a), 0, a) 271 | group_idx_nonans = np.where( 272 | np.isnan(group_idx), np.nanmax(group_idx) + 1, group_idx 273 | ) 274 | return _cumsum(group_idx_nonans, a_nonans, size, fill_value=fill_value, dtype=dtype) 275 | 276 | 277 | _impl_dict = dict( 278 | min=_min, 279 | max=_max, 280 | sum=_sum, 281 | prod=_prod, 282 | last=_last, 283 | first=_first, 284 | all=_all, 285 | any=_any, 286 | mean=_mean, 287 | std=_std, 288 | var=_var, 289 | anynan=_anynan, 290 | allnan=_allnan, 291 | sort=_sort, 292 | array=_array, 293 | argmax=_argmax, 294 | argmin=_argmin, 295 | len=_len, 296 | cumsum=_cumsum, 297 | sumofsquares=_sum_of_squres, 298 | generic=_generic_callable, 299 | ) 300 | _impl_dict.update( 301 | ("nan" + k, v) 302 | for k, v in list(_impl_dict.items()) 303 | if k not in funcs_no_separate_nan 304 | ) 305 | _impl_dict["nancumsum"] = _nancumsum 306 | 307 | 308 | def _aggregate_base( 309 | group_idx, 310 | a, 311 | func="sum", 312 | size=None, 313 | fill_value=0, 314 | order="C", 315 | dtype=None, 316 | axis=None, 317 | _impl_dict=_impl_dict, 318 | is_pandas=False, 319 | **kwargs, 320 | ): 321 | iv = input_validation(group_idx, a, size=size, order=order, axis=axis, func=func) 322 | group_idx, a, flat_size, ndim_idx, size, unravel_shape = iv 323 | 324 | if group_idx.dtype == np.dtype("uint64"): 325 | # Force conversion to signed int, to avoid issues with bincount etc later 326 | group_idx = group_idx.astype(int) 327 | 328 | func = get_func(func, aliasing, _impl_dict) 329 | if not isinstance(func, str): 330 | # do simple grouping and execute function in loop 331 | ret = _impl_dict.get("generic", _generic_callable)( 332 | group_idx, a, flat_size, fill_value, func=func, dtype=dtype, **kwargs 333 | ) 334 | else: 335 | # deal with nans and find the function 336 | if func.startswith("nan"): 337 | if np.ndim(a) == 0: 338 | raise ValueError("nan-version not supported for scalar input.") 339 | if "nan" in func: 340 | if "arg" in func: 341 | kwargs["_nansqueeze"] = True 342 | elif "cum" in func: 343 | pass 344 | else: 345 | good = ~np.isnan(a) 346 | if "len" not in func or is_pandas: 347 | # a is not needed for len, nanlen! 348 | a = a[good] 349 | group_idx = group_idx[good] 350 | 351 | dtype = check_dtype(dtype, func, a, flat_size) 352 | check_fill_value(fill_value, dtype, func=func) 353 | func = _impl_dict[func] 354 | ret = func( 355 | group_idx, a, flat_size, fill_value=fill_value, dtype=dtype, **kwargs 356 | ) 357 | 358 | # deal with ndimensional indexing 359 | if ndim_idx > 1: 360 | if unravel_shape is not None: 361 | # A negative fill_value cannot, and should not, be unraveled. 362 | mask = ret == fill_value 363 | ret[mask] = 0 364 | ret = np.unravel_index(ret, unravel_shape)[axis] 365 | ret[mask] = fill_value 366 | ret = ret.reshape(size, order=order) 367 | return ret 368 | 369 | 370 | def aggregate( 371 | group_idx, 372 | a, 373 | func="sum", 374 | size=None, 375 | fill_value=0, 376 | order="C", 377 | dtype=None, 378 | axis=None, 379 | **kwargs, 380 | ): 381 | return _aggregate_base( 382 | group_idx, 383 | a, 384 | size=size, 385 | fill_value=fill_value, 386 | order=order, 387 | dtype=dtype, 388 | func=func, 389 | axis=axis, 390 | _impl_dict=_impl_dict, 391 | **kwargs, 392 | ) 393 | 394 | 395 | aggregate.__doc__ = ( 396 | """ 397 | This is the pure numpy implementation of aggregate. 398 | """ 399 | + aggregate_common_doc 400 | ) 401 | 402 | 403 | def _fill_untouched(idx, ret, fill_value): 404 | """any elements of ret not indexed by idx are set to fill_value.""" 405 | untouched = np.ones_like(ret, dtype=bool) 406 | untouched[idx] = False 407 | ret[untouched] = fill_value 408 | -------------------------------------------------------------------------------- /numpy_groupies/aggregate_numpy_ufunc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from .aggregate_numpy import _aggregate_base 4 | from .utils import ( 5 | aggregate_common_doc, 6 | aliasing, 7 | check_boolean, 8 | get_func, 9 | maxval, 10 | minimum_dtype, 11 | minimum_dtype_scalar, 12 | minval, 13 | ) 14 | 15 | 16 | def _anynan(group_idx, a, size, fill_value, dtype=None): 17 | return _any(group_idx, np.isnan(a), size, fill_value=fill_value, dtype=dtype) 18 | 19 | 20 | def _allnan(group_idx, a, size, fill_value, dtype=None): 21 | return _all(group_idx, np.isnan(a), size, fill_value=fill_value, dtype=dtype) 22 | 23 | 24 | def _any(group_idx, a, size, fill_value, dtype=None): 25 | check_boolean(fill_value) 26 | ret = np.full(size, fill_value, dtype=bool) 27 | if fill_value: 28 | ret[group_idx] = False # any-test should start from False 29 | np.logical_or.at(ret, group_idx, a) 30 | return ret 31 | 32 | 33 | def _all(group_idx, a, size, fill_value, dtype=None): 34 | check_boolean(fill_value) 35 | ret = np.full(size, fill_value, dtype=bool) 36 | if not fill_value: 37 | ret[group_idx] = True # all-test should start from True 38 | np.logical_and.at(ret, group_idx, a) 39 | return ret 40 | 41 | 42 | def _sum(group_idx, a, size, fill_value, dtype=None): 43 | dtype = minimum_dtype_scalar(fill_value, dtype, a) 44 | ret = np.full(size, fill_value, dtype=dtype) 45 | if fill_value != 0: 46 | ret[group_idx] = 0 # sums should start at 0 47 | np.add.at(ret, group_idx, a) 48 | return ret 49 | 50 | 51 | def _len(group_idx, a, size, fill_value, dtype=None): 52 | return _sum(group_idx, 1, size, fill_value, dtype=int) 53 | 54 | 55 | def _prod(group_idx, a, size, fill_value, dtype=None): 56 | """Same as aggregate_numpy.py""" 57 | dtype = minimum_dtype_scalar(fill_value, dtype, a) 58 | ret = np.full(size, fill_value, dtype=dtype) 59 | if fill_value != 1: 60 | ret[group_idx] = 1 # product should start from 1 61 | np.multiply.at(ret, group_idx, a) 62 | return ret 63 | 64 | 65 | def _min(group_idx, a, size, fill_value, dtype=None): 66 | """Same as aggregate_numpy.py""" 67 | dtype = minimum_dtype(fill_value, dtype or a.dtype) 68 | dmax = maxval(fill_value, dtype) 69 | ret = np.full(size, fill_value, dtype=dtype) 70 | if fill_value != dmax: 71 | ret[group_idx] = dmax # min starts from maximum 72 | np.minimum.at(ret, group_idx, a) 73 | return ret 74 | 75 | 76 | def _max(group_idx, a, size, fill_value, dtype=None): 77 | """Same as aggregate_numpy.py""" 78 | dtype = minimum_dtype(fill_value, dtype or a.dtype) 79 | dmin = minval(fill_value, dtype) 80 | ret = np.full(size, fill_value, dtype=dtype) 81 | if fill_value != dmin: 82 | ret[group_idx] = dmin # max starts from minimum 83 | np.maximum.at(ret, group_idx, a) 84 | return ret 85 | 86 | 87 | _impl_dict = dict( 88 | min=_min, 89 | max=_max, 90 | sum=_sum, 91 | prod=_prod, 92 | all=_all, 93 | any=_any, 94 | allnan=_allnan, 95 | anynan=_anynan, 96 | len=_len, 97 | ) 98 | 99 | 100 | def aggregate( 101 | group_idx, 102 | a, 103 | func="sum", 104 | size=None, 105 | fill_value=0, 106 | order="C", 107 | dtype=None, 108 | axis=None, 109 | **kwargs, 110 | ): 111 | func = get_func(func, aliasing, _impl_dict) 112 | if not isinstance(func, str): 113 | raise NotImplementedError("No such ufunc available") 114 | return _aggregate_base( 115 | group_idx, 116 | a, 117 | size=size, 118 | fill_value=fill_value, 119 | order=order, 120 | dtype=dtype, 121 | func=func, 122 | axis=axis, 123 | _impl_dict=_impl_dict, 124 | **kwargs, 125 | ) 126 | 127 | 128 | aggregate.__doc__ = ( 129 | """ 130 | Unlike ``aggregate_numpy``, which in most cases does some custom 131 | optimisations, this version simply uses ``numpy``'s ``ufunc.at``. 132 | 133 | As of version 1.14 this gives fairly poor performance. There should 134 | normally be no need to use this version, it is intended to be used in 135 | testing and benchmarking only. 136 | """ 137 | + aggregate_common_doc 138 | ) 139 | -------------------------------------------------------------------------------- /numpy_groupies/aggregate_pandas.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from .aggregate_numpy import _aggregate_base 7 | from .utils import ( 8 | aggregate_common_doc, 9 | allnan, 10 | anynan, 11 | check_dtype, 12 | funcs_no_separate_nan, 13 | ) 14 | 15 | 16 | def _wrapper(group_idx, a, size, fill_value, func="sum", dtype=None, ddof=0, **kwargs): 17 | funcname = func.__name__ if callable(func) else func 18 | kwargs = {} 19 | if funcname in ("var", "std"): 20 | kwargs["ddof"] = ddof 21 | df = pd.DataFrame({"group_idx": group_idx, "a": a}) 22 | if func == "sort": 23 | grouped = df.groupby("group_idx", sort=True) 24 | else: 25 | grouped = df.groupby("group_idx", sort=False).aggregate(func, **kwargs) 26 | 27 | dtype = check_dtype(dtype, getattr(func, "__name__", funcname), a, size) 28 | if funcname.startswith("cum"): 29 | ret = grouped.values[:, 0] 30 | else: 31 | ret = np.full(size, fill_value, dtype=dtype) 32 | with np.errstate(invalid="ignore"): 33 | ret[grouped.index] = grouped.values[:, 0] 34 | return ret 35 | 36 | 37 | _supported_funcs = "sum prod all any min max mean var std first last cumsum cumprod cummax cummin".split() 38 | _impl_dict = {fn: partial(_wrapper, func=fn) for fn in _supported_funcs} 39 | _impl_dict.update( 40 | ("nan" + fn, partial(_wrapper, func=fn)) 41 | for fn in _supported_funcs 42 | if fn not in funcs_no_separate_nan 43 | ) 44 | _impl_dict.update( 45 | allnan=partial(_wrapper, func=allnan), 46 | anynan=partial(_wrapper, func=anynan), 47 | len=partial(_wrapper, func="count"), 48 | nanlen=partial(_wrapper, func="count"), 49 | argmax=partial(_wrapper, func="idxmax"), 50 | argmin=partial(_wrapper, func="idxmin"), 51 | nanargmax=partial(_wrapper, func="idxmax"), 52 | nanargmin=partial(_wrapper, func="idxmin"), 53 | generic=_wrapper, 54 | ) 55 | 56 | 57 | def aggregate( 58 | group_idx, 59 | a, 60 | func="sum", 61 | size=None, 62 | fill_value=0, 63 | order="C", 64 | dtype=None, 65 | axis=None, 66 | **kwargs, 67 | ): 68 | return _aggregate_base( 69 | group_idx, 70 | a, 71 | size=size, 72 | fill_value=fill_value, 73 | order=order, 74 | dtype=dtype, 75 | func=func, 76 | axis=axis, 77 | _impl_dict=_impl_dict, 78 | is_pandas=True, 79 | **kwargs, 80 | ) 81 | 82 | 83 | aggregate.__doc__ = ( 84 | """ 85 | This is the pandas implementation of aggregate. It makes use of 86 | `pandas`'s groupby machienery and is mainly used for reference 87 | and benchmarking. 88 | """ 89 | + aggregate_common_doc 90 | ) 91 | -------------------------------------------------------------------------------- /numpy_groupies/aggregate_purepy.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import math 3 | import operator 4 | 5 | from .utils import aggregate_common_doc 6 | from .utils import aliasing_py as aliasing 7 | from .utils import funcs_no_separate_nan, get_func 8 | 9 | # min, max, sum, all, any - builtin 10 | 11 | 12 | def _last(x): 13 | return x[-1] 14 | 15 | 16 | def _first(x): 17 | return x[0] 18 | 19 | 20 | def _array(x): 21 | return x 22 | 23 | 24 | def _mean(x): 25 | return sum(x) / len(x) 26 | 27 | 28 | def _var(x, ddof=0): 29 | mean = _mean(x) 30 | return sum((xx - mean) ** 2 for xx in x) / (len(x) - ddof) 31 | 32 | 33 | def _std(x, ddof=0): 34 | return math.sqrt(_var(x, ddof=ddof)) 35 | 36 | 37 | def _prod(x): 38 | r = x[0] 39 | for xx in x[1:]: 40 | r *= xx 41 | return r 42 | 43 | 44 | def _anynan(x): 45 | return any(math.isnan(xx) for xx in x) 46 | 47 | 48 | def _allnan(x): 49 | return all(math.isnan(xx) for xx in x) 50 | 51 | 52 | def _argmax(x_and_idx): 53 | return max(x_and_idx, key=operator.itemgetter(1))[0] 54 | 55 | 56 | _argmax.x_and_idx = True # tell aggregate what to use as first arg 57 | 58 | 59 | def _argmin(x_and_idx): 60 | return min(x_and_idx, key=operator.itemgetter(1))[0] 61 | 62 | 63 | _argmin.x_and_idx = True # tell aggregate what to use as first arg 64 | 65 | 66 | def _sort(group_idx, a, reverse=False): 67 | def _argsort(unordered): 68 | return sorted(range(len(unordered)), key=lambda k: unordered[k]) 69 | 70 | sortidx = _argsort( 71 | list((gi, aj) for gi, aj in zip(group_idx, -a if reverse else a)) 72 | ) 73 | revidx = _argsort(_argsort(group_idx)) 74 | a_srt = [a[si] for si in sortidx] 75 | return [a_srt[ri] for ri in revidx] 76 | 77 | 78 | _impl_dict = dict( 79 | min=min, 80 | max=max, 81 | sum=sum, 82 | prod=_prod, 83 | last=_last, 84 | first=_first, 85 | all=all, 86 | any=any, 87 | mean=_mean, 88 | std=_std, 89 | var=_var, 90 | anynan=_anynan, 91 | allnan=_allnan, 92 | sort=_sort, 93 | array=_array, 94 | argmax=_argmax, 95 | argmin=_argmin, 96 | len=len, 97 | ) 98 | _impl_dict.update( 99 | ("nan" + k, v) 100 | for k, v in list(_impl_dict.items()) 101 | if k not in funcs_no_separate_nan 102 | ) 103 | 104 | 105 | def aggregate( 106 | group_idx, 107 | a, 108 | func="sum", 109 | size=None, 110 | fill_value=0, 111 | order=None, 112 | dtype=None, 113 | axis=None, 114 | **kwargs, 115 | ): 116 | if axis is not None: 117 | raise NotImplementedError("axis arg not supported in purepy implementation.") 118 | 119 | # Check for 2d group_idx 120 | if size is None: 121 | try: 122 | size = 1 + int(max(group_idx)) 123 | except (TypeError, ValueError): 124 | raise NotImplementedError( 125 | "pure python implementation doesn't accept ndim idx input." 126 | ) 127 | 128 | for i in group_idx: 129 | try: 130 | i = int(i) 131 | except (TypeError, ValueError): 132 | if isinstance(i, (list, tuple)): 133 | raise NotImplementedError( 134 | "pure python implementation doesn't accept ndim idx input." 135 | ) 136 | else: 137 | try: 138 | len(i) 139 | except TypeError: 140 | raise ValueError(f"invalid value found in group_idx: {i}") 141 | else: 142 | raise NotImplementedError( 143 | "pure python implementation doesn't accept ndim indexed input." 144 | ) 145 | else: 146 | if i < 0: 147 | raise ValueError("group_idx contains negative value") 148 | 149 | func = get_func(func, aliasing, _impl_dict) 150 | if isinstance(a, (int, float)): 151 | if func not in ("sum", "prod", "len"): 152 | raise ValueError( 153 | "scalar inputs are supported only for 'sum', 'prod' and 'len'" 154 | ) 155 | a = [a] * len(group_idx) 156 | elif len(group_idx) != len(a): 157 | raise ValueError("group_idx and a must be of the same length") 158 | 159 | if isinstance(func, str): 160 | if func.startswith("nan"): 161 | func = func[3:] 162 | # remove nans 163 | group_idx, a = zip( 164 | *((ix, val) for ix, val in zip(group_idx, a) if not math.isnan(val)) 165 | ) 166 | 167 | func = _impl_dict[func] 168 | if func is _sort: 169 | return _sort(group_idx, a, reverse=kwargs.get("reverse", False)) 170 | 171 | # sort data and evaluate function on groups 172 | ret = [fill_value] * size 173 | if not getattr(func, "x_and_idx", False): 174 | data = sorted(zip(group_idx, a), key=operator.itemgetter(0)) 175 | for ix, group in itertools.groupby(data, key=operator.itemgetter(0)): 176 | ret[ix] = func(list(val for _, val in group), **kwargs) 177 | else: 178 | data = sorted(zip(range(len(a)), group_idx, a), key=operator.itemgetter(1)) 179 | for ix, group in itertools.groupby(data, key=operator.itemgetter(1)): 180 | ret[ix] = func(list((val_idx, val) for val_idx, _, val in group), **kwargs) 181 | 182 | return ret 183 | 184 | 185 | aggregate.__doc__ = ( 186 | """ 187 | This is the pure python implementation of aggregate. It is terribly slow. 188 | Using the numpy version is highly recommended. 189 | """ 190 | + aggregate_common_doc 191 | ) 192 | -------------------------------------------------------------------------------- /numpy_groupies/benchmarks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ml31415/numpy-groupies/1dd8c3dfcc55235ae3753ce0db264da53bae4a80/numpy_groupies/benchmarks/__init__.py -------------------------------------------------------------------------------- /numpy_groupies/benchmarks/generic.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python -B 2 | 3 | import platform 4 | import sys 5 | import timeit 6 | from operator import itemgetter 7 | 8 | import numpy as np 9 | 10 | from numpy_groupies.tests import _implementations, aggregate_numpy 11 | from numpy_groupies.utils import allnan, anynan, nanfirst, nanlast 12 | 13 | 14 | def aggregate_grouploop(*args, **kwargs): 15 | """wraps func in lambda which prevents aggregate_numpy from 16 | recognising and optimising it. Instead it groups and loops.""" 17 | extrafuncs = { 18 | "allnan": allnan, 19 | "anynan": anynan, 20 | "first": itemgetter(0), 21 | "last": itemgetter(-1), 22 | "nanfirst": nanfirst, 23 | "nanlast": nanlast, 24 | } 25 | func = kwargs.pop("func") 26 | func = extrafuncs.get(func, func) 27 | if isinstance(func, str): 28 | raise NotImplementedError("Grouploop needs to be called with a function") 29 | return aggregate_numpy.aggregate(*args, func=lambda x: func(x), **kwargs) 30 | 31 | 32 | def arbitrary(iterator): 33 | tmp = 0 34 | for i, x in enumerate(iterator, 1): 35 | tmp += x**i 36 | return tmp 37 | 38 | 39 | func_list = ( 40 | np.sum, 41 | np.prod, 42 | np.min, 43 | np.max, 44 | len, 45 | np.all, 46 | np.any, 47 | "anynan", 48 | "allnan", 49 | np.mean, 50 | np.std, 51 | np.var, 52 | "first", 53 | "last", 54 | "argmax", 55 | "argmin", 56 | np.nansum, 57 | np.nanprod, 58 | np.nanmin, 59 | np.nanmax, 60 | "nanlen", 61 | "nanall", 62 | "nanany", 63 | np.nanmean, 64 | np.nanvar, 65 | np.nanstd, 66 | "nanfirst", 67 | "nanlast", 68 | "nanargmin", 69 | "nanargmax", 70 | "cumsum", 71 | "cumprod", 72 | "cummax", 73 | "cummin", 74 | arbitrary, 75 | "sort", 76 | ) 77 | 78 | 79 | def benchmark_data(size=5e5, seed=100): 80 | rnd = np.random.RandomState(seed=seed) 81 | group_idx = rnd.randint(0, int(1e3), int(size)) 82 | a = rnd.random_sample(group_idx.size) 83 | a[a > 0.8] = 0 84 | nana = a.copy() 85 | nana[(nana < 0.2) & (nana != 0)] = np.nan 86 | nan_share = np.mean(np.isnan(nana)) 87 | assert 0.15 < nan_share < 0.25, f"{nan_share * 100:3f}% nans" 88 | return a, nana, group_idx 89 | 90 | 91 | def benchmark(implementations, repeat=5, size=5e5, seed=100, raise_errors=False): 92 | a, nana, group_idx = benchmark_data(size=size, seed=seed) 93 | 94 | print( 95 | "function" 96 | + "".join(impl.__name__.rsplit("_", 1)[1].rjust(14) for impl in implementations) 97 | ) 98 | print("-" * (9 + 14 * len(implementations))) 99 | for func in func_list: 100 | func_name = getattr(func, "__name__", func) 101 | print(func_name.ljust(9), end="") 102 | results = [] 103 | used_a = nana if "nan" in func_name else a 104 | 105 | for impl in implementations: 106 | if impl is None: 107 | print("----".rjust(14), end="") 108 | continue 109 | aggregatefunc = impl.aggregate 110 | 111 | try: 112 | res = aggregatefunc(group_idx, used_a, func=func) 113 | except NotImplementedError: 114 | print("----".rjust(14), end="") 115 | continue 116 | except Exception: 117 | if raise_errors: 118 | raise 119 | print("ERROR".rjust(14), end="") 120 | else: 121 | results.append(res) 122 | try: 123 | np.testing.assert_array_almost_equal(res, results[0]) 124 | except AssertionError: 125 | print("FAIL".rjust(14), end="") 126 | else: 127 | t0 = min( 128 | timeit.Timer( 129 | lambda: aggregatefunc(group_idx, used_a, func=func) 130 | ).repeat(repeat=repeat, number=1) 131 | ) 132 | print(f"{t0 * 1000:.3f}".rjust(14), end="") 133 | sys.stdout.flush() 134 | print() 135 | 136 | implementation_names = [impl.__name__.rsplit("_", 1)[1] for impl in implementations] 137 | postfix = "" 138 | if "numba" in implementation_names: 139 | import numba 140 | 141 | postfix += f", Numba {numba.__version__}" 142 | if "pandas" in implementation_names: 143 | import pandas 144 | 145 | postfix += f", Pandas {pandas.__version__}" 146 | print( 147 | f"{platform.system()}({platform.machine()}), Python {sys.version.split()[0]}, Numpy {np.version.version}" 148 | f"{postfix}" 149 | ) 150 | 151 | 152 | if __name__ == "__main__": 153 | implementations = ( 154 | _implementations if "--purepy" in sys.argv else _implementations[1:] 155 | ) 156 | implementations = ( 157 | implementations if "--pandas" in sys.argv else implementations[:-1] 158 | ) 159 | benchmark(implementations, raise_errors=False) 160 | -------------------------------------------------------------------------------- /numpy_groupies/benchmarks/simple.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python -B 2 | 3 | import timeit 4 | 5 | import numpy as np 6 | 7 | from numpy_groupies import aggregate_np, aggregate_py, aggregate_ufunc 8 | from numpy_groupies.aggregate_pandas import aggregate as aggregate_pd 9 | from numpy_groupies.utils import aliasing 10 | 11 | 12 | def aggregate_group_loop(*args, **kwargs): 13 | """wraps func in lambda which prevents aggregate_numpy from 14 | recognising and optimising it. Instead it groups and loops.""" 15 | func = kwargs["func"] 16 | del kwargs["func"] 17 | return aggregate_np(*args, func=lambda x: func(x), **kwargs) 18 | 19 | 20 | print("-----simple examples----------") 21 | test_a = np.array([12.0, 3.2, -15, 88, 12.9]) 22 | test_group_idx = np.array([1, 0, 1, 4, 1]) 23 | print("test_a: ", test_a) 24 | print("test_group_idx: ", test_group_idx) 25 | print("aggregate(test_group_idx, test_a):") 26 | print(aggregate_np(test_group_idx, test_a)) # group vals by idx and sum 27 | # array([3.2, 9.9, 0., 0., 88.]) 28 | print("aggregate(test_group_idx, test_a, sz=8, func='min', fill_value=np.nan):") 29 | print(aggregate_np(test_group_idx, test_a, size=8, func="min", fill_value=np.nan)) 30 | # array([3.2, -15., nan, 88., nan, nan, nan, nan]) 31 | print( 32 | "aggregate_py(test_group_idx, test_a, sz=5, func=lambda x: ' + '.join(str(xx) for xx in x),fill_value='')" 33 | ) 34 | print( 35 | aggregate_py( 36 | test_group_idx, 37 | test_a, 38 | size=5, 39 | func=lambda x: " + ".join(str(xx) for xx in x), 40 | fill_value="", 41 | ) 42 | ) 43 | 44 | 45 | print("") 46 | print("---------testing--------------") 47 | print("compare against group-and-loop with numpy") 48 | testable_funcs = { 49 | aliasing[f]: f 50 | for f in (np.sum, np.prod, np.any, np.all, np.min, np.max, np.std, np.var, np.mean) 51 | } 52 | test_group_idx = np.random.randint(0, int(1e3), int(1e5)) 53 | test_a = np.random.rand(int(1e5)) * 100 - 50 54 | test_a[test_a > 25] = 0 # for use with bool functions 55 | for name, f in testable_funcs.items(): 56 | numpy_loop_group = aggregate_group_loop(test_group_idx, test_a, func=f) 57 | 58 | for acc_func, acc_name in [ 59 | (aggregate_np, "np-optimised"), 60 | (aggregate_ufunc, "np-ufunc-at"), 61 | (aggregate_py, "purepy"), 62 | (aggregate_pd, "pandas"), 63 | ]: 64 | try: 65 | test_out = acc_func(test_group_idx, test_a, func=name) 66 | test_out = np.asarray(test_out) 67 | if not np.allclose(test_out, numpy_loop_group.astype(test_out.dtype)): 68 | print( 69 | name, 70 | acc_name, 71 | "FAILED test, output: [" + acc_name + "; correct]...", 72 | ) 73 | print(np.vstack((test_out, numpy_loop_group))) 74 | else: 75 | print(name, acc_name, "PASSED test") 76 | except NotImplementedError: 77 | print(name, acc_name, "NOT IMPLEMENTED") 78 | 79 | print("") 80 | print("----------benchmarking-------------") 81 | print( 82 | "Note that the actual observed speedup depends on a variety of properties of the input." 83 | ) 84 | print("Here we are using 100,000 indices uniformly picked from [0, 1000).") 85 | print("Specifically, about 25% of the values are 0 (for use with bool operations),") 86 | print("the remainder are uniformly distributed on [-50,25).") 87 | print("Times are scaled to 10 repetitions (actual number of reps used may not be 10).") 88 | 89 | print( 90 | "".join( 91 | [ 92 | "function".rjust(8), 93 | "pure-py".rjust(14), 94 | "np-grouploop".rjust(14), 95 | "np-ufuncat".rjust(14), 96 | "np-optimised".rjust(14), 97 | "pandas".rjust(14), 98 | "ratio".rjust(15), 99 | ] 100 | ) 101 | ) 102 | 103 | for name, f in testable_funcs.items(): 104 | print(name.rjust(8), end="") 105 | times = [None] * 5 106 | for ii, acc_func in enumerate( 107 | [ 108 | aggregate_py, 109 | aggregate_group_loop, 110 | aggregate_ufunc, 111 | aggregate_np, 112 | aggregate_pd, 113 | ] 114 | ): 115 | try: 116 | func = f if acc_func is aggregate_group_loop else name 117 | reps = 3 if acc_func is aggregate_py else 20 118 | times[ii] = ( 119 | timeit.Timer( 120 | lambda: acc_func(test_group_idx, test_a, func=func) 121 | ).timeit(number=reps) 122 | / reps 123 | * 10 124 | ) 125 | print(f"{times[ii] * 1000:.1f}ms".rjust(13), end="") 126 | except NotImplementedError: 127 | print("no-impl".rjust(13), end="") 128 | 129 | denom = min(t for t in times if t is not None) 130 | ratios = [ 131 | ("-".center(4) if t is None else str(round(t / denom, 1))).center(5) 132 | for t in times 133 | ] 134 | print(" ", (":".join(ratios))) 135 | -------------------------------------------------------------------------------- /numpy_groupies/tests/__init__.py: -------------------------------------------------------------------------------- 1 | from functools import wraps 2 | 3 | import pytest 4 | 5 | from .. import aggregate_numpy, aggregate_numpy_ufunc, aggregate_purepy 6 | 7 | try: 8 | from .. import aggregate_numba 9 | except ImportError: 10 | aggregate_numba = None 11 | try: 12 | from .. import aggregate_pandas 13 | except ImportError: 14 | aggregate_pandas = None 15 | 16 | _implementations = [ 17 | aggregate_purepy, 18 | aggregate_numpy_ufunc, 19 | aggregate_numpy, 20 | aggregate_numba, 21 | aggregate_pandas, 22 | ] 23 | _implementations = [i for i in _implementations if i is not None] 24 | 25 | 26 | def _impl_name(impl): 27 | if not impl or type(impl).__name__ == "NotSetType": 28 | return 29 | return impl.__name__.rsplit("aggregate_", 1)[1].rsplit("_", 1)[-1] 30 | 31 | 32 | _implemented_by_impl_name = { 33 | "numpy": {"not_implemented": ("cumprod", "cummax", "cummin")}, 34 | "purepy": { 35 | "not_implemented": ("cumsum", "cumprod", "cummax", "cummin", "sumofsquares") 36 | }, 37 | "numba": {"not_implemented": ("array", "list", "sort")}, 38 | "pandas": { 39 | "not_implemented": ("array", "list", "sort", "sumofsquares", "nansumofsquares") 40 | }, 41 | "ufunc": { 42 | "implemented": ( 43 | "sum", 44 | "prod", 45 | "min", 46 | "max", 47 | "len", 48 | "all", 49 | "any", 50 | "anynan", 51 | "allnan", 52 | ) 53 | }, 54 | } 55 | 56 | 57 | def _is_implemented(impl_name, funcname): 58 | func_description = _implemented_by_impl_name[impl_name] 59 | not_implemented = func_description.get("not_implemented", []) 60 | implemented = func_description.get("implemented", []) 61 | if impl_name == "purepy" and funcname.startswith("nan"): 62 | return False 63 | if funcname in not_implemented: 64 | return False 65 | if implemented and funcname not in implemented: 66 | return False 67 | return True 68 | 69 | 70 | def _wrap_notimplemented_skip(impl, name=None): 71 | """Some implementations lack some functionality. That's ok, let's skip that instead of raising errors.""" 72 | 73 | @wraps(impl) 74 | def try_skip(*args, **kwargs): 75 | try: 76 | return impl(*args, **kwargs) 77 | except NotImplementedError: 78 | impl_name = impl.__module__.split("_")[-1] 79 | func = kwargs.pop("func", None) 80 | if callable(func): 81 | func = func.__name__ 82 | if not _is_implemented(impl_name, func): 83 | pytest.skip("Functionality not implemented") 84 | 85 | if name: 86 | try_skip.__name__ = name 87 | return try_skip 88 | 89 | 90 | func_list = ( 91 | "sum", 92 | "prod", 93 | "min", 94 | "max", 95 | "all", 96 | "any", 97 | "mean", 98 | "std", 99 | "var", 100 | "len", 101 | "argmin", 102 | "argmax", 103 | "anynan", 104 | "allnan", 105 | "cumsum", 106 | "sumofsquares", 107 | "nansum", 108 | "nanprod", 109 | "nanmin", 110 | "nanmax", 111 | "nanmean", 112 | "nanstd", 113 | "nanvar", 114 | "nanlen", 115 | "nanargmin", 116 | "nanargmax", 117 | "nansumofsquares", 118 | ) 119 | -------------------------------------------------------------------------------- /numpy_groupies/tests/test_compare.py: -------------------------------------------------------------------------------- 1 | """ 2 | In this test, aggregate_numpy is taken as a reference implementation and this 3 | results are compared against the results of the other implementations. Implementations 4 | may throw NotImplementedError in order to show missing functionality without throwing 5 | test errors. 6 | """ 7 | 8 | from itertools import product 9 | 10 | import numpy as np 11 | import pytest 12 | 13 | from . import ( 14 | _impl_name, 15 | _is_implemented, 16 | aggregate_numba, 17 | aggregate_numpy, 18 | aggregate_numpy_ufunc, 19 | aggregate_pandas, 20 | aggregate_purepy, 21 | func_list, 22 | ) 23 | 24 | 25 | class AttrDict(dict): 26 | __getattr__ = dict.__getitem__ 27 | 28 | 29 | TEST_PAIRS = ["np/py", "ufunc/np", "numba/np", "pandas/np"] 30 | 31 | 32 | @pytest.fixture(params=TEST_PAIRS, scope="module") 33 | def aggregate_cmp(request, seed=100): 34 | test_pair = request.param 35 | if test_pair == "np/py": 36 | # Some functions in purepy are not implemented 37 | func_ref = aggregate_purepy.aggregate 38 | func = aggregate_numpy.aggregate 39 | group_cnt = 100 40 | else: 41 | group_cnt = 1000 42 | func_ref = aggregate_numpy.aggregate 43 | if "ufunc" in request.param: 44 | impl = aggregate_numpy_ufunc 45 | elif "numba" in request.param: 46 | impl = aggregate_numba 47 | elif "pandas" in request.param: 48 | impl = aggregate_pandas 49 | else: 50 | impl = None 51 | 52 | if not impl: 53 | pytest.skip("Implementation not available") 54 | name = _impl_name(impl) 55 | func = impl.aggregate 56 | 57 | rnd = np.random.RandomState(seed=seed) 58 | 59 | # Gives 100000 duplicates of size 10 each 60 | group_idx = np.repeat(np.arange(group_cnt), 2) 61 | rnd.shuffle(group_idx) 62 | group_idx = np.repeat(group_idx, 10) 63 | 64 | a = rnd.randn(group_idx.size) 65 | nana = a.copy() 66 | nana[::3] = np.nan 67 | nana[: (len(nana) // 2)] = np.nan 68 | somea = a.copy() 69 | somea[somea < 0.3] = 0 70 | somea[::31] = np.nan 71 | return AttrDict(locals()) 72 | 73 | 74 | def _deselect_purepy(aggregate_cmp, *args, **kwargs): 75 | # purepy implementation does not handle ndim arrays 76 | # This is a won't fix and should be deselected instead of skipped 77 | return aggregate_cmp.endswith("py") 78 | 79 | 80 | def _deselect_not_implemented(aggregate_cmp, func, fill_value, *args, **kwargs): 81 | impl_name = ( 82 | "purepy" if aggregate_cmp.endswith("py") else aggregate_cmp.split("/", 1)[0] 83 | ) 84 | funcname = getattr(func, "__name__", func) 85 | return not _is_implemented(impl_name, funcname) 86 | 87 | 88 | def func_arbitrary(iterator): 89 | tmp = 0 90 | for x in iterator: 91 | tmp += x * x 92 | return tmp 93 | 94 | 95 | def func_preserve_order(iterator): 96 | tmp = 0 97 | for i, x in enumerate(iterator, 1): 98 | tmp += x**i 99 | return tmp 100 | 101 | 102 | @pytest.mark.filterwarnings("ignore::FutureWarning") # handled pandas deprecation 103 | @pytest.mark.filterwarnings("ignore:numpy.ufunc size changed") 104 | @pytest.mark.deselect_if(func=_deselect_not_implemented) 105 | @pytest.mark.parametrize("fill_value", [0, 1, np.nan]) 106 | @pytest.mark.parametrize("func", func_list, ids=lambda x: getattr(x, "__name__", x)) 107 | def test_cmp(aggregate_cmp, func, fill_value, decimal=10): 108 | is_nanfunc = "nan" in getattr(func, "__name__", func) 109 | a = aggregate_cmp.nana if is_nanfunc else aggregate_cmp.a 110 | try: 111 | ref = aggregate_cmp.func_ref( 112 | aggregate_cmp.group_idx, a, func=func, fill_value=fill_value 113 | ) 114 | except ValueError: 115 | with pytest.raises(ValueError): 116 | aggregate_cmp.func( 117 | aggregate_cmp.group_idx, a, func=func, fill_value=fill_value 118 | ) 119 | else: 120 | try: 121 | res = aggregate_cmp.func( 122 | aggregate_cmp.group_idx, a, func=func, fill_value=fill_value 123 | ) 124 | except ValueError: 125 | if np.isnan(fill_value) and aggregate_cmp.test_pair.endswith("py"): 126 | pytest.skip( 127 | "pure python version uses lists and does not raise ValueErrors when inserting nan into integers" 128 | ) 129 | elif aggregate_cmp.test_pair.startswith("pandas"): 130 | pytest.skip("pandas now raises ValueError on all-nan arrays") 131 | 132 | else: 133 | raise 134 | if isinstance(ref, np.ndarray): 135 | assert res.dtype == ref.dtype 136 | try: 137 | np.testing.assert_allclose(res, ref, rtol=10**-decimal) 138 | except AssertionError: 139 | if "arg" in func and aggregate_cmp.test_pair.startswith("pandas"): 140 | pytest.skip( 141 | "pandas doesn't fill indices for all-nan groups with fill_value, but with -inf instead" 142 | ) 143 | else: 144 | raise 145 | 146 | 147 | @pytest.mark.deselect_if(func=_deselect_purepy) 148 | @pytest.mark.parametrize(["ndim", "order"], product([2, 3], ["C", "F"])) 149 | def test_cmp_ndim(aggregate_cmp, ndim, order, outsize=100, decimal=14): 150 | nindices = int(outsize**ndim) 151 | outshape = tuple([outsize] * ndim) 152 | group_idx = np.random.randint(0, outsize, size=(ndim, nindices)) 153 | a = np.random.random(group_idx.shape[1]) 154 | 155 | res = aggregate_cmp.func(group_idx, a, size=outshape, order=order) 156 | ref = aggregate_cmp.func_ref(group_idx, a, size=outshape, order=order) 157 | if ndim > 1 and order == "F": 158 | # 1d arrays always return False here 159 | assert np.isfortran(res) 160 | else: 161 | assert not np.isfortran(res) 162 | assert res.shape == outshape 163 | np.testing.assert_array_almost_equal(res, ref, decimal=decimal) 164 | -------------------------------------------------------------------------------- /numpy_groupies/tests/test_generic.py: -------------------------------------------------------------------------------- 1 | """Tests, that are run against all implemented versions of aggregate.""" 2 | 3 | import itertools 4 | import warnings 5 | 6 | import numpy as np 7 | import pytest 8 | 9 | from . import ( 10 | _impl_name, 11 | _implementations, 12 | _wrap_notimplemented_skip, 13 | func_list, 14 | _is_implemented, 15 | ) 16 | 17 | 18 | @pytest.fixture(params=_implementations, ids=_impl_name) 19 | def aggregate_all(request): 20 | impl = request.param 21 | if impl is None: 22 | pytest.skip("Implementation not available") 23 | name = _impl_name(impl) 24 | return _wrap_notimplemented_skip(impl.aggregate, "aggregate_" + name) 25 | 26 | 27 | def _deselect_purepy(aggregate_all, *args, **kwargs): 28 | # purepy implementations does not handle nan values and ndim correctly. 29 | # So it needs to be excluded from several tests.""" 30 | return aggregate_all.__name__.endswith("purepy") 31 | 32 | 33 | def _deselect_purepy_and_pandas(aggregate_all, *args, **kwargs): 34 | # purepy and pandas implementation handle some nan cases differently. 35 | # So they need to be excluded from several tests.""" 36 | return aggregate_all.__name__.endswith(("pandas", "purepy")) 37 | 38 | 39 | def _deselect_purepy_and_invalid_axis(aggregate_all, func, size, axis): 40 | impl_name = aggregate_all.__name__.split("_")[-1] 41 | if impl_name == "purepy": 42 | # purepy does not handle axis parameter 43 | return True 44 | if axis >= len(size): 45 | return True 46 | if not _is_implemented(impl_name, func): 47 | return True 48 | return False 49 | 50 | 51 | def _deselect_not_implemented(aggregate_all, func, *args, **kwargs): 52 | impl_name = aggregate_all.__name__.split("_")[-1] 53 | return not _is_implemented(impl_name, func) 54 | 55 | 56 | def test_preserve_missing(aggregate_all): 57 | res = aggregate_all(np.array([0, 1, 3, 1, 3]), np.arange(101, 106, dtype=int)) 58 | np.testing.assert_array_equal(res, np.array([101, 206, 0, 208])) 59 | if not isinstance(res, list): 60 | assert "int" in res.dtype.name 61 | 62 | 63 | @pytest.mark.parametrize("group_idx_type", [int, "uint32", "uint64"]) 64 | def test_uint_group_idx(aggregate_all, group_idx_type): 65 | group_idx = np.array([1, 1, 2, 2, 2, 2, 4, 4], dtype=group_idx_type) 66 | res = aggregate_all(group_idx, np.ones(group_idx.size), dtype=int) 67 | np.testing.assert_array_equal(res, np.array([0, 2, 4, 0, 2])) 68 | if not isinstance(res, list): 69 | assert "int" in res.dtype.name 70 | 71 | 72 | def test_start_with_offset(aggregate_all): 73 | group_idx = np.array([1, 1, 2, 2, 2, 2, 4, 4]) 74 | res = aggregate_all(group_idx, np.ones(group_idx.size), dtype=int) 75 | np.testing.assert_array_equal(res, np.array([0, 2, 4, 0, 2])) 76 | if not isinstance(res, list): 77 | assert "int" in res.dtype.name 78 | 79 | 80 | @pytest.mark.parametrize( 81 | "floatfunc", [np.std, np.var, np.mean], ids=lambda x: x.__name__ 82 | ) 83 | def test_float_enforcement(aggregate_all, floatfunc): 84 | group_idx = np.arange(10).repeat(3) 85 | a = np.arange(group_idx.size) 86 | res = aggregate_all(group_idx, a, floatfunc) 87 | if not isinstance(res, list): 88 | assert "float" in res.dtype.name 89 | assert np.all(np.array(res) > 0) 90 | 91 | 92 | def test_start_with_offset_prod(aggregate_all): 93 | group_idx = np.array([2, 2, 4, 4, 4, 7, 7, 7]) 94 | res = aggregate_all(group_idx, group_idx, func=np.prod, dtype=int) 95 | np.testing.assert_array_equal(res, np.array([0, 0, 4, 0, 64, 0, 0, 343])) 96 | 97 | 98 | def test_no_negative_indices(aggregate_all): 99 | for pos in (0, 10, -1): 100 | group_idx = np.arange(5).repeat(5) 101 | group_idx[pos] = -1 102 | pytest.raises(ValueError, aggregate_all, group_idx, np.arange(len(group_idx))) 103 | 104 | 105 | def test_parameter_missing(aggregate_all): 106 | pytest.raises(TypeError, aggregate_all, np.arange(5)) 107 | 108 | 109 | def test_shape_mismatch(aggregate_all): 110 | pytest.raises(ValueError, aggregate_all, np.array((1, 2, 3)), np.array((1, 2))) 111 | 112 | 113 | def test_create_lists(aggregate_all): 114 | res = aggregate_all( 115 | np.array([0, 1, 3, 1, 3]), np.arange(101, 106, dtype=int), func=list 116 | ) 117 | np.testing.assert_array_equal(np.array(res[0]), np.array([101])) 118 | assert res[2] == 0 119 | np.testing.assert_array_equal(np.array(res[3]), np.array([103, 105])) 120 | 121 | 122 | def test_item_counting(aggregate_all): 123 | group_idx = np.array([0, 1, 2, 3, 3, 3, 3, 4, 5, 5, 5, 6, 5, 4, 3, 8, 8]) 124 | a = np.arange(group_idx.size) 125 | res = aggregate_all(group_idx, a, func=lambda x: len(x) > 1) 126 | np.testing.assert_array_equal(res, np.array([0, 0, 0, 1, 1, 1, 0, 0, 1])) 127 | 128 | 129 | @pytest.mark.parametrize( 130 | ["func", "fill_value"], [(np.array, None), (np.sum, -1)], ids=["array", "sum"] 131 | ) 132 | def test_fill_value(aggregate_all, func, fill_value): 133 | group_idx = np.array([0, 2, 2], dtype=int) 134 | res = aggregate_all( 135 | group_idx, 136 | np.arange(len(group_idx), dtype=int), 137 | func=func, 138 | fill_value=fill_value, 139 | ) 140 | assert res[1] == fill_value 141 | 142 | 143 | @pytest.mark.parametrize("order", ["C", "F"]) 144 | def test_array_ordering(aggregate_all, order, size=10): 145 | mat = np.zeros((size, size), order=order, dtype=float) 146 | mat.flat[:] = np.arange(size * size) 147 | assert aggregate_all(np.zeros(size, dtype=int), mat[0, :], order=order)[0] == sum( 148 | range(size) 149 | ) 150 | 151 | 152 | @pytest.mark.deselect_if(func=_deselect_purepy) 153 | @pytest.mark.parametrize("size", [None, (10, 2)]) 154 | def test_ndim_group_idx(aggregate_all, size): 155 | group_idx = np.vstack((np.repeat(np.arange(10), 10), np.repeat([0, 1], 50))) 156 | aggregate_all(group_idx, 1, size=size) 157 | 158 | 159 | @pytest.mark.deselect_if(func=_deselect_purepy) 160 | @pytest.mark.parametrize(["ndim", "order"], itertools.product([1, 2, 3], ["C", "F"])) 161 | def test_ndim_indexing(aggregate_all, ndim, order, outsize=10): 162 | nindices = int(outsize**ndim) 163 | outshape = tuple([outsize] * ndim) 164 | group_idx = np.random.randint(0, outsize, size=(ndim, nindices)) 165 | a = np.random.random(group_idx.shape[1]) 166 | res = aggregate_all(group_idx, a, size=outshape, order=order) 167 | if ndim > 1 and order == "F": 168 | # 1d arrays always return False here 169 | assert np.isfortran(res) 170 | else: 171 | assert not np.isfortran(res) 172 | assert res.shape == outshape 173 | 174 | 175 | def test_len(aggregate_all, group_size=5): 176 | group_idx = np.arange(0, 100, 2, dtype=int).repeat(group_size) 177 | a = np.arange(group_idx.size) 178 | res = aggregate_all(group_idx, a, func="len") 179 | ref = aggregate_all(group_idx, 1, func="sum") 180 | if isinstance(res, np.ndarray): 181 | assert issubclass(res.dtype.type, np.integer) 182 | else: 183 | assert isinstance(res[0], int) 184 | np.testing.assert_array_equal(res, ref) 185 | group_idx = np.arange(0, 100, dtype=int).repeat(group_size) 186 | a = np.arange(group_idx.size) 187 | res = aggregate_all(group_idx, a, func=len) 188 | if isinstance(res, np.ndarray): 189 | assert np.all(res == group_size) 190 | else: 191 | assert all(x == group_size for x in res) 192 | 193 | 194 | def test_nan_len(aggregate_all): 195 | group_idx = np.arange(0, 20, 2, dtype=int).repeat(5) 196 | a = np.random.random(group_idx.size) 197 | a[::4] = np.nan 198 | a[::5] = np.nan 199 | res = aggregate_all(group_idx, a, func="nanlen") 200 | ref = aggregate_all(group_idx[~np.isnan(a)], 1, func="sum") 201 | if isinstance(res, np.ndarray): 202 | assert issubclass(res.dtype.type, np.integer) 203 | else: 204 | assert isinstance(res[0], int) 205 | np.testing.assert_array_equal(res, ref) 206 | 207 | 208 | @pytest.mark.parametrize("first_last", ["first", "last"]) 209 | def test_first_last(aggregate_all, first_last): 210 | group_idx = np.arange(0, 100, 2, dtype=int).repeat(5) 211 | a = np.arange(group_idx.size) 212 | res = aggregate_all(group_idx, a, func=first_last, fill_value=-1) 213 | ref = np.zeros(np.max(group_idx) + 1) 214 | ref.fill(-1) 215 | ref[::2] = np.arange( 216 | 0 if first_last == "first" else 4, group_idx.size, 5, dtype=int 217 | ) 218 | np.testing.assert_array_equal(res, ref) 219 | 220 | 221 | @pytest.mark.parametrize( 222 | ["first_last", "nanoffset"], itertools.product(["nanfirst", "nanlast"], [0, 2, 4]) 223 | ) 224 | def test_nan_first_last(aggregate_all, first_last, nanoffset): 225 | group_idx = np.arange(0, 100, 2, dtype=int).repeat(5) 226 | a = np.arange(group_idx.size, dtype=float) 227 | 228 | a[nanoffset::5] = np.nan 229 | res = aggregate_all(group_idx, a, func=first_last, fill_value=-1) 230 | ref = np.zeros(np.max(group_idx) + 1) 231 | ref.fill(-1) 232 | 233 | if first_last == "nanfirst": 234 | ref_offset = 1 if nanoffset == 0 else 0 235 | else: 236 | ref_offset = 3 if nanoffset == 4 else 4 237 | ref[::2] = np.arange(ref_offset, group_idx.size, 5, dtype=int) 238 | np.testing.assert_array_equal(res, ref) 239 | 240 | 241 | @pytest.mark.parametrize(["func", "ddof"], itertools.product(["var", "std"], [0, 1, 2])) 242 | def test_ddof(aggregate_all, func, ddof, size=20): 243 | group_idx = np.zeros(20, dtype=int) 244 | a = np.random.random(group_idx.size) 245 | res = aggregate_all(group_idx, a, func, ddof=ddof) 246 | ref_func = {"std": np.std, "var": np.var}.get(func) 247 | ref = ref_func(a, ddof=ddof) 248 | assert abs(res[0] - ref) < 1e-10 249 | 250 | 251 | @pytest.mark.parametrize("func", ["sum", "prod", "mean", "var", "std"]) 252 | def test_scalar_input(aggregate_all, func): 253 | group_idx = np.arange(0, 100, dtype=int).repeat(5) 254 | if func not in ("sum", "prod"): 255 | pytest.raises( 256 | (ValueError, NotImplementedError), aggregate_all, group_idx, 1, func=func 257 | ) 258 | else: 259 | res = aggregate_all(group_idx, 1, func=func) 260 | ref = aggregate_all(group_idx, np.ones_like(group_idx, dtype=int), func=func) 261 | np.testing.assert_array_equal(res, ref) 262 | 263 | 264 | @pytest.mark.parametrize("func", ["sum", "prod", "mean", "var", "std", "all", "any"]) 265 | def test_nan_input(aggregate_all, func, groups=100): 266 | if aggregate_all.__name__.endswith("pandas"): 267 | pytest.skip("pandas always skips nan values") 268 | group_idx = np.arange(0, groups, dtype=int).repeat(5) 269 | a = np.random.random(group_idx.size) 270 | a[::2] = np.nan 271 | 272 | if func in ("all", "any"): 273 | ref = np.ones(groups, dtype=bool) 274 | else: 275 | ref = np.full(groups, np.nan, dtype=float) 276 | res = aggregate_all(group_idx, a, func=func) 277 | np.testing.assert_array_equal(res, ref) 278 | 279 | 280 | def test_nan_input_len(aggregate_all, groups=100, group_size=5): 281 | if aggregate_all.__name__.endswith("pandas"): 282 | pytest.skip("pandas always skips nan values") 283 | group_idx = np.arange(0, groups, dtype=int).repeat(group_size) 284 | a = np.random.random(len(group_idx)) 285 | a[::2] = np.nan 286 | ref = np.full(groups, group_size, dtype=int) 287 | res = aggregate_all(group_idx, a, func=len) 288 | np.testing.assert_array_equal(res, ref) 289 | 290 | 291 | def test_argmin_argmax_nonans(aggregate_all): 292 | group_idx = np.array([0, 0, 0, 0, 3, 3, 3, 3]) 293 | a = np.array([4, 4, 3, 1, 10, 9, 9, 11]) 294 | 295 | res = aggregate_all(group_idx, a, func="argmax", fill_value=-1) 296 | np.testing.assert_array_equal(res, [0, -1, -1, 7]) 297 | 298 | res = aggregate_all(group_idx, a, func="argmin", fill_value=-1) 299 | np.testing.assert_array_equal(res, [3, -1, -1, 5]) 300 | 301 | 302 | @pytest.mark.deselect_if(func=_deselect_purepy) 303 | def test_argmin_argmax_nans(aggregate_all): 304 | if aggregate_all.__name__.endswith("pandas"): 305 | pytest.skip("pandas always ignores nans") 306 | 307 | group_idx = np.array([0, 0, 0, 0, 3, 3, 3, 3]) 308 | a = np.array([4, 4, 3, 1, np.nan, 1, 2, 3]) 309 | 310 | res = aggregate_all(group_idx, a, func="argmax", fill_value=-1) 311 | np.testing.assert_array_equal(res, [0, -1, -1, -1]) 312 | 313 | res = aggregate_all(group_idx, a, func="argmin", fill_value=-1) 314 | np.testing.assert_array_equal(res, [3, -1, -1, -1]) 315 | 316 | 317 | @pytest.mark.deselect_if(func=_deselect_purepy) 318 | def test_nanargmin_nanargmax_nans(aggregate_all): 319 | if aggregate_all.__name__.endswith("pandas"): 320 | pytest.skip( 321 | "pandas doesn't fill indices for all-nan groups with fill_value but with -inf instead" 322 | ) 323 | 324 | group_idx = np.array([0, 0, 0, 0, 3, 3, 3, 3]) 325 | a = np.array([4, 4, np.nan, 1, np.nan, np.nan, np.nan, np.nan]) 326 | 327 | res = aggregate_all(group_idx, a, func="nanargmax", fill_value=-1) 328 | np.testing.assert_array_equal(res, [0, -1, -1, -1]) 329 | 330 | res = aggregate_all(group_idx, a, func="nanargmin", fill_value=-1) 331 | np.testing.assert_array_equal(res, [3, -1, -1, -1]) 332 | 333 | 334 | def test_nanargmin_nanargmax_nonans(aggregate_all): 335 | group_idx = np.array([0, 0, 0, 0, 3, 3, 3, 3]) 336 | a = np.array([4, 4, 3, 1, 10, 9, 9, 11]) 337 | 338 | res = aggregate_all(group_idx, a, func="nanargmax", fill_value=-1) 339 | np.testing.assert_array_equal(res, [0, -1, -1, 7]) 340 | 341 | res = aggregate_all(group_idx, a, func="nanargmin", fill_value=-1) 342 | np.testing.assert_array_equal(res, [3, -1, -1, 5]) 343 | 344 | 345 | def test_min_max_inf(aggregate_all): 346 | # https://github.com/ml31415/numpy-groupies/issues/40 347 | res = aggregate_all( 348 | np.array([0, 1, 2, 0, 1, 2]), 349 | np.array([-np.inf, 0, -np.inf, -np.inf, 0, 0]), 350 | func="max", 351 | ) 352 | np.testing.assert_array_equal(res, [-np.inf, 0, 0]) 353 | 354 | res = aggregate_all( 355 | np.array([0, 1, 2, 0, 1, 2]), 356 | np.array([np.inf, 0, np.inf, np.inf, 0, 0]), 357 | func="min", 358 | ) 359 | np.testing.assert_array_equal(res, [np.inf, 0, 0]) 360 | 361 | 362 | def test_argmin_argmax_inf(aggregate_all): 363 | # https://github.com/ml31415/numpy-groupies/issues/40 364 | res = aggregate_all( 365 | np.array([0, 1, 2, 0, 1, 2]), 366 | np.array([-np.inf, 0, -np.inf, -np.inf, 0, 0]), 367 | func="argmax", 368 | fill_value=-1, 369 | ) 370 | np.testing.assert_array_equal(res, [0, 1, 5]) 371 | 372 | res = aggregate_all( 373 | np.array([0, 1, 2, 0, 1, 2]), 374 | np.array([np.inf, 0, np.inf, np.inf, 0, 0]), 375 | func="argmin", 376 | fill_value=-1, 377 | ) 378 | np.testing.assert_array_equal(res, [0, 1, 5]) 379 | 380 | 381 | def test_mean(aggregate_all): 382 | group_idx = np.array([0, 0, 0, 0, 3, 3, 3, 3]) 383 | a = np.arange(len(group_idx)) 384 | 385 | res = aggregate_all(group_idx, a, func="mean") 386 | np.testing.assert_array_equal(res, [1.5, 0, 0, 5.5]) 387 | 388 | 389 | def test_cumsum(aggregate_all): 390 | group_idx = np.array([4, 3, 3, 4, 4, 1, 1, 1, 7, 8, 7, 4, 3, 3, 1, 1]) 391 | a = np.array([3, 4, 1, 3, 9, 9, 6, 7, 7, 0, 8, 2, 1, 8, 9, 8]) 392 | ref = np.array([3, 4, 5, 6, 15, 9, 15, 22, 7, 0, 15, 17, 6, 14, 31, 39]) 393 | 394 | res = aggregate_all(group_idx, a, func="cumsum") 395 | np.testing.assert_array_equal(res, ref) 396 | 397 | 398 | @pytest.mark.deselect_if(func=_deselect_purepy_and_pandas) 399 | def test_nancumsum(aggregate_all): 400 | # https://github.com/ml31415/numpy-groupies/issues/79 401 | group_idx = [0, 0, 0, 1, 1, 0, 0] 402 | a = [2, 2, np.nan, 2, 2, 2, 2] 403 | ref = [2.0, 4.0, 4.0, 2.0, 4.0, 6.0, 8.0] 404 | 405 | res = aggregate_all(group_idx, a, func="nancumsum") 406 | np.testing.assert_array_equal(res, ref) 407 | 408 | 409 | def test_cummax(aggregate_all): 410 | group_idx = np.array([4, 3, 3, 4, 4, 1, 1, 1, 7, 8, 7, 4, 3, 3, 1, 1]) 411 | a = np.array([3, 4, 1, 3, 9, 9, 6, 7, 7, 0, 8, 2, 1, 8, 9, 8]) 412 | ref = np.array([3, 4, 4, 3, 9, 9, 9, 9, 7, 0, 8, 9, 4, 8, 9, 9]) 413 | 414 | res = aggregate_all(group_idx, a, func="cummax") 415 | np.testing.assert_array_equal(res, ref) 416 | 417 | 418 | @pytest.mark.parametrize("order", ["normal", "reverse"]) 419 | def test_list_ordering(aggregate_all, order): 420 | group_idx = np.repeat(np.arange(5), 4) 421 | a = np.arange(group_idx.size) 422 | if order == "reverse": 423 | a = a[::-1] 424 | ref = a[:4] 425 | 426 | res = aggregate_all(group_idx, a, func=list) 427 | np.testing.assert_array_equal(np.array(res[0]), ref) 428 | 429 | 430 | @pytest.mark.parametrize("order", ["normal", "reverse"]) 431 | def test_sort(aggregate_all, order): 432 | group_idx = np.array([3, 3, 3, 2, 2, 2, 1, 1, 1]) 433 | a = np.array([3, 2, 1, 3, 4, 5, 5, 10, 1]) 434 | ref_normal = np.array([1, 2, 3, 3, 4, 5, 1, 5, 10]) 435 | ref_reverse = np.array([3, 2, 1, 5, 4, 3, 10, 5, 1]) 436 | reverse = order == "reverse" 437 | ref = ref_reverse if reverse else ref_normal 438 | 439 | res = aggregate_all(group_idx, a, func="sort", reverse=reverse) 440 | np.testing.assert_array_equal(res, ref) 441 | 442 | 443 | @pytest.mark.deselect_if(func=_deselect_purepy_and_invalid_axis) 444 | @pytest.mark.parametrize("axis", (0, 1)) 445 | @pytest.mark.parametrize("size", ((12,), (12, 5))) 446 | @pytest.mark.parametrize("func", func_list) 447 | def test_along_axis(aggregate_all, func, size, axis): 448 | group_idx = np.zeros(size[axis], dtype=int) 449 | a = np.random.randn(*size) 450 | 451 | # add some NaNs to test out nan-skipping 452 | if "nan" in func and "nanarg" not in func: 453 | a[[1, 4, 5], ...] = np.nan 454 | elif "nanarg" in func and a.ndim > 1: 455 | a[[1, 4, 5], 1] = np.nan 456 | if func in ["any", "all"]: 457 | a = a > 0.5 458 | 459 | # construct expected values for all cases 460 | if func == "len": 461 | expected = np.array(size[axis]) 462 | elif func == "nanlen": 463 | expected = np.array((~np.isnan(a)).sum(axis=axis)) 464 | elif func == "anynan": 465 | expected = np.isnan(a).any(axis=axis) 466 | elif func == "allnan": 467 | expected = np.isnan(a).all(axis=axis) 468 | elif func == "sumofsquares": 469 | expected = np.sum(a * a, axis=axis) 470 | elif func == "nansumofsquares": 471 | expected = np.nansum(a * a, axis=axis) 472 | else: 473 | with warnings.catch_warnings(): 474 | # Filter expected warnings: 475 | # - RuntimeWarning: All-NaN slice encountered 476 | # - RuntimeWarning: Mean of empty slice 477 | # - RuntimeWarning: Degrees of freedom <= 0 for slice. 478 | warnings.simplefilter("ignore", RuntimeWarning) 479 | expected = getattr(np, func)(a, axis=axis) 480 | 481 | # The default fill_value is 0, the following makes the output match numpy 482 | fill_value = { 483 | "nanprod": 1, 484 | "nanvar": np.nan, 485 | "nanstd": np.nan, 486 | "nanmax": np.nan, 487 | "nanmin": np.nan, 488 | "nanmean": np.nan, 489 | }.get(func, 0) 490 | 491 | actual = aggregate_all(group_idx, a, axis=axis, func=func, fill_value=fill_value) 492 | assert actual.ndim == a.ndim 493 | 494 | # argmin, argmax don't support keepdims, so we can't use that to construct expected 495 | # instead we squeeze out the extra dims in actual. 496 | np.testing.assert_allclose(actual.squeeze(), expected) 497 | 498 | 499 | @pytest.mark.deselect_if(func=_deselect_purepy) 500 | def test_not_last_axis_reduction(aggregate_all): 501 | group_idx = np.array([1, 2, 2, 0, 1]) 502 | a = np.array([[1.0, 2.0], [4.0, 4.0], [5.0, 2.0], [np.nan, 3.0], [8.0, 7.0]]) 503 | func = "nanmax" 504 | fill_value = np.nan 505 | axis = 0 506 | actual = aggregate_all(group_idx, a, axis=axis, func=func, fill_value=fill_value) 507 | expected = np.array([[np.nan, 3.0], [8.0, 7.0], [5.0, 4.0]]) 508 | np.testing.assert_allclose(expected, actual) 509 | 510 | 511 | @pytest.mark.deselect_if(func=_deselect_purepy) 512 | def test_custom_callable(aggregate_all): 513 | def custom_callable(x): 514 | return x.sum() 515 | 516 | size = (10,) 517 | axis = -1 518 | 519 | group_idx = np.zeros(size, dtype=int) 520 | a = np.random.randn(*size) 521 | 522 | expected = a.sum(axis=axis, keepdims=True) 523 | actual = aggregate_all(group_idx, a, axis=axis, func=custom_callable, fill_value=0) 524 | assert actual.ndim == a.ndim 525 | 526 | np.testing.assert_allclose(actual, expected) 527 | 528 | 529 | @pytest.mark.deselect_if(func=_deselect_purepy) 530 | def test_argreduction_nD_array_1D_idx(aggregate_all): 531 | # https://github.com/ml31415/numpy-groupies/issues/41 532 | group_idx = np.array([0, 0, 2, 2, 2, 1, 1, 2, 2, 1, 1, 0], dtype=int) 533 | a = np.array([[1] * 12, [1] * 12]) 534 | actual = aggregate_all(group_idx, a, axis=-1, func="argmax") 535 | expected = np.array([[0, 5, 2], [0, 5, 2]]) 536 | np.testing.assert_equal(actual, expected) 537 | 538 | 539 | @pytest.mark.deselect_if(func=_deselect_purepy) 540 | def test_argreduction_negative_fill_value(aggregate_all): 541 | if aggregate_all.__name__.endswith("pandas"): 542 | pytest.skip("pandas always skips nan values") 543 | 544 | group_idx = np.array([0, 0, 2, 2, 2, 1, 1, 2, 2, 1, 1, 0], dtype=int) 545 | a = np.array([[1] * 12, [np.nan] * 12]) 546 | actual = aggregate_all(group_idx, a, axis=-1, fill_value=-1, func="argmax") 547 | expected = np.array([[0, 5, 2], [-1, -1, -1]]) 548 | np.testing.assert_equal(actual, expected) 549 | 550 | 551 | @pytest.mark.deselect_if(func=_deselect_not_implemented) 552 | @pytest.mark.parametrize( 553 | "nan_inds", (None, tuple([[1, 4, 5], Ellipsis]), tuple((1, (0, 1, 2, 3)))) 554 | ) 555 | @pytest.mark.parametrize("ddof", (0, 1)) 556 | @pytest.mark.parametrize("func", ("nanvar", "nanstd")) 557 | def test_var_with_nan_fill_value(aggregate_all, ddof, nan_inds, func): 558 | a = np.ones((12, 5)) 559 | group_idx = np.zeros(a.shape[-1:], dtype=int) 560 | 561 | if nan_inds is not None: 562 | a[nan_inds] = np.nan 563 | 564 | with warnings.catch_warnings(): 565 | # Filter RuntimeWarning: Degrees of freedom <= 0 for slice. 566 | warnings.simplefilter("ignore", RuntimeWarning) 567 | expected = getattr(np, func)(a, keepdims=True, axis=-1, ddof=ddof) 568 | 569 | actual = aggregate_all( 570 | group_idx, a, axis=-1, fill_value=np.nan, func=func, ddof=ddof 571 | ) 572 | np.testing.assert_equal(actual, expected) 573 | 574 | 575 | def test_cumsum_accuracy(aggregate_all): 576 | array = np.array( 577 | [0.00000000e00, 0.00000000e00, 0.00000000e00, 3.27680000e04, 9.99999975e-06] 578 | ) 579 | group_idx = np.array([0, 0, 0, 0, 1]) 580 | 581 | actual = aggregate_all(group_idx, array, axis=-1, func="cumsum") 582 | expected = array 583 | np.testing.assert_allclose(actual, expected) 584 | -------------------------------------------------------------------------------- /numpy_groupies/tests/test_indices.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from . import _impl_name, aggregate_numba 5 | 6 | _implementations = [aggregate_numba] 7 | _implementations = [i for i in _implementations if i is not None] 8 | 9 | 10 | @pytest.fixture(params=_implementations, ids=_impl_name) 11 | def aggregate_nb_wv(request): 12 | if request.param is None: 13 | pytest.skip("Implementation not available") 14 | return request.param 15 | 16 | 17 | def test_step_indices_length(aggregate_nb_wv): 18 | group_idx = np.array([1, 1, 1, 2, 2, 3, 3, 4, 4, 2, 2], dtype=int) 19 | for _ in range(20): 20 | np.random.shuffle(group_idx) 21 | step_cnt_ref = np.count_nonzero(np.diff(group_idx)) 22 | assert aggregate_nb_wv.step_count(group_idx) == step_cnt_ref + 1 23 | assert len(aggregate_nb_wv.step_indices(group_idx)) == step_cnt_ref + 2 24 | 25 | 26 | def test_step_indices_fields(aggregate_nb_wv): 27 | group_idx = np.array([1, 1, 1, 2, 2, 3, 3, 4, 5, 2, 2], dtype=int) 28 | steps = aggregate_nb_wv.step_indices(group_idx) 29 | np.testing.assert_array_equal(steps, np.array([0, 3, 5, 7, 8, 9, 11])) 30 | -------------------------------------------------------------------------------- /numpy_groupies/tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from ..utils import check_dtype, unpack 4 | 5 | 6 | def test_check_dtype(): 7 | dtype = check_dtype(None, "mean", np.arange(10, dtype=int), 10) 8 | assert np.issubdtype(dtype, np.floating) 9 | 10 | 11 | def test_unpack(): 12 | """Keep this test, in case unpack might get reimplemented again at some point.""" 13 | group_idx = np.arange(10) 14 | np.random.shuffle(group_idx) 15 | group_idx = np.repeat(group_idx, 3) 16 | vals = np.random.randn(np.max(group_idx) + 1) 17 | np.testing.assert_array_equal(unpack(group_idx, vals), vals[group_idx]) 18 | 19 | 20 | def test_unpack_long(): 21 | group_idx = np.repeat(np.arange(10000), 20) 22 | vals = np.random.randn(np.max(group_idx) + 1) 23 | np.testing.assert_array_equal(unpack(group_idx, vals), vals[group_idx]) 24 | -------------------------------------------------------------------------------- /numpy_groupies/utils.py: -------------------------------------------------------------------------------- 1 | """Common functionality for all aggregate implementations.""" 2 | 3 | import platform 4 | import numpy as np 5 | 6 | aggregate_common_doc = """ 7 | See readme file at https://github.com/ml31415/numpy-groupies for a full 8 | description. Below we reproduce the "Full description of inputs" 9 | section from that readme, note that the text below makes references to 10 | other portions of the readme that are not shown here. 11 | 12 | group_idx: 13 | this is an array of non-negative integers, to be used as the "labels" 14 | with which to group the values in ``a``. Although we have so far 15 | assumed that ``group_idx`` is one-dimensional, and the same length as 16 | ``a``, it can in fact be two-dimensional (or some form of nested 17 | sequences that can be converted to 2D). When ``group_idx`` is 2D, the 18 | size of the 0th dimension corresponds to the number of dimensions in 19 | the output, i.e. ``group_idx[i,j]`` gives the index into the ith 20 | dimension in the output 21 | for ``a[j]``. Note that ``a`` should still be 1D (or scalar), with 22 | length matching ``group_idx.shape[1]``. 23 | a: 24 | this is the array of values to be aggregated. See above for a 25 | simple demonstration of what this means. ``a`` will normally be a 26 | one-dimensional array, however it can also be a scalar in some cases. 27 | func: default='sum' 28 | the function to use for aggregation. See the section above for 29 | details. Note that the simplest way to specify the function is using a 30 | string (e.g. ``func='max'``) however a number of aliases are also 31 | defined (e.g. you can use the ``func=np.max``, or even ``func=max``, 32 | where ``max`` is the 33 | builtin function). To check the available aliases see ``utils.py``. 34 | size: default=None 35 | the shape of the output array. If ``None``, the maximum value in 36 | ``group_idx`` will set the size of the output. Note that for 37 | multidimensional output you need to list the size of each dimension 38 | here, or give ``None``. 39 | fill_value: default=0 40 | in the example above, group 2 does not have any data, so requires some 41 | kind of filling value - in this case the default of ``0`` is used. If 42 | you had set ``fill_value=nan`` or something else, that value would 43 | appear instead of ``0`` for the 2 element in the output. Note that 44 | there are some subtle interactions between what is permitted for 45 | ``fill_value`` and the input/output ``dtype`` - exceptions should be 46 | raised in most cases to alert the programmer if issue arrise. 47 | order: default='C' 48 | this is relevant only for multimensional output. It controls the 49 | layout of the output array in memory, can be ``'F'`` for fortran-style. 50 | dtype: default=None 51 | the ``dtype`` of the output. By default something sensible is chosen 52 | based on the input, aggregation function, and ``fill_value``. 53 | ddof: default=0 54 | passed through into calculations of variance and standard deviation 55 | (see above). 56 | """ 57 | 58 | funcs_common = "first last len mean var std allnan anynan max min argmax argmin sumofsquares cumsum cumprod cummax cummin".split() 59 | funcs_no_separate_nan = frozenset(["sort", "rsort", "array", "allnan", "anynan"]) 60 | 61 | 62 | _alias_str = { 63 | "or": "any", 64 | "and": "all", 65 | "add": "sum", 66 | "count": "len", 67 | "plus": "sum", 68 | "multiply": "prod", 69 | "product": "prod", 70 | "times": "prod", 71 | "amax": "max", 72 | "maximum": "max", 73 | "amin": "min", 74 | "minimum": "min", 75 | "split": "array", 76 | "splice": "array", 77 | "sorted": "sort", 78 | "asort": "sort", 79 | "asorted": "sort", 80 | "rsorted": "sort", 81 | "dsort": "sort", 82 | "dsorted": "rsort", 83 | } 84 | 85 | _alias_builtin = { 86 | all: "all", 87 | any: "any", 88 | len: "len", 89 | max: "max", 90 | min: "min", 91 | sum: "sum", 92 | sorted: "sort", 93 | slice: "array", 94 | list: "array", 95 | } 96 | 97 | 98 | _alias_numpy = { 99 | np.add: "sum", 100 | np.sum: "sum", 101 | np.any: "any", 102 | np.all: "all", 103 | np.multiply: "prod", 104 | np.prod: "prod", 105 | np.amin: "min", 106 | np.min: "min", 107 | np.minimum: "min", 108 | np.amax: "max", 109 | np.max: "max", 110 | np.maximum: "max", 111 | np.argmax: "argmax", 112 | np.argmin: "argmin", 113 | np.mean: "mean", 114 | np.std: "std", 115 | np.var: "var", 116 | np.array: "array", 117 | np.asarray: "array", 118 | np.sort: "sort", 119 | np.cumsum: "cumsum", 120 | np.cumprod: "cumprod", 121 | np.nansum: "nansum", 122 | np.nanprod: "nanprod", 123 | np.nanmean: "nanmean", 124 | np.nanvar: "nanvar", 125 | np.nanmax: "nanmax", 126 | np.nanmin: "nanmin", 127 | np.nanstd: "nanstd", 128 | np.nanargmax: "nanargmax", 129 | np.nanargmin: "nanargmin", 130 | np.nancumsum: "nancumsum", 131 | } 132 | 133 | 134 | def get_aliasing(*extra): 135 | """ 136 | Assembles a dictionary that maps both strings and functions to a list of supported function names. 137 | 138 | Examples: 139 | alias['add'] = 'sum' 140 | alias[sorted] = 'sort' 141 | 142 | This function should only be called during import. 143 | """ 144 | alias = dict((k, k) for k in funcs_common) 145 | alias.update(_alias_str) 146 | alias.update((fn, fn) for fn in _alias_builtin.values()) 147 | alias.update(_alias_builtin) 148 | for d in extra: 149 | alias.update(d) 150 | alias.update((k, k) for k in set(alias.values())) 151 | # Treat nan-functions as firstclass member and add them directly 152 | for key in set(alias.values()): 153 | if key not in funcs_no_separate_nan and not key.startswith("nan"): 154 | key = "nan" + key 155 | alias[key] = key 156 | return alias 157 | 158 | 159 | aliasing_py = get_aliasing() 160 | aliasing = get_aliasing(_alias_numpy) 161 | 162 | 163 | def get_func(func, aliasing, implementations): 164 | """Return the key of a found implementation or the func itself""" 165 | try: 166 | func_str = aliasing[func] 167 | except KeyError: 168 | if callable(func): 169 | return func 170 | else: 171 | if func_str in implementations: 172 | return func_str 173 | if func_str.startswith("nan") and func_str[3:] in funcs_no_separate_nan: 174 | raise ValueError(f"{func_str[3:]} does not have a nan-version") 175 | else: 176 | raise NotImplementedError("No such function available") 177 | raise ValueError( 178 | f"func {func} is neither a valid function string nor a callable object" 179 | ) 180 | 181 | 182 | def check_boolean(x): 183 | if x not in (0, 1): 184 | raise ValueError("Value not boolean") 185 | 186 | 187 | _next_int_dtype = dict( 188 | bool=np.int8, 189 | uint8=np.int16, 190 | int8=np.int16, 191 | uint16=np.int32, 192 | int16=np.int32, 193 | uint32=np.int64, 194 | int32=np.int64, 195 | ) 196 | 197 | _next_float_dtype = dict( 198 | float16=np.float32, 199 | float32=np.float64, 200 | float64=np.complex64, 201 | complex64=np.complex128, 202 | ) 203 | 204 | 205 | def minimum_dtype(x, dtype=np.bool_): 206 | """ 207 | Returns the "most basic" dtype which represents `x` properly, which provides at least the same 208 | value range as the specified dtype. 209 | """ 210 | 211 | def check_type(x, dtype): 212 | try: 213 | with np.errstate(invalid="ignore"): 214 | converted = np.array(x).astype(dtype) 215 | except (ValueError, OverflowError, RuntimeWarning): 216 | return False 217 | # False if some overflow has happened 218 | return converted == x or np.isnan(x) 219 | 220 | def type_loop(x, dtype, dtype_dict, default=None): 221 | while True: 222 | try: 223 | dtype = np.dtype(dtype_dict[dtype.name]) 224 | if check_type(x, dtype): 225 | return np.dtype(dtype) 226 | except KeyError: 227 | if default is not None: 228 | return np.dtype(default) 229 | raise ValueError(f"Can not determine dtype of {x!r}") 230 | 231 | dtype = np.dtype(dtype) 232 | if check_type(x, dtype): 233 | return dtype 234 | 235 | if np.issubdtype(dtype, np.inexact): 236 | return type_loop(x, dtype, _next_float_dtype) 237 | else: 238 | return type_loop(x, dtype, _next_int_dtype, default=np.float32) 239 | 240 | 241 | def minimum_dtype_scalar(x, dtype, a): 242 | if dtype is None: 243 | dtype = np.dtype(type(a)) if isinstance(a, (int, float)) else a.dtype 244 | return minimum_dtype(x, dtype) 245 | 246 | 247 | _forced_types = { 248 | "array": object, 249 | "all": bool, 250 | "any": bool, 251 | "nanall": bool, 252 | "nanany": bool, 253 | "len": np.int64, 254 | "nanlen": np.int64, 255 | "allnan": bool, 256 | "anynan": bool, 257 | "argmax": np.int64, 258 | "argmin": np.int64, 259 | "nanargmin": np.int64, 260 | "nanargmax": np.int64, 261 | } 262 | if platform.architecture()[0] == "32bit": 263 | _forced_types = { 264 | "array": object, 265 | "all": bool, 266 | "any": bool, 267 | "nanall": bool, 268 | "nanany": bool, 269 | "len": np.int32, 270 | "nanlen": np.int32, 271 | "allnan": bool, 272 | "anynan": bool, 273 | "argmax": np.int32, 274 | "argmin": np.int32, 275 | "nanargmin": np.int32, 276 | "nanargmax": np.int32, 277 | } 278 | _forced_float_types = {"mean", "var", "std", "nanmean", "nanvar", "nanstd"} 279 | _forced_same_type = { 280 | "min", 281 | "max", 282 | "first", 283 | "last", 284 | "nanmin", 285 | "nanmax", 286 | "nanfirst", 287 | "nanlast", 288 | } 289 | 290 | 291 | def check_dtype(dtype, func_str, a, n): 292 | if np.isscalar(a) or not a.shape: 293 | if func_str not in ("sum", "prod", "len"): 294 | raise ValueError( 295 | "scalar inputs are supported only for 'sum', 'prod' and 'len'" 296 | ) 297 | a_dtype = np.dtype(type(a)) 298 | else: 299 | a_dtype = a.dtype 300 | 301 | if dtype is not None: 302 | # dtype set by the user 303 | # Careful here: np.bool != np.bool_ ! 304 | if np.issubdtype(dtype, np.bool_) and not ( 305 | "all" in func_str or "any" in func_str 306 | ): 307 | raise TypeError( 308 | f"function {func_str} requires a more complex datatype than bool" 309 | ) 310 | if not np.issubdtype(dtype, np.integer) and func_str in ("len", "nanlen"): 311 | raise TypeError(f"function {func_str} requires an integer datatype") 312 | # TODO: Maybe have some more checks here 313 | return np.dtype(dtype) 314 | else: 315 | try: 316 | return np.dtype(_forced_types[func_str]) 317 | except KeyError: 318 | if func_str in _forced_float_types: 319 | if np.issubdtype(a_dtype, np.floating): 320 | return a_dtype 321 | else: 322 | return np.dtype(np.float64) 323 | else: 324 | if func_str == "sum": 325 | # Try to guess the minimally required int size 326 | if np.issubdtype(a_dtype, np.int64): 327 | # It's not getting bigger anymore 328 | # TODO: strictly speaking it might need float 329 | return np.dtype(np.int64) 330 | elif np.issubdtype(a_dtype, np.integer): 331 | maxval = np.iinfo(a_dtype).max * n 332 | return minimum_dtype(maxval, a_dtype) 333 | elif np.issubdtype(a_dtype, np.bool_): 334 | return minimum_dtype(n, a_dtype) 335 | else: 336 | # floating, inexact, whatever 337 | return a_dtype 338 | elif func_str in _forced_same_type: 339 | return a_dtype 340 | else: 341 | if isinstance(a_dtype, np.integer): 342 | return np.dtype(np.int64) 343 | else: 344 | return a_dtype 345 | 346 | 347 | def minval(fill_value, dtype): 348 | dtype = minimum_dtype(fill_value, dtype) 349 | if issubclass(dtype.type, np.floating): 350 | return -np.inf 351 | if issubclass(dtype.type, np.integer): 352 | return np.iinfo(dtype).min 353 | return np.finfo(dtype).min 354 | 355 | 356 | def maxval(fill_value, dtype): 357 | dtype = minimum_dtype(fill_value, dtype) 358 | if issubclass(dtype.type, np.floating): 359 | return np.inf 360 | if issubclass(dtype.type, np.integer): 361 | return np.iinfo(dtype).max 362 | return np.finfo(dtype).max 363 | 364 | 365 | def check_fill_value(fill_value, dtype, func=None): 366 | if func in ("all", "any", "allnan", "anynan"): 367 | check_boolean(fill_value) 368 | else: 369 | try: 370 | return dtype.type(fill_value) 371 | except ValueError: 372 | raise ValueError( 373 | f"fill_value must be convertible into {dtype.type.__name__}" 374 | ) 375 | 376 | 377 | def check_group_idx(group_idx, a=None, check_min=True): 378 | if a is not None and group_idx.size != a.size: 379 | raise ValueError("The size of group_idx must be the same as a.size") 380 | if not issubclass(group_idx.dtype.type, np.integer): 381 | raise TypeError("group_idx must be of integer type") 382 | if check_min and np.min(group_idx) < 0: 383 | raise ValueError("group_idx contains negative indices") 384 | 385 | 386 | def _ravel_group_idx(group_idx, a, axis, size, order, method="ravel"): 387 | ndim_a = a.ndim 388 | # Create the broadcast-ready multidimensional indexing. 389 | # Note the user could do this themselves, so this is 390 | # very much just a convenience. 391 | size_in = int(np.max(group_idx)) + 1 if size is None else size 392 | group_idx_in = group_idx 393 | group_idx = [] 394 | size = [] 395 | for ii, s in enumerate(a.shape): 396 | if method == "ravel": 397 | ii_idx = group_idx_in if ii == axis else np.arange(s) 398 | ii_shape = [1] * ndim_a 399 | ii_shape[ii] = s 400 | group_idx.append(ii_idx.reshape(ii_shape)) 401 | size.append(size_in if ii == axis else s) 402 | # Use the indexing, and return. It's a bit simpler than 403 | # using trying to keep all the logic below happy 404 | if method == "ravel": 405 | group_idx = np.ravel_multi_index(group_idx, size, order=order, mode="raise") 406 | elif method == "offset": 407 | group_idx = offset_labels(group_idx_in, a.shape, axis, order, size_in) 408 | return group_idx, size 409 | 410 | 411 | def offset_labels(group_idx, inshape, axis, order, size): 412 | """ 413 | Offset group labels by dimension. This is used when we reduce over a subset of the dimensions of 414 | group_idx. It assumes that the reductions dimensions have been flattened in the last dimension 415 | Copied from 416 | https://stackoverflow.com/questions/46256279/bin-elements-per-row-vectorized-2d-bincount-for-numpy 417 | """ 418 | 419 | newaxes = tuple(ax for ax in range(len(inshape)) if ax != axis) 420 | group_idx = np.broadcast_to(np.expand_dims(group_idx, newaxes), inshape) 421 | if axis not in (-1, len(inshape) - 1): 422 | group_idx = np.moveaxis(group_idx, axis, -1) 423 | newshape = group_idx.shape[:-1] + (-1,) 424 | 425 | group_idx = ( 426 | group_idx 427 | + np.arange(np.prod(newshape[:-1]), dtype=int).reshape(newshape) * size 428 | ) 429 | if axis not in (-1, len(inshape) - 1): 430 | return np.moveaxis(group_idx, -1, axis) 431 | else: 432 | return group_idx 433 | 434 | 435 | def input_validation( 436 | group_idx, 437 | a, 438 | size=None, 439 | order="C", 440 | axis=None, 441 | ravel_group_idx=True, 442 | check_bounds=True, 443 | func=None, 444 | ): 445 | """ 446 | Do some fairly extensive checking of group_idx and a, trying to give the user as much help as 447 | possible with what is wrong. Also, convert ndim-indexing to 1d indexing. 448 | """ 449 | if not isinstance(a, (int, float, complex)) and not is_duck_array(a): 450 | a = np.asanyarray(a) 451 | if not is_duck_array(group_idx): 452 | group_idx = np.asanyarray(group_idx) 453 | 454 | if not np.issubdtype(group_idx.dtype, np.integer): 455 | raise TypeError("group_idx must be of integer type") 456 | 457 | # This check works for multidimensional indexing as well 458 | if check_bounds and np.any(group_idx < 0): 459 | raise ValueError("negative indices not supported") 460 | 461 | ndim_idx = np.ndim(group_idx) 462 | ndim_a = np.ndim(a) 463 | 464 | # Deal with the axis arg: if present, then turn 1d indexing into 465 | # multi-dimensional indexing along the specified axis. 466 | if axis is None: 467 | if ndim_a > 1: 468 | raise ValueError( 469 | "a must be scalar or 1 dimensional, use .ravel to flatten. Alternatively specify axis." 470 | ) 471 | elif axis >= ndim_a or axis < -ndim_a: 472 | raise ValueError("axis arg too large for np.ndim(a)") 473 | else: 474 | axis = axis if axis >= 0 else ndim_a + axis # negative indexing 475 | if ndim_idx > 1: 476 | # TODO: we could support a sequence of axis values for multiple 477 | # dimensions of group_idx. 478 | raise NotImplementedError( 479 | "only 1d indexing currently supported with axis arg." 480 | ) 481 | elif a.shape[axis] != len(group_idx): 482 | raise ValueError("a.shape[axis] doesn't match length of group_idx.") 483 | elif size is not None and not np.isscalar(size): 484 | raise NotImplementedError( 485 | "when using axis arg, size must be None or scalar." 486 | ) 487 | else: 488 | is_form_3 = group_idx.ndim == 1 and a.ndim > 1 and axis is not None 489 | orig_shape = a.shape if is_form_3 else group_idx.shape 490 | if isinstance(func, str) and "arg" in func: 491 | unravel_shape = orig_shape 492 | else: 493 | unravel_shape = None 494 | 495 | method = "offset" if axis == ndim_a - 1 else "ravel" 496 | group_idx, size = _ravel_group_idx( 497 | group_idx, a, axis, size, order, method=method 498 | ) 499 | flat_size = np.prod(size) 500 | ndim_idx = ndim_a 501 | size = ( 502 | orig_shape 503 | if is_form_3 and not callable(func) and "cum" in func 504 | else size 505 | ) 506 | return ( 507 | group_idx.ravel(), 508 | a.ravel(), 509 | flat_size, 510 | ndim_idx, 511 | size, 512 | unravel_shape, 513 | ) 514 | 515 | if ndim_idx == 1: 516 | if size is None: 517 | size = int(np.max(group_idx)) + 1 518 | else: 519 | if not np.isscalar(size): 520 | raise ValueError("output size must be scalar or None") 521 | if check_bounds and np.any(group_idx > size - 1): 522 | raise ValueError(f"one or more indices are too large for size {size}") 523 | flat_size = size 524 | else: 525 | if size is None: 526 | size = np.max(group_idx, axis=1).astype(int) + 1 527 | elif np.isscalar(size): 528 | raise ValueError(f"output size must be of length {len(group_idx)}") 529 | elif len(size) != len(group_idx): 530 | raise ValueError( 531 | f"{len(size)} sizes given, but {len(group_idx)} output dimensions specified in index" 532 | ) 533 | if ravel_group_idx: 534 | group_idx = np.ravel_multi_index(group_idx, size, order=order, mode="raise") 535 | flat_size = np.prod(size) 536 | 537 | if not (np.ndim(a) == 0 or len(a) == group_idx.size): 538 | raise ValueError( 539 | "group_idx and a must be of the same length, or a can be scalar" 540 | ) 541 | 542 | return group_idx, a, flat_size, ndim_idx, size, None 543 | 544 | 545 | # General tools 546 | 547 | 548 | def unpack(group_idx, ret): 549 | """ 550 | Take an aggregate packed array and uncompress it to the size of group_idx. This is equivalent to 551 | ret[group_idx]. 552 | """ 553 | return ret[group_idx] 554 | 555 | 556 | def allnan(x): 557 | return np.all(np.isnan(x)) 558 | 559 | 560 | def anynan(x): 561 | return np.any(np.isnan(x)) 562 | 563 | 564 | def nanfirst(x): 565 | return x[~np.isnan(x)][0] 566 | 567 | 568 | def nanlast(x): 569 | return x[~np.isnan(x)][-1] 570 | 571 | 572 | def multi_arange(n): 573 | """By example: 574 | 575 | # 0 1 2 3 4 5 6 7 8 576 | n = [0, 0, 3, 0, 0, 2, 0, 2, 1] 577 | res = [0, 1, 2, 0, 1, 0, 1, 0] 578 | 579 | That is it is equivalent to something like this : 580 | 581 | hstack((arange(n_i) for n_i in n)) 582 | 583 | This version seems quite a bit faster, at least for some possible inputs, and at any rate it 584 | encapsulates a task in a function. 585 | """ 586 | if n.ndim != 1: 587 | raise ValueError("n is supposed to be 1d array.") 588 | 589 | n_mask = n.astype(bool) 590 | n_cumsum = np.cumsum(n) 591 | ret = np.ones(n_cumsum[-1] + 1, dtype=int) 592 | ret[n_cumsum[n_mask]] -= n[n_mask] 593 | ret[0] -= 1 594 | return np.cumsum(ret)[:-1] 595 | 596 | 597 | def label_contiguous_1d(X): 598 | """ 599 | WARNING: API for this function is not liable to change!!! 600 | 601 | By example: 602 | 603 | X = [F T T F F T F F F T T T] 604 | result = [0 1 1 0 0 2 0 0 0 3 3 3] 605 | 606 | Or: 607 | X = [0 3 3 0 0 5 5 5 1 1 0 2] 608 | result = [0 1 1 0 0 2 2 2 3 3 0 4] 609 | 610 | The ``0`` or ``False`` elements of ``X`` are labeled as ``0`` in the output. If ``X`` is a boolean 611 | array, each contiguous block of ``True`` is given an integer label, if ``X`` is not boolean, then 612 | each contiguous block of identical values is given an integer label. Integer labels are 1, 2, 3, 613 | ..... (i.e. start a 1 and increase by 1 for each block with no skipped numbers.) 614 | """ 615 | 616 | if X.ndim != 1: 617 | raise ValueError("this is for 1d masks only.") 618 | 619 | is_start = np.empty(len(X), dtype=bool) 620 | is_start[0] = X[0] # True if X[0] is True or non-zero 621 | 622 | if X.dtype.kind == "b": 623 | is_start[1:] = ~X[:-1] & X[1:] 624 | M = X 625 | else: 626 | M = X.astype(bool) 627 | is_start[1:] = X[:-1] != X[1:] 628 | is_start[~M] = False 629 | 630 | L = np.cumsum(is_start) 631 | L[~M] = 0 632 | return L 633 | 634 | 635 | def relabel_groups_unique(group_idx): 636 | """ 637 | See also ``relabel_groups_masked``. 638 | 639 | keep_group: [0 3 3 3 0 2 5 2 0 1 1 0 3 5 5] 640 | ret: [0 3 3 3 0 2 4 2 0 1 1 0 3 4 4] 641 | 642 | Description of above: unique groups in input was ``1,2,3,5``, i.e. 643 | ``4`` was missing, so group 5 was relabled to be ``4``. 644 | Relabeling maintains order, just "compressing" the higher numbers 645 | to fill gaps. 646 | """ 647 | 648 | keep_group = np.zeros(np.max(group_idx) + 1, dtype=bool) 649 | keep_group[0] = True 650 | keep_group[group_idx] = True 651 | return relabel_groups_masked(group_idx, keep_group) 652 | 653 | 654 | def relabel_groups_masked(group_idx, keep_group): 655 | """ 656 | group_idx: [0 3 3 3 0 2 5 2 0 1 1 0 3 5 5] 657 | 658 | 0 1 2 3 4 5 659 | keep_group: [0 1 0 1 1 1] 660 | 661 | ret: [0 2 2 2 0 0 4 0 0 1 1 0 2 4 4] 662 | 663 | Description of above in words: remove group 2, and relabel group 3,4, and 5 to be 2, 3 and 4 664 | respectively, in order to fill the gap. Note that group 4 was never used in the input group_idx, 665 | but the user supplied mask said to keep group 4, so group 5 is only moved up by one place to fill 666 | the gap created by removing group 2. 667 | 668 | That is, the mask describes which groups to remove, the remaining groups are relabled to remove the 669 | gaps created by the falsy elements in ``keep_group``. Note that ``keep_group[0]`` has no particular 670 | meaning because it refers to the zero group which cannot be "removed". 671 | 672 | ``keep_group`` should be bool and ``group_idx`` int. Values in ``group_idx`` can be any order. 673 | """ 674 | 675 | keep_group = keep_group.astype(bool, copy=not keep_group[0]) 676 | if not keep_group[0]: # ensuring keep_group[0] is True makes life easier 677 | keep_group[0] = True 678 | 679 | relabel = np.zeros(keep_group.size, dtype=group_idx.dtype) 680 | relabel[keep_group] = np.arange(np.count_nonzero(keep_group)) 681 | return relabel[group_idx] 682 | 683 | 684 | def is_duck_array(value): 685 | """This function was copied from xarray/core/utils.py under the terms of Xarray's Apache-2 license.""" 686 | 687 | if isinstance(value, np.ndarray): 688 | return True 689 | return ( 690 | hasattr(value, "ndim") 691 | and hasattr(value, "shape") 692 | and hasattr(value, "dtype") 693 | and hasattr(value, "__array_function__") 694 | and hasattr(value, "__array_ufunc__") 695 | ) 696 | 697 | 698 | def iscomplexobj(x): 699 | """Copied from np.iscomplexobj so that we place fewer requirements on duck array types.""" 700 | 701 | try: 702 | dtype = x.dtype 703 | type_ = dtype.type 704 | except AttributeError: 705 | type_ = np.asarray(x).dtype.type 706 | return issubclass(type_, np.complexfloating) 707 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "setuptools-scm"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "numpy-groupies" 7 | description = "Optimised tools for group-indexing operations: aggregated sum and more." 8 | dynamic = ["version"] 9 | readme = {file = "README.md", content-type = "text/markdown"} 10 | license = {file = "LICENSE.txt"} 11 | authors = [ 12 | {name = "Michael Löffler", email = "ml@occam.com.ua"}, 13 | {name = "Daniel Manson", email = "danielmanson.uk@gmail.com"} 14 | ] 15 | maintainers = [ 16 | {name = "Deepak Cherian", email = "dcherian@ucar.edu"} 17 | ] 18 | classifiers = [ 19 | "Development Status :: 4 - Beta", 20 | "Intended Audience :: Science/Research", 21 | "Intended Audience :: Developers", 22 | "Operating System :: OS Independent", 23 | "Programming Language :: Python :: 3", 24 | "Programming Language :: Python :: 3.9", 25 | "Programming Language :: Python :: 3.10", 26 | "Programming Language :: Python :: 3.11", 27 | "Programming Language :: Python :: 3.12", 28 | "Topic :: Scientific/Engineering", 29 | "Topic :: Software Development :: Libraries", 30 | "License :: OSI Approved :: BSD License", 31 | ] 32 | keywords = ["accumarray", "aggregate", "groupby", "grouping", "indexing"] 33 | requires-python = ">=3.9" 34 | dependencies = ["numpy"] 35 | 36 | [project.optional-dependencies] 37 | fast = [ 38 | "numba", 39 | ] 40 | dev = [ 41 | "pytest", 42 | "numba", 43 | "pandas", 44 | ] 45 | 46 | [project.urls] 47 | source = "https://github.com/ml31415/numpy-groupies" 48 | tracker = "https://github.com/ml31415/numpy-groupies/issues" 49 | 50 | [tool.black] 51 | line-length = 120 52 | 53 | [tool.isort] 54 | profile = "black" 55 | honor_noqa = true 56 | 57 | [tool.setuptools.packages.find] 58 | include = ["numpy_groupies*"] 59 | 60 | [tool.setuptools_scm] 61 | write_to = "numpy_groupies/_version.py" 62 | 63 | [tool.ruff.lint.per-file-ignores] 64 | "__init__.py" = ["F401"] 65 | 66 | [tool.codespell] 67 | ignore-words-list = "nd," 68 | ignore-regex = ".*codespell-ignore$" 69 | --------------------------------------------------------------------------------