├── npstreams
├── tests
│ ├── __init__.py
│ ├── data
│ │ ├── test_data1.npy
│ │ ├── test_data2.npy
│ │ └── test_data3.npy
│ ├── test_array_utils.py
│ ├── test_stacking.py
│ ├── test_array_stream.py
│ ├── test_linalg.py
│ ├── test_parallel.py
│ ├── test_flow.py
│ ├── test_cuda.py
│ ├── test_iter_utils.py
│ ├── test_reduce.py
│ ├── test_numerics.py
│ └── test_stats.py
├── __init__.py
├── stacking.py
├── array_utils.py
├── array_stream.py
├── flow.py
├── linalg.py
├── parallel.py
├── iter_utils.py
├── cuda.py
├── benchmarks.py
├── numerics.py
├── reduce.py
└── stats.py
├── docs
├── whatsnew.rst
├── references.txt
├── recipes.rst
├── control_flow.rst
├── cuda.rst
├── installation.rst
├── conventions.rst
├── api.rst
├── index.rst
├── conf.py
└── making_your_own.rst
├── MANIFEST.in
├── RELEASE-CHECKLIST.rst
├── .readthedocs.yml
├── .gitattributes
├── release-description.py
├── LICENSE.txt
├── CHANGELOG.rst
├── .gitignore
├── pyproject.toml
├── .github
└── workflows
│ └── ci.yml
└── README.md
/npstreams/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/whatsnew.rst:
--------------------------------------------------------------------------------
1 | What's new
2 | ==========
3 |
4 | .. include:: ../CHANGELOG.rst
--------------------------------------------------------------------------------
/npstreams/tests/data/test_data1.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LaurentRDC/npstreams/HEAD/npstreams/tests/data/test_data1.npy
--------------------------------------------------------------------------------
/npstreams/tests/data/test_data2.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LaurentRDC/npstreams/HEAD/npstreams/tests/data/test_data2.npy
--------------------------------------------------------------------------------
/npstreams/tests/data/test_data3.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LaurentRDC/npstreams/HEAD/npstreams/tests/data/test_data3.npy
--------------------------------------------------------------------------------
/docs/references.txt:
--------------------------------------------------------------------------------
1 | .. _Numpy: http://www.numpy.org
2 | .. _Scipy: https://www.scipy.org
3 | .. _PyCUDA: https://documen.tician.de/pycuda/
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include CHANGELOG.rst
2 | include LICENSE
3 | include README.md
4 |
5 | recursive-include npstreams/tests/data *
6 |
7 | recursive-exclude docs *
8 |
9 | global-exclude *.py[cod] __pycache__ *.so *.dylib
--------------------------------------------------------------------------------
/RELEASE-CHECKLIST.rst:
--------------------------------------------------------------------------------
1 | Release checklist
2 | -----------------
3 |
4 | To create a release, simply create a tag that starts with 'v' (e.g. 'v2.0.0')::
5 |
6 | git tag -a "v2.0.0"
7 | git push origin "v2.0.0"
8 |
9 | The package will be automatically tested, released on GitHub and uploaded to PyPI.
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | # Read the Docs configuration file
2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
3 |
4 | # Required
5 | version: 2
6 |
7 | sphinx:
8 | configuration: docs/conf.py
9 |
10 | build:
11 | os: ubuntu-22.04
12 | tools:
13 | python: "3.10"
14 |
15 | python:
16 | install:
17 | - method: pip
18 | path: .
19 | extra_requirements:
20 | - development
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
4 | # Custom for Visual Studio
5 | *.cs diff=csharp
6 |
7 | # Standard to msysgit
8 | *.doc diff=astextplain
9 | *.DOC diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot diff=astextplain
13 | *.DOT diff=astextplain
14 | *.pdf diff=astextplain
15 | *.PDF diff=astextplain
16 | *.rtf diff=astextplain
17 | *.RTF diff=astextplain
18 |
--------------------------------------------------------------------------------
/release-description.py:
--------------------------------------------------------------------------------
1 | """
2 | Extract the changes from last release
3 | """
4 |
5 | import sys
6 |
7 | if __name__ == "__main__":
8 | filename = sys.argv[1]
9 |
10 | with open(filename, mode="r") as f:
11 |
12 | # Look for the first second-level title
13 | for line in f:
14 | if line.startswith("Release"):
15 | break
16 |
17 | print(line, end="")
18 | for line in f:
19 | if not line.startswith("Release"):
20 | print(line, end="")
21 | else:
22 | # Exit gracefully
23 | sys.exit(0)
24 | # There was a problem: Exit with error
25 | sys.exit(-1)
26 |
--------------------------------------------------------------------------------
/npstreams/tests/test_array_utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from npstreams import nan_to_num
3 |
4 |
5 | def test_nan_to_num_generic():
6 | """Test that NaNs are replaced with a fill value"""
7 | with np.errstate(divide="ignore", invalid="ignore"):
8 | vals = nan_to_num(np.array([0]) / 0.0, fill_value=14)
9 | assert vals[0] == 14
10 |
11 |
12 | def test_nan_to_num_integer():
13 | """Test that nan_to_num on integers does nothing"""
14 | vals = nan_to_num(1)
15 | assert vals == 1
16 | vals = nan_to_num([1])
17 | assert np.allclose(vals, np.array([1]))
18 |
19 |
20 | def test_nan_to_num_complex_good():
21 | """Test nan_to_num on complex input"""
22 | vals = nan_to_num(1 + 1j)
23 | assert vals == 1 + 1j
24 |
--------------------------------------------------------------------------------
/npstreams/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | __author__ = "Laurent P. René de Cotret"
3 | __email__ = "laurent.decotret@outlook.com"
4 | __license__ = "BSD"
5 | __version__ = "1.7.0"
6 |
7 | from .benchmarks import benchmark
8 | from .array_stream import array_stream, ArrayStream
9 | from .array_utils import nan_to_num
10 | from .linalg import idot, itensordot, ieinsum, iinner
11 | from .parallel import pmap, pmap_unordered, preduce
12 | from .flow import ipipe, iload, pload
13 | from .iter_utils import (
14 | cyclic,
15 | last,
16 | chunked,
17 | multilinspace,
18 | linspace,
19 | peek,
20 | itercopy,
21 | primed,
22 | length_hint,
23 | )
24 | from .reduce import ireduce_ufunc, preduce_ufunc, reduce_ufunc
25 | from .stacking import stack
26 | from .stats import (
27 | iaverage,
28 | average,
29 | imean,
30 | mean,
31 | istd,
32 | std,
33 | ivar,
34 | var,
35 | isem,
36 | sem,
37 | average_and_var,
38 | ihistogram,
39 | )
40 | from .numerics import isum, sum, iprod, prod, isub, iall, iany, imax, imin
41 |
--------------------------------------------------------------------------------
/npstreams/tests/test_stacking.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import numpy as np
4 |
5 | from npstreams import stack
6 | import pytest
7 |
8 |
9 | def test_stack_against_numpy_stack():
10 | """Test against numpy.stack for axis = -1 and"""
11 | stream = [np.random.random((15, 7, 2, 1)) for _ in range(10)]
12 |
13 | dense = np.stack(stream, axis=-1)
14 | from_stack = stack(stream, axis=-1)
15 | assert np.allclose(dense, from_stack)
16 |
17 |
18 | def test_stack_on_single_array():
19 | """Test that npstreams.stack works with a single array"""
20 | arr = np.random.random((16, 16))
21 | stacked = stack(arr)
22 | assert np.allclose(arr[..., np.newaxis], stacked)
23 |
24 |
25 | @pytest.mark.parametrize("axis", range(4))
26 | def test_stack_against_numpy_concatenate(axis):
27 | """Test against numpy.concatenate for existing axes"""
28 | stream = [np.random.random((15, 7, 2, 1)) for _ in range(10)]
29 |
30 | dense = np.concatenate(stream, axis=axis)
31 | from_stack = stack(stream, axis=axis)
32 | assert np.allclose(dense, from_stack)
33 |
--------------------------------------------------------------------------------
/docs/recipes.rst:
--------------------------------------------------------------------------------
1 | .. include:: references.txt
2 |
3 | .. _recipes:
4 |
5 | *******
6 | Recipes
7 | *******
8 |
9 | Single-pass mean and error calculation
10 | --------------------------------------
11 |
12 | Here is a snipped for a function that computes a mean
13 | and standard error in the mean (SEM) in a single pass::
14 |
15 | from npstreams import imean, isem, array_stream, itercopy
16 |
17 | # The `array_stream` decorator ensures that the elements of
18 | # the iterable `arrays` will be converted to ndarrays if possible
19 | # This decorator is not required.
20 | @array_stream
21 | def mean_and_error(arrays, axis = -1):
22 | """ Yields (mean, error) pairs from a stream of arrays """
23 | # itercopy creates a copy of the original stream
24 | # The elements are only generated once, and then fed
25 | # to those two copies; much more efficient than
26 | # creating two streams from scratch.
27 | arrays_for_mean, arrays_for_sem = itercopy(arrays)
28 |
29 | means = imean(arrays_for_mean, axis = axis)
30 | errors = isem(arrays_for_sem, axis = axis)
31 |
32 | yield from zip(means, errors)
--------------------------------------------------------------------------------
/npstreams/stacking.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Stacking arrays from a stream
4 | -----------------------------
5 | """
6 | from collections.abc import Sized
7 | from functools import partial
8 |
9 | import numpy as np
10 |
11 | from .array_stream import array_stream
12 |
13 |
14 | @array_stream
15 | def stack(arrays, axis=-1):
16 | """
17 | Stack of all arrays from a stream. Generalization of numpy.stack
18 | and numpy.concatenate.
19 |
20 | Parameters
21 | ----------
22 | arrays : iterable
23 | Stream of NumPy arrays. Arrays must have shapes that broadcast together.
24 | axis : int, optional
25 | Stacking direction. If ``axis = -1``, arrays are stacked along a
26 | new dimension.
27 |
28 | Returns
29 | -------
30 | stacked : ndarray
31 | Cumulative stacked array.
32 | """
33 | # Shortcut : if axis == -1, this is exactly what ArrayStream.__array__
34 | if axis == -1:
35 | return np.array(arrays)
36 |
37 | # TODO: Shortcut if we already know the stream length
38 | # Note : we are guaranteed that `arrays` is a stream of arrays
39 | # at worst a tuple (arr,)
40 | # Use npstreams.length_hint
41 | arrays = iter(arrays)
42 | first = next(arrays)
43 | stack = np.array(first, copy=True)
44 |
45 | for array in arrays:
46 | stack = np.concatenate([stack, array], axis=axis)
47 |
48 | return stack
49 |
--------------------------------------------------------------------------------
/docs/control_flow.rst:
--------------------------------------------------------------------------------
1 | .. include:: references.txt
2 |
3 | .. _control_flow:
4 |
5 | ************
6 | Control Flow
7 | ************
8 |
9 | .. currentmodule:: npstreams
10 |
11 | =========================
12 | Streaming array pipelines
13 | =========================
14 |
15 | Before reducing your stream of arrays (e.g. averaging them together), you may want to
16 | transform them. This can be done with the :func:`ipipe` function:
17 |
18 | .. autofunction:: ipipe
19 | :noindex:
20 |
21 | Imagine we have the following pipeline, in which we want processes images in some iterable :data:`arrays`
22 | as follows:
23 |
24 | * Remove negative pixel intensity values;
25 | * Adjust the gamma value of images (from Scikit-image's :mod:`exposure` module);
26 | * Average the result together.
27 |
28 | The following lines will do the trick::
29 |
30 | from functools import partial
31 | from npstreams import ipipe, iaverage, last
32 | from skimage.exposure import adjust_gamma
33 |
34 | def remove_negative(arr):
35 | arr[arr < 0] = 0
36 | return arr
37 |
38 | pipeline = ipipe(adjust_gamma, remove_negative, arrays)
39 | avgs = last(iaverage(pipeline))
40 |
41 | If the pipeline is computationally intensive, we can also pipe arrays in parallel using the
42 | keyword-only ``processes``::
43 |
44 | pipeline = ipipe(adjust_gamma, remove_negative, arrays, processes = 4) # 4 cores will be used
45 | avgs = last(iaverage(pipeline))
46 |
47 | Since :func:`ipipe` uses :func:`pmap` under the hood, we can also use all available cores
48 | by passing ``processes = None``.
--------------------------------------------------------------------------------
/docs/cuda.rst:
--------------------------------------------------------------------------------
1 | .. include:: references.txt
2 |
3 | .. _cuda:
4 |
5 | ============
6 | CUDA support
7 | ============
8 |
9 | .. currentmodule:: npstreams
10 |
11 | What is CUDA
12 | ============
13 |
14 | `CUDA `_ is a computing platform taking advantage of Nvidia hardware.
15 | It effectively allows for array computations on Graphical Processing Units (GPU).
16 |
17 | :mod:`npstreams` relies on the (optional) `PyCUDA`_ library
18 | to access CUDA functionality.
19 |
20 | Advantages of CUDA
21 | ------------------
22 |
23 | TODO: benchmarks
24 |
25 | CUDA in npstreams
26 | =================
27 |
28 | `PyCUDA`_ is an optional dependency. Therefore, the CUDA-enabled functions are located in a separate
29 | module, the :mod:`npstreams.cuda` submodule.
30 |
31 | Importing from :mod:`npstreams.cuda` submodule
32 | ----------------------------------------------
33 |
34 | Importing anything from the :mod:`npstreams.cuda` submodule will raise an ``ImportError`` in the following cases:
35 |
36 | * `PyCUDA`_ is not installed;
37 | * No GPUs are available;
38 | * CUDA compilation backend is not available, possibly due to incomplete installation.
39 |
40 | With this in mind, it is wise to wrap import statements from :mod:`npstreams.cuda` in a ``try/except`` block.
41 |
42 | CUDA-enabled routines
43 | ---------------------
44 |
45 | A limited set of functions implemented in npstreams also have CUDA-enabled equivalents. For performance reasons,
46 | all CUDA-enabled routines operate along the 'stream' axis, i.e. as if the arrays had been stacked
47 | along a new dimension.
--------------------------------------------------------------------------------
/npstreams/tests/test_array_stream.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import numpy as np
3 |
4 | from npstreams.array_stream import array_stream, ArrayStream
5 |
6 |
7 | @array_stream
8 | def iden(arrays):
9 | yield from arrays
10 |
11 |
12 | def test_array_stream_decorator_type():
13 | """Test that all object from an array stream are ndarrays"""
14 |
15 | stream = [0, 1, np.array([1])]
16 | for arr in iden(stream):
17 | assert isinstance(arr, np.ndarray)
18 |
19 |
20 | def test_single_array():
21 | """Test that a 'stream' consisting of a single array is repackaged into an iterable"""
22 | stream = np.array([1, 2, 3])
23 | assert len(list(iden(stream))) == 1
24 |
25 |
26 | def test_array_stream_length_hint_sized_iterable():
27 | """Test the accuracy of __length_hint__ for ArrayStream constructed
28 | from a sized iterable"""
29 | iterable = [1, 2, 3, 4, 5]
30 | a = ArrayStream(iterable)
31 | assert len(iterable) == a.__length_hint__()
32 |
33 |
34 | def test_array_stream_length_hint_not_sized_iterable():
35 | """Test that __length_hint__ returns NotImplemented for ArrayStream constructed
36 | from an unsized iterable"""
37 | iterable = (0 for _ in range(10))
38 | a = ArrayStream(iterable)
39 | assert a.__length_hint__() is NotImplemented
40 |
41 |
42 | def test_array_stream_conversion_to_array():
43 | """Test that numpy.array(Arraystream(...)) returns an array built as a stack of arrays"""
44 | a = ArrayStream([np.random.random((16, 16)) for _ in range(10)])
45 | arr = np.array(a)
46 | assert arr.shape == (16, 16, 10)
47 |
--------------------------------------------------------------------------------
/npstreams/array_utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Array utilities
4 | ---------------
5 | """
6 | import numpy as np
7 |
8 |
9 | def nan_to_num(array, fill_value=0.0, copy=True):
10 | """
11 | Replace NaNs with another fill value.
12 |
13 | Parameters
14 | ----------
15 | array : array_like
16 | Input data.
17 | fill_value : float, optional
18 | NaNs will be replaced by ``fill_value``. Default is 0.0, in keeping
19 | with ``numpy.nan_to_num``.
20 | copy : bool, optional
21 | Whether to create a copy of `array` (True) or to replace values
22 | in-place (False). The in-place operation only occurs if
23 | casting to an array does not require a copy.
24 |
25 | Returns
26 | -------
27 | out : ndarray
28 | Array without NaNs. If ``array`` was not of floating or complearray type,
29 | ``array`` is returned unchanged.
30 |
31 | Notes
32 | -----
33 | Contrary to ``numpy.nan_to_num``, this functions does not handle
34 | infinite values.
35 |
36 | See Also
37 | --------
38 | numpy.nan_to_num : replace NaNs and Infs with zeroes.
39 | """
40 | array = np.array(array, subok=True, copy=copy)
41 | dtype = array.dtype.type
42 |
43 | # Non-inexact types do not have NaNs
44 | if not np.issubdtype(dtype, np.inexact):
45 | return array
46 |
47 | iscomplex = np.issubdtype(dtype, np.complexfloating)
48 | dest = (array.real, array.imag) if iscomplex else (array,)
49 | for d in dest:
50 | np.copyto(d, fill_value, where=np.isnan(d))
51 | return array
52 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) 2017-2020, Laurent P. René de Cotret.
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without
5 | modification, are permitted provided that the following conditions are
6 | met:
7 |
8 | * Redistributions of source code must retain the above copyright
9 | notice, this list of conditions and the following disclaimer.
10 |
11 | * Redistributions in binary form must reproduce the above
12 | copyright notice, this list of conditions and the following
13 | disclaimer in the documentation and/or other materials provided
14 | with the distribution.
15 |
16 | * Neither the name of the NumPy Developers nor the names of any
17 | contributors may be used to endorse or promote products derived
18 | from this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 |
--------------------------------------------------------------------------------
/CHANGELOG.rst:
--------------------------------------------------------------------------------
1 |
2 | Release 1.7.0
3 | -------------
4 |
5 | * Explicit support for NumPy 2, in addition to NumPy 1.
6 |
7 | Release 1.6.6
8 | -------------
9 |
10 | * Added the ability to automatically publish to PyPI.
11 |
12 | Release 1.6.5
13 | -------------
14 |
15 | * `Support for Python 3.6 and NumPy<1.17 has been dropped `_
16 | * Migration of testing infrastructure to pytest.
17 | * Tests are now included in the package itself.
18 | * Fixed some deprecation warnings from NumPy 1.20+.
19 |
20 | Release 1.6.4
21 | -------------
22 |
23 | * Fixed an issue regarding a deprecation of `collections.Sized` (in favour of `collections.abc.Sized`) in Python 3.10+
24 | * Code snippets in documentation are now tested for correctness.
25 | * Tests are now included in source distributions.
26 |
27 | Release 1.6.3
28 | -------------
29 |
30 | * Added support for Python 3.9
31 |
32 | Release 1.6.2
33 | -------------
34 |
35 | * Added the ability to run default benchmarks from the command line with ``python -m npsteams.benchmarks``.
36 | * Added explicit support for Python 3.8.
37 | * Bumped requirement for `numpy >= 1.14`.
38 |
39 | Release 1.6.1
40 | -------------
41 |
42 | * Added a changelog.
43 | * Added the possibility to use weights in ``ihistogram``.
44 | * Added the function ``average_and_var`` to compute the average and variance in a single pass.
45 | * Documentation regarding the ``ddof`` keyword in many statistical wrongly stated that the default value was 1. This has been corrected.
46 |
47 | Release 1.6
48 | -----------
49 |
50 | * Fixed some issues with NumPy versions above 1.16.
51 |
52 | Release 1.5.2
53 | -------------
54 |
55 | * Added benchmarking capabilities.
56 | * Added the ``array_stream`` decorator.
57 | * Removed support for Python < 3.6.
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Visual studio cache
10 | *.vs/
11 | *.vscode/
12 |
13 | # autogenerated documentation
14 | docs/source/functions/
15 | docs/source/classes/
16 |
17 | # Jupyter notebooks
18 | notebooks/
19 |
20 | # Distribution / packaging
21 | .Python
22 | env/
23 | build/
24 | develop-eggs/
25 | dist/
26 | downloads/
27 | eggs/
28 | .eggs/
29 | lib/
30 | lib64/
31 | parts/
32 | sdist/
33 | var/
34 |
35 | *.egg-info/
36 | .installed.cfg
37 | *.egg
38 |
39 | # PyInstaller
40 | # Usually these files are written by a python script from a template
41 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
42 | *.manifest
43 | *.spec
44 |
45 | # Installer logs
46 | pip-log.txt
47 | pip-delete-this-directory.txt
48 |
49 | # Unit test / coverage reports
50 | htmlcov/
51 | .tox/
52 | .coverage
53 | .coverage.*
54 | .cache
55 | nosetests.xml
56 | coverage.xml
57 | *,cover
58 | .hypothesis/
59 |
60 | # Translations
61 | *.mo
62 | *.pot
63 |
64 | # Django stuff:
65 | *.log
66 | local_settings.py
67 |
68 | # Flask stuff:
69 | instance/
70 | .webassets-cache
71 |
72 | # Scrapy stuff:
73 | .scrapy
74 |
75 | # Sphinx documentation
76 | # These directories are autogenerated
77 | docs/_build/
78 | docs/functions/
79 | docs/classes/
80 |
81 | # PyBuilder
82 | target/
83 |
84 | # IPython Notebook
85 | .ipynb_checkpoints
86 |
87 | # pyenv
88 | .python-version
89 |
90 | # celery beat schedule file
91 | celerybeat-schedule
92 |
93 | # dotenv
94 | .env
95 |
96 | # virtualenv
97 | venv/
98 | ENV/
99 |
100 | # Spyder project settings
101 | .spyderproject
102 |
103 | # Rope project settings
104 | .ropeproject
105 |
106 | # PyCharm
107 | .idea/
--------------------------------------------------------------------------------
/npstreams/tests/test_linalg.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from random import randint, random
4 |
5 | import numpy as np
6 |
7 | from npstreams import idot, itensordot, iinner, ieinsum, last
8 | import pytest
9 |
10 |
11 | def test_idot_against_numpy_multidot():
12 | """Test against numpy.linalg.multi_dot in 2D case"""
13 | stream = [np.random.random((8, 8)) for _ in range(7)]
14 |
15 | from_numpy = np.linalg.multi_dot(stream)
16 | from_stream = last(idot(stream))
17 |
18 | assert from_numpy.shape == from_stream.shape
19 | assert np.allclose(from_numpy, from_stream)
20 |
21 |
22 | @pytest.mark.parametrize("axis", (0, 1, 2))
23 | def test_itensordot_against_numpy_tensordot(axis):
24 | """Test against numpy.tensordot in 2D case"""
25 | stream = tuple(np.random.random((8, 8)) for _ in range(2))
26 |
27 | from_numpy = np.tensordot(*stream)
28 | from_stream = last(itensordot(stream))
29 |
30 | assert from_numpy.shape == from_stream.shape
31 | assert np.allclose(from_numpy, from_stream)
32 |
33 |
34 | @pytest.mark.parametrize("axis", (0, 1, 2))
35 | def test_iinner_against_numpy_inner(axis):
36 | """Test against numpy.tensordot in 2D case"""
37 | stream = tuple(np.random.random((8, 8)) for _ in range(2))
38 |
39 | from_numpy = np.inner(*stream)
40 | from_stream = last(iinner(stream))
41 |
42 | assert from_numpy.shape == from_stream.shape
43 | assert np.allclose(from_numpy, from_stream)
44 |
45 |
46 | def test_ieinsum_against_numpy_einsum():
47 | """Test against numpy.einsum"""
48 | a = np.arange(60.0).reshape(3, 4, 5)
49 | b = np.arange(24.0).reshape(4, 3, 2)
50 | stream = [a, b]
51 |
52 | from_numpy = np.einsum("ijk,jil->kl", a, b)
53 | from_stream = last(ieinsum(stream, "ijk,jil->kl"))
54 |
55 | assert from_numpy.shape == from_stream.shape
56 | assert np.allclose(from_numpy, from_stream)
57 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["build", "setuptools>=61.0"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [tool.setuptools.dynamic]
6 | version = {attr = "npstreams.__version__"}
7 |
8 | [project]
9 | name = "npstreams"
10 | dynamic = ["version"]
11 | authors = [
12 | { name="Laurent P. René de Cotret", email="laurent.decotret@outlook.com" },
13 | ]
14 | maintainers = [
15 | { name="Laurent P. René de Cotret", email="laurent.decotret@outlook.com" },
16 | ]
17 | description = "Streaming operations on NumPy arrays"
18 | readme = "README.md"
19 | license = {file = "LICENSE"}
20 | requires-python = ">=3.7, <4"
21 | dependencies = ["numpy >= 1.17, <3"]
22 | keywords=["streaming", "numpy", "math"]
23 | classifiers = [
24 | "Environment :: Console",
25 | "Intended Audience :: Science/Research",
26 | "Topic :: Scientific/Engineering",
27 | "License :: OSI Approved :: BSD License",
28 | "Natural Language :: English",
29 | "Operating System :: OS Independent",
30 | "Programming Language :: Python",
31 | "Programming Language :: Python :: 3",
32 | ]
33 |
34 | [project.optional-dependencies]
35 | development = [
36 | "Sphinx >= 3",
37 | "sphinx_rtd_theme >= 0.4",
38 | "pytest >= 6",
39 | "scipy >= 1",
40 | ]
41 |
42 | [project.urls]
43 | Documentation = "https://npstreams.readthedocs.io/"
44 | Repository = "https://github.com/LaurentRDC/npstreams"
45 | "Bug Tracker" = "https://github.com/LaurentRDC/npstreams/issues"
46 |
47 | [tool.black]
48 | line-length = 120
49 | include = '\.pyi?$'
50 |
51 | [tool.isort]
52 | profile = "black"
53 |
54 | [tool.pytest.ini_options]
55 | minversion = "6.0"
56 | log_cli_level = "INFO"
57 | addopts = ["--doctest-modules"]
58 | testpaths = ["npstreams/tests"]
59 |
60 | # See here for an explanation of how to include package data:
61 | # https://setuptools.pypa.io/en/latest/userguide/datafiles.html#package-data
62 | [tool.setuptools.package-data]
63 | npstreams = ["tests/data/*.npy"]
64 |
--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
1 | .. include:: references.txt
2 |
3 | .. _installation:
4 |
5 | ************
6 | Installation
7 | ************
8 |
9 | Requirements
10 | ============
11 |
12 | **npstreams** works on Linux, Mac OS X and Windows. It requires Python 3.7+
13 | as well as `numpy`_. `scipy`_ is an optional dependency that is only used in
14 | tests; however, if SciPy cannot be imported, tests will not fail.
15 |
16 | To get access to the :mod:`npstreams.cuda` module, which contains CUDA-enabled routines,
17 | PyCUDA_ must be installed as well.
18 |
19 | Install npstreams
20 | =================
21 |
22 | npstreams is available on PyPI; it can be installed with `pip `_::
23 |
24 | python -m pip install npstreams
25 |
26 | npstreams can also be installed with the conda package manager, from the conda-forge channel::
27 |
28 | conda config --add channels conda-forge
29 | conda install npstreams
30 |
31 | You can install the latest developer version of npstreams by cloning the git
32 | repository::
33 |
34 | git clone https://github.com/LaurentRDC/npstreams.git
35 |
36 | ...then installing the package with::
37 |
38 | cd npstreams
39 | pip install .
40 |
41 |
42 | Testing
43 | =======
44 |
45 | If you want to check that all the tests are running correctly with your Python
46 | configuration, type::
47 |
48 | pip install .[development]
49 | pytest
50 |
51 |
52 | Embedding in applications
53 | =========================
54 |
55 | `npstreams` is designed to be used in conjuction with multiprocessing libraries, such as the standard
56 | `multiprocessing` library. `npstreams` even uses `multiprocessing` directly in certain functions.
57 |
58 | In order to use the multicore functionality of `npstreams` in applications frozen with `py2exe`, `PyInstaller`, or `cx_Freeze`,
59 | you will need to activate the ``multiprocessing.freeze_support()`` function. `You can read more
60 | about it here. `_
--------------------------------------------------------------------------------
/npstreams/tests/test_parallel.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from npstreams import pmap, pmap_unordered, preduce
3 | from functools import reduce
4 | import numpy as np
5 | from operator import add
6 |
7 |
8 | def identity(obj, *args, **kwargs):
9 | """ignores args and kwargs"""
10 | return obj
11 |
12 |
13 | def test_preduce_preduce_one_process():
14 | """Test that preduce reduces to functools.reduce for a single process"""
15 | integers = list(range(0, 10))
16 | preduce_results = preduce(add, integers, processes=1)
17 | reduce_results = reduce(add, integers)
18 |
19 | assert preduce_results == reduce_results
20 |
21 |
22 | def test_preduce_preduce_multiple_processes():
23 | """Test that preduce reduces to functools.reduce for a single process"""
24 | integers = list(range(0, 10))
25 | preduce_results = preduce(add, integers, processes=2)
26 | reduce_results = reduce(add, integers)
27 |
28 | assert preduce_results == reduce_results
29 |
30 |
31 | def test_preduce_on_numpy_arrays():
32 | """Test sum of numpy arrays as parallel reduce"""
33 | arrays = [np.zeros((32, 32)) for _ in range(10)]
34 | s = preduce(add, arrays, processes=2)
35 |
36 | assert np.allclose(s, arrays[0])
37 |
38 |
39 | def test_preduce_with_kwargs():
40 | """Test preduce with keyword-arguments"""
41 | pass
42 |
43 |
44 | def test_pmap_trivial_map_no_args():
45 | """Test that pmap is working with no positional arguments"""
46 | integers = list(range(0, 10))
47 | result = list(pmap(identity, integers, processes=2))
48 | assert integers == result
49 |
50 |
51 | def test_pmap_trivial_map_kwargs():
52 | """Test that pmap is working with args and kwargs"""
53 | integers = list(range(0, 10))
54 | result = list(pmap(identity, integers, processes=2, kwargs={"test": True}))
55 | assert result == integers
56 |
57 |
58 | def test_pmap_trivial_map_no_args():
59 | """Test that pmap_unordered is working with no positional arguments"""
60 | integers = list(range(0, 10))
61 | result = list(sorted(pmap_unordered(identity, integers, processes=2)))
62 | assert integers == result
63 |
64 |
65 | def test_pmap_trivial_map_kwargs():
66 | """Test that pmap_unordered is working with args and kwargs"""
67 | integers = list(range(0, 10))
68 | result = list(
69 | sorted(pmap_unordered(identity, integers, processes=2, kwargs={"test": True}))
70 | )
71 | assert result == integers
72 |
--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | name: Continuous integration
2 |
3 | on:
4 | push:
5 | pull_request:
6 |
7 | jobs:
8 | build:
9 | # To prevent this job from running, have "[skip ci]" or "[ci skip]" in the commit message
10 | if: contains(toJson(github.event.commits), '[ci skip]') == false && contains(toJson(github.event.commits), '[skip ci]') == false
11 |
12 | runs-on: ${{ matrix.os }}
13 | strategy:
14 | fail-fast: false
15 | matrix:
16 | os: [ubuntu-latest, macos-latest, windows-latest]
17 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
18 |
19 | steps:
20 | - uses: actions/checkout@v4
21 |
22 | - name: Set up Python ${{ matrix.python-version }} on ${{ matrix.os }}
23 | uses: actions/setup-python@v5
24 | with:
25 | python-version: ${{ matrix.python-version }}
26 |
27 | - name: Install dependencies
28 | run: |
29 | python -m pip install --upgrade pip
30 | pip install .[development]
31 |
32 | # Note the use of the -Wa flag to show DeprecationWarnings
33 | # We run the tests on the installed package
34 | - name: Unit tests and doctests
35 | run: |
36 | python -Wa -m pytest
37 |
38 | - name: Build documentation
39 | run:
40 | sphinx-build -M html docs build/docs
41 |
42 |
43 | release:
44 | if: startsWith(github.ref, 'refs/tags/v')
45 | needs: [build]
46 | runs-on: ubuntu-latest
47 | permissions:
48 | id-token: write # IMPORTANT: this permission is mandatory for trusted publishing
49 | contents: write # To create a release
50 | steps:
51 | - uses: actions/checkout@v4
52 |
53 | - name: Set up Python
54 | uses: actions/setup-python@v5
55 | with:
56 | python-version: "3.10"
57 |
58 | - name: Install dependencies
59 | run: |
60 | pip install build
61 | pip install .[development]
62 |
63 | - name: Create release description
64 | run: |
65 | python release-description.py CHANGELOG.rst > description.md
66 | cat description.md
67 |
68 | - name: Create source distribution
69 | run: |
70 | python -m build
71 |
72 | - name: Create release
73 | uses: softprops/action-gh-release@v2
74 | with:
75 | body_path: description.md
76 | files: |
77 | dist/*
78 |
79 | # Github Actions have been set as a trusted publisher on PyPI's npstreams project,
80 | # hence why no username, password, or token is required.
81 | - name: Upload to PyPI
82 | if: always()
83 | uses: pypa/gh-action-pypi-publish@release/v1
84 |
--------------------------------------------------------------------------------
/docs/conventions.rst:
--------------------------------------------------------------------------------
1 | .. include:: references.txt
2 |
3 | .. _conventions:
4 |
5 | ***********
6 | Conventions
7 | ***********
8 |
9 | .. currentmodule:: npstreams
10 |
11 | Stream Conventions
12 | ------------------
13 |
14 | Most (all?) functions in :mod:`npstreams` are designed to work on streams, or
15 | iterables of NumPy arrays. These iterables can be infinite.
16 | The quintessential example is a stream of images progressively read from disk.
17 | These streams of arrays must contain arrays that all have the same shape and data-type,
18 | unless specified otherwise.
19 |
20 | An example of a function that operates on a stream of arrays of different shapes is :func:`ieinsum`
21 |
22 | A single NumPy array can be passed where a stream is expected; the array will be repackaged
23 | into a stream of a single array.
24 |
25 | Naming Conventions
26 | ------------------
27 |
28 | In order to facilitate documentation, functions in :mod:`npstreams` follow the following conventions:
29 |
30 | * Routines are named after their closest equivalent in :mod:`numpy` and :mod:`scipy`.
31 | * Routines with names starting with 'i' (e.g. :func:`iprod`) are generator functions; they yield running results
32 | as they are being computer. Usually, these functions have a non-generator equivalent that
33 | consumes the entire stream (e.g. :func:`iaverage` vs. :func:`average`).
34 | * Routines with names starting with 'c' (e.g. :func:`csum`) are CUDA-enabled (requires :mod:`pycuda`)
35 | * Routines with names starting with 'p' (e.g. :func:`pmap`) can be parallelized. The default
36 | behavior is always to not use multiple cores. For example, the default behavior of :func:`pmap`
37 | is to behave like :func:`map`.
38 |
39 | Axis Conventions
40 | ----------------
41 |
42 | NumPy arrays provide operations along axes. Similarly, :mod:`npstreams` also
43 | exposes the :data:`axis` keyword in some (most?) reduction functions like :func:`isum`
44 | and :func:`iprod`.
45 |
46 | The convention for specification of the :data:`axis` parameter is as follows:
47 |
48 | * If ``axis = None``, arrays are flattened before being combined. The result will
49 | be a scalar of a 0d array.
50 | * The default (``axis = -1``) always corresponds to combining arrays along a
51 | new axis. For example, summing images together along ``axis = -1`` is equivalent
52 | to stacking images along a new axis, then averaging along this new axis
53 | * if ``axis`` is an ``int``, then arrays are reduced according to this axis, and then combined.
54 |
55 | CUDA-enabled functions
56 | ----------------------
57 | Some functions are implemented using CUDA
--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
1 | .. include:: references.txt
2 |
3 | .. _api:
4 |
5 | *************
6 | Reference/API
7 | *************
8 |
9 | .. currentmodule:: npstreams
10 |
11 | Click on any function below to see detailed information.
12 |
13 | Creation of Streams
14 | -------------------
15 |
16 | Decorator for streaming functions which guarantees that the stream elements will be converted to arrays.
17 |
18 | .. autosummary::
19 | :toctree: functions/
20 |
21 | array_stream
22 |
23 | The :func:`array_stream` decorator wraps iterables into an :class:`ArrayStream` iterator. This is not
24 | required to use the functions defined here, but it provides some nice guarantees.
25 |
26 | .. autosummary::
27 | :toctree: classes/
28 |
29 | ArrayStream
30 |
31 | Statistical Functions
32 | ---------------------
33 |
34 | .. autosummary::
35 | :toctree: functions/
36 |
37 | imean
38 | iaverage
39 | istd
40 | ivar
41 | isem
42 | ihistogram
43 |
44 | The following functions consume entire streams. By avoiding costly intermediate steps,
45 | they can perform much faster than their generator versions.
46 |
47 | .. autosummary::
48 | :toctree: functions/
49 |
50 | mean
51 | average
52 | std
53 | var
54 | sem
55 | average_and_var
56 |
57 | Numerics
58 | --------
59 |
60 | .. autosummary::
61 | :toctree: functions/
62 |
63 | isum
64 | iprod
65 | isub
66 |
67 | .. autosummary::
68 | :toctree: functions/
69 |
70 | sum
71 | prod
72 |
73 | Linear Algebra
74 | --------------
75 | .. autosummary::
76 | :toctree: functions/
77 |
78 | idot
79 | iinner
80 | itensordot
81 | ieinsum
82 |
83 | Control Flow
84 | ------------
85 | .. autosummary::
86 | :toctree: functions/
87 |
88 | ipipe
89 | iload
90 | pload
91 |
92 | Comparisons
93 | -----------
94 | .. autosummary::
95 | :toctree: functions/
96 |
97 | iany
98 | iall
99 | imax
100 | imin
101 |
102 | Parallelization
103 | ---------------
104 | .. autosummary::
105 | :toctree: functions/
106 |
107 | pmap
108 | pmap_unordered
109 | preduce
110 |
111 | Stacking
112 | --------
113 | .. autosummary::
114 | :toctree: functions/
115 |
116 | stack
117 |
118 | Iterator Utilities
119 | ------------------
120 | .. autosummary::
121 | :toctree: functions/
122 |
123 | last
124 | cyclic
125 | itercopy
126 | chunked
127 | linspace
128 | multilinspace
129 | peek
130 | primed
131 | length_hint
132 |
133 | Array Utilities
134 | ---------------
135 | .. autosummary::
136 | :toctree: functions/
137 |
138 | nan_to_num
139 |
140 | Benchmarking
141 | ------------
142 | .. autosummary::
143 | :toctree: functions/
144 |
145 | benchmark
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | .. include:: references.txt
2 |
3 | .. _npstreams:
4 |
5 | **************************************
6 | `npstreams`: streaming NumPy functions
7 | **************************************
8 |
9 | :mod:`npstreams` is an open-source Python package for streaming NumPy array operations.
10 | The goal is to provide tested, (almost) drop-in replacements for NumPy functions (where possible)
11 | that operate on streams of arrays instead of dense arrays.
12 |
13 | :mod:`npstreams` also provides some utilities for parallelization. These parallelization
14 | generators can be combined with the streaming functions to drastically improve performance
15 | in some cases.
16 |
17 | The code presented herein has been in use at some point by the
18 | `Siwick research group `_.
19 |
20 | Example
21 | =======
22 |
23 | Consider the following snippet to combine 50 images
24 | from an iterable :data:`source`::
25 |
26 | import numpy as np
27 |
28 | images = np.empty( shape = (2048, 2048, 50) )
29 | for index, im in enumerate(source):
30 | images[:,:,index] = im
31 |
32 | avg = np.average(images, axis = 2)
33 |
34 | If the :data:`source` iterable provided 10000 images, the above routine would
35 | not work on most machines. Moreover, what if we want to transform the images
36 | one by one before averaging them? What about looking at the average while it
37 | is being computed? Let's look at an example::
38 |
39 | import numpy as np
40 | from npstreams import iaverage
41 | from scipy.misc import imread
42 |
43 | stream = map(imread, list_of_filenames)
44 | averaged = iaverage(stream)
45 |
46 | At this point, the generators :func:`map` and :func:`iaverage` are 'wired'
47 | but will not compute anything until it is requested. We can look at the average evolve::
48 |
49 | import matplotlib.pyplot as plt
50 | for avg in average:
51 | plt.imshow(avg); plt.show()
52 |
53 | We can also use :func:`last` to get at the final average::
54 |
55 | from npstreams import last
56 |
57 | total = last(averaged) # average of the entire stream. See also npstreams.average
58 |
59 | Benchmark
60 | =========
61 |
62 | npstreams provides a function for benchmarking common use cases.
63 |
64 | To run the benchmark with default parameters, from the interpreter::
65 |
66 | from npstreams import benchmark
67 | benchmark()
68 |
69 | From a command-line terminal::
70 |
71 | python -m npstreams.benchmarks
72 |
73 | The results will be printed to the screen.
74 |
75 | Links
76 | =====
77 |
78 | * `Source code `_
79 | * `Issues `_
80 | * `Docs `_
81 |
82 | .. _npstreams_docs:
83 |
84 | General Documentation
85 | =====================
86 |
87 | .. toctree::
88 | :maxdepth: 3
89 |
90 | installation
91 | whatsnew
92 | conventions
93 | api
94 | cuda
95 | control_flow
96 | making_your_own
97 | recipes
98 |
99 | Authors
100 | =======
101 |
102 | * Laurent P. René de Cotret
--------------------------------------------------------------------------------
/npstreams/array_stream.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from collections.abc import Iterator
4 | from functools import wraps
5 |
6 | import numpy as np
7 | from numpy import asanyarray
8 |
9 | from .iter_utils import length_hint, peek
10 |
11 |
12 | class ArrayStream(Iterator):
13 | """
14 | Iterator of arrays. Elements from the stream are converted to
15 | NumPy arrays. If ``stream`` is a single array, it will be
16 | repackaged as a length 1 iterable.
17 |
18 | Arrays in the stream will be cast to the same data-type as the first
19 | array in the stream. The stream data-type is located in the `dtype` attribute.
20 |
21 | .. versionadded:: 1.5.2
22 | """
23 |
24 | def __init__(self, stream):
25 | if isinstance(stream, np.ndarray):
26 | stream = (stream,)
27 |
28 | self._sequence_length = length_hint(stream, default=NotImplemented)
29 |
30 | # Once length_hint has been determined, we can peek into the stream
31 | first, stream = peek(stream)
32 | self._iterator = iter(stream)
33 |
34 | first = asanyarray(first)
35 | self.dtype = first.dtype
36 |
37 | def __repr__(self):
38 | """Verbose string representation"""
39 | representation = f"< {self.__class__.__name__} object"
40 | representation += f" of data-type {self.dtype}"
41 |
42 | if not (self._sequence_length is NotImplemented):
43 | representation += f" and a sequence length of {self._sequence_length}"
44 | else:
45 | representation += " of unknown length"
46 |
47 | return representation + " >"
48 |
49 | def __array__(self, *_, **__):
50 | """Returns a dense array created from this stream."""
51 | # As of numpy version 1.14, arrays are expanded into a list before contatenation
52 | # Therefore, it's ok to build that list first
53 | arraylist = list(self)
54 | return np.stack(arraylist, axis=-1)
55 |
56 | def __length_hint__(self):
57 | """
58 | In certain cases, an ArrayStream can have a definite size.
59 | See https://www.python.org/dev/peps/pep-0424/
60 | """
61 | return self._sequence_length
62 |
63 | def __next__(self):
64 | n = self._iterator.__next__()
65 | return asanyarray(n, dtype=self.dtype)
66 |
67 |
68 | def array_stream(func):
69 | """
70 | Decorates streaming functions to make sure that the stream
71 | is a stream of ndarrays. Objects that are not arrays are transformed
72 | into arrays. If the stream is in fact a single ndarray, this ndarray
73 | is repackaged into a sequence of length 1.
74 |
75 | The first argument of the decorated function is assumed to be an iterable of
76 | arrays, or an iterable of objects that can be casted to arrays.
77 |
78 | Note that using this decorator also ensures that the stream is only wrapped once
79 | by the conversion function.
80 | """
81 |
82 | @wraps(func)
83 | def decorated(arrays, *args, **kwargs):
84 | if isinstance(arrays, ArrayStream):
85 | return func(arrays, *args, **kwargs)
86 | return func(ArrayStream(arrays), *args, **kwargs)
87 |
88 | return decorated
89 |
--------------------------------------------------------------------------------
/npstreams/tests/test_flow.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import numpy as np
4 | from pathlib import Path
5 | from npstreams import array_stream, ipipe, last, iload, pload, isum
6 |
7 |
8 | @array_stream
9 | def iden(arrays):
10 | yield from arrays
11 |
12 |
13 | def test_ipipe_order():
14 | """Test that ipipe(f, g, h, arrays) -> f(g(h(arr))) for arr in arrays"""
15 | stream = [np.random.random((15, 7, 2, 1)) for _ in range(10)]
16 | squared = [np.cbrt(np.square(arr)) for arr in stream]
17 | pipeline = ipipe(np.cbrt, np.square, stream)
18 |
19 | assert all(np.allclose(s, p) for s, p in zip(pipeline, squared))
20 |
21 |
22 | def test_ipipe_multiprocessing():
23 | """Test that ipipe(f, g, h, arrays) -> f(g(h(arr))) for arr in arrays"""
24 | stream = [np.random.random((15, 7, 2, 1)) for _ in range(10)]
25 | squared = [np.cbrt(np.square(arr)) for arr in stream]
26 | pipeline = ipipe(np.cbrt, np.square, stream, processes=2)
27 |
28 | assert all(np.allclose(s, p) for s, p in zip(pipeline, squared))
29 |
30 |
31 | def test_iload_glob():
32 | """Test that iload works on glob-like patterns"""
33 | stream = iload(Path(__file__).parent / "data" / "test_data*.npy", load_func=np.load)
34 | s = last(isum(stream)).astype(float) # Cast to float for np.allclose
35 | assert np.allclose(s, np.zeros_like(s))
36 |
37 |
38 | def test_iload_file_list():
39 | """Test that iload works on iterable of filenames"""
40 | files = [
41 | Path(__file__).parent / "data" / "test_data1.npy",
42 | Path(__file__).parent / "data" / "test_data2.npy",
43 | Path(__file__).parent / "data" / "test_data3.npy",
44 | ]
45 | stream = iload(files, load_func=np.load)
46 | s = last(isum(stream)).astype(float) # Cast to float for np.allclose
47 | assert np.allclose(s, np.zeros_like(s))
48 |
49 |
50 | def test_pload_glob():
51 | """Test that pload works on glob-like patterns"""
52 | stream = pload(Path(__file__).parent / "data" / "test_data*.npy", load_func=np.load)
53 | s = last(isum(stream)).astype(float) # Cast to float for np.allclose
54 | assert np.allclose(s, np.zeros_like(s))
55 |
56 | stream = pload(
57 | Path(__file__).parent / "data" / "test_data*.npy",
58 | load_func=np.load,
59 | processes=2,
60 | )
61 | s = last(isum(stream)).astype(float) # Cast to float for np.allclose
62 | assert np.allclose(s, np.zeros_like(s))
63 |
64 |
65 | def test_pload_file_list():
66 | """Test that pload works on iterable of filenames"""
67 | files = [
68 | Path(__file__).parent / "data" / "test_data1.npy",
69 | Path(__file__).parent / "data" / "test_data2.npy",
70 | Path(__file__).parent / "data" / "test_data3.npy",
71 | ]
72 | stream = pload(files, load_func=np.load)
73 | s = last(isum(stream)).astype(float) # Cast to float for np.allclose
74 | assert np.allclose(s, np.zeros_like(s))
75 |
76 | files = [
77 | Path(__file__).parent / "data" / "test_data1.npy",
78 | Path(__file__).parent / "data" / "test_data2.npy",
79 | Path(__file__).parent / "data" / "test_data3.npy",
80 | ]
81 | stream = pload(files, load_func=np.load, processes=2)
82 | s = last(isum(stream)).astype(float) # Cast to float for np.allclose
83 | assert np.allclose(s, np.zeros_like(s))
84 |
--------------------------------------------------------------------------------
/npstreams/tests/test_cuda.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from itertools import repeat
4 | import numpy as np
5 | import pytest
6 |
7 | try:
8 | from npstreams.cuda import csum, cprod, caverage, cmean
9 |
10 | WITH_CUDA = True
11 | except ImportError:
12 | WITH_CUDA = False
13 |
14 |
15 | skip_if_no_cuda = pytest.mark.skipif(
16 | not WITH_CUDA, reason="PyCUDA is not installed/available"
17 | )
18 |
19 |
20 | @skip_if_no_cuda
21 | def test_csum_zero_sum():
22 | stream = repeat(np.zeros((16, 16), dtype=float), times=5)
23 | s = csum(stream)
24 | assert np.allclose(s, np.zeros((16, 16)))
25 |
26 |
27 | @skip_if_no_cuda
28 | def test_csum_dtype():
29 | stream = repeat(np.zeros((16, 16), dtype=float), times=5)
30 | s = csum(stream, dtype=np.int16)
31 | assert np.allclose(s, np.zeros((16, 16)))
32 | assert s.dtype == np.int16
33 |
34 |
35 | @skip_if_no_cuda
36 | def test_csum_ignore_nans():
37 | """Test a sum of zeros with NaNs sprinkled"""
38 | source = [np.zeros((16,), dtype=float) for _ in range(10)]
39 | source.append(np.full((16,), fill_value=np.nan))
40 | summed = csum(source, ignore_nan=True)
41 | assert np.allclose(summed, np.zeros_like(summed))
42 |
43 |
44 | @skip_if_no_cuda
45 | def test_cprod_ones_prod():
46 | stream = repeat(np.ones((16, 16), dtype=float), times=5)
47 | s = cprod(stream)
48 | assert np.allclose(s, np.ones((16, 16)))
49 |
50 |
51 | @skip_if_no_cuda
52 | def test_cprod_ignore_nans():
53 | """Test that NaNs are ignored."""
54 | source = [np.ones((16,), dtype=float) for _ in range(10)]
55 | source.append(np.full_like(source[0], np.nan))
56 | product = cprod(source, ignore_nan=True)
57 | assert np.allclose(product, np.ones_like(product))
58 |
59 |
60 | @skip_if_no_cuda
61 | def test_cprod_dtype():
62 | """Test that dtype argument is working"""
63 | source = [np.ones((16,), dtype=float) for _ in range(10)]
64 | product = cprod(source, dtype=int)
65 | assert np.allclose(product, np.ones_like(product))
66 | assert product.dtype == int
67 |
68 |
69 | @skip_if_no_cuda
70 | def test_cavg_no_weights():
71 | stream = [np.random.random(size=(16, 16)) for _ in range(5)]
72 | from_caverage = caverage(stream)
73 | from_numpy = np.average(np.dstack(stream), axis=2)
74 | assert np.allclose(from_caverage, from_numpy)
75 |
76 |
77 | @skip_if_no_cuda
78 | def test_cavg_weighted_average():
79 | """Test results of weighted average against numpy.average"""
80 | stream = [np.random.random(size=(16, 16)) for _ in range(5)]
81 |
82 | weights = [np.random.random(size=stream[0].shape) for _ in stream]
83 | from_caverage = caverage(stream, weights=weights)
84 | from_numpy = np.average(np.dstack(stream), axis=2, weights=np.dstack(weights))
85 | assert np.allclose(from_caverage, from_numpy)
86 |
87 |
88 | @skip_if_no_cuda
89 | def test_cmean_of_ones():
90 | stream = repeat(np.ones((16, 16), dtype=float), times=5)
91 | s = cmean(stream)
92 | assert np.allclose(s, np.ones((16, 16)))
93 |
94 |
95 | @skip_if_no_cuda
96 | def test_cmean_random():
97 | """Test cmean against numpy.mean on random data"""
98 | stream = [np.random.random(size=(16, 16)) for _ in range(5)]
99 | from_cmean = cmean(stream)
100 | from_numpy = np.mean(np.dstack(stream), axis=2)
101 | assert np.allclose(from_cmean, from_numpy)
102 |
--------------------------------------------------------------------------------
/npstreams/tests/test_iter_utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from itertools import repeat
4 | from npstreams import last, chunked, linspace, multilinspace, cyclic, length_hint
5 | import pytest
6 |
7 |
8 | def test_last_trivial():
9 | """Test last() on iterable of identical values"""
10 | i = repeat(1, 10)
11 | assert last(i) == 1
12 |
13 |
14 | def test_last_on_empty_iterable():
15 | """Test that last() raises RuntimeError for empty iterable"""
16 | with pytest.raises(RuntimeError):
17 | last(list())
18 |
19 |
20 | def test_cyclic_numbers():
21 | """ """
22 | permutations = set(cyclic((1, 2, 3)))
23 | assert (1, 2, 3) in permutations
24 | assert (2, 3, 1) in permutations
25 | assert (3, 1, 2) in permutations
26 | assert len(permutations) == 3
27 |
28 |
29 | def test_linspace_endpoint():
30 | """Test that the endpoint is included by linspace() when appropriate"""
31 | space = linspace(0, 1, num=10, endpoint=True)
32 | assert last(space) == 1
33 |
34 | space = linspace(0, 1, num=10, endpoint=False)
35 | assert round(abs(last(space) - 0.9), 7) == 0
36 |
37 |
38 | def test_linspace_length():
39 | """Test that linspace() returns an iterable of the correct length"""
40 | space = list(linspace(0, 1, num=13, endpoint=True))
41 | assert len(space) == 13
42 |
43 | space = list(linspace(0, 1, num=13, endpoint=False))
44 | assert len(space) == 13
45 |
46 |
47 | def test_multilinspace_endpoint():
48 | """Test that the endpoint is included by linspace() when appropriate"""
49 | space = multilinspace((0, 0), (1, 1), num=10, endpoint=True)
50 | assert last(space) == (1, 1)
51 |
52 | space = multilinspace((0, 0), (1, 1), num=10, endpoint=False)
53 | # Unfortunately there is no assertSequenceAlmostEqual
54 | assert last(space) == (0.8999999999999999, 0.8999999999999999)
55 |
56 |
57 | def test_multilinspace_length():
58 | """Test that linspace() returns an iterable of the correct length"""
59 | space = list(multilinspace((0, 0), (1, 1), num=13, endpoint=True))
60 | assert len(space) == 13
61 |
62 | space = list(multilinspace((0, 0), (1, 1), num=13, endpoint=False))
63 | assert len(space) == 13
64 |
65 |
66 | def test_chunked_larger_chunksize():
67 | """Test chunked() with a chunksize larger that the iterable it"""
68 | i = repeat(1, 10)
69 | chunks = chunked(i, chunksize=15)
70 | assert len(list(chunks)) == 1 # One single chunk is returned
71 |
72 |
73 | def test_chunked_on_infinite_generator():
74 | """Test chunked() on an infinite iterable"""
75 | i = repeat(1)
76 | chunks = chunked(i, chunksize=15)
77 | for _ in range(10):
78 | assert len(next(chunks)) == 15
79 |
80 |
81 | def test_chunked_chunked_nonint_chunksize():
82 | """Test that chunked raises a TypeError immediately if `chunksize` is not an integer"""
83 | with pytest.raises(TypeError):
84 | i = repeat(1)
85 | chunks = chunked(i, chunksize=15.0)
86 |
87 |
88 | def test_length_hint_on_sized():
89 | """Test length_hint on a sized iterable"""
90 | l = [1, 2, 3, 4, 5]
91 | assert length_hint(l) == len(l)
92 |
93 |
94 | def test_length_hint_on_unsized():
95 | """Test length_hint on an unsized iterable returns the default"""
96 | l = (0 for _ in range(10))
97 | assert length_hint(l, default=0) == 0
98 |
99 |
100 | def test_length_hint_on_method_if_implemented():
101 | """Test length_hint returns the same as __length_hint__ if implemented"""
102 |
103 | class WithHint:
104 | """Some dummy class with a length hint"""
105 |
106 | def __length_hint__(self):
107 | return 1
108 |
109 | assert length_hint(WithHint(), default=0) == 1
110 |
--------------------------------------------------------------------------------
/npstreams/flow.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Flow controls
4 | -------------
5 | """
6 | from functools import partial
7 | from glob import iglob
8 | from pathlib import Path
9 |
10 | from .array_stream import ArrayStream
11 | from .parallel import pmap, pmap_unordered
12 |
13 |
14 | def iload(files, load_func, **kwargs):
15 | """
16 | Create a stream of arrays from files, which are loaded lazily.
17 |
18 | In cases where the consumer function is much faster than data loading,
19 | consider using :func:`pload` instead.
20 |
21 | Parameters
22 | ----------
23 | pattern : iterable of str or str
24 | Either an iterable of filenames or a glob-like pattern str.
25 | load_func : callable, optional
26 | Function taking a filename as its first arguments
27 | kwargs
28 | Keyword arguments are passed to ``load_func``.
29 |
30 | Yields
31 | ------
32 | arr: `~numpy.ndarray`
33 | Loaded data.
34 |
35 | See Also
36 | --------
37 | pload : load files from parallel processes.
38 |
39 | Examples
40 | --------
41 | To load images using scikit-image ::
42 |
43 | from skimage.io import imread
44 | ims = iload('images_*.tif', imread)
45 |
46 | Keyword arguments are passed to the ``load_func``; for example,
47 | to specify the scikit-image plugin ``'tifffile'``::
48 |
49 | ims = iload('images_*.tif', imread, plugin = 'tifffile')
50 |
51 | In case the list of images is already known::
52 |
53 | ims = iload(['im1.tif', 'im2.tif', 'im3.tif'], imread)
54 | """
55 | # TODO: better handling of Paths
56 | if isinstance(files, Path):
57 | files = str(files)
58 |
59 | if isinstance(files, str):
60 | files = iglob(files)
61 | files = iter(files)
62 |
63 | yield from map(partial(load_func, **kwargs), files)
64 |
65 |
66 | def pload(files, load_func, processes=1, **kwargs):
67 | """
68 | Create a stream of arrays from files, which are loaded lazily
69 | from multiple processes.
70 |
71 | This function should be preferred to :func:`iload` in cases where
72 | the consumer function is much faster than the data can be loaded.
73 |
74 | Parameters
75 | ----------
76 | pattern : iterable of str or str
77 | Either an iterable of filenames or a glob-like pattern str.
78 | load_func : callable, optional
79 | Function taking a filename as its first arguments
80 | processes : int or None, optional
81 | Number of processes to use. If `None`, maximal number of processes
82 | is used. Default is one.
83 | kwargs
84 | Keyword arguments are passed to ``load_func``.
85 |
86 | Yields
87 | ------
88 | arr: `~numpy.ndarray`
89 | Loaded data.
90 |
91 | See Also
92 | --------
93 | iload : load files lazily
94 | """
95 | if processes == 1:
96 | yield from iload(files, load_func, **kwargs)
97 | return
98 |
99 | # TODO: better handling of Paths
100 | if isinstance(files, Path):
101 | files = str(files)
102 |
103 | if isinstance(files, str):
104 | files = iglob(files)
105 | files = iter(files)
106 |
107 | yield from pmap_unordered(partial(load_func, **kwargs), files, processes=processes)
108 |
109 |
110 | # pmap does not support local functions
111 | def _pipe(funcs, array):
112 | for func in funcs:
113 | array = func(array)
114 | return array
115 |
116 |
117 | def ipipe(*args, **kwargs):
118 | """
119 | Pipe arrays through a sequence of functions. For example:
120 |
121 | ``pipe(f, g, h, stream)`` is equivalent to ::
122 |
123 | for arr in stream:
124 | yield f(g(h(arr)))
125 |
126 | Parameters
127 | ----------
128 | *funcs : callable
129 | Callable that support Numpy arrays in their first argument. These
130 | should *NOT* be generator functions.
131 | arrays : iterable
132 | Stream of arrays to be passed.
133 | processes : int or None, optional, keyword-only
134 | Number of processes to use. If `None`, maximal number of processes
135 | is used. Default is one.
136 | ntotal : int or None, optional, keyword-only
137 | If the length of `arrays` is known, but passing `arrays` as a list
138 | would take too much memory, the total number of arrays `ntotal` can be specified. This
139 | allows for `pmap` to chunk better in case of ``processes > 1``.
140 |
141 | Yields
142 | ------
143 | piped : ndarray
144 | """
145 | arrays = ArrayStream(args[-1])
146 | functions = tuple(reversed(args[:-1]))
147 | yield from pmap(partial(_pipe, functions), arrays, **kwargs)
148 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | #
4 | # This file is execfile()d with the current directory set to its
5 | # containing dir.
6 | #
7 | # Note that not all possible configuration values are present in this
8 | # autogenerated file.
9 | #
10 | # All configuration values have a default; values that are commented out
11 | # serve to show the default.
12 |
13 | # If extensions (or modules to document with autodoc) are in another directory,
14 | # add these directories to sys.path here. If the directory is relative to the
15 | # documentation root, use os.path.abspath to make it absolute, like shown here.
16 | #
17 | import os
18 | import sys
19 |
20 | currentpath = os.path.dirname(__file__)
21 | sys.path.append(os.path.join(currentpath, ".."))
22 |
23 | import npstreams
24 |
25 | # -- General configuration ------------------------------------------------
26 |
27 | # If your documentation needs a minimal Sphinx version, state it here.
28 | #
29 | # needs_sphinx = '1.5'
30 | from datetime import datetime
31 | import alabaster
32 |
33 | year = datetime.now().year
34 |
35 | # Add any Sphinx extension module names here, as strings. They can be
36 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
37 | # ones.
38 | extensions = [
39 | "alabaster",
40 | "sphinx.ext.todo",
41 | "sphinx.ext.intersphinx",
42 | "sphinx.ext.autosummary",
43 | "sphinx.ext.autodoc",
44 | "sphinx.ext.napoleon",
45 | "sphinx.ext.mathjax",
46 | "sphinx.ext.doctest",
47 | ]
48 |
49 | intersphinx_mapping = {"numpy": ("http://docs.scipy.org/doc/numpy/", None)}
50 |
51 | napoleon_google_docstring = False
52 | autosummary_generate = True
53 |
54 | # The suffix(es) of source filenames.
55 | # You can specify multiple suffix as a list of string:
56 | #
57 | # source_suffix = ['.rst', '.md']
58 | source_suffix = ".rst"
59 |
60 | # The master toctree document.
61 | master_doc = "index"
62 |
63 | # Releases changelog extension
64 | releases_release_uri = "https://github.com/LaurentRDC/npstreams/tree/%s"
65 | releases_issue_uri = "https://github.com/LaurentRDC/npstreams/issues/%s"
66 |
67 | # General information about the project.
68 | project = "npstreams"
69 | copyright = "%d Laurent P. René de Cotret" % year
70 | author = "Laurent P. René de Cotret"
71 |
72 | # The version info for the project you're documenting, acts as replacement for
73 | # |version| and |release|, also used in various other places throughout the
74 | # built documents.
75 | #
76 | # The short X.Y version.
77 | version = npstreams.__version__
78 | # The full version, including alpha/beta/rc tags.
79 | release = version
80 |
81 | # The language for content autogenerated by Sphinx. Refer to documentation
82 | # for a list of supported languages.
83 | #
84 | # This is also used if you do content translation via gettext catalogs.
85 | # Usually you set "language" from the command line for these cases.
86 | language = None
87 |
88 | # List of patterns, relative to source directory, that match files and
89 | # directories to ignore when looking for source files.
90 | # This patterns also effect to html_static_path and html_extra_path
91 | exclude_patterns = []
92 | exclude_trees = ["_build"]
93 |
94 | # The name of the Pygments (syntax highlighting) style to use.
95 | pygments_style = "sphinx"
96 |
97 | # If true, `todo` and `todoList` produce output, else they produce nothing.
98 | todo_include_todos = True
99 |
100 |
101 | # -- Options for HTML output ----------------------------------------------
102 |
103 | # The theme to use for HTML and HTML Help pages. See the documentation for
104 | # a list of builtin themes.
105 | #
106 | html_theme = "sphinx_rtd_theme"
107 | html_theme_path = ["_themes"]
108 | html_sidebars = {
109 | "**": [
110 | "about.html",
111 | "navigation.html",
112 | "searchbox.html",
113 | "localtoc.html",
114 | "sourcelink.html",
115 | ]
116 | }
117 | # html_show_sourcelink = True
118 |
119 | # Everything intersphinx's to Python.
120 | intersphinx_mapping = {"python": ("https://docs.python.org", None)}
121 |
122 | # Autodoc settings
123 | autodoc_default_flags = ["members", "special-members"]
124 | autoclass_content = "both"
125 |
126 |
127 | def autodoc_skip_member(app, what, name, obj, skip, options):
128 | exclusions = {"__weakref__", "__doc__", "__module__", "__dict__"}
129 | exclude = name in exclusions
130 | return skip or exclude
131 |
132 |
133 | def setup(app):
134 | app.connect("autodoc-skip-member", autodoc_skip_member)
135 |
136 |
137 | doctest_global_setup = """
138 | import npstreams as ns
139 | """
140 |
141 |
142 | # Add any paths that contain custom static files (such as style sheets) here,
143 | # relative to this directory. They are copied after the builtin static files,
144 | # so a file named "default.css" will overwrite the builtin "default.css".
145 | html_static_path = []
146 |
147 | # Suppress the warning about a non-local URI for status shields.
148 | suppress_warnings = ["image.nonlocal_uri"]
149 |
150 | # Enable releases 'unstable prehistory' mode.
151 | releases_unstable_prehistory = True
152 |
--------------------------------------------------------------------------------
/npstreams/linalg.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Numerics Functions
4 | ------------------
5 | """
6 | from functools import partial
7 |
8 | import numpy as np
9 |
10 | from .array_stream import array_stream
11 |
12 |
13 | @array_stream
14 | def _ireduce_linalg(arrays, func, **kwargs):
15 | """
16 | Yield the cumulative reduction of a linag algebra function
17 | """
18 | arrays = iter(arrays)
19 | first = next(arrays)
20 | second = next(arrays)
21 |
22 | func = partial(func, **kwargs)
23 |
24 | accumulator = func(first, second)
25 | yield accumulator
26 |
27 | for array in arrays:
28 | func(accumulator, array, out=accumulator)
29 | yield accumulator
30 |
31 |
32 | def idot(arrays):
33 | """
34 | Yields the cumulative array inner product (dot product) of arrays.
35 |
36 | Parameters
37 | ----------
38 | arrays : iterable
39 | Arrays to be reduced.
40 |
41 | Yields
42 | ------
43 | online_dot : ndarray
44 |
45 | See Also
46 | --------
47 | numpy.linalg.multi_dot : Compute the dot product of two or more arrays in a single function call,
48 | while automatically selecting the fastest evaluation order.
49 | """
50 | yield from _ireduce_linalg(arrays=arrays, func=np.dot)
51 |
52 |
53 | def itensordot(arrays, axes=2):
54 | """
55 | Yields the cumulative array inner product (dot product) of arrays.
56 |
57 | Parameters
58 | ----------
59 | arrays : iterable
60 | Arrays to be reduced.
61 | axes : int or (2,) array_like
62 | * integer_like: If an int N, sum over the last N axes of a
63 | and the first N axes of b in order. The sizes of the corresponding axes must match.
64 | * (2,) array_like: Or, a list of axes to be summed over, first sequence applying to a,
65 | second to b. Both elements array_like must be of the same length.
66 |
67 | Yields
68 | ------
69 | online_tensordot : ndarray
70 |
71 | See Also
72 | --------
73 | numpy.tensordot : Compute the tensordot on two tensors.
74 | """
75 | yield from _ireduce_linalg(arrays=arrays, func=np.tensordot, axes=axes)
76 |
77 |
78 | def iinner(arrays):
79 | """
80 | Cumulative inner product of all arrays in a stream.
81 |
82 | Parameters
83 | ----------
84 | arrays : iterable
85 | Arrays to be reduced.
86 |
87 | Yields
88 | ------
89 | online_inner : ndarray or scalar
90 | """
91 | yield from _ireduce_linalg(arrays=arrays, func=np.inner)
92 |
93 |
94 | def ieinsum(arrays, subscripts, **kwargs):
95 | """
96 | Evaluates the Einstein summation convention on the operands.
97 |
98 | Using the Einstein summation convention, many common multi-dimensional
99 | array operations can be represented in a simple fashion.
100 |
101 | Parameters
102 | ----------
103 | arrays : iterable
104 | Arrays to be reduced.
105 | subscripts : str
106 | Specifies the subscripts for summation.
107 | dtype : numpy.dtype or None, optional
108 | The type of the yielded array and of the accumulator in which the elements
109 | are combined. The dtype of a is used by default unless a has an integer dtype
110 | of less precision than the default platform integer. In that case, if a is
111 | signed then the platform integer is used while if a is unsigned then an
112 | unsigned integer of the same precision as the platform integer is used.
113 | order : {'C', 'F', 'A', 'K'}, optional
114 | Controls the memory layout of the output. 'C' means it should
115 | be C contiguous. 'F' means it should be Fortran contiguous,
116 | 'A' means it should be 'F' if the inputs are all 'F', 'C' otherwise.
117 | 'K' means it should be as close to the layout as the inputs as
118 | is possible, including arbitrarily permuted axes.
119 | Default is 'K'.
120 | casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional
121 | Controls what kind of data casting may occur. Setting this to
122 | 'unsafe' is not recommended, as it can adversely affect accumulations.
123 |
124 | * 'no' means the data types should not be cast at all.
125 | * 'equiv' means only byte-order changes are allowed.
126 | * 'safe' means only casts which can preserve values are allowed.
127 | * 'same_kind' means only safe casts or casts within a kind,
128 | like float64 to float32, are allowed.
129 | * 'unsafe' means any data conversions may be done.
130 |
131 | Default is 'safe'.
132 | optimize : {False, True, 'greedy', 'optimal'}, optional
133 | Controls if intermediate optimization should occur. No optimization
134 | will occur if False and True will default to the 'greedy' algorithm.
135 | Also accepts an explicit contraction list from the ``np.einsum_path``
136 | function. See ``np.einsum_path`` for more details. Default is False.
137 |
138 | Yields
139 | ------
140 | online_einsum : ndarray
141 | Cumulative Einstein summation
142 | """
143 | yield from _ireduce_linalg(
144 | arrays=arrays, func=partial(np.einsum, subscripts), **kwargs
145 | )
146 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # npstreams
2 |
3 | [](http://npstreams.readthedocs.io) [](https://pypi.python.org/pypi/npstreams) [](https://anaconda.org/conda-forge/npstreams) [](https://doi.org/10.1186/s40679-018-0060-y)
4 |
5 | npstreams is an open-source Python package for streaming NumPy array
6 | operations. The goal is to provide tested routines that operate on
7 | streams (or generators) of arrays instead of dense arrays.
8 |
9 | Streaming reduction operations (sums, averages, etc.) can be implemented
10 | in constant memory, which in turns allows for easy parallelization.
11 |
12 | This approach has been a huge boon when working with lots of images; the
13 | images are read one-by-one from disk and combined/processed in a
14 | streaming fashion.
15 |
16 | This package is developed in conjunction with other software projects in
17 | the [Siwick research group](http://www.physics.mcgill.ca/siwicklab/).
18 |
19 | ## Motivating Example
20 |
21 | Consider the following snippet to combine 50 images from an iterable
22 | `source`:
23 |
24 | ```python
25 | import numpy as np
26 |
27 | images = np.empty( shape = (2048, 2048, 50) )
28 | for index, im in enumerate(source):
29 | images[:,:,index] = im
30 |
31 | avg = np.average(images, axis = 2)
32 | ```
33 |
34 | If the `source` iterable provided 1000 images, the above routine would
35 | not work on most machines. Moreover, what if we want to transform the
36 | images one by one before averaging them? What about looking at the
37 | average while it is being computed? Let\'s look at an example:
38 |
39 | ```python
40 | import numpy as np
41 | from npstreams import iaverage
42 | from scipy.misc import imread
43 |
44 | stream = map(imread, list_of_filenames)
45 | averaged = iaverage(stream)
46 | ```
47 |
48 | At this point, the generators `map` and `iaverage` are \'wired\' but
49 | will not compute anything until it is requested. We can look at the
50 | average evolve:
51 |
52 | ```python
53 | import matplotlib.pyplot as plt
54 | for avg in average:
55 | plt.imshow(avg); plt.show()
56 | ```
57 |
58 | We can also use `last` to get at the final average:
59 |
60 | ```python
61 | from npstreams import last
62 |
63 | total = last(averaged) # average of the entire stream
64 | ```
65 |
66 | ## Streaming Functions
67 |
68 | npstreams comes with some streaming functions built-in. Some examples:
69 |
70 | - Numerics : `isum`, `iprod`, `isub`, etc.
71 | - Statistics : `iaverage` (weighted mean), `ivar` (single-pass
72 | variance), etc.
73 |
74 | More importantly, npstreams gives you all the tools required to build
75 | your own streaming function. All routines are documented in the [API
76 | Reference on readthedocs.io](http://npstreams.readthedocs.io).
77 |
78 | ## Benchmarking
79 |
80 | npstreams provides a function for benchmarking common use cases.
81 |
82 | To run the benchmark with default parameters, from the interpreter:
83 |
84 | ```python
85 | from npstreams import benchmark
86 | benchmark()
87 | ```
88 |
89 | From a command-line terminal:
90 |
91 | ```bash
92 | python -c 'import npstreams; npstreams.benchmark()'
93 | ```
94 |
95 | The results will be printed to the screen.
96 |
97 | ## Future Work
98 |
99 | Some of the features I want to implement in this package in the near
100 | future:
101 |
102 | - Optimize the CUDA-enabled routines
103 | - More functions : more streaming functions borrowed from NumPy and
104 | SciPy.
105 |
106 | ## API Reference
107 |
108 | The [API Reference on readthedocs.io](http://npstreams.readthedocs.io)
109 | provides API-level documentation, as well as tutorials.
110 |
111 | ## Installation
112 |
113 | The only requirement is NumPy. To have access to CUDA-enabled routines,
114 | PyCUDA must also be installed. npstreams is available on PyPI; it can be
115 | installed with [pip](https://pip.pypa.io).:
116 |
117 | ```bash
118 | python -m pip install npstreams
119 | ```
120 |
121 | npstreams can also be installed with the conda package manager, from the
122 | conda-forge channel:
123 |
124 | ```bash
125 | conda config --add channels conda-forge
126 | conda install npstreams
127 | ```
128 |
129 | To install the latest development version from
130 | [Github](https://github.com/LaurentRDC/npstreams):
131 |
132 | ```bash
133 | python -m pip install git+git://github.com/LaurentRDC/npstreams.git
134 | ```
135 |
136 | Tests can be run using the `pytest` package.
137 |
138 | ## Citations
139 |
140 | If you find this software useful, please consider citing the following
141 | publication:
142 |
143 | > L. P. René de Cotret, M. R. Otto, M. J. Stern. and B. J. Siwick, *An open-source software ecosystem for the interactive exploration of ultrafast electron scattering data*, Advanced Structural and Chemical Imaging 4:11 (2018) [DOI: 10.1186/s40679-018-0060-y.](https://ascimaging.springeropen.com/articles/10.1186/s40679-018-0060-y)
144 |
145 |
146 | ## Support / Report Issues
147 |
148 | All support requests and issue reports should be [filed on Github as an
149 | issue](https://github.com/LaurentRDC/npstreams/issues).
150 |
151 | ## License
152 |
153 | npstreams is made available under the BSD License, same as NumPy. For
154 | more details, see
155 | [LICENSE.txt](https://github.com/LaurentRDC/npstreams/blob/master/LICENSE.txt).
156 |
--------------------------------------------------------------------------------
/docs/making_your_own.rst:
--------------------------------------------------------------------------------
1 | .. include:: references.txt
2 |
3 | .. _making_your_own:
4 |
5 | ********************************************
6 | Making your own Streaming Reduction Function
7 | ********************************************
8 |
9 | .. currentmodule:: npstreams
10 |
11 | ============================================
12 | The :func:`ireduce_ufunc` generator function
13 | ============================================
14 |
15 | You can assemble your own streaming reduction function from a **binary** NumPy ufunc
16 | using the following generator function:
17 |
18 | .. autofunction:: ireduce_ufunc
19 |
20 | The non-generator version is also available:
21 |
22 | .. autofunction:: reduce_ufunc
23 |
24 | Note that while all NumPy ufuncs have a :meth:`reduce` method, not all of them are useful.
25 | This is why :func:`ireduce_ufunc` and :func:`reduce_ufunc` will only work with **binary** ufuncs,
26 | most of which are listed below. For performance reasons, we further restrict the use of
27 | :func:`ireduce_ufunc` and :func:`reduce_ufunc` to ufuncs that have the same input types
28 | as output types. Therefore, for example, :func:`numpy.greater` cannot be made to work with
29 | :func:`ireduce_ufunc` and :func:`reduce_ufunc`.
30 |
31 | NaNs handling
32 | -------------
33 |
34 | NumPy ufuncs can have an identity value, that is, a value such that ``ufunc(x1, identity)`` is always ``x1``. For such ufuncs,
35 | :func:`ireduce_ufunc` and :func:`reduce_ufunc` can replace NaNs in the stream with the ufunc's identity value, if ``ignore_nan = True``.
36 | Note that not all ufuncs have an identity value; for example, how would you define the identity value of ``numpy.maximum``? There is no answer.
37 |
38 | .. _numpy_binary_ufuncs:
39 |
40 | ===================
41 | NumPy Binary Ufuncs
42 | ===================
43 |
44 | :func:`ireduce_ufunc` is tested to work on the following binary ufuncs, which are available in `NumPy`_.
45 |
46 |
47 | Arithmetics
48 | -----------
49 |
50 | .. autosummary::
51 | :nosignatures:
52 |
53 | numpy.add
54 | numpy.subtract
55 | numpy.multiply
56 | numpy.divide
57 | numpy.logaddexp
58 | numpy.logaddexp2
59 | numpy.true_divide
60 | numpy.floor_divide
61 | numpy.power
62 | numpy.remainder
63 | numpy.mod
64 | numpy.fmod
65 |
66 | Trigonometric functions
67 | -----------------------
68 |
69 | .. autosummary::
70 | :nosignatures:
71 |
72 | numpy.arctan2
73 | numpy.hypot
74 |
75 | Bit-twiddling functions
76 | -----------------------
77 |
78 | .. autosummary::
79 | :nosignatures:
80 |
81 | numpy.bitwise_and
82 | numpy.bitwise_or
83 | numpy.bitwise_xor
84 | numpy.left_shift
85 | numpy.right_shift
86 |
87 | Comparison functions
88 | --------------------
89 |
90 | .. autosummary::
91 | :nosignatures:
92 |
93 | numpy.maximum
94 | numpy.fmax
95 | numpy.minimum
96 | numpy.fmin
97 |
98 | Floating functions
99 | ------------------
100 |
101 | .. autosummary::
102 | :nosignatures:
103 |
104 | numpy.copysign
105 | numpy.nextafter
106 | numpy.ldexp
107 |
108 | ==========================
109 | Example: Streaming Maximum
110 | ==========================
111 |
112 | Let's create a streaming maximum function for a stream. First, we have to choose
113 | how to handle NaNs; since ``numpy.maximum`` does not have an identity value, we must find
114 | another way. We can proceed as follows:
115 |
116 | * If we want to propagate NaNs, we should use :func:`numpy.maximum`
117 | * If we want to ignore NaNs, we should use :func:`numpy.fmax`
118 |
119 | Both of those functions are binary ufuncs, so we can use :func:`ireduce_ufunc`. Note that any function based
120 | on :func:`ireduce_ufunc` or :func:`reduce_ufunc` will automatically work on streams of numbers thanks to the
121 | :func:`array_stream` decorator.
122 |
123 | Putting it all together::
124 |
125 | from npstreams import ireduce_ufunc
126 | from numpy import maximum, fmax
127 |
128 | def imax(arrays, axis = -1, ignore_nan = False, **kwargs):
129 | """
130 | Streaming cumulative maximum along an axis.
131 |
132 | Parameters
133 | ----------
134 | arrays : iterable
135 | Stream of arrays to be compared.
136 | axis : int or None, optional
137 | Axis along which to compute the maximum. If None,
138 | arrays are flattened before reduction.
139 | ignore_nan : bool, optional
140 | If True, NaNs are ignored. Default is False.
141 |
142 | Yields
143 | ------
144 | online_max : ndarray
145 | """
146 | ufunc = fmax if ignore_nan else maximum
147 | yield from ireduce_ufunc(arrays, ufunc, axis = axis, **kwargs)
148 |
149 | This will provide us with a streaming function, meaning that we can look at the progress
150 | as it is being computed. We can also create a function that returns the max of the stream
151 | like :meth:`numpy.ndarray.max()` using the :func:`reduce_ufunc` function::
152 |
153 | from npstreams import reduce_ufunc
154 |
155 | def smax(*args, **kwargs): # s for stream
156 | """
157 | Maximum of a stream along an axis.
158 |
159 | Parameters
160 | ----------
161 | arrays : iterable
162 | Stream of arrays to be compared.
163 | axis : int or None, optional
164 | Axis along which to compute the maximum. If None,
165 | arrays are flattened before reduction.
166 | ignore_nan : bool, optional
167 | If True, NaNs are ignored. Default is False.
168 |
169 | Yields
170 | ------
171 | max : ndarray
172 | """
173 | return reduce_ufunc(*args, **kwargs)
--------------------------------------------------------------------------------
/npstreams/parallel.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Parallelization utilities
4 | -------------------------
5 | """
6 | from collections.abc import Sized
7 | from functools import partial, reduce
8 | from multiprocessing import Pool
9 |
10 | from .iter_utils import chunked
11 |
12 |
13 | def preduce(func, iterable, args=None, kwargs=None, processes=1, ntotal=None):
14 | """
15 | Parallel application of the reduce function, with keyword arguments.
16 |
17 | Parameters
18 | ----------
19 | func : callable
20 | Function to be applied to every element of `iterable`.
21 | iterable : iterable
22 | Iterable of items to be reduced. Generators are consumed.
23 | args : tuple or None, optional
24 | Positional arguments of `function`.
25 | kwargs : dictionary or None, optional
26 | Keyword arguments of `function`.
27 | processes : int or None, optional
28 | Number of processes to use. If `None`, maximal number of processes
29 | is used. Default is one.
30 | ntotal : int or None, optional
31 | If the length of `iterable` is known, but passing `iterable` as a list
32 | would take too much memory, the total length `ntotal` can be specified. This
33 | allows for `preduce` to chunk better.
34 |
35 | Returns
36 | -------
37 | reduced : object
38 |
39 | Notes
40 | -----
41 | If `processes` is 1, `preduce` is equivalent to functools.reduce with the
42 | added benefit of using `args` and `kwargs`, but `initializer` is not supported.
43 | """
44 | if kwargs is None:
45 | kwargs = dict()
46 |
47 | if args is None:
48 | args = tuple()
49 |
50 | func = partial(func, *args, **kwargs)
51 |
52 | if processes == 1:
53 | return reduce(func, iterable)
54 |
55 | with Pool(processes) as pool:
56 | chunksize = 1
57 | if isinstance(iterable, Sized):
58 | chunksize = max(1, int(len(iterable) / pool._processes))
59 | elif ntotal is not None:
60 | chunksize = max(1, int(ntotal / pool._processes))
61 |
62 | # Some reductions are order-sensitive
63 | res = pool.imap(partial(reduce, func), tuple(chunked(iterable, chunksize)))
64 | return reduce(func, res)
65 |
66 |
67 | def pmap(func, iterable, args=None, kwargs=None, processes=1, ntotal=None):
68 | """
69 | Parallel application of a function with keyword arguments.
70 |
71 | Parameters
72 | ----------
73 | func : callable
74 | Function to be applied to every element of `iterable`.
75 | iterable : iterable
76 | Iterable of items to be mapped.
77 | args : tuple or None, optional
78 | Positional arguments of `function`.
79 | kwargs : dictionary or None, optional
80 | Keyword arguments of `function`.
81 | processes : int or None, optional
82 | Number of processes to use. If `None`, maximal number of processes
83 | is used. Default is one.
84 | ntotal : int or None, optional
85 | If the length of `iterable` is known, but passing `iterable` as a list
86 | would take too much memory, the total length `ntotal` can be specified. This
87 | allows for `pmap` to chunk better.
88 |
89 | Yields
90 | ------
91 | Mapped values.
92 |
93 | See Also
94 | --------
95 | pmap_unordered : parallel map that does not preserve order
96 |
97 | Notes
98 | -----
99 | If `processes` is 1, `pmap` reduces to `map`, with the added benefit of
100 | of using `kwargs`
101 | """
102 | if kwargs is None:
103 | kwargs = dict()
104 |
105 | if args is None:
106 | args = tuple()
107 |
108 | func = partial(func, *args, **kwargs)
109 |
110 | if processes == 1:
111 | yield from map(func, iterable)
112 | return
113 |
114 | with Pool(processes) as pool:
115 | chunksize = 1
116 | if isinstance(iterable, Sized):
117 | chunksize = max(1, int(len(iterable) / pool._processes))
118 | elif ntotal is not None:
119 | chunksize = max(1, int(ntotal / pool._processes))
120 |
121 | yield from pool.imap(func=func, iterable=iterable, chunksize=chunksize)
122 |
123 |
124 | def pmap_unordered(func, iterable, args=None, kwargs=None, processes=1, ntotal=None):
125 | """
126 | Parallel application of a function with keyword arguments in no particular order.
127 | This can reduce memory usage because results are not accumulated so that the order is preserved.
128 |
129 | Parameters
130 | ----------
131 | func : callable
132 | Function to be applied to every element of `iterable`.
133 | iterable : iterable
134 | Iterable of items to be mapped.
135 | args : tuple or None, optional
136 | Positional arguments of `function`.
137 | kwargs : dictionary or None, optional
138 | Keyword arguments of `function`.
139 | processes : int or None, optional
140 | Number of processes to use. If `None`, maximal number of processes
141 | is used. Default is one.
142 | ntotal : int or None, optional
143 | If the length of `iterable` is known, but passing `iterable` as a list
144 | would take too much memory, the total length `ntotal` can be specified. This
145 | allows for `pmap` to chunk better.
146 |
147 | Yields
148 | ------
149 | Mapped values.
150 |
151 | See Also
152 | --------
153 | pmap : parallel map that preserves order
154 |
155 | Notes
156 | -----
157 | If `processes` is 1, `pmap_unordered` reduces to `map`, with the added benefit of
158 | of using `kwargs`
159 | """
160 | if kwargs is None:
161 | kwargs = dict()
162 |
163 | if args is None:
164 | args = tuple()
165 |
166 | func = partial(func, *args, **kwargs)
167 |
168 | if processes == 1:
169 | yield from map(func, iterable)
170 | return
171 |
172 | with Pool(processes) as pool:
173 | chunksize = 1
174 | if isinstance(iterable, Sized):
175 | chunksize = max(1, int(len(iterable) / pool._processes))
176 | elif ntotal is not None:
177 | chunksize = max(1, int(ntotal / pool._processes))
178 |
179 | yield from pool.imap_unordered(
180 | func=func, iterable=iterable, chunksize=chunksize
181 | )
182 |
--------------------------------------------------------------------------------
/npstreams/tests/test_reduce.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import numpy as np
4 |
5 | from npstreams import ireduce_ufunc, preduce_ufunc, last, nan_to_num, reduce_ufunc
6 | import pytest
7 |
8 | # Only testing binary ufuncs that support floats
9 | # i.e. leaving bitwise_* and logical_* behind
10 | # Also, numpy.ldexp takes in ints and floats separately, so
11 | # leave it behind
12 | UFUNCS = (
13 | np.add,
14 | np.subtract,
15 | np.multiply,
16 | np.divide,
17 | np.logaddexp,
18 | np.logaddexp2,
19 | np.true_divide,
20 | np.floor_divide,
21 | np.power,
22 | np.remainder,
23 | np.mod,
24 | np.fmod,
25 | np.arctan2,
26 | np.hypot,
27 | np.maximum,
28 | np.fmax,
29 | np.minimum,
30 | np.fmin,
31 | np.copysign,
32 | np.nextafter,
33 | )
34 |
35 | UFUNCS_WITH_IDENTITY = list(filter(lambda u: u.identity is not None, UFUNCS))
36 |
37 |
38 | def test_ireduce_ufunc_no_side_effects():
39 | """Test that no arrays in the stream are modified"""
40 | source = [np.random.random((16, 5, 8)) for _ in range(10)]
41 | stack = np.stack(source, axis=-1)
42 | for arr in source:
43 | arr.setflags(write=False)
44 | out = last(ireduce_ufunc(source, np.add))
45 |
46 |
47 | def test_ireduce_ufunc_single_array():
48 | """Test ireduce_ufunc on a single array, not a sequence"""
49 | source = [np.random.random((16, 5, 8)) for _ in range(10)]
50 | stack = np.stack(source, axis=-1)
51 | source = np.ones((16, 16), dtype=int)
52 | out = last(ireduce_ufunc(source, np.add, axis=-1))
53 | assert np.allclose(source, out)
54 |
55 |
56 | def test_ireduce_ufunc_out_parameter():
57 | """Test that the kwargs ``out`` is correctly passed to reduction function"""
58 | source = [np.random.random((16, 5, 8)) for _ in range(10)]
59 | stack = np.stack(source, axis=-1)
60 | not_out = last(ireduce_ufunc(source, np.add, axis=-1))
61 | out = np.empty_like(source[0])
62 | last(ireduce_ufunc(source, ufunc=np.add, out=out))
63 |
64 | assert np.allclose(not_out, out)
65 |
66 | not_out = last(ireduce_ufunc(source, np.add, axis=2))
67 | out = np.empty_like(source[0])
68 | from_out = last(ireduce_ufunc(source, ufunc=np.add, out=out, axis=2))
69 |
70 | assert np.allclose(not_out, from_out)
71 |
72 |
73 | def test_ireduce_ufunc_ignore_nan_no_identity():
74 | """Test ireduce_ufunc on an ufunc with no identity raises
75 | an error for ignore_nan = True"""
76 | source = [np.ones((16, 16), dtype=int) for _ in range(5)]
77 | with pytest.raises(ValueError):
78 | ireduce_ufunc(source, np.maximum, axis=-1, ignore_nan=True)
79 |
80 |
81 | def test_ireduce_ufunc_non_ufunc():
82 | """Test that ireduce_ufunc raises TypeError when a non-ufunc is passed"""
83 | with pytest.raises(TypeError):
84 | ireduce_ufunc(range(10), ufunc=lambda x: x)
85 |
86 |
87 | def test_ireduce_ufunc_non_binary_ufunc():
88 | """Test that ireduce_ufunc raises ValueError if non-binary ufunc is used"""
89 | with pytest.raises(ValueError):
90 | ireduce_ufunc(range(10), ufunc=np.absolute)
91 |
92 |
93 | @pytest.mark.parametrize("axis", (0, 1, 2, 3, None))
94 | def test_ireduce_ufunc_output_shape(axis):
95 | """Test output shape"""
96 | source = [np.random.random((16, 5, 8)) for _ in range(10)]
97 | stack = np.stack(source, axis=-1)
98 |
99 | from_numpy = np.add.reduce(stack, axis=axis)
100 | out = last(ireduce_ufunc(source, np.add, axis=axis))
101 | assert from_numpy.shape == out.shape
102 | assert np.allclose(out, from_numpy)
103 |
104 |
105 | @pytest.mark.parametrize("axis", (0, 1, 2, 3, None))
106 | def test_ireduce_ufunc_length(axis):
107 | """Test that the number of elements yielded by ireduce_ufunc is correct"""
108 |
109 | source = (np.zeros((16, 5, 8)) for _ in range(10))
110 | out = list(ireduce_ufunc(source, np.add, axis=axis))
111 | assert 10 == len(out)
112 |
113 |
114 | @pytest.mark.parametrize("axis", (0, 1, 2, 3, None))
115 | def test_ireduce_ufunc_ignore_nan(axis):
116 | """Test that ignore_nan is working"""
117 | source = [np.random.random((16, 5, 8)) for _ in range(10)]
118 | stack = np.stack(source, axis=-1)
119 |
120 | out = last(ireduce_ufunc(source, np.add, axis=axis, ignore_nan=True))
121 | assert not np.any(np.isnan(out))
122 |
123 |
124 | def test_preduce_ufunc_trivial():
125 | """Test preduce_ufunc for a sum of zeroes over two processes"""
126 | stream = [np.zeros((8, 8)) for _ in range(10)]
127 | s = preduce_ufunc(stream, ufunc=np.add, processes=2, ntotal=10)
128 | assert np.allclose(s, np.zeros_like(s))
129 |
130 |
131 | def test_preduce_ufunc_correctess():
132 | """Test preduce_ufunc is equivalent to reduce_ufunc for random sums"""
133 | stream = [np.random.random((8, 8)) for _ in range(20)]
134 | s = preduce_ufunc(stream, ufunc=np.add, processes=3, ntotal=10)
135 | assert np.allclose(s, reduce_ufunc(stream, np.add))
136 |
137 |
138 | # Dynamics generation of tests on binary ufuncs
139 | @pytest.mark.parametrize("ufunc", UFUNCS)
140 | @pytest.mark.parametrize("axis", (0, 1, 2, -1))
141 | def test_binary_ufunc(ufunc, axis):
142 | """Generate a test to ensure that ireduce_ufunc(..., ufunc, ...)
143 | works as intendent."""
144 | source = [np.random.random((16, 5, 8)) for _ in range(10)]
145 | stack = np.stack(source, axis=-1)
146 |
147 | def sufunc(arrays, axis=-1): # s for stream
148 | return last(ireduce_ufunc(arrays, ufunc, axis=axis))
149 |
150 | from_numpy = ufunc.reduce(stack, axis=axis)
151 | from_sufunc = sufunc(source, axis=axis)
152 | assert from_sufunc.shape == from_numpy.shape
153 | assert np.allclose(from_numpy, from_sufunc)
154 |
155 |
156 | @pytest.mark.parametrize("ufunc", UFUNCS_WITH_IDENTITY)
157 | def test_binary_ufunc_ignore_nan(ufunc):
158 | """Generate a test to ensure that ireduce_ufunc(..., ufunc, ...)
159 | works as intendent with NaNs in stream."""
160 |
161 | source = [np.random.random((16, 5, 8)) for _ in range(10)]
162 | source[0][0, 0, 0] = np.nan
163 | stack = nan_to_num(np.stack(source, axis=-1), fill_value=ufunc.identity)
164 |
165 | def sufunc(arrays, ignore_nan=False): # s for stream
166 | return last(ireduce_ufunc(arrays, ufunc, axis=1, ignore_nan=True))
167 |
168 | from_numpy = ufunc.reduce(stack, axis=1)
169 | from_sufunc = sufunc(source)
170 | assert from_numpy.shape == from_sufunc.shape
171 | assert np.allclose(from_numpy, from_sufunc)
172 |
--------------------------------------------------------------------------------
/npstreams/iter_utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Iterator/Generator utilities
4 | ----------------------------
5 | """
6 | from collections import deque
7 | from functools import wraps
8 | from itertools import chain, islice, tee
9 |
10 |
11 | def primed(gen):
12 | """
13 | Decorator that primes a generator function, i.e. runs the function
14 | until the first ``yield`` statement. Useful in cases where there
15 | are preliminary checks when creating the generator.
16 | """
17 |
18 | @wraps(gen)
19 | def primed_gen(*args, **kwargs):
20 | generator = gen(*args, **kwargs)
21 | next(generator)
22 | return generator
23 |
24 | return primed_gen
25 |
26 |
27 | @primed
28 | def chunked(iterable, chunksize):
29 | """
30 | Generator yielding multiple iterables of length 'chunksize'.
31 |
32 | Parameters
33 | ----------
34 | iterable : iterable
35 | Iterable to be chunked.
36 | chunksize : int
37 | Chunk size.
38 |
39 | Yields
40 | ------
41 | chunk : iterable
42 | Iterable of size `chunksize`. In special case of iterable not being
43 | divisible by `chunksize`, the last `chunk` will be smaller.
44 |
45 | Raises
46 | ------
47 | TypeError : if `chunksize` is not an integer.
48 | """
49 | if not isinstance(chunksize, int):
50 | raise TypeError(
51 | f"Expected `chunksize` to be an integer, but received {chunksize}"
52 | )
53 |
54 | yield
55 |
56 | iterable = iter(iterable)
57 |
58 | next_chunk = tuple(islice(iterable, chunksize))
59 | while next_chunk:
60 | yield next_chunk
61 | next_chunk = tuple(islice(iterable, chunksize))
62 |
63 |
64 | def peek(iterable):
65 | """
66 | Peek ahead in an iterable.
67 |
68 | Parameters
69 | ----------
70 | iterable : iterable
71 |
72 | Returns
73 | -------
74 | first : object
75 | First element of ``iterable``
76 | stream : iterable
77 | Iterable containing ``first`` and all other elements from ``iterable``
78 | """
79 | iterable = iter(iterable)
80 | ahead = next(iterable)
81 | return ahead, chain([ahead], iterable)
82 |
83 |
84 | def itercopy(iterable, copies=2):
85 | """
86 | Split iterable into 'copies'. Once this is done, the original iterable *should
87 | not* be used again.
88 |
89 | Parameters
90 | ----------
91 | iterable : iterable
92 | Iterable to be split. Once it is split, the original iterable
93 | should not be used again.
94 | copies : int, optional
95 | Number of copies. Also determines the number of returned iterables.
96 |
97 | Returns
98 | -------
99 | iter1, iter2, ... : iterable
100 | Copies of ``iterable``.
101 |
102 | Examples
103 | --------
104 | By rebinding the name of the original iterable, we make sure that it
105 | will never be used again.
106 |
107 | >>> from npstreams import itercopy
108 | >>> evens = (2*n for n in range(1000))
109 | >>> evens, evens_copy = itercopy(evens, copies = 2)
110 |
111 | See Also
112 | --------
113 | itertools.tee : equivalent function
114 | """
115 | # itercopy is included because documentation of itertools.tee isn't obvious
116 | # to everyone
117 | return tee(iterable, copies)
118 |
119 |
120 | def linspace(start, stop, num, endpoint=True):
121 | """
122 | Generate linear space. This is sometimes more appropriate than
123 | using `range`.
124 |
125 | Parameters
126 | ----------
127 | start : float
128 | The starting value of the sequence.
129 | stop : float
130 | The end value of the sequence.
131 | num : int
132 | Number of samples to generate.
133 | endpoint : bool, optional
134 | If True (default), the endpoint is included in the linear space.
135 |
136 | Yields
137 | ------
138 | val : float
139 |
140 | See also
141 | --------
142 | numpy.linspace : generate linear space as a dense array.
143 | """
144 | # If endpoint are to be counted in,
145 | # step does not count the last yield
146 | if endpoint:
147 | num -= 1
148 |
149 | step = (stop - start) / num
150 |
151 | val = start
152 | for _ in range(num):
153 | yield val
154 | val += step
155 |
156 | if endpoint:
157 | yield stop
158 |
159 |
160 | def multilinspace(start, stop, num, endpoint=True):
161 | """
162 | Generate multilinear space, for joining the values in two iterables.
163 |
164 | Parameters
165 | ----------
166 | start : iterable of floats
167 | The starting value. This iterable will be consumed.
168 | stop : iterable of floats
169 | The end value. This iterable will be consumed.
170 | num : int
171 | Number of samples to generate.
172 | endpoint : bool, optional
173 | If True (default), the endpoint is included in the linear space.
174 |
175 | Yields
176 | ------
177 | val : tuple
178 | Tuple of the same length as start and stop
179 |
180 | Examples
181 | --------
182 | >>> from npstreams import multilinspace
183 | >>> multispace = multilinspace(start = (0, 0), stop = (1, 1), num = 4, endpoint = False)
184 | >>> print(list(multispace))
185 | [(0, 0), (0.25, 0.25), (0.5, 0.5), (0.75, 0.75)]
186 |
187 | See also
188 | --------
189 | linspace : generate a linear space between two numbers
190 | """
191 | start, stop = tuple(start), tuple(stop)
192 | if len(start) != len(stop):
193 | raise ValueError("start and stop must have the same length")
194 |
195 | spaces = tuple(
196 | linspace(a, b, num=num, endpoint=endpoint) for a, b in zip(start, stop)
197 | )
198 | yield from zip(*spaces)
199 |
200 |
201 | def last(stream):
202 | """
203 | Retrieve the last item from a stream/iterator, consuming
204 | iterables in the process. If empty stream, a RuntimeError is raised.
205 | """
206 | # Wonderful idea from itertools recipes
207 | # https://docs.python.org/3.9/library/itertools.html#itertools-recipes
208 | try:
209 | return deque(stream, maxlen=1)[0]
210 | except IndexError:
211 | raise RuntimeError("Empty stream")
212 |
213 |
214 | def cyclic(iterable):
215 | """
216 | Yields cyclic permutations of an iterable.
217 |
218 | Examples
219 | --------
220 | >>> from npstreams import cyclic
221 | >>> list(cyclic((1,2,3)))
222 | [(1, 2, 3), (3, 1, 2), (2, 3, 1)]
223 | """
224 | iterable = tuple(iterable)
225 | n = len(iterable)
226 | yield from (tuple(iterable[i - j] for i in range(n)) for j in range(n))
227 |
228 |
229 | def length_hint(obj, default=0):
230 | """
231 | Return an estimate of the number of items in ``obj``.
232 |
233 | This is useful for presizing containers when building from an
234 | iterable.
235 |
236 | If the object supports len(), the result will be
237 | exact. Otherwise, it may over- or under-estimate by an
238 | arbitrary amount. The result will be an integer >= 0.
239 |
240 | Notes
241 | -----
242 | Source : https://www.python.org/dev/peps/pep-0424/
243 |
244 | Examples
245 | --------
246 | >>> from npstreams import length_hint
247 | >>> length_hint([1,2,3,4,5]) # Should be exact
248 | 5
249 | >>> length_hint(None, default = 15) # Does not implement __length_hint__
250 | 15
251 | """
252 | try:
253 | return len(obj)
254 | except TypeError:
255 | try:
256 | get_hint = type(obj).__length_hint__
257 | except AttributeError:
258 | return default
259 | try:
260 | hint = get_hint(obj)
261 | except TypeError:
262 | return default
263 | if hint is NotImplemented:
264 | return default
265 | if not isinstance(hint, int):
266 | raise TypeError("Length hint must be an integer, not %r" % type(hint))
267 | if hint < 0:
268 | raise ValueError("__length_hint__() should return >= 0")
269 | return hint
270 |
--------------------------------------------------------------------------------
/npstreams/cuda.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | CUDA-accelerated streaming operations
4 | -------------------------------------
5 | """
6 | from functools import partial
7 | from itertools import repeat
8 | from operator import iadd, imul
9 | from subprocess import run, PIPE
10 |
11 | import numpy as np
12 |
13 | from . import array_stream, itercopy, nan_to_num, peek
14 |
15 | # Determine if
16 | # 1. pycuda is installed;
17 | # 2. pycuda can compile with nvcc
18 | # 3. a GPU is available
19 |
20 | try:
21 | import pycuda.gpuarray as gpuarray
22 | import pycuda.autoinit
23 | except ImportError:
24 | raise ImportError("PyCUDA is not installed. CUDA capabilities are not available.")
25 | else:
26 | import pycuda.driver as driver
27 | from pycuda.compiler import SourceModule
28 |
29 | # Check if nvcc compiler is installed at all
30 | nvcc_installed = run(["nvcc", "-h"], stdout=PIPE).returncode == 0
31 | if not nvcc_installed:
32 | raise ImportError("CUDA compiler `nvcc` not installed.")
33 |
34 | # Check that nvcc is at least set up properly
35 | # For example, if nvcc is installed but C++ compiler is not in path
36 | try:
37 | SourceModule("")
38 | except driver.CompileError:
39 | raise ImportError("CUDA compiler `nvcc` is not properly set up.")
40 |
41 | if driver.Device.count() == 0:
42 | raise ImportError("No GPU is available.")
43 |
44 |
45 | @array_stream
46 | def cuda_inplace_reduce(arrays, operator, dtype=None, ignore_nan=False, identity=0):
47 | """
48 | Inplace reduce on GPU arrays.
49 |
50 | Parameters
51 | ----------
52 | arrays : iterable
53 | Arrays to be reduced.
54 | operator : callable
55 | Callable of two arguments. This operator should operate in-place, storing the results into
56 | the buffer of the first argument, e.g. operator.iadd
57 | dtype : numpy.dtype, optional
58 | Arrays of the stream are cast to this dtype before reduction.
59 | ignore_nan : bool, optional
60 | If True, NaNs are replaced with ``identity``. Default is propagation of NaNs.
61 | identity : float, optional
62 | If ``ignore_nan = True``, NaNs are replaced with this value.
63 |
64 | Returns
65 | -------
66 | out : ndarray
67 | """
68 | # No need to cast all arrays if ``dtype`` is the same
69 | # type as the stream
70 | first, arrays = peek(arrays)
71 | if (dtype is not None) and (first.dtype != dtype):
72 | arrays = map(lambda arr: arr.astype(dtype), arrays)
73 |
74 | if ignore_nan:
75 | arrays = map(partial(nan_to_num, fill_value=identity), arrays)
76 |
77 | acc_gpu = gpuarray.to_gpu(next(arrays)) # Accumulator
78 | arr_gpu = gpuarray.empty_like(acc_gpu) # GPU memory location for each array
79 | for arr in arrays:
80 | arr_gpu.set(arr)
81 | operator(acc_gpu, arr_gpu)
82 |
83 | return acc_gpu.get()
84 |
85 |
86 | def csum(arrays, dtype=None, ignore_nan=False):
87 | """
88 | CUDA-enabled sum of stream of arrays. Arrays are summed along
89 | the streaming axis for performance reasons.
90 |
91 | Parameters
92 | ----------
93 | arrays : iterable
94 | Arrays to be summed.
95 | ignore_nan : bool, optional
96 | If True, NaNs are ignored. Default is propagation of NaNs.
97 |
98 | Returns
99 | -------
100 | cuda_sum : ndarray
101 |
102 | See Also
103 | --------
104 | isum : streaming sum of array elements, possibly along different axes
105 | """
106 | return cuda_inplace_reduce(
107 | arrays, operator=iadd, dtype=dtype, ignore_nan=ignore_nan, identity=0
108 | )
109 |
110 |
111 | def cprod(arrays, dtype=None, ignore_nan=False):
112 | """
113 | CUDA-enabled product of a stream of arrays. Arrays are multiplied
114 | along the streaming axis for performance reasons.
115 |
116 | Parameters
117 | ----------
118 | arrays : iterable
119 | Arrays to be multiplied.
120 | dtype : numpy.dtype, optional
121 | The type of the yielded array and of the accumulator in which the elements
122 | are summed. The dtype of a is used by default unless a has an integer dtype
123 | of less precision than the default platform integer. In that case, if a is
124 | signed then the platform integer is used while if a is unsigned then an
125 | unsigned integer of the same precision as the platform integer is used.
126 | ignore_nan : bool, optional
127 | If True, NaNs are ignored. Default is propagation of NaNs.
128 |
129 | Yields
130 | ------
131 | online_prod : ndarray
132 | """
133 | return cuda_inplace_reduce(
134 | arrays, operator=imul, dtype=dtype, ignore_nan=ignore_nan, identity=1
135 | )
136 |
137 |
138 | @array_stream
139 | def cmean(arrays, ignore_nan=False):
140 | """
141 | CUDA-enabled mean of stream of arrays (i.e. unweighted average). Arrays are averaged
142 | along the streaming axis for performance reasons.
143 |
144 | Parameters
145 | ----------
146 | arrays : iterable of ndarrays
147 | Arrays to be averaged. This iterable can also a generator.
148 | ignore_nan : bool, optional
149 | If True, NaNs are set to zero weight. Default is propagation of NaNs.
150 |
151 | Returns
152 | -------
153 | cuda_mean : ndarray
154 |
155 | See also
156 | --------
157 | caverage : CUDA-enabled weighted average
158 | imean : streaming mean of arrays, possibly along different axes
159 | """
160 | first, arrays = peek(arrays)
161 |
162 | # Need to know which array has NaNs, and modify the weights stream accordingly
163 | if ignore_nan:
164 | arrays, arrays2 = itercopy(arrays)
165 | weights = map(
166 | lambda arr, wgt: np.logical_not(np.isnan(arr)) * wgt, arrays2, weights
167 | )
168 | arrays = map(np.nan_to_num, arrays)
169 | return caverage(arrays, weights, ignore_nan=False)
170 |
171 | accumulator = gpuarray.to_gpu(next(arrays))
172 | array_gpu = gpuarray.empty_like(accumulator)
173 | num_arrays = 1
174 | for arr in arrays:
175 | num_arrays += 1
176 | array_gpu.set(arr)
177 | accumulator += array_gpu
178 |
179 | return accumulator.get() / num_arrays
180 |
181 |
182 | @array_stream
183 | def caverage(arrays, weights=None, ignore_nan=False):
184 | """
185 | CUDA-enabled average of stream of arrays, possibly weighted. Arrays are averaged
186 | along the streaming axis for performance reasons.
187 |
188 | Parameters
189 | ----------
190 | arrays : iterable of ndarrays
191 | Arrays to be averaged. This iterable can also a generator.
192 | weights : iterable of ndarray, iterable of floats, or None, optional
193 | Iterable of weights associated with the values in each item of `images`.
194 | Each value in an element of `images` contributes to the average
195 | according to its associated weight. The weights array can either be a float
196 | or an array of the same shape as any element of `images`. If weights=None,
197 | then all data in each element of `images` are assumed to have a weight equal to one.
198 | ignore_nan : bool, optional
199 | If True, NaNs are set to zero weight. Default is propagation of NaNs.
200 |
201 | Returns
202 | -------
203 | cuda_avg : ndarray
204 |
205 | See also
206 | --------
207 | iaverage : streaming weighted average, possibly along different axes
208 | """
209 | if weights is None:
210 | return cmean(arrays, ignore_nan)
211 |
212 | first, arrays = peek(arrays)
213 |
214 | # We make sure that weights is always an array
215 | # This simplifies the handling of NaNs.
216 | if weights is None:
217 | weights = repeat(1)
218 | weights = map(partial(np.broadcast_to, shape=first.shape), weights)
219 | weights = map(
220 | lambda arr: arr.astype(first.dtype), weights
221 | ) # Won't work without this
222 |
223 | # Need to know which array has NaNs, and modify the weights stream accordingly
224 | if ignore_nan:
225 | arrays, arrays2 = itercopy(arrays)
226 | weights = map(
227 | lambda arr, wgt: np.logical_not(np.isnan(arr)) * wgt, arrays2, weights
228 | )
229 | arrays = map(np.nan_to_num, arrays)
230 |
231 | first = next(arrays)
232 | fst_wgt = next(weights)
233 |
234 | arr_gpu = gpuarray.to_gpu(first * fst_wgt)
235 | wgt_gpu = gpuarray.to_gpu(fst_wgt)
236 | for arr, wgt in zip(arrays, weights):
237 | arr_gpu += gpuarray.to_gpu(arr) * gpuarray.to_gpu(wgt)
238 | wgt_gpu += gpuarray.to_gpu(wgt)
239 |
240 | arr_gpu /= wgt_gpu
241 | return arr_gpu.get()
242 |
--------------------------------------------------------------------------------
/npstreams/tests/test_numerics.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from random import randint, random
4 |
5 | import numpy as np
6 |
7 | from npstreams import isum, iprod, last, isub, iany, iall, prod
8 | from npstreams import sum as nssum # avoiding name clashes
9 | import pytest
10 |
11 |
12 | def test_isum_trivial():
13 | """Test a sum of zeros"""
14 | source = [np.zeros((16,), dtype=float) for _ in range(10)]
15 | summed = last(isum(source))
16 | assert np.allclose(summed, np.zeros_like(summed))
17 |
18 |
19 | def test_isum_ignore_nans():
20 | """Test a sum of zeros with NaNs sprinkled"""
21 | source = [np.zeros((16,), dtype=float) for _ in range(10)]
22 | source.append(np.full((16,), fill_value=np.nan))
23 | summed = last(isum(source, ignore_nan=True))
24 | assert np.allclose(summed, np.zeros_like(summed))
25 |
26 |
27 | def test_isum_length():
28 | """Test that the number of yielded elements is the same as source"""
29 | source = [np.zeros((16,), dtype=float) for _ in range(10)]
30 | summed = list(isum(source, axis=0))
31 | assert 10 == len(summed)
32 |
33 |
34 | def test_isum_dtype():
35 | """Test a sum of floating zeros with an int accumulator"""
36 | source = [np.zeros((16,), dtype=float) for _ in range(10)]
37 | summed = last(isum(source, dtype=int))
38 | assert np.allclose(summed, np.zeros_like(summed))
39 | assert summed.dtype == int
40 |
41 |
42 | def test_isum_axis():
43 | """Test that isum(axis = 0) yields 0d arrays"""
44 | source = [np.zeros((16,), dtype=float) for _ in range(10)]
45 |
46 | summed = last(isum(source, axis=0))
47 | assert np.allclose(summed, np.zeros_like(summed))
48 |
49 | summed = last(isum(source, axis=None))
50 | assert np.allclose(summed, 0)
51 |
52 |
53 | def test_isum_return_shape():
54 | """Test that the shape of output is as expected"""
55 | source = [np.zeros((16,), dtype=float) for _ in range(10)]
56 |
57 | summed = last(isum(source, axis=0))
58 | assert summed.shape == (1, 10)
59 |
60 |
61 | @pytest.mark.parametrize("axis", (0, 1, 2, None))
62 | def test_isum_against_numpy(axis):
63 | """Test that isum() returns the same as numpy.sum() for various axis inputs"""
64 |
65 | stream = [np.random.random((16, 16)) for _ in range(10)]
66 | stack = np.dstack(stream)
67 |
68 | from_numpy = np.sum(stack, axis=axis)
69 | from_isum = last(isum(stream, axis=axis))
70 | assert np.allclose(from_isum, from_numpy)
71 |
72 |
73 | def test_sum_trivial():
74 | """Test a sum of zeros"""
75 | source = [np.zeros((16,), dtype=float) for _ in range(10)]
76 | summed = nssum(source)
77 | assert np.allclose(summed, np.zeros_like(summed))
78 |
79 |
80 | def test_sum_ignore_nans():
81 | """Test a sum of zeros with NaNs sprinkled"""
82 | source = [np.zeros((16,), dtype=float) for _ in range(10)]
83 | source.append(np.full((16,), fill_value=np.nan))
84 | summed = nssum(source, ignore_nan=True)
85 | assert np.allclose(summed, np.zeros_like(summed))
86 |
87 |
88 | def test_sum_dtype():
89 | """Test a sum of floating zeros with an int accumulator"""
90 | source = [np.zeros((16,), dtype=float) for _ in range(10)]
91 | summed = nssum(source, dtype=int)
92 | assert np.allclose(summed, np.zeros_like(summed))
93 | assert summed.dtype == int
94 |
95 |
96 | def test_sum_axis():
97 | """Test that isum(axis = 0) yields 0d arrays"""
98 | source = [np.zeros((16,), dtype=float) for _ in range(10)]
99 |
100 | summed = nssum(source, axis=0)
101 | assert np.allclose(summed, np.zeros_like(summed))
102 |
103 | summed = nssum(source, axis=None)
104 | assert np.allclose(summed, 0)
105 |
106 |
107 | def test_sum_return_shape():
108 | """Test that the shape of output is as expected"""
109 | source = [np.zeros((16,), dtype=float) for _ in range(10)]
110 |
111 | summed = nssum(source, axis=0)
112 | assert summed.shape == (1, 10)
113 |
114 |
115 | @pytest.mark.parametrize("axis", (0, 1, 2, None))
116 | def test_sum_against_numpy(axis):
117 | """Test that isum() returns the same as numpy.sum() for various axis inputs"""
118 |
119 | stream = [np.random.random((16, 16)) for _ in range(10)]
120 | stack = np.dstack(stream)
121 |
122 | from_numpy = np.sum(stack, axis=axis)
123 | from_sum = nssum(stream, axis=axis)
124 | assert np.allclose(from_sum, from_numpy)
125 |
126 |
127 | def test_iprod_trivial():
128 | """Test a product of ones"""
129 | source = [np.ones((16,), dtype=float) for _ in range(10)]
130 | product = last(iprod(source))
131 | assert np.allclose(product, np.ones_like(product))
132 |
133 |
134 | def test_iprod_ignore_nans():
135 | """Test that NaNs are ignored."""
136 | source = [np.ones((16,), dtype=float) for _ in range(10)]
137 | source.append(np.full_like(source[0], np.nan))
138 | product = last(iprod(source, ignore_nan=True))
139 | assert np.allclose(product, np.ones_like(product))
140 |
141 |
142 | def test_iprod_dtype():
143 | """Test that dtype argument is working"""
144 | source = [np.ones((16,), dtype=float) for _ in range(10)]
145 | product = last(iprod(source, dtype=int))
146 | assert np.allclose(product, np.ones_like(product))
147 | assert product.dtype == int
148 |
149 |
150 | def test_iprod_axis():
151 | """Test that iprod(axis = 0) yields 0d arrays"""
152 | source = [np.ones((16,), dtype=float) for _ in range(10)]
153 |
154 | summed = last(iprod(source, axis=0))
155 | assert np.all(summed == 1)
156 |
157 | summed = last(iprod(source, axis=None))
158 | assert np.allclose(summed, np.ones_like(summed))
159 |
160 |
161 | @pytest.mark.parametrize("axis", (0, 1, 2, None))
162 | def test_iprod_against_numpy(axis):
163 | """Test that iprod() returns the same as numpy.prod() for various axis inputs"""
164 |
165 | stream = [np.random.random((16, 16)) for _ in range(10)]
166 | stack = np.dstack(stream)
167 |
168 | from_numpy = np.prod(stack, axis=axis)
169 | from_stream = last(iprod(stream, axis=axis))
170 | assert np.allclose(from_stream, from_numpy)
171 |
172 |
173 | def test_prod_trivial():
174 | """Test a product of ones"""
175 | source = [np.ones((16,), dtype=float) for _ in range(10)]
176 | product = prod(source)
177 | assert np.allclose(product, np.ones_like(product))
178 |
179 |
180 | def test_prod_ignore_nans():
181 | """Test that NaNs are ignored."""
182 | source = [np.ones((16,), dtype=float) for _ in range(10)]
183 | source.append(np.full_like(source[0], np.nan))
184 | product = prod(source, ignore_nan=True)
185 | assert np.allclose(product, np.ones_like(product))
186 |
187 |
188 | def test_prod_dtype():
189 | """Test that dtype argument is working"""
190 | source = [np.ones((16,), dtype=float) for _ in range(10)]
191 | product = prod(source, dtype=int)
192 | assert np.allclose(product, np.ones_like(product))
193 | assert product.dtype == int
194 |
195 |
196 | def test_prod_axis():
197 | """Test that iprod(axis = 0) yields 0d arrays"""
198 | source = [np.ones((16,), dtype=float) for _ in range(10)]
199 |
200 | summed = prod(source, axis=0)
201 | assert np.all(summed == 1)
202 |
203 | summed = prod(source, axis=None)
204 | assert np.allclose(summed, np.ones_like(summed))
205 |
206 |
207 | @pytest.mark.parametrize("axis", (0, 1, 2, None))
208 | def test_prod_against_numpy(axis):
209 | """Test that iprod() returns the same as numpy.prod() for various axis inputs"""
210 |
211 | stream = [np.random.random((16, 16)) for _ in range(10)]
212 | stack = np.dstack(stream)
213 |
214 | from_numpy = np.prod(stack, axis=axis)
215 | from_stream = prod(stream, axis=axis)
216 | assert np.allclose(from_stream, from_numpy)
217 |
218 |
219 | @pytest.mark.parametrize("axis", (0, 1, 2))
220 | def test_isub_against_numpy(axis):
221 | """Test against numpy.subtract.reduce"""
222 | stream = [np.random.random((8, 16, 2)) for _ in range(11)]
223 | stack = np.stack(stream, axis=-1)
224 |
225 | from_numpy = np.subtract.reduce(stack, axis=axis)
226 | from_stream = last(isub(stream, axis=axis))
227 | assert np.allclose(from_numpy, from_stream)
228 |
229 |
230 | @pytest.mark.parametrize("axis", (0, 1, 2, None))
231 | def test_iall_against_numpy(axis):
232 | """Test iall against numpy.all"""
233 | stream = [np.zeros((8, 16, 2)) for _ in range(11)]
234 | stream[3][3, 0, 1] = 1 # so that np.all(axis = None) evaluates to False
235 | stack = np.stack(stream, axis=-1)
236 |
237 | from_numpy = np.all(stack, axis=axis)
238 | from_stream = last(iall(stream, axis=axis))
239 | assert np.allclose(from_numpy, from_stream)
240 |
241 |
242 | @pytest.mark.parametrize("axis", (0, 1, 2, None))
243 | def test_iany_against_numpy(axis):
244 | """Test iany against numpy.any"""
245 | stream = [np.zeros((8, 16, 2)) for _ in range(11)]
246 | stream[3][3, 0, 1] = 1 # so that np.all(axis = None) evaluates to False
247 | stack = np.stack(stream, axis=-1)
248 |
249 | from_numpy = np.any(stack, axis=axis)
250 | from_stream = last(iany(stream, axis=axis))
251 | assert np.allclose(from_numpy, from_stream)
252 |
--------------------------------------------------------------------------------
/npstreams/benchmarks.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Reliably benchmarking npstreams performance.
4 | """
5 | import inspect
6 | import sys
7 | import timeit
8 | from collections import namedtuple
9 | from contextlib import redirect_stdout
10 | from functools import partial
11 | from shutil import get_terminal_size
12 |
13 | import numpy as np
14 |
15 | from . import __version__
16 | from .reduce import _check_binary_ufunc
17 |
18 | UFUNC_SETUP = """
19 | from npstreams import reduce_ufunc, stack
20 | import numpy as np
21 | from numpy import {ufunc.__name__}
22 |
23 | np.random.seed(42056)
24 |
25 | def stream():
26 | return (np.random.random({shape}) for _ in range(10))
27 | """
28 |
29 | FUNC_SETUP = """
30 | from npstreams import stack
31 | import numpy as np
32 | from numpy import {func.__name__} as np_{func.__name__}
33 | from npstreams import {func.__name__} as ns_{func.__name__}
34 |
35 | np.random.seed(42056)
36 |
37 | def stream():
38 | return (np.random.random({shape}) for _ in range(10))
39 | """
40 |
41 | BenchmarkResults = namedtuple(
42 | "BenchmarkResults", field_names=["numpy_time", "npstreams_time", "shape"]
43 | )
44 |
45 |
46 | def autotimeit(statement, setup="pass", repeat=3):
47 | """
48 | Time a statement, automatically determining the number of times to
49 | run the statement so that the total excecution time is not too short.
50 |
51 | .. versionadded:: 1.5.2
52 |
53 | Parameters
54 | ----------
55 | statement: string
56 | Statement to time. The statement will be executed after the `setup` statement.
57 | setup : string, optional
58 | Setup statement executed before timing starts.
59 | repeat : int, optional
60 | Number of repeated timing to execute.
61 |
62 | Returns
63 | -------
64 | time : float
65 | Minimal time per execution of `statement` [seconds].
66 | """
67 | timer = timeit.Timer(stmt=statement, setup=setup)
68 | number, _ = timer.autorange()
69 | return min(timer.repeat(repeat=repeat, number=number)) / number
70 |
71 |
72 | def benchmark(
73 | funcs=[np.average, np.mean, np.std, np.sum, np.prod],
74 | ufuncs=[np.add, np.multiply, np.power, np.true_divide, np.mod],
75 | shapes=[(4, 4), (8, 8), (16, 16), (64, 64)],
76 | file=None,
77 | ):
78 | """
79 | Benchmark npstreams against numpy and print the results.
80 |
81 | There are two categories of benchmarks. The first category compares NumPy functions against
82 | npstreams versions of the same functions. The second category compares NumPy universal functions
83 | against dynamically-generated npstreams versions of those same universal functions.
84 |
85 | All benchmarks compare a reduction operation on a stream of arrays of varying sizes. The sequence length is fixed.
86 |
87 | .. versionadded:: 1.5.2
88 |
89 | Parameters
90 | ----------
91 | funcs : iterable of NumPy functions, optional
92 | NumPy functions to compare. An equivalent must exist in npstreams, e.g. `np.average` and `npstreams.average` .
93 | Functions without equivalents will be skipped.
94 | ufuncs : iterable of NumPy ufunc, optional
95 | Invalid ufuncs (e.g. non-binary ufuncs) will be skipped.
96 | shapes : iterable of tuples, optional
97 | Shapes of arrays to test. Streams of random numbers will be generated with arrays of those shapes.
98 | The sequence lengths are fixed.
99 | file : file-like or None, optional
100 | File to which the benchmark results will be written. If None, sys.stdout will be used.
101 | """
102 | # Preliminaries
103 | console_width = min(get_terminal_size().columns, 80)
104 | func_test_name = "numpy.{f.__name__} vs npstreams.{f.__name__}".format
105 | ufunc_test_name = (
106 | "numpy.{f.__name__} vs npstreams.reduce_ufunc(numpy.{f.__name__}, ...)".format
107 | )
108 |
109 | # Determine justification based on maximal shape functions
110 | sh_just = max(map(lambda s: len(str(s)), shapes)) + 10
111 |
112 | # To make it easy to either write the results in a file or print to stdout,
113 | # We actually redirect stdout.
114 | if file is None:
115 | file = sys.stdout
116 |
117 | with redirect_stdout(file):
118 | # Start benchmarks --------------------------------------------------------
119 | print(
120 | "".ljust(console_width, "*"),
121 | "npstreams performance benchmark".upper().center(console_width),
122 | "",
123 | " npstreams".ljust(15) + f" {__version__}",
124 | " NumPy".ljust(15) + f" {np.__version__}",
125 | "",
126 | " Speedup is NumPy time divided by npstreams time (Higher is better)",
127 | "".ljust(console_width, "*"),
128 | sep="\n",
129 | )
130 |
131 | # Determine valid ufuncs and funcs first ----------------------------------
132 | valid_ufuncs = comparable_ufuncs(ufuncs, file)
133 | valid_funcs = comparable_funcs(funcs, file)
134 |
135 | # Benchmarking functions --------------------------------------------------
136 | for func in sorted(valid_funcs, key=lambda fn: fn.__name__):
137 | print(func_test_name(f=func).center(console_width), "\n")
138 |
139 | for (np_time, ns_time, shape) in benchmark_func(func, shapes):
140 | speedup = np_time / ns_time
141 | print(
142 | " ",
143 | f"shape = {shape}".ljust(sh_just),
144 | f"speedup = {speedup:.4f}x",
145 | )
146 |
147 | print("".ljust(console_width, "-"))
148 |
149 | # Benchmarking universal functions ----------------------------------------
150 | for ufunc in sorted(valid_ufuncs, key=lambda fn: fn.__name__):
151 | print(ufunc_test_name(f=ufunc).center(console_width), "\n")
152 |
153 | for (np_time, ns_time, shape) in benchmark_ufunc(ufunc, shapes):
154 | speedup = np_time / ns_time
155 | print(
156 | " ",
157 | f"shape = {shape}".ljust(sh_just),
158 | f"speedup = {speedup:.4f}x",
159 | )
160 |
161 | print("".ljust(console_width, "-"))
162 |
163 |
164 | def benchmark_ufunc(ufunc, shapes):
165 | """
166 | Compare the running time between a NumPy ufunc and the npstreams equivalent.
167 |
168 | Parameters
169 | ----------
170 | ufunc : NumPy ufunc
171 |
172 | shapes : iterable of tuples, optional
173 | Shapes of arrays to test. Streams of random numbers will be generated with arrays of those shapes.
174 | The sequence lengths are fixed.
175 |
176 | Yields
177 | ------
178 | results : BenchmarkResults
179 | """
180 | for shape in shapes:
181 |
182 | numpy_statement = f"{ufunc.__name__}.reduce(stack(stream()), axis = -1)"
183 | npstreams_statement = f"reduce_ufunc(stream(), {ufunc.__name__}, axis = -1)"
184 |
185 | with np.errstate(invalid="ignore"):
186 | np_time = autotimeit(
187 | numpy_statement, UFUNC_SETUP.format(ufunc=ufunc, shape=shape)
188 | )
189 | ns_time = autotimeit(
190 | npstreams_statement, UFUNC_SETUP.format(ufunc=ufunc, shape=shape)
191 | )
192 |
193 | yield BenchmarkResults(np_time, ns_time, shape)
194 |
195 |
196 | def benchmark_func(func, shapes):
197 | """
198 | Compare the running time between a NumPy func and the npstreams equivalent.
199 |
200 | Parameters
201 | ----------
202 | func : NumPy func
203 |
204 | shapes : iterable of tuples, optional
205 | Shapes of arrays to test. Streams of random numbers will be generated with arrays of those shapes.
206 | The sequence lengths are fixed.
207 |
208 | Yields
209 | ------
210 | results : BenchmarkResults
211 | """
212 | for shape in shapes:
213 |
214 | numpy_statement = f"np_{func.__name__}(stack(stream()), axis = -1)"
215 | npstreams_statement = f"ns_{func.__name__}(stream(), axis = -1)"
216 |
217 | with np.errstate(invalid="ignore"):
218 | np_time = autotimeit(
219 | numpy_statement, FUNC_SETUP.format(func=func, shape=shape)
220 | )
221 | ns_time = autotimeit(
222 | npstreams_statement, FUNC_SETUP.format(func=func, shape=shape)
223 | )
224 |
225 | yield BenchmarkResults(np_time, ns_time, shape)
226 |
227 |
228 | def comparable_ufuncs(ufuncs, file):
229 | """
230 | Yields ufuncs that can be compared between numpy and npstreams.
231 |
232 | Parameters
233 | ----------
234 | ufuncs : iterable of NumPy ufunc
235 | NumPy ufuncs to check. Ufuncs that cannot be compared will be skipped.
236 |
237 | Yields
238 | ------
239 | ufunc : callable
240 | NumPy ufuncs that can be compared with npstreams.
241 | """
242 | for ufunc in ufuncs:
243 | if not isinstance(ufunc, np.ufunc):
244 | print(
245 | f"Skipping function {ufunc.__name__} as it is not a NumPy Universal Function"
246 | )
247 | continue
248 |
249 | try:
250 | _check_binary_ufunc(ufunc)
251 | except ValueError:
252 | print(
253 | f"Skipping function {ufunc.__name__} as it is not a valid binary ufunc"
254 | )
255 | else:
256 | yield ufunc
257 |
258 |
259 | def comparable_funcs(funcs, file):
260 | """
261 | Yields NumPy functions that have npstreams equivalents.
262 |
263 | Parameters
264 | ----------
265 | ufuncs : iterable of NumPy functions
266 | NumPy funcs to check.
267 |
268 | Yields
269 | ------
270 | ufunc : callable
271 | NumPy funcs that have npstreams equivalents.
272 | """
273 | import npstreams
274 |
275 | npstreams_functions = set(
276 | name for name, value in inspect.getmembers(npstreams, inspect.isfunction)
277 | )
278 | for func in funcs:
279 | if func.__name__ not in npstreams_functions:
280 | print(
281 | f"Skipping function {func.__name__} as there is no npstreams equivalent"
282 | )
283 | else:
284 | yield func
285 |
286 |
287 | if __name__ == "__main__":
288 | benchmark()
289 |
--------------------------------------------------------------------------------
/npstreams/numerics.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Numerics Functions
4 | ------------------
5 | """
6 | import numpy as np
7 |
8 | from .reduce import ireduce_ufunc, reduce_ufunc
9 |
10 |
11 | def isum(arrays, axis=-1, dtype=None, ignore_nan=False):
12 | """
13 | Streaming sum of array elements.
14 |
15 | Parameters
16 | ----------
17 | arrays : iterable
18 | Arrays to be summed.
19 | axis : int or None, optional
20 | Reduction axis. Default is to sum the arrays in the stream as if
21 | they had been stacked along a new axis, then sum along this new axis.
22 | If None, arrays are flattened before summing. If `axis` is an int larger that
23 | the number of dimensions in the arrays of the stream, arrays are summed
24 | along the new axis.
25 | dtype : numpy.dtype, optional
26 | The type of the yielded array and of the accumulator in which the elements
27 | are summed. The dtype of a is used by default unless a has an integer dtype
28 | of less precision than the default platform integer. In that case, if a is
29 | signed then the platform integer is used while if a is unsigned then an
30 | unsigned integer of the same precision as the platform integer is used.
31 | ignore_nan : bool, optional
32 | If True, NaNs are ignored. Default is propagation of NaNs.
33 |
34 | Yields
35 | ------
36 | online_sum : ndarray
37 | """
38 | yield from ireduce_ufunc(
39 | arrays, ufunc=np.add, axis=axis, ignore_nan=ignore_nan, dtype=dtype
40 | )
41 |
42 |
43 | def sum(arrays, axis=-1, dtype=None, ignore_nan=False):
44 | """
45 | Sum of arrays in a stream.
46 |
47 | Parameters
48 | ----------
49 | arrays : iterable
50 | Arrays to be summed.
51 | axis : int or None, optional
52 | Reduction axis. Default is to sum the arrays in the stream as if
53 | they had been stacked along a new axis, then sum along this new axis.
54 | If None, arrays are flattened before summing. If `axis` is an int larger that
55 | the number of dimensions in the arrays of the stream, arrays are summed
56 | along the new axis.
57 | dtype : numpy.dtype, optional
58 | The type of the yielded array and of the accumulator in which the elements
59 | are summed. The dtype of a is used by default unless a has an integer dtype
60 | of less precision than the default platform integer. In that case, if a is
61 | signed then the platform integer is used while if a is unsigned then an
62 | unsigned integer of the same precision as the platform integer is used.
63 | ignore_nan : bool, optional
64 | If True, NaNs are ignored. Default is propagation of NaNs.
65 |
66 | Returns
67 | -------
68 | sum : ndarray
69 | """
70 | return reduce_ufunc(
71 | arrays, ufunc=np.add, axis=axis, dtype=dtype, ignore_nan=ignore_nan
72 | )
73 |
74 |
75 | def iprod(arrays, axis=-1, dtype=None, ignore_nan=False):
76 | """
77 | Streaming product of array elements.
78 |
79 | Parameters
80 | ----------
81 | arrays : iterable
82 | Arrays to be multiplied.
83 | axis : int or None, optional
84 | Reduction axis. Default is to multiply the arrays in the stream as if
85 | they had been stacked along a new axis, then multiply along this new axis.
86 | If None, arrays are flattened before multiplication. If `axis` is an int larger that
87 | the number of dimensions in the arrays of the stream, arrays are multiplied
88 | along the new axis.
89 | dtype : numpy.dtype, optional
90 | The type of the yielded array and of the accumulator in which the elements
91 | are summed. The dtype of a is used by default unless a has an integer dtype
92 | of less precision than the default platform integer. In that case, if a is
93 | signed then the platform integer is used while if a is unsigned then an
94 | unsigned integer of the same precision as the platform integer is used.
95 | ignore_nan : bool, optional
96 | If True, NaNs are ignored. Default is propagation of NaNs.
97 |
98 | Yields
99 | ------
100 | online_prod : ndarray
101 | """
102 | yield from ireduce_ufunc(
103 | arrays, ufunc=np.multiply, axis=axis, dtype=dtype, ignore_nan=ignore_nan
104 | )
105 |
106 |
107 | def prod(arrays, axis=-1, dtype=None, ignore_nan=False):
108 | """
109 | Product of arrays in a stream.
110 |
111 | Parameters
112 | ----------
113 | arrays : iterable
114 | Arrays to be multiplied.
115 | axis : int or None, optional
116 | Reduction axis. Default is to multiply the arrays in the stream as if
117 | they had been stacked along a new axis, then multiply along this new axis.
118 | If None, arrays are flattened before multiplication. If `axis` is an int larger that
119 | the number of dimensions in the arrays of the stream, arrays are multiplied
120 | along the new axis.
121 | dtype : numpy.dtype, optional
122 | The type of the yielded array and of the accumulator in which the elements
123 | are summed. The dtype of a is used by default unless a has an integer dtype
124 | of less precision than the default platform integer. In that case, if a is
125 | signed then the platform integer is used while if a is unsigned then an
126 | unsigned integer of the same precision as the platform integer is used.
127 | ignore_nan : bool, optional
128 | If True, NaNs are ignored. Default is propagation of NaNs.
129 |
130 | Returns
131 | -------
132 | product : ndarray
133 | """
134 | return reduce_ufunc(
135 | arrays, ufunc=np.multiply, axis=axis, dtype=dtype, ignore_nan=ignore_nan
136 | )
137 |
138 |
139 | def isub(arrays, axis=-1, dtype=None):
140 | """
141 | Subtract elements in a reduction fashion. Equivalent to ``numpy.subtract.reduce`` on a dense array.
142 |
143 | Parameters
144 | ----------
145 | arrays : iterable
146 | Arrays to be multiplied.
147 | axis : int, optional
148 | Reduction axis. Since subtraction is not reorderable (unlike a sum, for example),
149 | `axis` must be specified as an int; full reduction (``axis = None``) will raise an exception.
150 | Default is to subtract the arrays in the stream as if they had been stacked along a new axis,
151 | then subtract along this new axis. If None, arrays are flattened before subtraction.
152 | If `axis` is an int larger that the number of dimensions in the arrays of the stream,
153 | arrays are subtracted along the new axis.
154 | dtype : numpy.dtype, optional
155 | The type of the yielded array and of the accumulator in which the elements
156 | are combined. The dtype of a is used by default unless a has an integer dtype
157 | of less precision than the default platform integer. In that case, if a is
158 | signed then the platform integer is used while if a is unsigned then an
159 | unsigned integer of the same precision as the platform integer is used.
160 |
161 | Yields
162 | ------
163 | online_sub : ndarray
164 |
165 | Raises
166 | ------
167 | ValueError
168 | If `axis` is None. Since subtraction is not reorderable (unlike a sum, for example),
169 | `axis` must be specified as an int.
170 | """
171 | if axis is None:
172 | raise ValueError(
173 | "Subtraction is not a reorderable operation, and \
174 | therefore a specific axis must be given."
175 | )
176 | yield from ireduce_ufunc(arrays, ufunc=np.subtract, axis=axis, dtype=dtype)
177 |
178 |
179 | def iall(arrays, axis=-1):
180 | """
181 | Test whether all array elements along a given axis evaluate to True
182 |
183 | Parameters
184 | ----------
185 | arrays : iterable
186 | Arrays to be reduced.
187 | axis : int or None, optional
188 | Axis along which a logical AND reduction is performed. The default
189 | is to perform a logical AND along the 'stream axis', as if all arrays in ``array``
190 | were stacked along a new dimension. If ``axis = None``, arrays in ``arrays`` are flattened
191 | before reduction.
192 |
193 | Yields
194 | ------
195 | all : ndarray, dtype bool
196 | """
197 | # TODO: use ``where`` keyword to only check places that are already ``True``
198 | yield from ireduce_ufunc(arrays, ufunc=np.logical_and, axis=axis)
199 |
200 |
201 | def iany(arrays, axis=-1):
202 | """
203 | Test whether any array elements along a given axis evaluate to True.
204 |
205 | Parameters
206 | ----------
207 | arrays : iterable
208 | Arrays to be reduced.
209 | axis : int or None, optional
210 | Axis along which a logical OR reduction is performed. The default
211 | is to perform a logical AND along the 'stream axis', as if all arrays in ``array``
212 | were stacked along a new dimension. If ``axis = None``, arrays in ``arrays`` are flattened
213 | before reduction.
214 |
215 | Yields
216 | ------
217 | any : ndarray, dtype bool
218 | """
219 | # TODO: use ``where`` keyword to only check places that are not already ``True``
220 | yield from ireduce_ufunc(arrays, ufunc=np.logical_or, axis=axis)
221 |
222 |
223 | def imax(arrays, axis, ignore_nan=False):
224 | """
225 | Maximum of a stream of arrays along an axis.
226 |
227 | Parameters
228 | ----------
229 | arrays : iterable
230 | Arrays to be reduced.
231 | axis : int or None, optional
232 | Axis along which the maximum is found. The default
233 | is to find the maximum along the 'stream axis', as if all arrays in ``array``
234 | were stacked along a new dimension. If ``axis = None``, arrays in ``arrays`` are flattened
235 | before reduction.
236 | ignore_nan : bool, optional
237 | If True, NaNs are ignored. Default is propagation of NaNs.
238 |
239 | Yields
240 | ------
241 | online_max : ndarray
242 | Cumulative maximum.
243 | """
244 | ufunc = np.fmax if ignore_nan else np.maximum
245 | yield from ireduce_ufunc(arrays, ufunc, axis)
246 |
247 |
248 | def imin(arrays, axis, ignore_nan=False):
249 | """
250 | Minimum of a stream of arrays along an axis.
251 |
252 | Parameters
253 | ----------
254 | arrays : iterable
255 | Arrays to be reduced.
256 | axis : int or None, optional
257 | Axis along which the minimum is found. The default
258 | is to find the minimum along the 'stream axis', as if all arrays in ``array``
259 | were stacked along a new dimension. If ``axis = None``, arrays in ``arrays`` are flattened
260 | before reduction.
261 | ignore_nan : bool, optional
262 | If True, NaNs are ignored. Default is propagation of NaNs.
263 |
264 | Yields
265 | ------
266 | online_min : ndarray
267 | Cumulative minimum.
268 | """
269 | ufunc = np.fmin if ignore_nan else np.minimum
270 | yield from ireduce_ufunc(arrays, ufunc, axis)
271 |
--------------------------------------------------------------------------------
/npstreams/reduce.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | General stream reduction
4 | ------------------------
5 | """
6 | from functools import lru_cache, partial
7 | from itertools import islice, repeat
8 | from multiprocessing import Pool
9 |
10 | import numpy as np
11 |
12 | from .array_stream import array_stream
13 | from .array_utils import nan_to_num
14 | from .iter_utils import chunked, last, peek, primed
15 | from .parallel import preduce
16 |
17 | identity = lambda i: i
18 |
19 |
20 | @lru_cache(maxsize=128)
21 | def _check_binary_ufunc(ufunc):
22 | """
23 | Check that ufunc is suitable for ``ireduce_ufunc``.
24 |
25 | Specifically, a binary ``numpy.ufunc`` function is required. Functions
26 | that returns a boolean are also not suitable because they cannot be accumulated.
27 |
28 | This function does not return anything.
29 |
30 | Parameters
31 | ----------
32 | ufunc : callable
33 | Function to check.
34 |
35 | Raises
36 | ------
37 | TypeError : if ``ufunc`` is not a ``numpy.ufunc``
38 | ValueError: if ``ufunc`` is not binary or the return type is boolean.
39 | """
40 | if not isinstance(ufunc, np.ufunc):
41 | raise TypeError(f"{ufunc.__name__} is not a NumPy Ufunc")
42 | if ufunc.nin != 2:
43 | raise ValueError(
44 | f"Only binary ufuncs are supported, and {ufunc.__name__} is not one of them"
45 | )
46 |
47 |
48 | @primed
49 | @array_stream
50 | def ireduce_ufunc(arrays, ufunc, axis=-1, dtype=None, ignore_nan=False, **kwargs):
51 | """
52 | Streaming reduction generator function from a binary NumPy ufunc. Generator
53 | version of `reduce_ufunc`.
54 |
55 | ``ufunc`` must be a NumPy binary Ufunc (i.e. it takes two arguments). Moreover,
56 | for performance reasons, ufunc must have the same return types as input types.
57 | This precludes the use of ``numpy.greater``, for example.
58 |
59 | Note that performance is much better for the default ``axis = -1``. In such a case,
60 | reduction operations can occur in-place. This also allows to operate in constant-memory.
61 |
62 | Parameters
63 | ----------
64 | arrays : iterable
65 | Arrays to be reduced.
66 | ufunc : numpy.ufunc
67 | Binary universal function.
68 | axis : int or None, optional
69 | Reduction axis. Default is to reduce the arrays in the stream as if
70 | they had been stacked along a new axis, then reduce along this new axis.
71 | If None, arrays are flattened before reduction. If `axis` is an int larger that
72 | the number of dimensions in the arrays of the stream, arrays are reduced
73 | along the new axis. Note that not all of NumPy Ufuncs support
74 | ``axis = None``, e.g. ``numpy.subtract``.
75 | dtype : numpy.dtype or None, optional
76 | Overrides the dtype of the calculation and output arrays.
77 | ignore_nan : bool, optional
78 | If True and ufunc has an identity value (e.g. ``numpy.add.identity`` is 0), then NaNs
79 | are replaced with this identity. An error is raised if ``ufunc`` has no identity
80 | (e.g. ``numpy.maximum.identity`` is ``None``).
81 | kwargs
82 | Keyword arguments are passed to ``ufunc``. Note that some valid ufunc keyword arguments
83 | (e.g. ``keepdims``) are not valid for all streaming functions. Also, contrary to NumPy
84 | v. 1.10+, ``casting = 'unsafe`` is the default in npstreams.
85 |
86 | Yields
87 | ------
88 | reduced : ndarray or scalar
89 |
90 | Raises
91 | ------
92 | TypeError : if ``ufunc`` is not NumPy ufunc.
93 | ValueError : if ``ignore_nan`` is True but ``ufunc`` has no identity
94 | ValueError : if ``ufunc`` is not a binary ufunc
95 | ValueError : if ``ufunc`` does not have the same input type as output type
96 | """
97 | kwargs.update({"dtype": dtype, "axis": axis})
98 |
99 | _check_binary_ufunc(ufunc)
100 |
101 | if ignore_nan:
102 | if ufunc.identity is None:
103 | raise ValueError(
104 | f"Cannot ignore NaNs because {ufunc.__name__} has no identity value"
105 | )
106 | # TODO: use the ``where`` keyword in ufuncs instead
107 | arrays = map(partial(nan_to_num, fill_value=ufunc.identity, copy=False), arrays)
108 |
109 | # Since ireduce_ufunc is primed, we need to wait here
110 | # Priming is a way to start error checking before actually running
111 | # any computations.
112 | yield
113 |
114 | if kwargs["axis"] == -1:
115 | yield from _ireduce_ufunc_new_axis(arrays, ufunc, **kwargs)
116 | return
117 |
118 | if kwargs["axis"] is None:
119 | yield from _ireduce_ufunc_all_axes(arrays, ufunc, **kwargs)
120 | return
121 |
122 | first, arrays = peek(arrays)
123 |
124 | if kwargs["axis"] >= first.ndim:
125 | kwargs["axis"] = -1
126 | yield from ireduce_ufunc(arrays, ufunc, **kwargs)
127 | return
128 |
129 | yield from _ireduce_ufunc_existing_axis(arrays, ufunc, **kwargs)
130 |
131 |
132 | def reduce_ufunc(arrays, ufunc, axis=-1, dtype=None, ignore_nan=False, **kwargs):
133 | """
134 | Reduce a stream using a binary NumPy ufunc. Function version of ``ireduce_ufunc``.
135 |
136 | ``ufunc`` must be a NumPy binary Ufunc (i.e. it takes two arguments). Moreover,
137 | for performance reasons, ufunc must have the same return types as input types.
138 | This precludes the use of ``numpy.greater``, for example.
139 |
140 | Note that performance is much better for the default ``axis = -1``. In such a case,
141 | reduction operations can occur in-place. This also allows to operate in constant-memory.
142 |
143 | Parameters
144 | ----------
145 | arrays : iterable
146 | Arrays to be reduced.
147 | ufunc : numpy.ufunc
148 | Binary universal function.
149 | axis : int or None, optional
150 | Reduction axis. Default is to reduce the arrays in the stream as if
151 | they had been stacked along a new axis, then reduce along this new axis.
152 | If None, arrays are flattened before reduction. If `axis` is an int larger that
153 | the number of dimensions in the arrays of the stream, arrays are reduced
154 | along the new axis. Note that not all of NumPy Ufuncs support
155 | ``axis = None``, e.g. ``numpy.subtract``.
156 | dtype : numpy.dtype or None, optional
157 | Overrides the dtype of the calculation and output arrays.
158 | ignore_nan : bool, optional
159 | If True and ufunc has an identity value (e.g. ``numpy.add.identity`` is 0), then NaNs
160 | are replaced with this identity. An error is raised if ``ufunc`` has no identity (e.g. ``numpy.maximum.identity`` is ``None``).
161 | kwargs
162 | Keyword arguments are passed to ``ufunc``. Note that some valid ufunc keyword arguments
163 | (e.g. ``keepdims``) are not valid for all streaming functions. Note that
164 | contrary to NumPy v. 1.10+, ``casting = 'unsafe`` is the default in npstreams.
165 |
166 | Returns
167 | -------
168 | reduced : ndarray or scalar
169 |
170 | Raises
171 | ------
172 | TypeError : if ``ufunc`` is not NumPy ufunc.
173 | ValueError : if ``ignore_nan`` is True but ``ufunc`` has no identity
174 | ValueError: if ``ufunc`` is not a binary ufunc
175 | ValueError: if ``ufunc`` does not have the same input type as output type
176 | """
177 | return last(
178 | ireduce_ufunc(
179 | arrays, ufunc, axis=axis, dtype=dtype, ignore_nan=ignore_nan, **kwargs
180 | )
181 | )
182 |
183 |
184 | @array_stream
185 | def preduce_ufunc(
186 | arrays,
187 | ufunc,
188 | axis=-1,
189 | dtype=None,
190 | ignore_nan=False,
191 | processes=1,
192 | ntotal=None,
193 | **kwargs,
194 | ):
195 | """
196 | Parallel reduction of array streams.
197 |
198 | ``ufunc`` must be a NumPy binary Ufunc (i.e. it takes two arguments). Moreover,
199 | for performance reasons, ufunc must have the same return types as input types.
200 | This precludes the use of ``numpy.greater``, for example.
201 |
202 | Parameters
203 | ----------
204 | arrays : iterable
205 | Arrays to be reduced.
206 | ufunc : numpy.ufunc
207 | Binary universal function.
208 | axis : int or None, optional
209 | Reduction axis. Default is to reduce the arrays in the stream as if
210 | they had been stacked along a new axis, then reduce along this new axis.
211 | If None, arrays are flattened before reduction. If `axis` is an int larger that
212 | the number of dimensions in the arrays of the stream, arrays are reduced
213 | along the new axis. Note that not all of NumPy Ufuncs support
214 | ``axis = None``, e.g. ``numpy.subtract``.
215 | dtype : numpy.dtype or None, optional
216 | Overrides the dtype of the calculation and output arrays.
217 | ignore_nan : bool, optional
218 | If True and ufunc has an identity value (e.g. ``numpy.add.identity`` is 0), then NaNs
219 | are replaced with this identity. An error is raised if ``ufunc`` has no identity (e.g. ``numpy.maximum.identity`` is ``None``).
220 | processes : int or None, optional
221 | Number of processes to use. If `None`, maximal number of processes
222 | is used. Default is 1.
223 | kwargs
224 | Keyword arguments are passed to ``ufunc``. Note that some valid ufunc keyword arguments
225 | (e.g. ``keepdims``) are not valid for all streaming functions. Also, contrary to NumPy
226 | v. 1.10+, ``casting = 'unsafe`` is the default in npstreams.
227 | """
228 | if processes == 1:
229 | return reduce_ufunc(arrays, ufunc, axis, dtype, ignore_nan, **kwargs)
230 |
231 | kwargs.update(
232 | {"ufunc": ufunc, "ignore_nan": ignore_nan, "dtype": dtype, "axis": axis}
233 | )
234 | reduce = partial(reduce_ufunc, **kwargs)
235 | # return preduce(reduce, arrays, processes = processes, ntotal = ntotal)
236 |
237 | with Pool(processes) as pool:
238 | chunksize = 1
239 | if ntotal is not None:
240 | chunksize = max(1, int(ntotal / pool._processes))
241 | res = pool.imap(reduce, chunked(arrays, chunksize))
242 | return reduce(res)
243 |
244 |
245 | def _ireduce_ufunc_new_axis(arrays, ufunc, **kwargs):
246 | """
247 | Reduction operation for arrays, in the direction of a new axis (i.e. stacking).
248 |
249 | Parameters
250 | ----------
251 | arrays : iterable
252 | Arrays to be reduced.
253 | ufunc : numpy.ufunc
254 | Binary universal function. Must have a signature of the form ufunc(x1, x2, ...)
255 | kwargs
256 | Keyword arguments are passed to ``ufunc``.
257 |
258 | Yields
259 | ------
260 | reduced : ndarray
261 | """
262 | arrays = iter(arrays)
263 | first = next(arrays)
264 |
265 | kwargs.pop("axis")
266 |
267 | dtype = kwargs.get("dtype", None)
268 | if dtype is None:
269 | dtype = first.dtype
270 | else:
271 | kwargs["casting"] = "unsafe"
272 |
273 | # If the out parameter was already given
274 | # we create the accumulator from it
275 | # Otherwise, it is a copy of the first array
276 | accumulator = kwargs.pop("out", None)
277 | if accumulator is not None:
278 | accumulator[:] = first
279 | else:
280 | accumulator = np.array(first, copy=True).astype(dtype)
281 | yield accumulator
282 |
283 | for array in arrays:
284 | ufunc(accumulator, array, out=accumulator, **kwargs)
285 | yield accumulator
286 |
287 |
288 | def _ireduce_ufunc_existing_axis(arrays, ufunc, **kwargs):
289 | """
290 | Reduction operation for arrays, in the direction of an existing axis.
291 |
292 | Parameters
293 | ----------
294 | arrays : iterable
295 | Arrays to be reduced.
296 | ufunc : numpy.ufunc
297 | Binary universal function. Must have a signature of the form ufunc(x1, x2, ...)
298 | kwargs
299 | Keyword arguments are passed to ``ufunc``. The ``out`` parameter is ignored.
300 |
301 | Yields
302 | ------
303 | reduced : ndarray
304 | """
305 | arrays = iter(arrays)
306 | first = next(arrays)
307 |
308 | if kwargs["axis"] not in range(first.ndim):
309 | axis = kwargs["axis"]
310 | raise ValueError(f"Axis {axis} not supported on arrays of shape {first.shape}.")
311 |
312 | # Remove parameters that will not be used.
313 | kwargs.pop("out", None)
314 |
315 | dtype = kwargs.get("dtype")
316 | if dtype is None:
317 | dtype = first.dtype
318 |
319 | axis_reduce = partial(ufunc.reduce, **kwargs)
320 |
321 | accumulator = np.atleast_1d(axis_reduce(first))
322 | yield accumulator
323 |
324 | # On the first pass of the following loop, accumulator is missing a dimensions
325 | # therefore, the stacking function cannot be 'concatenate'
326 | second = next(arrays)
327 | accumulator = np.stack([accumulator, np.atleast_1d(axis_reduce(second))], axis=-1)
328 | yield accumulator
329 |
330 | # On the second pass, the new dimensions exists, and thus we switch to
331 | # using concatenate.
332 | for array in arrays:
333 | reduced = np.expand_dims(
334 | np.atleast_1d(axis_reduce(array)), axis=accumulator.ndim - 1
335 | )
336 | accumulator = np.concatenate([accumulator, reduced], axis=accumulator.ndim - 1)
337 | yield accumulator
338 |
339 |
340 | def _ireduce_ufunc_all_axes(arrays, ufunc, **kwargs):
341 | """
342 | Reduction operation for arrays, over all axes.
343 |
344 | Parameters
345 | ----------
346 | arrays : iterable
347 | Arrays to be reduced.
348 | ufunc : numpy.ufunc
349 | Binary universal function. Must have a signature of the form ufunc(x1, x2, ...)
350 | kwargs
351 | Keyword arguments are passed to ``ufunc``. The ``out`` parameter is ignored.
352 |
353 | Yields
354 | ------
355 | reduced : scalar
356 | """
357 | arrays = iter(arrays)
358 | first = next(arrays)
359 |
360 | kwargs.pop("out", None)
361 |
362 | kwargs["axis"] = None
363 | axis_reduce = partial(ufunc.reduce, **kwargs)
364 |
365 | accumulator = axis_reduce(first)
366 | yield accumulator
367 |
368 | for array in arrays:
369 | accumulator = axis_reduce([accumulator, axis_reduce(array)])
370 | yield accumulator
371 |
--------------------------------------------------------------------------------
/npstreams/tests/test_stats.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from itertools import repeat
4 | from random import randint, random, seed
5 | from warnings import catch_warnings, simplefilter
6 | import pytest
7 |
8 | import numpy as np
9 |
10 | try:
11 | from scipy.stats import sem as scipy_sem
12 |
13 | WITH_SCIPY = True
14 | except ImportError:
15 | WITH_SCIPY = False
16 |
17 | from npstreams import (
18 | iaverage,
19 | imean,
20 | isem,
21 | istd,
22 | ivar,
23 | last,
24 | ihistogram,
25 | mean,
26 | average,
27 | sem,
28 | std,
29 | var,
30 | )
31 |
32 | seed(23)
33 |
34 |
35 | def test_average_trivial():
36 | """Test average() on a stream of zeroes"""
37 | stream = repeat(np.zeros((64, 64), dtype=float), times=5)
38 | for av in average(stream):
39 | assert np.allclose(av, np.zeros_like(av))
40 |
41 |
42 | @pytest.mark.parametrize("axis", (0, 1, 2, None))
43 | def test_average_vs_numpy(axis):
44 | """Test average vs. numpy.average"""
45 | stream = [np.random.random(size=(64, 64)) for _ in range(5)]
46 | stack = np.dstack(stream)
47 |
48 | from_stream = average(stream, axis=axis)
49 | from_numpy = np.average(stack, axis=axis)
50 | assert np.allclose(from_numpy, from_stream)
51 |
52 |
53 | def test_average_weighted_average():
54 | """Test results of weighted average against numpy.average"""
55 | stream = [np.random.random(size=(16, 16)) for _ in range(5)]
56 |
57 | weights = [random() for _ in stream]
58 | from_average = average(stream, weights=weights)
59 | from_numpy = np.average(np.dstack(stream), axis=2, weights=np.array(weights))
60 | assert np.allclose(from_average, from_numpy)
61 |
62 | weights = [np.random.random(size=stream[0].shape) for _ in stream]
63 | from_average = average(stream, weights=weights)
64 | from_numpy = np.average(np.dstack(stream), axis=2, weights=np.dstack(weights))
65 | assert np.allclose(from_average, from_numpy)
66 |
67 |
68 | def test_average_ignore_nan():
69 | """Test that NaNs are handled correctly"""
70 | stream = [np.random.random(size=(16, 12)) for _ in range(5)]
71 | for s in stream:
72 | s[randint(0, 15), randint(0, 11)] = np.nan
73 |
74 | with catch_warnings():
75 | simplefilter("ignore")
76 | from_average = average(stream, ignore_nan=True)
77 | from_numpy = np.nanmean(np.dstack(stream), axis=2)
78 | assert np.allclose(from_average, from_numpy)
79 |
80 |
81 | def test_iaverage_trivial():
82 | """Test iaverage on stream of zeroes"""
83 | stream = repeat(np.zeros((64, 64), dtype=float), times=5)
84 | for av in iaverage(stream):
85 | assert np.allclose(av, np.zeros_like(av))
86 |
87 |
88 | def test_iaverage_weighted_average():
89 | """Test results of weighted iverage against numpy.average"""
90 | stream = [np.random.random(size=(16, 16)) for _ in range(5)]
91 |
92 | weights = [random() for _ in stream]
93 | from_iaverage = last(iaverage(stream, weights=weights))
94 | from_numpy = np.average(np.dstack(stream), axis=2, weights=np.array(weights))
95 | assert np.allclose(from_iaverage, from_numpy)
96 |
97 | weights = [np.random.random(size=stream[0].shape) for _ in stream]
98 | from_iaverage = last(iaverage(stream, weights=weights))
99 | from_numpy = np.average(np.dstack(stream), axis=2, weights=np.dstack(weights))
100 | assert np.allclose(from_iaverage, from_numpy)
101 |
102 |
103 | def test_iaverage_ignore_nan():
104 | """Test that NaNs are handled correctly"""
105 | stream = [np.random.random(size=(16, 12)) for _ in range(5)]
106 | for s in stream:
107 | s[randint(0, 15), randint(0, 11)] = np.nan
108 |
109 | with catch_warnings():
110 | simplefilter("ignore")
111 | from_iaverage = last(iaverage(stream, ignore_nan=True))
112 | from_numpy = np.nanmean(np.dstack(stream), axis=2)
113 | assert np.allclose(from_iaverage, from_numpy)
114 |
115 |
116 | def test_iaverage_length():
117 | """Test that the number of yielded elements is the same as source"""
118 | source = (np.zeros((16,)) for _ in range(5))
119 | avg = list(iaverage(source, axis=0))
120 | assert len(avg) == 5
121 |
122 |
123 | @pytest.mark.parametrize("dtype", (np.uint8, bool, np.int16, np.float16))
124 | def test_iaverage_output_dtype(dtype):
125 | """Test that that yielded arrays are always floats"""
126 | source = (np.zeros((16,), dtype=dtype) for _ in range(5))
127 | avg = last(iaverage(source))
128 | assert avg.dtype == float
129 |
130 |
131 | @pytest.mark.parametrize("axis", (0, 1, 2, None))
132 | def test_iaverage_output_shape(axis):
133 | """Test output shape"""
134 | source = [np.random.random((16, 12, 5)) for _ in range(10)]
135 | stack = np.stack(source, axis=-1)
136 |
137 | from_numpy = np.average(stack, axis=axis)
138 | out = last(iaverage(source, axis=axis))
139 | assert from_numpy.shape == out.shape
140 | assert np.allclose(out, from_numpy)
141 |
142 |
143 | def test_mean_trivial():
144 | """Test mean() on a stream of zeroes"""
145 | stream = repeat(np.zeros((64, 64), dtype=float), times=5)
146 | for av in mean(stream):
147 | assert np.allclose(av, np.zeros_like(av))
148 |
149 |
150 | @pytest.mark.parametrize("axis", (0, 1, 2, None))
151 | def test_mean_vs_numpy(axis):
152 | """Test mean vs. numpy.mean"""
153 | stream = [np.random.random(size=(64, 64)) for _ in range(5)]
154 | stack = np.dstack(stream)
155 |
156 | from_stream = mean(stream, axis=axis)
157 | from_numpy = np.mean(stack, axis=axis)
158 | assert np.allclose(from_numpy, from_stream)
159 |
160 |
161 | @pytest.mark.parametrize("axis", (0, 1, 2, None))
162 | def test_mean_against_numpy_nanmean(axis):
163 | """Test results against numpy.mean"""
164 | source = [np.random.random((16, 12, 5)) for _ in range(10)]
165 | for arr in source:
166 | arr[randint(0, 15), randint(0, 11), randint(0, 4)] = np.nan
167 | stack = np.stack(source, axis=-1)
168 |
169 | from_numpy = np.nanmean(stack, axis=axis)
170 | out = mean(source, axis=axis, ignore_nan=True)
171 | assert from_numpy.shape == out.shape
172 | assert np.allclose(out, from_numpy)
173 |
174 |
175 | @pytest.mark.parametrize("axis", (0, 1, 2, None))
176 | def test_imean_against_numpy_mean(axis):
177 | """Test results against numpy.mean"""
178 | source = [np.random.random((16, 12, 5)) for _ in range(10)]
179 | stack = np.stack(source, axis=-1)
180 |
181 | from_numpy = np.mean(stack, axis=axis)
182 | out = last(imean(source, axis=axis))
183 | assert from_numpy.shape == out.shape
184 | assert np.allclose(out, from_numpy)
185 |
186 |
187 | @pytest.mark.parametrize("axis", (0, 1, 2, None))
188 | def test_imean_against_numpy_nanmean(axis):
189 | """Test results against numpy.mean"""
190 | source = [np.random.random((16, 12, 5)) for _ in range(10)]
191 | for arr in source:
192 | arr[randint(0, 15), randint(0, 11), randint(0, 4)] = np.nan
193 | stack = np.stack(source, axis=-1)
194 |
195 | from_numpy = np.nanmean(stack, axis=axis)
196 | out = last(imean(source, axis=axis, ignore_nan=True))
197 | assert from_numpy.shape == out.shape
198 | assert np.allclose(out, from_numpy)
199 |
200 |
201 | @pytest.mark.parametrize("axis", (0, 1, 2, None))
202 | def test_var_vs_numpy(axis):
203 | """Test that the axis parameter is handled correctly"""
204 | stream = [np.random.random((16, 7, 3)) for _ in range(5)]
205 | stack = np.stack(stream, axis=-1)
206 |
207 | from_numpy = np.var(stack, axis=axis)
208 | from_var = var(stream, axis=axis)
209 | assert from_numpy.shape == from_var.shape
210 | assert np.allclose(from_var, from_numpy)
211 |
212 |
213 | @pytest.mark.parametrize("axis", (0, 1, 2, None))
214 | @pytest.mark.parametrize("ddof", range(4))
215 | def test_var_ddof(axis, ddof):
216 | """Test that the ddof parameter is equivalent to numpy's"""
217 | stream = [np.random.random((16, 7, 3)) for _ in range(10)]
218 | stack = np.stack(stream, axis=-1)
219 |
220 | with catch_warnings():
221 | simplefilter("ignore")
222 |
223 | from_numpy = np.var(stack, axis=axis, ddof=ddof)
224 | from_var = var(stream, axis=axis, ddof=ddof)
225 | assert from_numpy.shape == from_var.shape
226 | assert np.allclose(from_var, from_numpy)
227 |
228 |
229 | def test_ivar_first():
230 | """Test that the first yielded value of ivar is an array fo zeros"""
231 | stream = repeat(np.random.random(size=(64, 64)), times=5)
232 | first = next(ivar(stream))
233 |
234 | assert np.allclose(first, np.zeros_like(first))
235 |
236 |
237 | @pytest.mark.parametrize("axis", (0, 1, 2, None))
238 | def test_ivar_output_shape(axis):
239 | """Test that the axis parameter is handled correctly"""
240 | stream = [np.random.random((16, 7, 3)) for _ in range(5)]
241 | stack = np.stack(stream, axis=-1)
242 |
243 | from_numpy = np.var(stack, axis=axis)
244 | from_ivar = last(ivar(stream, axis=axis))
245 | assert from_numpy.shape == from_ivar.shape
246 | assert np.allclose(from_ivar, from_numpy)
247 |
248 |
249 | @pytest.mark.parametrize("axis", (0, 1, 2, None))
250 | @pytest.mark.parametrize("ddof", range(4))
251 | def test_ivar_ddof(axis, ddof):
252 | """Test that the ddof parameter is equivalent to numpy's"""
253 | stream = [np.random.random((16, 7, 3)) for _ in range(10)]
254 | stack = np.stack(stream, axis=-1)
255 |
256 | with catch_warnings():
257 | simplefilter("ignore")
258 |
259 | from_numpy = np.var(stack, axis=axis, ddof=ddof)
260 | from_ivar = last(ivar(stream, axis=axis, ddof=ddof))
261 | assert from_numpy.shape == from_ivar.shape
262 | assert np.allclose(from_ivar, from_numpy)
263 |
264 |
265 | @pytest.mark.parametrize("axis", (0, 1, 2, None))
266 | @pytest.mark.parametrize("ddof", range(4))
267 | def test_std_against_numpy_std(axis, ddof):
268 | stream = [np.random.random((16, 7, 3)) for _ in range(10)]
269 | stack = np.stack(stream, axis=-1)
270 |
271 | with catch_warnings():
272 | simplefilter("ignore")
273 |
274 | from_numpy = np.std(stack, axis=axis, ddof=ddof)
275 | from_ivar = std(stream, axis=axis, ddof=ddof)
276 | assert from_numpy.shape == from_ivar.shape
277 | assert np.allclose(from_ivar, from_numpy)
278 |
279 |
280 | @pytest.mark.parametrize("axis", (0, 1, 2, None))
281 | @pytest.mark.parametrize("ddof", range(4))
282 | def test_std_against_numpy_nanstd(axis, ddof):
283 | source = [np.random.random((16, 12, 5)) for _ in range(10)]
284 | for arr in source:
285 | arr[randint(0, 15), randint(0, 11), randint(0, 4)] = np.nan
286 | stack = np.stack(source, axis=-1)
287 |
288 | from_numpy = np.nanstd(stack, axis=axis, ddof=ddof)
289 | from_ivar = std(source, axis=axis, ddof=ddof, ignore_nan=True)
290 | assert from_numpy.shape == from_ivar.shape
291 | assert np.allclose(from_ivar, from_numpy)
292 |
293 |
294 | @pytest.mark.parametrize("axis", (0, 1, 2, None))
295 | @pytest.mark.parametrize("ddof", range(4))
296 | def test_istd_against_numpy_std(axis, ddof):
297 | stream = [np.random.random((16, 7, 3)) for _ in range(10)]
298 | stack = np.stack(stream, axis=-1)
299 |
300 | with catch_warnings():
301 | simplefilter("ignore")
302 |
303 | from_numpy = np.std(stack, axis=axis, ddof=ddof)
304 | from_ivar = last(istd(stream, axis=axis, ddof=ddof))
305 | assert from_numpy.shape == from_ivar.shape
306 | assert np.allclose(from_ivar, from_numpy)
307 |
308 |
309 | @pytest.mark.parametrize("axis", (0, 1, 2, None))
310 | @pytest.mark.parametrize("ddof", range(4))
311 | def test_istd_against_numpy_nanstd(axis, ddof):
312 | source = [np.random.random((16, 12, 5)) for _ in range(10)]
313 | for arr in source:
314 | arr[randint(0, 15), randint(0, 11), randint(0, 4)] = np.nan
315 | stack = np.stack(source, axis=-1)
316 |
317 | from_numpy = np.nanstd(stack, axis=axis, ddof=ddof)
318 | from_ivar = last(istd(source, axis=axis, ddof=ddof, ignore_nan=True))
319 | assert from_numpy.shape == from_ivar.shape
320 | assert np.allclose(from_ivar, from_numpy)
321 |
322 |
323 | @pytest.mark.skipif(not WITH_SCIPY, reason="SciPy is not installed/importable")
324 | @pytest.mark.parametrize("axis", (0, 1, 2, None))
325 | @pytest.mark.parametrize("ddof", range(4))
326 | def test_sem_against_scipy_no_nans(axis, ddof):
327 | """Test that isem outputs the same as scipy.stats.sem"""
328 | source = [np.random.random((16, 12, 5)) for _ in range(10)]
329 | stack = np.stack(source, axis=-1)
330 |
331 | from_scipy = scipy_sem(stack, axis=axis, ddof=ddof)
332 | from_isem = sem(source, axis=axis, ddof=ddof)
333 | assert from_scipy.shape == from_isem.shape
334 | assert np.allclose(from_isem, from_scipy)
335 |
336 |
337 | @pytest.mark.skipif(not WITH_SCIPY, reason="SciPy is not installed/importable")
338 | @pytest.mark.parametrize("axis", (0, 1, 2, None))
339 | @pytest.mark.parametrize("ddof", range(4))
340 | def test_sem_against_scipy_with_nans(axis, ddof):
341 | """Test that isem outputs the same as scipy.stats.sem when NaNs are ignored."""
342 | source = [np.random.random((16, 12, 5)) for _ in range(10)]
343 | for arr in source:
344 | arr[randint(0, 15), randint(0, 11), randint(0, 4)] = np.nan
345 | stack = np.stack(source, axis=-1)
346 |
347 | from_scipy = scipy_sem(stack, axis=axis, ddof=ddof, nan_policy="omit")
348 | from_isem = sem(source, axis=axis, ddof=ddof, ignore_nan=True)
349 | assert from_scipy.shape == from_isem.shape
350 | assert np.allclose(from_isem, from_scipy)
351 |
352 |
353 | @pytest.mark.skipif(not WITH_SCIPY, reason="SciPy is not installed/importable")
354 | @pytest.mark.parametrize("axis", (0, 1, 2, None))
355 | @pytest.mark.parametrize("ddof", range(4))
356 | def test_isem_against_scipy_no_nans(axis, ddof):
357 | """Test that isem outputs the same as scipy.stats.sem"""
358 | source = [np.random.random((16, 12, 5)) for _ in range(10)]
359 | stack = np.stack(source, axis=-1)
360 |
361 | from_scipy = scipy_sem(stack, axis=axis, ddof=ddof)
362 | from_isem = last(isem(source, axis=axis, ddof=ddof))
363 | assert from_scipy.shape == from_isem.shape
364 | assert np.allclose(from_isem, from_scipy)
365 |
366 |
367 | @pytest.mark.skipif(not WITH_SCIPY, reason="SciPy is not installed/importable")
368 | @pytest.mark.parametrize("axis", (0, 1, 2, None))
369 | @pytest.mark.parametrize("ddof", range(4))
370 | def test_isem_against_scipy_with_nans(axis, ddof):
371 | """Test that isem outputs the same as scipy.stats.sem when NaNs are ignored."""
372 | source = [np.random.random((16, 12, 5)) for _ in range(10)]
373 | for arr in source:
374 | arr[randint(0, 15), randint(0, 11), randint(0, 4)] = np.nan
375 | stack = np.stack(source, axis=-1)
376 |
377 | from_scipy = scipy_sem(stack, axis=axis, ddof=ddof, nan_policy="omit")
378 | from_isem = last(isem(source, axis=axis, ddof=ddof, ignore_nan=True))
379 | assert from_scipy.shape == from_isem.shape
380 | assert np.allclose(from_isem, from_scipy)
381 |
382 |
383 | def test_ihistogram_against_numpy_no_weights():
384 | """Test ihistogram against numpy.histogram with no weights"""
385 | source = [np.random.random((16, 12, 5)) for _ in range(10)]
386 | stack = np.stack(source, axis=-1)
387 |
388 | bins = np.linspace(0, 1, num=10)
389 | from_numpy = np.histogram(stack, bins=bins)[0]
390 | from_ihistogram = last(ihistogram(source, bins=bins))
391 |
392 | # Since histogram output is int, cannot use allclose
393 | assert np.all(np.equal(from_numpy, from_ihistogram))
394 |
395 |
396 | def test_ihistogram_trivial_weights():
397 | """Test ihistogram with weights being all 1s vs. weights=None"""
398 | source = [np.random.random((16, 12, 5)) for _ in range(10)]
399 | weights = [np.array([1]) for _ in source]
400 |
401 | bins = np.linspace(0, 1, num=10)
402 | none_weights = last(ihistogram(source, bins=bins, weights=None))
403 | trivial_weights = last(ihistogram(source, bins=bins, weights=weights))
404 |
405 | assert np.all(np.equal(none_weights, trivial_weights))
406 |
--------------------------------------------------------------------------------
/npstreams/stats.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Statistical functions
4 | ---------------------
5 | """
6 | from functools import partial
7 | from itertools import count, repeat, starmap
8 | from operator import truediv
9 | from warnings import catch_warnings, simplefilter
10 |
11 | import numpy as np
12 |
13 | from .array_stream import array_stream
14 | from .array_utils import nan_to_num
15 | from .iter_utils import itercopy, last, peek
16 | from .numerics import isum
17 |
18 |
19 | @array_stream
20 | def _iaverage(arrays, axis=-1, weights=None, ignore_nan=False):
21 | """
22 | Primitive version of weighted averaging that yields the running sum and running weights sum,
23 | but avoids the costly division at every step.
24 | """
25 | # Special case: in the easiest case, no need to calculate
26 | # weights and ignore nans.
27 | # This case is pretty common
28 | if (weights is None) and (not ignore_nan) and (axis == -1):
29 | yield from zip(isum(arrays, axis=axis, dtype=float, ignore_nan=False), count(1))
30 | return
31 |
32 | first, arrays = peek(arrays)
33 |
34 | # We make sure that weights is always an array
35 | # This simplifies the handling of NaNs.
36 | if weights is None:
37 | weights = repeat(1)
38 | weights = map(partial(np.broadcast_to, shape=first.shape), weights)
39 |
40 | # Need to know which array has NaNs, and modify the weights stream accordingly
41 | if ignore_nan:
42 | arrays, arrays2 = itercopy(arrays)
43 | weights = map(
44 | lambda arr, wgt: np.logical_not(np.isnan(arr)) * wgt, arrays2, weights
45 | )
46 |
47 | weights1, weights2 = itercopy(weights)
48 |
49 | sum_of_weights = isum(weights1, axis=axis, dtype=float)
50 | weighted_arrays = map(lambda arr, wgt: arr * wgt, arrays, weights2)
51 | weighted_sum = isum(weighted_arrays, axis=axis, ignore_nan=ignore_nan, dtype=float)
52 |
53 | yield from zip(weighted_sum, sum_of_weights)
54 |
55 |
56 | @array_stream
57 | def average(arrays, axis=-1, weights=None, ignore_nan=False):
58 | """
59 | Average (weighted) of a stream of arrays. This function consumes the
60 | entire stream.
61 |
62 | Parameters
63 | ----------
64 | arrays : iterable of ndarrays
65 | Arrays to be averaged. This iterable can also a generator.
66 | axis : int, optional
67 | Reduction axis. Default is to average the arrays in the stream as if
68 | they had been stacked along a new axis, then average along this new axis.
69 | If None, arrays are flattened before averaging. If `axis` is an int larger that
70 | the number of dimensions in the arrays of the stream, arrays are averaged
71 | along the new axis.
72 | weights : iterable of ndarray, iterable of floats, or None, optional
73 | Iterable of weights associated with the values in each item of `arrays`.
74 | Each value in an element of `arrays` contributes to the average
75 | according to its associated weight. The weights array can either be a float
76 | or an array of the same shape as any element of `arrays`. If ``weights=None``,
77 | then all data in each element of `arrays` are assumed to have a weight equal to one.
78 | ignore_nan : bool, optional
79 | If True, NaNs are set to zero weight. Default is propagation of NaNs.
80 |
81 | Returns
82 | -------
83 | avg: `~numpy.ndarray`, dtype float
84 | Weighted average.
85 |
86 | See Also
87 | --------
88 | iaverage : streaming (weighted) average.
89 | numpy.average : (weighted) average of dense arrays
90 | mean : non-weighted average of a stream.
91 | """
92 | total_sum, total_weight = last(_iaverage(arrays, axis, weights, ignore_nan))
93 | with catch_warnings():
94 | simplefilter("ignore", category=RuntimeWarning)
95 | return np.true_divide(total_sum, total_weight)
96 |
97 |
98 | @array_stream
99 | def iaverage(arrays, axis=-1, weights=None, ignore_nan=False):
100 | """
101 | Streaming (weighted) average of arrays.
102 |
103 | Parameters
104 | ----------
105 | arrays : iterable of ndarrays
106 | Arrays to be averaged. This iterable can also a generator.
107 | axis : int, optional
108 | Reduction axis. Default is to average the arrays in the stream as if
109 | they had been stacked along a new axis, then average along this new axis.
110 | If None, arrays are flattened before averaging. If `axis` is an int larger that
111 | the number of dimensions in the arrays of the stream, arrays are averaged
112 | along the new axis.
113 | weights : iterable of ndarray, iterable of floats, or None, optional
114 | Iterable of weights associated with the values in each item of `arrays`.
115 | Each value in an element of `arrays` contributes to the average
116 | according to its associated weight. The weights array can either be a float
117 | or an array of the same shape as any element of `arrays`. If weights=None,
118 | then all data in each element of `arrays` are assumed to have a weight equal to one.
119 | ignore_nan : bool, optional
120 | If True, NaNs are set to zero weight. Default is propagation of NaNs.
121 |
122 | Yields
123 | ------
124 | avg: `~numpy.ndarray`, dtype float
125 | Weighted average.
126 |
127 | See Also
128 | --------
129 | imean : streaming array mean (non-weighted average).
130 | """
131 | # Primitive stream is composed of tuples (running_sum, running_weights)
132 | primitive = _iaverage(arrays, axis, weights, ignore_nan)
133 | yield from map(lambda element: truediv(*element), primitive)
134 |
135 |
136 | @array_stream
137 | def mean(arrays, axis=-1, ignore_nan=False):
138 | """
139 | Mean of a stream of arrays. This function consumes the
140 | entire stream.
141 |
142 | Parameters
143 | ----------
144 | arrays : iterable of ndarrays
145 | Arrays to be averaged. This iterable can also a generator.
146 | axis : int, optional
147 | Reduction axis. Default is to average the arrays in the stream as if
148 | they had been stacked along a new axis, then average along this new axis.
149 | If None, arrays are flattened before averaging. If `axis` is an int larger that
150 | the number of dimensions in the arrays of the stream, arrays are averaged
151 | along the new axis.
152 | ignore_nan : bool, optional
153 | If True, NaNs are set to zero weight. Default is propagation of NaNs.
154 |
155 | Returns
156 | -------
157 | mean: `~numpy.ndarray`, dtype float
158 | Total mean array.
159 | """
160 | total_sum, total_count = last(
161 | _iaverage(arrays, axis, weights=None, ignore_nan=ignore_nan)
162 | )
163 | return total_sum / total_count
164 |
165 |
166 | @array_stream
167 | def imean(arrays, axis=-1, ignore_nan=False):
168 | """
169 | Streaming mean of arrays. Equivalent to `iaverage(arrays, weights = None)`.
170 |
171 | Parameters
172 | ----------
173 | arrays : iterable of ndarrays
174 | Arrays to be averaged. This iterable can also a generator.
175 | axis : int, optional
176 | Reduction axis. Default is to average the arrays in the stream as if
177 | they had been stacked along a new axis, then average along this new axis.
178 | If None, arrays are flattened before averaging. If `axis` is an int larger that
179 | the number of dimensions in the arrays of the stream, arrays are averaged
180 | along the new axis.
181 | ignore_nan : bool, optional
182 | If True, NaNs are set to zero weight. Default is propagation of NaNs.
183 |
184 | Yields
185 | ------
186 | mean: `~numpy.ndarray`, dtype float
187 | Online mean array.
188 | """
189 | # Primitive stream is composed of tuples (running_sum, running_count)
190 | primitive = _iaverage(arrays, axis, weights=None, ignore_nan=ignore_nan)
191 | yield from map(lambda element: truediv(*element), primitive)
192 |
193 |
194 | @array_stream
195 | def _ivar(arrays, axis=-1, weights=None, ignore_nan=False):
196 | """
197 | Primitive version of weighted variance that yields the running average, running average of squares and running weights sum,
198 | but avoids the costly division and squaring at every step.
199 | """
200 | first, arrays = peek(arrays)
201 |
202 | # We make sure that weights is always an array
203 | # This simplifies the handling of NaNs.
204 | if weights is None:
205 | weights = repeat(1)
206 | weights = map(partial(np.broadcast_to, shape=first.shape), weights)
207 |
208 | # Need to know which array has NaNs, and modify the weights stream accordingly
209 | if ignore_nan:
210 | arrays, arrays2 = itercopy(arrays)
211 | weights = map(
212 | lambda arr, wgt: np.logical_not(np.isnan(arr)) * wgt, arrays2, weights
213 | )
214 |
215 | arrays, arrays2 = itercopy(arrays)
216 | weights, weights2, weights3 = itercopy(weights, 3)
217 |
218 | avgs = iaverage(arrays, axis=axis, weights=weights, ignore_nan=ignore_nan)
219 | avg_of_squares = iaverage(
220 | map(np.square, arrays2), axis=axis, weights=weights2, ignore_nan=ignore_nan
221 | )
222 | sum_of_weights = isum(weights3, axis=axis, ignore_nan=ignore_nan)
223 |
224 | yield from zip(avgs, avg_of_squares, sum_of_weights)
225 |
226 |
227 | @array_stream
228 | def average_and_var(arrays, axis=-1, ddof=0, weights=None, ignore_nan=False):
229 | """
230 | Calculate the simultaneous average and variance of a stream of arrays. This is done in
231 | single iteration for maximum performance.
232 |
233 | .. versionadded:: 1.6.1
234 |
235 | Parameters
236 | ----------
237 | arrays : iterable of ndarrays
238 | Arrays to be combined. This iterable can also a generator.
239 | axis : int, optional
240 | Reduction axis. Default is to combine the arrays in the stream as if
241 | they had been stacked along a new axis, then compute the variance along this new axis.
242 | If None, arrays are flattened. If `axis` is an int larger that
243 | the number of dimensions in the arrays of the stream, variance is computed
244 | along the new axis.
245 | ddof : int, optional
246 | Means Delta Degrees of Freedom. The divisor used in calculations
247 | is ``N - ddof``, where ``N`` represents the number of elements.
248 | weights : iterable of ndarray, iterable of floats, or None, optional
249 | Iterable of weights associated with the values in each item of `arrays`.
250 | Each value in an element of `arrays` contributes to the variance
251 | according to its associated weight. The weights array can either be a float
252 | or an array of the same shape as any element of `arrays`. If weights=None,
253 | then all data in each element of `arrays` are assumed to have a weight equal to one.
254 | ignore_nan : bool, optional
255 | If True, NaNs are set to zero weight. Default is propagation of NaNs.
256 |
257 | Returns
258 | -------
259 | average : `~numpy.ndarray`
260 | Average, possibly weighted.
261 | var: `~numpy.ndarray`
262 | Variance, possibly weighted.
263 |
264 | Notes
265 | -----
266 | Since the calculation of the variance requires knowledge of the average, this function is a
267 | very thin wrapper around `var`.
268 |
269 | References
270 | ----------
271 | .. [#] D. H. D. West, Updating the mean and variance estimates: an improved method.
272 | Communications of the ACM Vol. 22, Issue 9, pp. 532 - 535 (1979)
273 | """
274 | # Since the variance calculation requires knowing the average,
275 | # `average_and_var` runs in the exact same time as `var`
276 | avg, sq_avg, swgt = last(
277 | _ivar(arrays=arrays, axis=axis, weights=weights, ignore_nan=ignore_nan)
278 | )
279 | variance = (sq_avg - avg**2) * (swgt / (swgt - ddof))
280 | return avg, variance
281 |
282 |
283 | @array_stream
284 | def var(arrays, axis=-1, ddof=0, weights=None, ignore_nan=False):
285 | """
286 | Total variance of a stream of arrays. Weights are also supported. This function
287 | consumes the input stream.
288 |
289 | Parameters
290 | ----------
291 | arrays : iterable of ndarrays
292 | Arrays to be combined. This iterable can also a generator.
293 | axis : int, optional
294 | Reduction axis. Default is to combine the arrays in the stream as if
295 | they had been stacked along a new axis, then compute the variance along this new axis.
296 | If None, arrays are flattened. If `axis` is an int larger that
297 | the number of dimensions in the arrays of the stream, variance is computed
298 | along the new axis.
299 | ddof : int, optional
300 | Means Delta Degrees of Freedom. The divisor used in calculations
301 | is ``N - ddof``, where ``N`` represents the number of elements.
302 | weights : iterable of ndarray, iterable of floats, or None, optional
303 | Iterable of weights associated with the values in each item of `arrays`.
304 | Each value in an element of `arrays` contributes to the variance
305 | according to its associated weight. The weights array can either be a float
306 | or an array of the same shape as any element of `arrays`. If weights=None,
307 | then all data in each element of `arrays` are assumed to have a weight equal to one.
308 | ignore_nan : bool, optional
309 | If True, NaNs are set to zero weight. Default is propagation of NaNs.
310 |
311 | Returns
312 | -------
313 | var: `~numpy.ndarray`
314 | Variance.
315 |
316 | See Also
317 | --------
318 | ivar : streaming variance
319 | numpy.var : variance calculation for dense arrays. Weights are not supported.
320 |
321 | References
322 | ----------
323 | .. [#] D. H. D. West, Updating the mean and variance estimates: an improved method.
324 | Communications of the ACM Vol. 22, Issue 9, pp. 532 - 535 (1979)
325 | """
326 | _, variance = average_and_var(
327 | arrays=arrays, axis=axis, ddof=ddof, weights=weights, ignore_nan=ignore_nan
328 | )
329 | return variance
330 |
331 |
332 | @array_stream
333 | def ivar(arrays, axis=-1, ddof=0, weights=None, ignore_nan=False):
334 | """
335 | Streaming variance of arrays. Weights are also supported.
336 |
337 | Parameters
338 | ----------
339 | arrays : iterable of ndarrays
340 | Arrays to be combined. This iterable can also a generator.
341 | axis : int, optional
342 | Reduction axis. Default is to combine the arrays in the stream as if
343 | they had been stacked along a new axis, then compute the variance along this new axis.
344 | If None, arrays are flattened. If `axis` is an int larger that
345 | the number of dimensions in the arrays of the stream, variance is computed
346 | along the new axis.
347 | ddof : int, optional
348 | Means Delta Degrees of Freedom. The divisor used in calculations
349 | is ``N - ddof``, where ``N`` represents the number of elements.
350 | weights : iterable of ndarray, iterable of floats, or None, optional
351 | Iterable of weights associated with the values in each item of `arrays`.
352 | Each value in an element of `arrays` contributes to the variance
353 | according to its associated weight. The weights array can either be a float
354 | or an array of the same shape as any element of `arrays`. If weights=None,
355 | then all data in each element of `arrays` are assumed to have a weight equal to one.
356 | ignore_nan : bool, optional
357 | If True, NaNs are set to zero weight. Default is propagation of NaNs.
358 |
359 | Yields
360 | ------
361 | var: `~numpy.ndarray`
362 | Variance.
363 |
364 | See Also
365 | --------
366 | numpy.var : variance calculation for dense arrays. Weights are not supported.
367 |
368 | References
369 | ----------
370 | .. [#] D. H. D. West, Updating the mean and variance estimates: an improved method.
371 | Communications of the ACM Vol. 22, Issue 9, pp. 532 - 535 (1979)
372 | """
373 | primitive = _ivar(arrays=arrays, axis=axis, weights=weights, ignore_nan=ignore_nan)
374 | for avg, sq_avg, swgt in primitive:
375 | yield (sq_avg - avg**2) * (swgt / (swgt - ddof))
376 |
377 |
378 | @array_stream
379 | def std(arrays, axis=-1, ddof=0, weights=None, ignore_nan=False):
380 | """
381 | Total standard deviation of arrays. Weights are also supported. This function
382 | consumes the input stream.
383 |
384 | Parameters
385 | ----------
386 | arrays : iterable of ndarrays
387 | Arrays to be combined. This iterable can also a generator.
388 | axis : int, optional
389 | Reduction axis. Default is to combine the arrays in the stream as if
390 | they had been stacked along a new axis, then compute the standard deviation along this new axis.
391 | If None, arrays are flattened. If `axis` is an int larger that
392 | the number of dimensions in the arrays of the stream, standard deviation is computed
393 | along the new axis.
394 | ddof : int, optional
395 | Means Delta Degrees of Freedom. The divisor used in calculations
396 | is ``N - ddof``, where ``N`` represents the number of elements.
397 | weights : iterable of ndarray, iterable of floats, or None, optional
398 | Iterable of weights associated with the values in each item of `arrays`.
399 | Each value in an element of `arrays` contributes to the standard deviation
400 | according to its associated weight. The weights array can either be a float
401 | or an array of the same shape as any element of `arrays`. If weights=None,
402 | then all data in each element of `arrays` are assumed to have a weight equal to one.
403 | ignore_nan : bool, optional
404 | If True, NaNs are set to zero weight. Default is propagation of NaNs.
405 |
406 | Returns
407 | -------
408 | std: `~numpy.ndarray`
409 | Standard deviation
410 |
411 | See Also
412 | --------
413 | istd : streaming standard deviation.
414 | numpy.std : standard deviation calculation of dense arrays. Weights are not supported.
415 | """
416 | return np.sqrt(
417 | var(arrays=arrays, axis=axis, ddof=ddof, weights=weights, ignore_nan=ignore_nan)
418 | )
419 |
420 |
421 | @array_stream
422 | def istd(arrays, axis=-1, ddof=0, weights=None, ignore_nan=False):
423 | """
424 | Streaming standard deviation of arrays. Weights are also supported.
425 | This is equivalent to calling `numpy.std(axis = 2)` on a stack of images.
426 |
427 | Parameters
428 | ----------
429 | arrays : iterable of ndarrays
430 | Arrays to be combined. This iterable can also a generator.
431 | axis : int, optional
432 | Reduction axis. Default is to combine the arrays in the stream as if
433 | they had been stacked along a new axis, then compute the standard deviation along this new axis.
434 | If None, arrays are flattened. If `axis` is an int larger that
435 | the number of dimensions in the arrays of the stream, standard deviation is computed
436 | along the new axis.
437 | ddof : int, optional
438 | Means Delta Degrees of Freedom. The divisor used in calculations
439 | is ``N - ddof``, where ``N`` represents the number of elements.
440 | weights : iterable of ndarray, iterable of floats, or None, optional
441 | Iterable of weights associated with the values in each item of `arrays`.
442 | Each value in an element of `arrays` contributes to the standard deviation
443 | according to its associated weight. The weights array can either be a float
444 | or an array of the same shape as any element of `arrays`. If weights=None,
445 | then all data in each element of `arrays` are assumed to have a weight equal to one.
446 | ignore_nan : bool, optional
447 | If True, NaNs are set to zero weight. Default is propagation of NaNs.
448 |
449 | Yields
450 | ------
451 | std: `~numpy.ndarray`
452 | Standard deviation
453 |
454 | See Also
455 | --------
456 | std : total standard deviation.
457 | numpy.std : standard deviation calculation of dense arrays. Weights are not supported.
458 | """
459 | yield from map(
460 | np.sqrt,
461 | ivar(
462 | arrays=arrays, axis=axis, ddof=ddof, weights=weights, ignore_nan=ignore_nan
463 | ),
464 | )
465 |
466 |
467 | @array_stream
468 | def sem(arrays, axis=-1, ddof=0, weights=None, ignore_nan=False):
469 | """
470 | Standard error in the mean (SEM) of a stream of arrays. This function consumes
471 | the entire stream.
472 |
473 | Parameters
474 | ----------
475 | arrays : iterable of ndarrays
476 | Arrays to be combined. This iterable can also a generator.
477 | axis : int, optional
478 | Reduction axis. Default is to combine the arrays in the stream as if
479 | they had been stacked along a new axis, then compute the standard error along this new axis.
480 | If None, arrays are flattened. If `axis` is an int larger that
481 | the number of dimensions in the arrays of the stream, standard error is computed
482 | along the new axis.
483 | ddof : int, optional
484 | Means Delta Degrees of Freedom. The divisor used in calculations
485 | is ``N - ddof``, where ``N`` represents the number of elements.
486 | weights : iterable of ndarray, iterable of floats, or None, optional
487 | Iterable of weights associated with the values in each item of `arrays`.
488 | Each value in an element of `arrays` contributes to the standard error
489 | according to its associated weight. The weights array can either be a float
490 | or an array of the same shape as any element of `arrays`. If weights=None,
491 | then all data in each element of `arrays` are assumed to have a weight equal to one.
492 | ignore_nan : bool, optional
493 | If True, NaNs are set to zero weight. Default is propagation of NaNs.
494 |
495 | Returns
496 | -------
497 | sem: `~numpy.ndarray`, dtype float
498 | Standard error in the mean.
499 |
500 | See Also
501 | --------
502 | scipy.stats.sem : standard error in the mean of dense arrays.
503 | """
504 | avg, sq_avg, swgt = last(
505 | _ivar(arrays=arrays, axis=axis, weights=weights, ignore_nan=ignore_nan)
506 | )
507 | return np.sqrt((sq_avg - avg**2) * (1 / (swgt - ddof)))
508 |
509 |
510 | @array_stream
511 | def isem(arrays, axis=-1, ddof=1, weights=None, ignore_nan=False):
512 | """
513 | Streaming standard error in the mean (SEM) of arrays. This is equivalent to
514 | calling `scipy.stats.sem(axis = 2)` on a stack of images.
515 |
516 | Parameters
517 | ----------
518 | arrays : iterable of ndarrays
519 | Arrays to be combined. This iterable can also a generator.
520 | axis : int, optional
521 | Reduction axis. Default is to combine the arrays in the stream as if
522 | they had been stacked along a new axis, then compute the standard error along this new axis.
523 | If None, arrays are flattened. If `axis` is an int larger that
524 | the number of dimensions in the arrays of the stream, standard error is computed
525 | along the new axis.
526 | ddof : int, optional
527 | Means Delta Degrees of Freedom. The divisor used in calculations
528 | is ``N - ddof``, where ``N`` represents the number of elements.
529 | weights : iterable of ndarray, iterable of floats, or None, optional
530 | Iterable of weights associated with the values in each item of `arrays`.
531 | Each value in an element of `arrays` contributes to the standard error
532 | according to its associated weight. The weights array can either be a float
533 | or an array of the same shape as any element of `arrays`. If weights=None,
534 | then all data in each element of `arrays` are assumed to have a weight equal to one.
535 | ignore_nan : bool, optional
536 | If True, NaNs are set to zero weight. Default is propagation of NaNs.
537 |
538 | Yields
539 | ------
540 | sem: `~numpy.ndarray`, dtype float
541 | Standard error in the mean.
542 |
543 | See Also
544 | --------
545 | scipy.stats.sem : standard error in the mean of dense arrays.
546 | """
547 | primitive = _ivar(arrays=arrays, axis=axis, weights=weights, ignore_nan=ignore_nan)
548 | for avg, sq_avg, swgt in primitive:
549 | yield np.sqrt((sq_avg - avg**2) * (1 / (swgt - ddof)))
550 |
551 |
552 | @array_stream
553 | def ihistogram(arrays, bins, range=None, weights=None):
554 | """
555 | Streaming histogram calculation.
556 |
557 | Parameters
558 | ----------
559 | arrays : iterable of ndarrays
560 | Arrays to be combined. This iterable can also a generator. Arrays in this stream
561 | can be of any shape; the histogram is computed over the flattened array.
562 | bins : iterable
563 | Bin edges, including the rightmost edge, allowing for non-uniform bin widths.
564 | To determine the appropriate bins automatically, see ``numpy.histogram_bin_edges``.
565 | weights : iterable of ndarray, iterable of floats, or None, optional
566 | Iterable of weights associated with the values in each item of `arrays`.
567 | Each value in a only contributes its associated weight towards the
568 | bin count (instead of 1). The weights array can either be a float
569 | or an array of the same shape as any element of `arrays`. If ``weights=None``,
570 | then all data in each element of `arrays` are assumed to have a weight equal to one.
571 |
572 | .. versionadded:: 1.6.1
573 |
574 | Yields
575 | ------
576 | hist : `~numpy.ndarray`
577 | Streamed histogram.
578 |
579 | See Also
580 | --------
581 | numpy.histogram : 1D histogram of dense arrays.
582 | numpy.histogram_bin_edges : automatic selection of bins
583 | """
584 | bins = np.asarray(bins)
585 | first, arrays = peek(arrays)
586 |
587 | if weights is None:
588 | weights = repeat(None)
589 | else:
590 | weights = map(partial(np.broadcast_to, shape=first.shape), weights)
591 |
592 | # np.histogram also returns the bin edges, which we ignore
593 | hist_func = lambda arr, wgt: np.histogram(arr, bins=bins, weights=wgt)[0]
594 | yield from isum(starmap(hist_func, zip(arrays, weights)))
595 |
--------------------------------------------------------------------------------