├── npstreams ├── tests │ ├── __init__.py │ ├── data │ │ ├── test_data1.npy │ │ ├── test_data2.npy │ │ └── test_data3.npy │ ├── test_array_utils.py │ ├── test_stacking.py │ ├── test_array_stream.py │ ├── test_linalg.py │ ├── test_parallel.py │ ├── test_flow.py │ ├── test_cuda.py │ ├── test_iter_utils.py │ ├── test_reduce.py │ ├── test_numerics.py │ └── test_stats.py ├── __init__.py ├── stacking.py ├── array_utils.py ├── array_stream.py ├── flow.py ├── linalg.py ├── parallel.py ├── iter_utils.py ├── cuda.py ├── benchmarks.py ├── numerics.py ├── reduce.py └── stats.py ├── docs ├── whatsnew.rst ├── references.txt ├── recipes.rst ├── control_flow.rst ├── cuda.rst ├── installation.rst ├── conventions.rst ├── api.rst ├── index.rst ├── conf.py └── making_your_own.rst ├── MANIFEST.in ├── RELEASE-CHECKLIST.rst ├── .readthedocs.yml ├── .gitattributes ├── release-description.py ├── LICENSE.txt ├── CHANGELOG.rst ├── .gitignore ├── pyproject.toml ├── .github └── workflows │ └── ci.yml └── README.md /npstreams/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/whatsnew.rst: -------------------------------------------------------------------------------- 1 | What's new 2 | ========== 3 | 4 | .. include:: ../CHANGELOG.rst -------------------------------------------------------------------------------- /npstreams/tests/data/test_data1.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LaurentRDC/npstreams/HEAD/npstreams/tests/data/test_data1.npy -------------------------------------------------------------------------------- /npstreams/tests/data/test_data2.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LaurentRDC/npstreams/HEAD/npstreams/tests/data/test_data2.npy -------------------------------------------------------------------------------- /npstreams/tests/data/test_data3.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LaurentRDC/npstreams/HEAD/npstreams/tests/data/test_data3.npy -------------------------------------------------------------------------------- /docs/references.txt: -------------------------------------------------------------------------------- 1 | .. _Numpy: http://www.numpy.org 2 | .. _Scipy: https://www.scipy.org 3 | .. _PyCUDA: https://documen.tician.de/pycuda/ -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include CHANGELOG.rst 2 | include LICENSE 3 | include README.md 4 | 5 | recursive-include npstreams/tests/data * 6 | 7 | recursive-exclude docs * 8 | 9 | global-exclude *.py[cod] __pycache__ *.so *.dylib -------------------------------------------------------------------------------- /RELEASE-CHECKLIST.rst: -------------------------------------------------------------------------------- 1 | Release checklist 2 | ----------------- 3 | 4 | To create a release, simply create a tag that starts with 'v' (e.g. 'v2.0.0'):: 5 | 6 | git tag -a "v2.0.0" 7 | git push origin "v2.0.0" 8 | 9 | The package will be automatically tested, released on GitHub and uploaded to PyPI. -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | # Required 5 | version: 2 6 | 7 | sphinx: 8 | configuration: docs/conf.py 9 | 10 | build: 11 | os: ubuntu-22.04 12 | tools: 13 | python: "3.10" 14 | 15 | python: 16 | install: 17 | - method: pip 18 | path: . 19 | extra_requirements: 20 | - development -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /release-description.py: -------------------------------------------------------------------------------- 1 | """ 2 | Extract the changes from last release 3 | """ 4 | 5 | import sys 6 | 7 | if __name__ == "__main__": 8 | filename = sys.argv[1] 9 | 10 | with open(filename, mode="r") as f: 11 | 12 | # Look for the first second-level title 13 | for line in f: 14 | if line.startswith("Release"): 15 | break 16 | 17 | print(line, end="") 18 | for line in f: 19 | if not line.startswith("Release"): 20 | print(line, end="") 21 | else: 22 | # Exit gracefully 23 | sys.exit(0) 24 | # There was a problem: Exit with error 25 | sys.exit(-1) 26 | -------------------------------------------------------------------------------- /npstreams/tests/test_array_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from npstreams import nan_to_num 3 | 4 | 5 | def test_nan_to_num_generic(): 6 | """Test that NaNs are replaced with a fill value""" 7 | with np.errstate(divide="ignore", invalid="ignore"): 8 | vals = nan_to_num(np.array([0]) / 0.0, fill_value=14) 9 | assert vals[0] == 14 10 | 11 | 12 | def test_nan_to_num_integer(): 13 | """Test that nan_to_num on integers does nothing""" 14 | vals = nan_to_num(1) 15 | assert vals == 1 16 | vals = nan_to_num([1]) 17 | assert np.allclose(vals, np.array([1])) 18 | 19 | 20 | def test_nan_to_num_complex_good(): 21 | """Test nan_to_num on complex input""" 22 | vals = nan_to_num(1 + 1j) 23 | assert vals == 1 + 1j 24 | -------------------------------------------------------------------------------- /npstreams/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = "Laurent P. René de Cotret" 3 | __email__ = "laurent.decotret@outlook.com" 4 | __license__ = "BSD" 5 | __version__ = "1.7.0" 6 | 7 | from .benchmarks import benchmark 8 | from .array_stream import array_stream, ArrayStream 9 | from .array_utils import nan_to_num 10 | from .linalg import idot, itensordot, ieinsum, iinner 11 | from .parallel import pmap, pmap_unordered, preduce 12 | from .flow import ipipe, iload, pload 13 | from .iter_utils import ( 14 | cyclic, 15 | last, 16 | chunked, 17 | multilinspace, 18 | linspace, 19 | peek, 20 | itercopy, 21 | primed, 22 | length_hint, 23 | ) 24 | from .reduce import ireduce_ufunc, preduce_ufunc, reduce_ufunc 25 | from .stacking import stack 26 | from .stats import ( 27 | iaverage, 28 | average, 29 | imean, 30 | mean, 31 | istd, 32 | std, 33 | ivar, 34 | var, 35 | isem, 36 | sem, 37 | average_and_var, 38 | ihistogram, 39 | ) 40 | from .numerics import isum, sum, iprod, prod, isub, iall, iany, imax, imin 41 | -------------------------------------------------------------------------------- /npstreams/tests/test_stacking.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | 5 | from npstreams import stack 6 | import pytest 7 | 8 | 9 | def test_stack_against_numpy_stack(): 10 | """Test against numpy.stack for axis = -1 and""" 11 | stream = [np.random.random((15, 7, 2, 1)) for _ in range(10)] 12 | 13 | dense = np.stack(stream, axis=-1) 14 | from_stack = stack(stream, axis=-1) 15 | assert np.allclose(dense, from_stack) 16 | 17 | 18 | def test_stack_on_single_array(): 19 | """Test that npstreams.stack works with a single array""" 20 | arr = np.random.random((16, 16)) 21 | stacked = stack(arr) 22 | assert np.allclose(arr[..., np.newaxis], stacked) 23 | 24 | 25 | @pytest.mark.parametrize("axis", range(4)) 26 | def test_stack_against_numpy_concatenate(axis): 27 | """Test against numpy.concatenate for existing axes""" 28 | stream = [np.random.random((15, 7, 2, 1)) for _ in range(10)] 29 | 30 | dense = np.concatenate(stream, axis=axis) 31 | from_stack = stack(stream, axis=axis) 32 | assert np.allclose(dense, from_stack) 33 | -------------------------------------------------------------------------------- /docs/recipes.rst: -------------------------------------------------------------------------------- 1 | .. include:: references.txt 2 | 3 | .. _recipes: 4 | 5 | ******* 6 | Recipes 7 | ******* 8 | 9 | Single-pass mean and error calculation 10 | -------------------------------------- 11 | 12 | Here is a snipped for a function that computes a mean 13 | and standard error in the mean (SEM) in a single pass:: 14 | 15 | from npstreams import imean, isem, array_stream, itercopy 16 | 17 | # The `array_stream` decorator ensures that the elements of 18 | # the iterable `arrays` will be converted to ndarrays if possible 19 | # This decorator is not required. 20 | @array_stream 21 | def mean_and_error(arrays, axis = -1): 22 | """ Yields (mean, error) pairs from a stream of arrays """ 23 | # itercopy creates a copy of the original stream 24 | # The elements are only generated once, and then fed 25 | # to those two copies; much more efficient than 26 | # creating two streams from scratch. 27 | arrays_for_mean, arrays_for_sem = itercopy(arrays) 28 | 29 | means = imean(arrays_for_mean, axis = axis) 30 | errors = isem(arrays_for_sem, axis = axis) 31 | 32 | yield from zip(means, errors) -------------------------------------------------------------------------------- /npstreams/stacking.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Stacking arrays from a stream 4 | ----------------------------- 5 | """ 6 | from collections.abc import Sized 7 | from functools import partial 8 | 9 | import numpy as np 10 | 11 | from .array_stream import array_stream 12 | 13 | 14 | @array_stream 15 | def stack(arrays, axis=-1): 16 | """ 17 | Stack of all arrays from a stream. Generalization of numpy.stack 18 | and numpy.concatenate. 19 | 20 | Parameters 21 | ---------- 22 | arrays : iterable 23 | Stream of NumPy arrays. Arrays must have shapes that broadcast together. 24 | axis : int, optional 25 | Stacking direction. If ``axis = -1``, arrays are stacked along a 26 | new dimension. 27 | 28 | Returns 29 | ------- 30 | stacked : ndarray 31 | Cumulative stacked array. 32 | """ 33 | # Shortcut : if axis == -1, this is exactly what ArrayStream.__array__ 34 | if axis == -1: 35 | return np.array(arrays) 36 | 37 | # TODO: Shortcut if we already know the stream length 38 | # Note : we are guaranteed that `arrays` is a stream of arrays 39 | # at worst a tuple (arr,) 40 | # Use npstreams.length_hint 41 | arrays = iter(arrays) 42 | first = next(arrays) 43 | stack = np.array(first, copy=True) 44 | 45 | for array in arrays: 46 | stack = np.concatenate([stack, array], axis=axis) 47 | 48 | return stack 49 | -------------------------------------------------------------------------------- /docs/control_flow.rst: -------------------------------------------------------------------------------- 1 | .. include:: references.txt 2 | 3 | .. _control_flow: 4 | 5 | ************ 6 | Control Flow 7 | ************ 8 | 9 | .. currentmodule:: npstreams 10 | 11 | ========================= 12 | Streaming array pipelines 13 | ========================= 14 | 15 | Before reducing your stream of arrays (e.g. averaging them together), you may want to 16 | transform them. This can be done with the :func:`ipipe` function: 17 | 18 | .. autofunction:: ipipe 19 | :noindex: 20 | 21 | Imagine we have the following pipeline, in which we want processes images in some iterable :data:`arrays` 22 | as follows: 23 | 24 | * Remove negative pixel intensity values; 25 | * Adjust the gamma value of images (from Scikit-image's :mod:`exposure` module); 26 | * Average the result together. 27 | 28 | The following lines will do the trick:: 29 | 30 | from functools import partial 31 | from npstreams import ipipe, iaverage, last 32 | from skimage.exposure import adjust_gamma 33 | 34 | def remove_negative(arr): 35 | arr[arr < 0] = 0 36 | return arr 37 | 38 | pipeline = ipipe(adjust_gamma, remove_negative, arrays) 39 | avgs = last(iaverage(pipeline)) 40 | 41 | If the pipeline is computationally intensive, we can also pipe arrays in parallel using the 42 | keyword-only ``processes``:: 43 | 44 | pipeline = ipipe(adjust_gamma, remove_negative, arrays, processes = 4) # 4 cores will be used 45 | avgs = last(iaverage(pipeline)) 46 | 47 | Since :func:`ipipe` uses :func:`pmap` under the hood, we can also use all available cores 48 | by passing ``processes = None``. -------------------------------------------------------------------------------- /docs/cuda.rst: -------------------------------------------------------------------------------- 1 | .. include:: references.txt 2 | 3 | .. _cuda: 4 | 5 | ============ 6 | CUDA support 7 | ============ 8 | 9 | .. currentmodule:: npstreams 10 | 11 | What is CUDA 12 | ============ 13 | 14 | `CUDA `_ is a computing platform taking advantage of Nvidia hardware. 15 | It effectively allows for array computations on Graphical Processing Units (GPU). 16 | 17 | :mod:`npstreams` relies on the (optional) `PyCUDA`_ library 18 | to access CUDA functionality. 19 | 20 | Advantages of CUDA 21 | ------------------ 22 | 23 | TODO: benchmarks 24 | 25 | CUDA in npstreams 26 | ================= 27 | 28 | `PyCUDA`_ is an optional dependency. Therefore, the CUDA-enabled functions are located in a separate 29 | module, the :mod:`npstreams.cuda` submodule. 30 | 31 | Importing from :mod:`npstreams.cuda` submodule 32 | ---------------------------------------------- 33 | 34 | Importing anything from the :mod:`npstreams.cuda` submodule will raise an ``ImportError`` in the following cases: 35 | 36 | * `PyCUDA`_ is not installed; 37 | * No GPUs are available; 38 | * CUDA compilation backend is not available, possibly due to incomplete installation. 39 | 40 | With this in mind, it is wise to wrap import statements from :mod:`npstreams.cuda` in a ``try/except`` block. 41 | 42 | CUDA-enabled routines 43 | --------------------- 44 | 45 | A limited set of functions implemented in npstreams also have CUDA-enabled equivalents. For performance reasons, 46 | all CUDA-enabled routines operate along the 'stream' axis, i.e. as if the arrays had been stacked 47 | along a new dimension. -------------------------------------------------------------------------------- /npstreams/tests/test_array_stream.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | 4 | from npstreams.array_stream import array_stream, ArrayStream 5 | 6 | 7 | @array_stream 8 | def iden(arrays): 9 | yield from arrays 10 | 11 | 12 | def test_array_stream_decorator_type(): 13 | """Test that all object from an array stream are ndarrays""" 14 | 15 | stream = [0, 1, np.array([1])] 16 | for arr in iden(stream): 17 | assert isinstance(arr, np.ndarray) 18 | 19 | 20 | def test_single_array(): 21 | """Test that a 'stream' consisting of a single array is repackaged into an iterable""" 22 | stream = np.array([1, 2, 3]) 23 | assert len(list(iden(stream))) == 1 24 | 25 | 26 | def test_array_stream_length_hint_sized_iterable(): 27 | """Test the accuracy of __length_hint__ for ArrayStream constructed 28 | from a sized iterable""" 29 | iterable = [1, 2, 3, 4, 5] 30 | a = ArrayStream(iterable) 31 | assert len(iterable) == a.__length_hint__() 32 | 33 | 34 | def test_array_stream_length_hint_not_sized_iterable(): 35 | """Test that __length_hint__ returns NotImplemented for ArrayStream constructed 36 | from an unsized iterable""" 37 | iterable = (0 for _ in range(10)) 38 | a = ArrayStream(iterable) 39 | assert a.__length_hint__() is NotImplemented 40 | 41 | 42 | def test_array_stream_conversion_to_array(): 43 | """Test that numpy.array(Arraystream(...)) returns an array built as a stack of arrays""" 44 | a = ArrayStream([np.random.random((16, 16)) for _ in range(10)]) 45 | arr = np.array(a) 46 | assert arr.shape == (16, 16, 10) 47 | -------------------------------------------------------------------------------- /npstreams/array_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Array utilities 4 | --------------- 5 | """ 6 | import numpy as np 7 | 8 | 9 | def nan_to_num(array, fill_value=0.0, copy=True): 10 | """ 11 | Replace NaNs with another fill value. 12 | 13 | Parameters 14 | ---------- 15 | array : array_like 16 | Input data. 17 | fill_value : float, optional 18 | NaNs will be replaced by ``fill_value``. Default is 0.0, in keeping 19 | with ``numpy.nan_to_num``. 20 | copy : bool, optional 21 | Whether to create a copy of `array` (True) or to replace values 22 | in-place (False). The in-place operation only occurs if 23 | casting to an array does not require a copy. 24 | 25 | Returns 26 | ------- 27 | out : ndarray 28 | Array without NaNs. If ``array`` was not of floating or complearray type, 29 | ``array`` is returned unchanged. 30 | 31 | Notes 32 | ----- 33 | Contrary to ``numpy.nan_to_num``, this functions does not handle 34 | infinite values. 35 | 36 | See Also 37 | -------- 38 | numpy.nan_to_num : replace NaNs and Infs with zeroes. 39 | """ 40 | array = np.array(array, subok=True, copy=copy) 41 | dtype = array.dtype.type 42 | 43 | # Non-inexact types do not have NaNs 44 | if not np.issubdtype(dtype, np.inexact): 45 | return array 46 | 47 | iscomplex = np.issubdtype(dtype, np.complexfloating) 48 | dest = (array.real, array.imag) if iscomplex else (array,) 49 | for d in dest: 50 | np.copyto(d, fill_value, where=np.isnan(d)) 51 | return array 52 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017-2020, Laurent P. René de Cotret. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are 6 | met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | * Neither the name of the NumPy Developers nor the names of any 17 | contributors may be used to endorse or promote products derived 18 | from this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /CHANGELOG.rst: -------------------------------------------------------------------------------- 1 | 2 | Release 1.7.0 3 | ------------- 4 | 5 | * Explicit support for NumPy 2, in addition to NumPy 1. 6 | 7 | Release 1.6.6 8 | ------------- 9 | 10 | * Added the ability to automatically publish to PyPI. 11 | 12 | Release 1.6.5 13 | ------------- 14 | 15 | * `Support for Python 3.6 and NumPy<1.17 has been dropped `_ 16 | * Migration of testing infrastructure to pytest. 17 | * Tests are now included in the package itself. 18 | * Fixed some deprecation warnings from NumPy 1.20+. 19 | 20 | Release 1.6.4 21 | ------------- 22 | 23 | * Fixed an issue regarding a deprecation of `collections.Sized` (in favour of `collections.abc.Sized`) in Python 3.10+ 24 | * Code snippets in documentation are now tested for correctness. 25 | * Tests are now included in source distributions. 26 | 27 | Release 1.6.3 28 | ------------- 29 | 30 | * Added support for Python 3.9 31 | 32 | Release 1.6.2 33 | ------------- 34 | 35 | * Added the ability to run default benchmarks from the command line with ``python -m npsteams.benchmarks``. 36 | * Added explicit support for Python 3.8. 37 | * Bumped requirement for `numpy >= 1.14`. 38 | 39 | Release 1.6.1 40 | ------------- 41 | 42 | * Added a changelog. 43 | * Added the possibility to use weights in ``ihistogram``. 44 | * Added the function ``average_and_var`` to compute the average and variance in a single pass. 45 | * Documentation regarding the ``ddof`` keyword in many statistical wrongly stated that the default value was 1. This has been corrected. 46 | 47 | Release 1.6 48 | ----------- 49 | 50 | * Fixed some issues with NumPy versions above 1.16. 51 | 52 | Release 1.5.2 53 | ------------- 54 | 55 | * Added benchmarking capabilities. 56 | * Added the ``array_stream`` decorator. 57 | * Removed support for Python < 3.6. -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Visual studio cache 10 | *.vs/ 11 | *.vscode/ 12 | 13 | # autogenerated documentation 14 | docs/source/functions/ 15 | docs/source/classes/ 16 | 17 | # Jupyter notebooks 18 | notebooks/ 19 | 20 | # Distribution / packaging 21 | .Python 22 | env/ 23 | build/ 24 | develop-eggs/ 25 | dist/ 26 | downloads/ 27 | eggs/ 28 | .eggs/ 29 | lib/ 30 | lib64/ 31 | parts/ 32 | sdist/ 33 | var/ 34 | 35 | *.egg-info/ 36 | .installed.cfg 37 | *.egg 38 | 39 | # PyInstaller 40 | # Usually these files are written by a python script from a template 41 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 42 | *.manifest 43 | *.spec 44 | 45 | # Installer logs 46 | pip-log.txt 47 | pip-delete-this-directory.txt 48 | 49 | # Unit test / coverage reports 50 | htmlcov/ 51 | .tox/ 52 | .coverage 53 | .coverage.* 54 | .cache 55 | nosetests.xml 56 | coverage.xml 57 | *,cover 58 | .hypothesis/ 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | # These directories are autogenerated 77 | docs/_build/ 78 | docs/functions/ 79 | docs/classes/ 80 | 81 | # PyBuilder 82 | target/ 83 | 84 | # IPython Notebook 85 | .ipynb_checkpoints 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # celery beat schedule file 91 | celerybeat-schedule 92 | 93 | # dotenv 94 | .env 95 | 96 | # virtualenv 97 | venv/ 98 | ENV/ 99 | 100 | # Spyder project settings 101 | .spyderproject 102 | 103 | # Rope project settings 104 | .ropeproject 105 | 106 | # PyCharm 107 | .idea/ -------------------------------------------------------------------------------- /npstreams/tests/test_linalg.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from random import randint, random 4 | 5 | import numpy as np 6 | 7 | from npstreams import idot, itensordot, iinner, ieinsum, last 8 | import pytest 9 | 10 | 11 | def test_idot_against_numpy_multidot(): 12 | """Test against numpy.linalg.multi_dot in 2D case""" 13 | stream = [np.random.random((8, 8)) for _ in range(7)] 14 | 15 | from_numpy = np.linalg.multi_dot(stream) 16 | from_stream = last(idot(stream)) 17 | 18 | assert from_numpy.shape == from_stream.shape 19 | assert np.allclose(from_numpy, from_stream) 20 | 21 | 22 | @pytest.mark.parametrize("axis", (0, 1, 2)) 23 | def test_itensordot_against_numpy_tensordot(axis): 24 | """Test against numpy.tensordot in 2D case""" 25 | stream = tuple(np.random.random((8, 8)) for _ in range(2)) 26 | 27 | from_numpy = np.tensordot(*stream) 28 | from_stream = last(itensordot(stream)) 29 | 30 | assert from_numpy.shape == from_stream.shape 31 | assert np.allclose(from_numpy, from_stream) 32 | 33 | 34 | @pytest.mark.parametrize("axis", (0, 1, 2)) 35 | def test_iinner_against_numpy_inner(axis): 36 | """Test against numpy.tensordot in 2D case""" 37 | stream = tuple(np.random.random((8, 8)) for _ in range(2)) 38 | 39 | from_numpy = np.inner(*stream) 40 | from_stream = last(iinner(stream)) 41 | 42 | assert from_numpy.shape == from_stream.shape 43 | assert np.allclose(from_numpy, from_stream) 44 | 45 | 46 | def test_ieinsum_against_numpy_einsum(): 47 | """Test against numpy.einsum""" 48 | a = np.arange(60.0).reshape(3, 4, 5) 49 | b = np.arange(24.0).reshape(4, 3, 2) 50 | stream = [a, b] 51 | 52 | from_numpy = np.einsum("ijk,jil->kl", a, b) 53 | from_stream = last(ieinsum(stream, "ijk,jil->kl")) 54 | 55 | assert from_numpy.shape == from_stream.shape 56 | assert np.allclose(from_numpy, from_stream) 57 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["build", "setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.setuptools.dynamic] 6 | version = {attr = "npstreams.__version__"} 7 | 8 | [project] 9 | name = "npstreams" 10 | dynamic = ["version"] 11 | authors = [ 12 | { name="Laurent P. René de Cotret", email="laurent.decotret@outlook.com" }, 13 | ] 14 | maintainers = [ 15 | { name="Laurent P. René de Cotret", email="laurent.decotret@outlook.com" }, 16 | ] 17 | description = "Streaming operations on NumPy arrays" 18 | readme = "README.md" 19 | license = {file = "LICENSE"} 20 | requires-python = ">=3.7, <4" 21 | dependencies = ["numpy >= 1.17, <3"] 22 | keywords=["streaming", "numpy", "math"] 23 | classifiers = [ 24 | "Environment :: Console", 25 | "Intended Audience :: Science/Research", 26 | "Topic :: Scientific/Engineering", 27 | "License :: OSI Approved :: BSD License", 28 | "Natural Language :: English", 29 | "Operating System :: OS Independent", 30 | "Programming Language :: Python", 31 | "Programming Language :: Python :: 3", 32 | ] 33 | 34 | [project.optional-dependencies] 35 | development = [ 36 | "Sphinx >= 3", 37 | "sphinx_rtd_theme >= 0.4", 38 | "pytest >= 6", 39 | "scipy >= 1", 40 | ] 41 | 42 | [project.urls] 43 | Documentation = "https://npstreams.readthedocs.io/" 44 | Repository = "https://github.com/LaurentRDC/npstreams" 45 | "Bug Tracker" = "https://github.com/LaurentRDC/npstreams/issues" 46 | 47 | [tool.black] 48 | line-length = 120 49 | include = '\.pyi?$' 50 | 51 | [tool.isort] 52 | profile = "black" 53 | 54 | [tool.pytest.ini_options] 55 | minversion = "6.0" 56 | log_cli_level = "INFO" 57 | addopts = ["--doctest-modules"] 58 | testpaths = ["npstreams/tests"] 59 | 60 | # See here for an explanation of how to include package data: 61 | # https://setuptools.pypa.io/en/latest/userguide/datafiles.html#package-data 62 | [tool.setuptools.package-data] 63 | npstreams = ["tests/data/*.npy"] 64 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | .. include:: references.txt 2 | 3 | .. _installation: 4 | 5 | ************ 6 | Installation 7 | ************ 8 | 9 | Requirements 10 | ============ 11 | 12 | **npstreams** works on Linux, Mac OS X and Windows. It requires Python 3.7+ 13 | as well as `numpy`_. `scipy`_ is an optional dependency that is only used in 14 | tests; however, if SciPy cannot be imported, tests will not fail. 15 | 16 | To get access to the :mod:`npstreams.cuda` module, which contains CUDA-enabled routines, 17 | PyCUDA_ must be installed as well. 18 | 19 | Install npstreams 20 | ================= 21 | 22 | npstreams is available on PyPI; it can be installed with `pip `_:: 23 | 24 | python -m pip install npstreams 25 | 26 | npstreams can also be installed with the conda package manager, from the conda-forge channel:: 27 | 28 | conda config --add channels conda-forge 29 | conda install npstreams 30 | 31 | You can install the latest developer version of npstreams by cloning the git 32 | repository:: 33 | 34 | git clone https://github.com/LaurentRDC/npstreams.git 35 | 36 | ...then installing the package with:: 37 | 38 | cd npstreams 39 | pip install . 40 | 41 | 42 | Testing 43 | ======= 44 | 45 | If you want to check that all the tests are running correctly with your Python 46 | configuration, type:: 47 | 48 | pip install .[development] 49 | pytest 50 | 51 | 52 | Embedding in applications 53 | ========================= 54 | 55 | `npstreams` is designed to be used in conjuction with multiprocessing libraries, such as the standard 56 | `multiprocessing` library. `npstreams` even uses `multiprocessing` directly in certain functions. 57 | 58 | In order to use the multicore functionality of `npstreams` in applications frozen with `py2exe`, `PyInstaller`, or `cx_Freeze`, 59 | you will need to activate the ``multiprocessing.freeze_support()`` function. `You can read more 60 | about it here. `_ -------------------------------------------------------------------------------- /npstreams/tests/test_parallel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from npstreams import pmap, pmap_unordered, preduce 3 | from functools import reduce 4 | import numpy as np 5 | from operator import add 6 | 7 | 8 | def identity(obj, *args, **kwargs): 9 | """ignores args and kwargs""" 10 | return obj 11 | 12 | 13 | def test_preduce_preduce_one_process(): 14 | """Test that preduce reduces to functools.reduce for a single process""" 15 | integers = list(range(0, 10)) 16 | preduce_results = preduce(add, integers, processes=1) 17 | reduce_results = reduce(add, integers) 18 | 19 | assert preduce_results == reduce_results 20 | 21 | 22 | def test_preduce_preduce_multiple_processes(): 23 | """Test that preduce reduces to functools.reduce for a single process""" 24 | integers = list(range(0, 10)) 25 | preduce_results = preduce(add, integers, processes=2) 26 | reduce_results = reduce(add, integers) 27 | 28 | assert preduce_results == reduce_results 29 | 30 | 31 | def test_preduce_on_numpy_arrays(): 32 | """Test sum of numpy arrays as parallel reduce""" 33 | arrays = [np.zeros((32, 32)) for _ in range(10)] 34 | s = preduce(add, arrays, processes=2) 35 | 36 | assert np.allclose(s, arrays[0]) 37 | 38 | 39 | def test_preduce_with_kwargs(): 40 | """Test preduce with keyword-arguments""" 41 | pass 42 | 43 | 44 | def test_pmap_trivial_map_no_args(): 45 | """Test that pmap is working with no positional arguments""" 46 | integers = list(range(0, 10)) 47 | result = list(pmap(identity, integers, processes=2)) 48 | assert integers == result 49 | 50 | 51 | def test_pmap_trivial_map_kwargs(): 52 | """Test that pmap is working with args and kwargs""" 53 | integers = list(range(0, 10)) 54 | result = list(pmap(identity, integers, processes=2, kwargs={"test": True})) 55 | assert result == integers 56 | 57 | 58 | def test_pmap_trivial_map_no_args(): 59 | """Test that pmap_unordered is working with no positional arguments""" 60 | integers = list(range(0, 10)) 61 | result = list(sorted(pmap_unordered(identity, integers, processes=2))) 62 | assert integers == result 63 | 64 | 65 | def test_pmap_trivial_map_kwargs(): 66 | """Test that pmap_unordered is working with args and kwargs""" 67 | integers = list(range(0, 10)) 68 | result = list( 69 | sorted(pmap_unordered(identity, integers, processes=2, kwargs={"test": True})) 70 | ) 71 | assert result == integers 72 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Continuous integration 2 | 3 | on: 4 | push: 5 | pull_request: 6 | 7 | jobs: 8 | build: 9 | # To prevent this job from running, have "[skip ci]" or "[ci skip]" in the commit message 10 | if: contains(toJson(github.event.commits), '[ci skip]') == false && contains(toJson(github.event.commits), '[skip ci]') == false 11 | 12 | runs-on: ${{ matrix.os }} 13 | strategy: 14 | fail-fast: false 15 | matrix: 16 | os: [ubuntu-latest, macos-latest, windows-latest] 17 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] 18 | 19 | steps: 20 | - uses: actions/checkout@v4 21 | 22 | - name: Set up Python ${{ matrix.python-version }} on ${{ matrix.os }} 23 | uses: actions/setup-python@v5 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | pip install .[development] 31 | 32 | # Note the use of the -Wa flag to show DeprecationWarnings 33 | # We run the tests on the installed package 34 | - name: Unit tests and doctests 35 | run: | 36 | python -Wa -m pytest 37 | 38 | - name: Build documentation 39 | run: 40 | sphinx-build -M html docs build/docs 41 | 42 | 43 | release: 44 | if: startsWith(github.ref, 'refs/tags/v') 45 | needs: [build] 46 | runs-on: ubuntu-latest 47 | permissions: 48 | id-token: write # IMPORTANT: this permission is mandatory for trusted publishing 49 | contents: write # To create a release 50 | steps: 51 | - uses: actions/checkout@v4 52 | 53 | - name: Set up Python 54 | uses: actions/setup-python@v5 55 | with: 56 | python-version: "3.10" 57 | 58 | - name: Install dependencies 59 | run: | 60 | pip install build 61 | pip install .[development] 62 | 63 | - name: Create release description 64 | run: | 65 | python release-description.py CHANGELOG.rst > description.md 66 | cat description.md 67 | 68 | - name: Create source distribution 69 | run: | 70 | python -m build 71 | 72 | - name: Create release 73 | uses: softprops/action-gh-release@v2 74 | with: 75 | body_path: description.md 76 | files: | 77 | dist/* 78 | 79 | # Github Actions have been set as a trusted publisher on PyPI's npstreams project, 80 | # hence why no username, password, or token is required. 81 | - name: Upload to PyPI 82 | if: always() 83 | uses: pypa/gh-action-pypi-publish@release/v1 84 | -------------------------------------------------------------------------------- /docs/conventions.rst: -------------------------------------------------------------------------------- 1 | .. include:: references.txt 2 | 3 | .. _conventions: 4 | 5 | *********** 6 | Conventions 7 | *********** 8 | 9 | .. currentmodule:: npstreams 10 | 11 | Stream Conventions 12 | ------------------ 13 | 14 | Most (all?) functions in :mod:`npstreams` are designed to work on streams, or 15 | iterables of NumPy arrays. These iterables can be infinite. 16 | The quintessential example is a stream of images progressively read from disk. 17 | These streams of arrays must contain arrays that all have the same shape and data-type, 18 | unless specified otherwise. 19 | 20 | An example of a function that operates on a stream of arrays of different shapes is :func:`ieinsum` 21 | 22 | A single NumPy array can be passed where a stream is expected; the array will be repackaged 23 | into a stream of a single array. 24 | 25 | Naming Conventions 26 | ------------------ 27 | 28 | In order to facilitate documentation, functions in :mod:`npstreams` follow the following conventions: 29 | 30 | * Routines are named after their closest equivalent in :mod:`numpy` and :mod:`scipy`. 31 | * Routines with names starting with 'i' (e.g. :func:`iprod`) are generator functions; they yield running results 32 | as they are being computer. Usually, these functions have a non-generator equivalent that 33 | consumes the entire stream (e.g. :func:`iaverage` vs. :func:`average`). 34 | * Routines with names starting with 'c' (e.g. :func:`csum`) are CUDA-enabled (requires :mod:`pycuda`) 35 | * Routines with names starting with 'p' (e.g. :func:`pmap`) can be parallelized. The default 36 | behavior is always to not use multiple cores. For example, the default behavior of :func:`pmap` 37 | is to behave like :func:`map`. 38 | 39 | Axis Conventions 40 | ---------------- 41 | 42 | NumPy arrays provide operations along axes. Similarly, :mod:`npstreams` also 43 | exposes the :data:`axis` keyword in some (most?) reduction functions like :func:`isum` 44 | and :func:`iprod`. 45 | 46 | The convention for specification of the :data:`axis` parameter is as follows: 47 | 48 | * If ``axis = None``, arrays are flattened before being combined. The result will 49 | be a scalar of a 0d array. 50 | * The default (``axis = -1``) always corresponds to combining arrays along a 51 | new axis. For example, summing images together along ``axis = -1`` is equivalent 52 | to stacking images along a new axis, then averaging along this new axis 53 | * if ``axis`` is an ``int``, then arrays are reduced according to this axis, and then combined. 54 | 55 | CUDA-enabled functions 56 | ---------------------- 57 | Some functions are implemented using CUDA -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | .. include:: references.txt 2 | 3 | .. _api: 4 | 5 | ************* 6 | Reference/API 7 | ************* 8 | 9 | .. currentmodule:: npstreams 10 | 11 | Click on any function below to see detailed information. 12 | 13 | Creation of Streams 14 | ------------------- 15 | 16 | Decorator for streaming functions which guarantees that the stream elements will be converted to arrays. 17 | 18 | .. autosummary:: 19 | :toctree: functions/ 20 | 21 | array_stream 22 | 23 | The :func:`array_stream` decorator wraps iterables into an :class:`ArrayStream` iterator. This is not 24 | required to use the functions defined here, but it provides some nice guarantees. 25 | 26 | .. autosummary:: 27 | :toctree: classes/ 28 | 29 | ArrayStream 30 | 31 | Statistical Functions 32 | --------------------- 33 | 34 | .. autosummary:: 35 | :toctree: functions/ 36 | 37 | imean 38 | iaverage 39 | istd 40 | ivar 41 | isem 42 | ihistogram 43 | 44 | The following functions consume entire streams. By avoiding costly intermediate steps, 45 | they can perform much faster than their generator versions. 46 | 47 | .. autosummary:: 48 | :toctree: functions/ 49 | 50 | mean 51 | average 52 | std 53 | var 54 | sem 55 | average_and_var 56 | 57 | Numerics 58 | -------- 59 | 60 | .. autosummary:: 61 | :toctree: functions/ 62 | 63 | isum 64 | iprod 65 | isub 66 | 67 | .. autosummary:: 68 | :toctree: functions/ 69 | 70 | sum 71 | prod 72 | 73 | Linear Algebra 74 | -------------- 75 | .. autosummary:: 76 | :toctree: functions/ 77 | 78 | idot 79 | iinner 80 | itensordot 81 | ieinsum 82 | 83 | Control Flow 84 | ------------ 85 | .. autosummary:: 86 | :toctree: functions/ 87 | 88 | ipipe 89 | iload 90 | pload 91 | 92 | Comparisons 93 | ----------- 94 | .. autosummary:: 95 | :toctree: functions/ 96 | 97 | iany 98 | iall 99 | imax 100 | imin 101 | 102 | Parallelization 103 | --------------- 104 | .. autosummary:: 105 | :toctree: functions/ 106 | 107 | pmap 108 | pmap_unordered 109 | preduce 110 | 111 | Stacking 112 | -------- 113 | .. autosummary:: 114 | :toctree: functions/ 115 | 116 | stack 117 | 118 | Iterator Utilities 119 | ------------------ 120 | .. autosummary:: 121 | :toctree: functions/ 122 | 123 | last 124 | cyclic 125 | itercopy 126 | chunked 127 | linspace 128 | multilinspace 129 | peek 130 | primed 131 | length_hint 132 | 133 | Array Utilities 134 | --------------- 135 | .. autosummary:: 136 | :toctree: functions/ 137 | 138 | nan_to_num 139 | 140 | Benchmarking 141 | ------------ 142 | .. autosummary:: 143 | :toctree: functions/ 144 | 145 | benchmark -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. include:: references.txt 2 | 3 | .. _npstreams: 4 | 5 | ************************************** 6 | `npstreams`: streaming NumPy functions 7 | ************************************** 8 | 9 | :mod:`npstreams` is an open-source Python package for streaming NumPy array operations. 10 | The goal is to provide tested, (almost) drop-in replacements for NumPy functions (where possible) 11 | that operate on streams of arrays instead of dense arrays. 12 | 13 | :mod:`npstreams` also provides some utilities for parallelization. These parallelization 14 | generators can be combined with the streaming functions to drastically improve performance 15 | in some cases. 16 | 17 | The code presented herein has been in use at some point by the 18 | `Siwick research group `_. 19 | 20 | Example 21 | ======= 22 | 23 | Consider the following snippet to combine 50 images 24 | from an iterable :data:`source`:: 25 | 26 | import numpy as np 27 | 28 | images = np.empty( shape = (2048, 2048, 50) ) 29 | for index, im in enumerate(source): 30 | images[:,:,index] = im 31 | 32 | avg = np.average(images, axis = 2) 33 | 34 | If the :data:`source` iterable provided 10000 images, the above routine would 35 | not work on most machines. Moreover, what if we want to transform the images 36 | one by one before averaging them? What about looking at the average while it 37 | is being computed? Let's look at an example:: 38 | 39 | import numpy as np 40 | from npstreams import iaverage 41 | from scipy.misc import imread 42 | 43 | stream = map(imread, list_of_filenames) 44 | averaged = iaverage(stream) 45 | 46 | At this point, the generators :func:`map` and :func:`iaverage` are 'wired' 47 | but will not compute anything until it is requested. We can look at the average evolve:: 48 | 49 | import matplotlib.pyplot as plt 50 | for avg in average: 51 | plt.imshow(avg); plt.show() 52 | 53 | We can also use :func:`last` to get at the final average:: 54 | 55 | from npstreams import last 56 | 57 | total = last(averaged) # average of the entire stream. See also npstreams.average 58 | 59 | Benchmark 60 | ========= 61 | 62 | npstreams provides a function for benchmarking common use cases. 63 | 64 | To run the benchmark with default parameters, from the interpreter:: 65 | 66 | from npstreams import benchmark 67 | benchmark() 68 | 69 | From a command-line terminal:: 70 | 71 | python -m npstreams.benchmarks 72 | 73 | The results will be printed to the screen. 74 | 75 | Links 76 | ===== 77 | 78 | * `Source code `_ 79 | * `Issues `_ 80 | * `Docs `_ 81 | 82 | .. _npstreams_docs: 83 | 84 | General Documentation 85 | ===================== 86 | 87 | .. toctree:: 88 | :maxdepth: 3 89 | 90 | installation 91 | whatsnew 92 | conventions 93 | api 94 | cuda 95 | control_flow 96 | making_your_own 97 | recipes 98 | 99 | Authors 100 | ======= 101 | 102 | * Laurent P. René de Cotret -------------------------------------------------------------------------------- /npstreams/array_stream.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from collections.abc import Iterator 4 | from functools import wraps 5 | 6 | import numpy as np 7 | from numpy import asanyarray 8 | 9 | from .iter_utils import length_hint, peek 10 | 11 | 12 | class ArrayStream(Iterator): 13 | """ 14 | Iterator of arrays. Elements from the stream are converted to 15 | NumPy arrays. If ``stream`` is a single array, it will be 16 | repackaged as a length 1 iterable. 17 | 18 | Arrays in the stream will be cast to the same data-type as the first 19 | array in the stream. The stream data-type is located in the `dtype` attribute. 20 | 21 | .. versionadded:: 1.5.2 22 | """ 23 | 24 | def __init__(self, stream): 25 | if isinstance(stream, np.ndarray): 26 | stream = (stream,) 27 | 28 | self._sequence_length = length_hint(stream, default=NotImplemented) 29 | 30 | # Once length_hint has been determined, we can peek into the stream 31 | first, stream = peek(stream) 32 | self._iterator = iter(stream) 33 | 34 | first = asanyarray(first) 35 | self.dtype = first.dtype 36 | 37 | def __repr__(self): 38 | """Verbose string representation""" 39 | representation = f"< {self.__class__.__name__} object" 40 | representation += f" of data-type {self.dtype}" 41 | 42 | if not (self._sequence_length is NotImplemented): 43 | representation += f" and a sequence length of {self._sequence_length}" 44 | else: 45 | representation += " of unknown length" 46 | 47 | return representation + " >" 48 | 49 | def __array__(self, *_, **__): 50 | """Returns a dense array created from this stream.""" 51 | # As of numpy version 1.14, arrays are expanded into a list before contatenation 52 | # Therefore, it's ok to build that list first 53 | arraylist = list(self) 54 | return np.stack(arraylist, axis=-1) 55 | 56 | def __length_hint__(self): 57 | """ 58 | In certain cases, an ArrayStream can have a definite size. 59 | See https://www.python.org/dev/peps/pep-0424/ 60 | """ 61 | return self._sequence_length 62 | 63 | def __next__(self): 64 | n = self._iterator.__next__() 65 | return asanyarray(n, dtype=self.dtype) 66 | 67 | 68 | def array_stream(func): 69 | """ 70 | Decorates streaming functions to make sure that the stream 71 | is a stream of ndarrays. Objects that are not arrays are transformed 72 | into arrays. If the stream is in fact a single ndarray, this ndarray 73 | is repackaged into a sequence of length 1. 74 | 75 | The first argument of the decorated function is assumed to be an iterable of 76 | arrays, or an iterable of objects that can be casted to arrays. 77 | 78 | Note that using this decorator also ensures that the stream is only wrapped once 79 | by the conversion function. 80 | """ 81 | 82 | @wraps(func) 83 | def decorated(arrays, *args, **kwargs): 84 | if isinstance(arrays, ArrayStream): 85 | return func(arrays, *args, **kwargs) 86 | return func(ArrayStream(arrays), *args, **kwargs) 87 | 88 | return decorated 89 | -------------------------------------------------------------------------------- /npstreams/tests/test_flow.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | from pathlib import Path 5 | from npstreams import array_stream, ipipe, last, iload, pload, isum 6 | 7 | 8 | @array_stream 9 | def iden(arrays): 10 | yield from arrays 11 | 12 | 13 | def test_ipipe_order(): 14 | """Test that ipipe(f, g, h, arrays) -> f(g(h(arr))) for arr in arrays""" 15 | stream = [np.random.random((15, 7, 2, 1)) for _ in range(10)] 16 | squared = [np.cbrt(np.square(arr)) for arr in stream] 17 | pipeline = ipipe(np.cbrt, np.square, stream) 18 | 19 | assert all(np.allclose(s, p) for s, p in zip(pipeline, squared)) 20 | 21 | 22 | def test_ipipe_multiprocessing(): 23 | """Test that ipipe(f, g, h, arrays) -> f(g(h(arr))) for arr in arrays""" 24 | stream = [np.random.random((15, 7, 2, 1)) for _ in range(10)] 25 | squared = [np.cbrt(np.square(arr)) for arr in stream] 26 | pipeline = ipipe(np.cbrt, np.square, stream, processes=2) 27 | 28 | assert all(np.allclose(s, p) for s, p in zip(pipeline, squared)) 29 | 30 | 31 | def test_iload_glob(): 32 | """Test that iload works on glob-like patterns""" 33 | stream = iload(Path(__file__).parent / "data" / "test_data*.npy", load_func=np.load) 34 | s = last(isum(stream)).astype(float) # Cast to float for np.allclose 35 | assert np.allclose(s, np.zeros_like(s)) 36 | 37 | 38 | def test_iload_file_list(): 39 | """Test that iload works on iterable of filenames""" 40 | files = [ 41 | Path(__file__).parent / "data" / "test_data1.npy", 42 | Path(__file__).parent / "data" / "test_data2.npy", 43 | Path(__file__).parent / "data" / "test_data3.npy", 44 | ] 45 | stream = iload(files, load_func=np.load) 46 | s = last(isum(stream)).astype(float) # Cast to float for np.allclose 47 | assert np.allclose(s, np.zeros_like(s)) 48 | 49 | 50 | def test_pload_glob(): 51 | """Test that pload works on glob-like patterns""" 52 | stream = pload(Path(__file__).parent / "data" / "test_data*.npy", load_func=np.load) 53 | s = last(isum(stream)).astype(float) # Cast to float for np.allclose 54 | assert np.allclose(s, np.zeros_like(s)) 55 | 56 | stream = pload( 57 | Path(__file__).parent / "data" / "test_data*.npy", 58 | load_func=np.load, 59 | processes=2, 60 | ) 61 | s = last(isum(stream)).astype(float) # Cast to float for np.allclose 62 | assert np.allclose(s, np.zeros_like(s)) 63 | 64 | 65 | def test_pload_file_list(): 66 | """Test that pload works on iterable of filenames""" 67 | files = [ 68 | Path(__file__).parent / "data" / "test_data1.npy", 69 | Path(__file__).parent / "data" / "test_data2.npy", 70 | Path(__file__).parent / "data" / "test_data3.npy", 71 | ] 72 | stream = pload(files, load_func=np.load) 73 | s = last(isum(stream)).astype(float) # Cast to float for np.allclose 74 | assert np.allclose(s, np.zeros_like(s)) 75 | 76 | files = [ 77 | Path(__file__).parent / "data" / "test_data1.npy", 78 | Path(__file__).parent / "data" / "test_data2.npy", 79 | Path(__file__).parent / "data" / "test_data3.npy", 80 | ] 81 | stream = pload(files, load_func=np.load, processes=2) 82 | s = last(isum(stream)).astype(float) # Cast to float for np.allclose 83 | assert np.allclose(s, np.zeros_like(s)) 84 | -------------------------------------------------------------------------------- /npstreams/tests/test_cuda.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from itertools import repeat 4 | import numpy as np 5 | import pytest 6 | 7 | try: 8 | from npstreams.cuda import csum, cprod, caverage, cmean 9 | 10 | WITH_CUDA = True 11 | except ImportError: 12 | WITH_CUDA = False 13 | 14 | 15 | skip_if_no_cuda = pytest.mark.skipif( 16 | not WITH_CUDA, reason="PyCUDA is not installed/available" 17 | ) 18 | 19 | 20 | @skip_if_no_cuda 21 | def test_csum_zero_sum(): 22 | stream = repeat(np.zeros((16, 16), dtype=float), times=5) 23 | s = csum(stream) 24 | assert np.allclose(s, np.zeros((16, 16))) 25 | 26 | 27 | @skip_if_no_cuda 28 | def test_csum_dtype(): 29 | stream = repeat(np.zeros((16, 16), dtype=float), times=5) 30 | s = csum(stream, dtype=np.int16) 31 | assert np.allclose(s, np.zeros((16, 16))) 32 | assert s.dtype == np.int16 33 | 34 | 35 | @skip_if_no_cuda 36 | def test_csum_ignore_nans(): 37 | """Test a sum of zeros with NaNs sprinkled""" 38 | source = [np.zeros((16,), dtype=float) for _ in range(10)] 39 | source.append(np.full((16,), fill_value=np.nan)) 40 | summed = csum(source, ignore_nan=True) 41 | assert np.allclose(summed, np.zeros_like(summed)) 42 | 43 | 44 | @skip_if_no_cuda 45 | def test_cprod_ones_prod(): 46 | stream = repeat(np.ones((16, 16), dtype=float), times=5) 47 | s = cprod(stream) 48 | assert np.allclose(s, np.ones((16, 16))) 49 | 50 | 51 | @skip_if_no_cuda 52 | def test_cprod_ignore_nans(): 53 | """Test that NaNs are ignored.""" 54 | source = [np.ones((16,), dtype=float) for _ in range(10)] 55 | source.append(np.full_like(source[0], np.nan)) 56 | product = cprod(source, ignore_nan=True) 57 | assert np.allclose(product, np.ones_like(product)) 58 | 59 | 60 | @skip_if_no_cuda 61 | def test_cprod_dtype(): 62 | """Test that dtype argument is working""" 63 | source = [np.ones((16,), dtype=float) for _ in range(10)] 64 | product = cprod(source, dtype=int) 65 | assert np.allclose(product, np.ones_like(product)) 66 | assert product.dtype == int 67 | 68 | 69 | @skip_if_no_cuda 70 | def test_cavg_no_weights(): 71 | stream = [np.random.random(size=(16, 16)) for _ in range(5)] 72 | from_caverage = caverage(stream) 73 | from_numpy = np.average(np.dstack(stream), axis=2) 74 | assert np.allclose(from_caverage, from_numpy) 75 | 76 | 77 | @skip_if_no_cuda 78 | def test_cavg_weighted_average(): 79 | """Test results of weighted average against numpy.average""" 80 | stream = [np.random.random(size=(16, 16)) for _ in range(5)] 81 | 82 | weights = [np.random.random(size=stream[0].shape) for _ in stream] 83 | from_caverage = caverage(stream, weights=weights) 84 | from_numpy = np.average(np.dstack(stream), axis=2, weights=np.dstack(weights)) 85 | assert np.allclose(from_caverage, from_numpy) 86 | 87 | 88 | @skip_if_no_cuda 89 | def test_cmean_of_ones(): 90 | stream = repeat(np.ones((16, 16), dtype=float), times=5) 91 | s = cmean(stream) 92 | assert np.allclose(s, np.ones((16, 16))) 93 | 94 | 95 | @skip_if_no_cuda 96 | def test_cmean_random(): 97 | """Test cmean against numpy.mean on random data""" 98 | stream = [np.random.random(size=(16, 16)) for _ in range(5)] 99 | from_cmean = cmean(stream) 100 | from_numpy = np.mean(np.dstack(stream), axis=2) 101 | assert np.allclose(from_cmean, from_numpy) 102 | -------------------------------------------------------------------------------- /npstreams/tests/test_iter_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from itertools import repeat 4 | from npstreams import last, chunked, linspace, multilinspace, cyclic, length_hint 5 | import pytest 6 | 7 | 8 | def test_last_trivial(): 9 | """Test last() on iterable of identical values""" 10 | i = repeat(1, 10) 11 | assert last(i) == 1 12 | 13 | 14 | def test_last_on_empty_iterable(): 15 | """Test that last() raises RuntimeError for empty iterable""" 16 | with pytest.raises(RuntimeError): 17 | last(list()) 18 | 19 | 20 | def test_cyclic_numbers(): 21 | """ """ 22 | permutations = set(cyclic((1, 2, 3))) 23 | assert (1, 2, 3) in permutations 24 | assert (2, 3, 1) in permutations 25 | assert (3, 1, 2) in permutations 26 | assert len(permutations) == 3 27 | 28 | 29 | def test_linspace_endpoint(): 30 | """Test that the endpoint is included by linspace() when appropriate""" 31 | space = linspace(0, 1, num=10, endpoint=True) 32 | assert last(space) == 1 33 | 34 | space = linspace(0, 1, num=10, endpoint=False) 35 | assert round(abs(last(space) - 0.9), 7) == 0 36 | 37 | 38 | def test_linspace_length(): 39 | """Test that linspace() returns an iterable of the correct length""" 40 | space = list(linspace(0, 1, num=13, endpoint=True)) 41 | assert len(space) == 13 42 | 43 | space = list(linspace(0, 1, num=13, endpoint=False)) 44 | assert len(space) == 13 45 | 46 | 47 | def test_multilinspace_endpoint(): 48 | """Test that the endpoint is included by linspace() when appropriate""" 49 | space = multilinspace((0, 0), (1, 1), num=10, endpoint=True) 50 | assert last(space) == (1, 1) 51 | 52 | space = multilinspace((0, 0), (1, 1), num=10, endpoint=False) 53 | # Unfortunately there is no assertSequenceAlmostEqual 54 | assert last(space) == (0.8999999999999999, 0.8999999999999999) 55 | 56 | 57 | def test_multilinspace_length(): 58 | """Test that linspace() returns an iterable of the correct length""" 59 | space = list(multilinspace((0, 0), (1, 1), num=13, endpoint=True)) 60 | assert len(space) == 13 61 | 62 | space = list(multilinspace((0, 0), (1, 1), num=13, endpoint=False)) 63 | assert len(space) == 13 64 | 65 | 66 | def test_chunked_larger_chunksize(): 67 | """Test chunked() with a chunksize larger that the iterable it""" 68 | i = repeat(1, 10) 69 | chunks = chunked(i, chunksize=15) 70 | assert len(list(chunks)) == 1 # One single chunk is returned 71 | 72 | 73 | def test_chunked_on_infinite_generator(): 74 | """Test chunked() on an infinite iterable""" 75 | i = repeat(1) 76 | chunks = chunked(i, chunksize=15) 77 | for _ in range(10): 78 | assert len(next(chunks)) == 15 79 | 80 | 81 | def test_chunked_chunked_nonint_chunksize(): 82 | """Test that chunked raises a TypeError immediately if `chunksize` is not an integer""" 83 | with pytest.raises(TypeError): 84 | i = repeat(1) 85 | chunks = chunked(i, chunksize=15.0) 86 | 87 | 88 | def test_length_hint_on_sized(): 89 | """Test length_hint on a sized iterable""" 90 | l = [1, 2, 3, 4, 5] 91 | assert length_hint(l) == len(l) 92 | 93 | 94 | def test_length_hint_on_unsized(): 95 | """Test length_hint on an unsized iterable returns the default""" 96 | l = (0 for _ in range(10)) 97 | assert length_hint(l, default=0) == 0 98 | 99 | 100 | def test_length_hint_on_method_if_implemented(): 101 | """Test length_hint returns the same as __length_hint__ if implemented""" 102 | 103 | class WithHint: 104 | """Some dummy class with a length hint""" 105 | 106 | def __length_hint__(self): 107 | return 1 108 | 109 | assert length_hint(WithHint(), default=0) == 1 110 | -------------------------------------------------------------------------------- /npstreams/flow.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Flow controls 4 | ------------- 5 | """ 6 | from functools import partial 7 | from glob import iglob 8 | from pathlib import Path 9 | 10 | from .array_stream import ArrayStream 11 | from .parallel import pmap, pmap_unordered 12 | 13 | 14 | def iload(files, load_func, **kwargs): 15 | """ 16 | Create a stream of arrays from files, which are loaded lazily. 17 | 18 | In cases where the consumer function is much faster than data loading, 19 | consider using :func:`pload` instead. 20 | 21 | Parameters 22 | ---------- 23 | pattern : iterable of str or str 24 | Either an iterable of filenames or a glob-like pattern str. 25 | load_func : callable, optional 26 | Function taking a filename as its first arguments 27 | kwargs 28 | Keyword arguments are passed to ``load_func``. 29 | 30 | Yields 31 | ------ 32 | arr: `~numpy.ndarray` 33 | Loaded data. 34 | 35 | See Also 36 | -------- 37 | pload : load files from parallel processes. 38 | 39 | Examples 40 | -------- 41 | To load images using scikit-image :: 42 | 43 | from skimage.io import imread 44 | ims = iload('images_*.tif', imread) 45 | 46 | Keyword arguments are passed to the ``load_func``; for example, 47 | to specify the scikit-image plugin ``'tifffile'``:: 48 | 49 | ims = iload('images_*.tif', imread, plugin = 'tifffile') 50 | 51 | In case the list of images is already known:: 52 | 53 | ims = iload(['im1.tif', 'im2.tif', 'im3.tif'], imread) 54 | """ 55 | # TODO: better handling of Paths 56 | if isinstance(files, Path): 57 | files = str(files) 58 | 59 | if isinstance(files, str): 60 | files = iglob(files) 61 | files = iter(files) 62 | 63 | yield from map(partial(load_func, **kwargs), files) 64 | 65 | 66 | def pload(files, load_func, processes=1, **kwargs): 67 | """ 68 | Create a stream of arrays from files, which are loaded lazily 69 | from multiple processes. 70 | 71 | This function should be preferred to :func:`iload` in cases where 72 | the consumer function is much faster than the data can be loaded. 73 | 74 | Parameters 75 | ---------- 76 | pattern : iterable of str or str 77 | Either an iterable of filenames or a glob-like pattern str. 78 | load_func : callable, optional 79 | Function taking a filename as its first arguments 80 | processes : int or None, optional 81 | Number of processes to use. If `None`, maximal number of processes 82 | is used. Default is one. 83 | kwargs 84 | Keyword arguments are passed to ``load_func``. 85 | 86 | Yields 87 | ------ 88 | arr: `~numpy.ndarray` 89 | Loaded data. 90 | 91 | See Also 92 | -------- 93 | iload : load files lazily 94 | """ 95 | if processes == 1: 96 | yield from iload(files, load_func, **kwargs) 97 | return 98 | 99 | # TODO: better handling of Paths 100 | if isinstance(files, Path): 101 | files = str(files) 102 | 103 | if isinstance(files, str): 104 | files = iglob(files) 105 | files = iter(files) 106 | 107 | yield from pmap_unordered(partial(load_func, **kwargs), files, processes=processes) 108 | 109 | 110 | # pmap does not support local functions 111 | def _pipe(funcs, array): 112 | for func in funcs: 113 | array = func(array) 114 | return array 115 | 116 | 117 | def ipipe(*args, **kwargs): 118 | """ 119 | Pipe arrays through a sequence of functions. For example: 120 | 121 | ``pipe(f, g, h, stream)`` is equivalent to :: 122 | 123 | for arr in stream: 124 | yield f(g(h(arr))) 125 | 126 | Parameters 127 | ---------- 128 | *funcs : callable 129 | Callable that support Numpy arrays in their first argument. These 130 | should *NOT* be generator functions. 131 | arrays : iterable 132 | Stream of arrays to be passed. 133 | processes : int or None, optional, keyword-only 134 | Number of processes to use. If `None`, maximal number of processes 135 | is used. Default is one. 136 | ntotal : int or None, optional, keyword-only 137 | If the length of `arrays` is known, but passing `arrays` as a list 138 | would take too much memory, the total number of arrays `ntotal` can be specified. This 139 | allows for `pmap` to chunk better in case of ``processes > 1``. 140 | 141 | Yields 142 | ------ 143 | piped : ndarray 144 | """ 145 | arrays = ArrayStream(args[-1]) 146 | functions = tuple(reversed(args[:-1])) 147 | yield from pmap(partial(_pipe, functions), arrays, **kwargs) 148 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # This file is execfile()d with the current directory set to its 5 | # containing dir. 6 | # 7 | # Note that not all possible configuration values are present in this 8 | # autogenerated file. 9 | # 10 | # All configuration values have a default; values that are commented out 11 | # serve to show the default. 12 | 13 | # If extensions (or modules to document with autodoc) are in another directory, 14 | # add these directories to sys.path here. If the directory is relative to the 15 | # documentation root, use os.path.abspath to make it absolute, like shown here. 16 | # 17 | import os 18 | import sys 19 | 20 | currentpath = os.path.dirname(__file__) 21 | sys.path.append(os.path.join(currentpath, "..")) 22 | 23 | import npstreams 24 | 25 | # -- General configuration ------------------------------------------------ 26 | 27 | # If your documentation needs a minimal Sphinx version, state it here. 28 | # 29 | # needs_sphinx = '1.5' 30 | from datetime import datetime 31 | import alabaster 32 | 33 | year = datetime.now().year 34 | 35 | # Add any Sphinx extension module names here, as strings. They can be 36 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 37 | # ones. 38 | extensions = [ 39 | "alabaster", 40 | "sphinx.ext.todo", 41 | "sphinx.ext.intersphinx", 42 | "sphinx.ext.autosummary", 43 | "sphinx.ext.autodoc", 44 | "sphinx.ext.napoleon", 45 | "sphinx.ext.mathjax", 46 | "sphinx.ext.doctest", 47 | ] 48 | 49 | intersphinx_mapping = {"numpy": ("http://docs.scipy.org/doc/numpy/", None)} 50 | 51 | napoleon_google_docstring = False 52 | autosummary_generate = True 53 | 54 | # The suffix(es) of source filenames. 55 | # You can specify multiple suffix as a list of string: 56 | # 57 | # source_suffix = ['.rst', '.md'] 58 | source_suffix = ".rst" 59 | 60 | # The master toctree document. 61 | master_doc = "index" 62 | 63 | # Releases changelog extension 64 | releases_release_uri = "https://github.com/LaurentRDC/npstreams/tree/%s" 65 | releases_issue_uri = "https://github.com/LaurentRDC/npstreams/issues/%s" 66 | 67 | # General information about the project. 68 | project = "npstreams" 69 | copyright = "%d Laurent P. René de Cotret" % year 70 | author = "Laurent P. René de Cotret" 71 | 72 | # The version info for the project you're documenting, acts as replacement for 73 | # |version| and |release|, also used in various other places throughout the 74 | # built documents. 75 | # 76 | # The short X.Y version. 77 | version = npstreams.__version__ 78 | # The full version, including alpha/beta/rc tags. 79 | release = version 80 | 81 | # The language for content autogenerated by Sphinx. Refer to documentation 82 | # for a list of supported languages. 83 | # 84 | # This is also used if you do content translation via gettext catalogs. 85 | # Usually you set "language" from the command line for these cases. 86 | language = None 87 | 88 | # List of patterns, relative to source directory, that match files and 89 | # directories to ignore when looking for source files. 90 | # This patterns also effect to html_static_path and html_extra_path 91 | exclude_patterns = [] 92 | exclude_trees = ["_build"] 93 | 94 | # The name of the Pygments (syntax highlighting) style to use. 95 | pygments_style = "sphinx" 96 | 97 | # If true, `todo` and `todoList` produce output, else they produce nothing. 98 | todo_include_todos = True 99 | 100 | 101 | # -- Options for HTML output ---------------------------------------------- 102 | 103 | # The theme to use for HTML and HTML Help pages. See the documentation for 104 | # a list of builtin themes. 105 | # 106 | html_theme = "sphinx_rtd_theme" 107 | html_theme_path = ["_themes"] 108 | html_sidebars = { 109 | "**": [ 110 | "about.html", 111 | "navigation.html", 112 | "searchbox.html", 113 | "localtoc.html", 114 | "sourcelink.html", 115 | ] 116 | } 117 | # html_show_sourcelink = True 118 | 119 | # Everything intersphinx's to Python. 120 | intersphinx_mapping = {"python": ("https://docs.python.org", None)} 121 | 122 | # Autodoc settings 123 | autodoc_default_flags = ["members", "special-members"] 124 | autoclass_content = "both" 125 | 126 | 127 | def autodoc_skip_member(app, what, name, obj, skip, options): 128 | exclusions = {"__weakref__", "__doc__", "__module__", "__dict__"} 129 | exclude = name in exclusions 130 | return skip or exclude 131 | 132 | 133 | def setup(app): 134 | app.connect("autodoc-skip-member", autodoc_skip_member) 135 | 136 | 137 | doctest_global_setup = """ 138 | import npstreams as ns 139 | """ 140 | 141 | 142 | # Add any paths that contain custom static files (such as style sheets) here, 143 | # relative to this directory. They are copied after the builtin static files, 144 | # so a file named "default.css" will overwrite the builtin "default.css". 145 | html_static_path = [] 146 | 147 | # Suppress the warning about a non-local URI for status shields. 148 | suppress_warnings = ["image.nonlocal_uri"] 149 | 150 | # Enable releases 'unstable prehistory' mode. 151 | releases_unstable_prehistory = True 152 | -------------------------------------------------------------------------------- /npstreams/linalg.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Numerics Functions 4 | ------------------ 5 | """ 6 | from functools import partial 7 | 8 | import numpy as np 9 | 10 | from .array_stream import array_stream 11 | 12 | 13 | @array_stream 14 | def _ireduce_linalg(arrays, func, **kwargs): 15 | """ 16 | Yield the cumulative reduction of a linag algebra function 17 | """ 18 | arrays = iter(arrays) 19 | first = next(arrays) 20 | second = next(arrays) 21 | 22 | func = partial(func, **kwargs) 23 | 24 | accumulator = func(first, second) 25 | yield accumulator 26 | 27 | for array in arrays: 28 | func(accumulator, array, out=accumulator) 29 | yield accumulator 30 | 31 | 32 | def idot(arrays): 33 | """ 34 | Yields the cumulative array inner product (dot product) of arrays. 35 | 36 | Parameters 37 | ---------- 38 | arrays : iterable 39 | Arrays to be reduced. 40 | 41 | Yields 42 | ------ 43 | online_dot : ndarray 44 | 45 | See Also 46 | -------- 47 | numpy.linalg.multi_dot : Compute the dot product of two or more arrays in a single function call, 48 | while automatically selecting the fastest evaluation order. 49 | """ 50 | yield from _ireduce_linalg(arrays=arrays, func=np.dot) 51 | 52 | 53 | def itensordot(arrays, axes=2): 54 | """ 55 | Yields the cumulative array inner product (dot product) of arrays. 56 | 57 | Parameters 58 | ---------- 59 | arrays : iterable 60 | Arrays to be reduced. 61 | axes : int or (2,) array_like 62 | * integer_like: If an int N, sum over the last N axes of a 63 | and the first N axes of b in order. The sizes of the corresponding axes must match. 64 | * (2,) array_like: Or, a list of axes to be summed over, first sequence applying to a, 65 | second to b. Both elements array_like must be of the same length. 66 | 67 | Yields 68 | ------ 69 | online_tensordot : ndarray 70 | 71 | See Also 72 | -------- 73 | numpy.tensordot : Compute the tensordot on two tensors. 74 | """ 75 | yield from _ireduce_linalg(arrays=arrays, func=np.tensordot, axes=axes) 76 | 77 | 78 | def iinner(arrays): 79 | """ 80 | Cumulative inner product of all arrays in a stream. 81 | 82 | Parameters 83 | ---------- 84 | arrays : iterable 85 | Arrays to be reduced. 86 | 87 | Yields 88 | ------ 89 | online_inner : ndarray or scalar 90 | """ 91 | yield from _ireduce_linalg(arrays=arrays, func=np.inner) 92 | 93 | 94 | def ieinsum(arrays, subscripts, **kwargs): 95 | """ 96 | Evaluates the Einstein summation convention on the operands. 97 | 98 | Using the Einstein summation convention, many common multi-dimensional 99 | array operations can be represented in a simple fashion. 100 | 101 | Parameters 102 | ---------- 103 | arrays : iterable 104 | Arrays to be reduced. 105 | subscripts : str 106 | Specifies the subscripts for summation. 107 | dtype : numpy.dtype or None, optional 108 | The type of the yielded array and of the accumulator in which the elements 109 | are combined. The dtype of a is used by default unless a has an integer dtype 110 | of less precision than the default platform integer. In that case, if a is 111 | signed then the platform integer is used while if a is unsigned then an 112 | unsigned integer of the same precision as the platform integer is used. 113 | order : {'C', 'F', 'A', 'K'}, optional 114 | Controls the memory layout of the output. 'C' means it should 115 | be C contiguous. 'F' means it should be Fortran contiguous, 116 | 'A' means it should be 'F' if the inputs are all 'F', 'C' otherwise. 117 | 'K' means it should be as close to the layout as the inputs as 118 | is possible, including arbitrarily permuted axes. 119 | Default is 'K'. 120 | casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional 121 | Controls what kind of data casting may occur. Setting this to 122 | 'unsafe' is not recommended, as it can adversely affect accumulations. 123 | 124 | * 'no' means the data types should not be cast at all. 125 | * 'equiv' means only byte-order changes are allowed. 126 | * 'safe' means only casts which can preserve values are allowed. 127 | * 'same_kind' means only safe casts or casts within a kind, 128 | like float64 to float32, are allowed. 129 | * 'unsafe' means any data conversions may be done. 130 | 131 | Default is 'safe'. 132 | optimize : {False, True, 'greedy', 'optimal'}, optional 133 | Controls if intermediate optimization should occur. No optimization 134 | will occur if False and True will default to the 'greedy' algorithm. 135 | Also accepts an explicit contraction list from the ``np.einsum_path`` 136 | function. See ``np.einsum_path`` for more details. Default is False. 137 | 138 | Yields 139 | ------ 140 | online_einsum : ndarray 141 | Cumulative Einstein summation 142 | """ 143 | yield from _ireduce_linalg( 144 | arrays=arrays, func=partial(np.einsum, subscripts), **kwargs 145 | ) 146 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # npstreams 2 | 3 | [![Documentation Build Status](https://readthedocs.org/projects/npstreams/badge/?version=master)](http://npstreams.readthedocs.io) [![PyPI Version](https://img.shields.io/pypi/v/npstreams.svg)](https://pypi.python.org/pypi/npstreams) [![Conda-forge Version](https://img.shields.io/conda/vn/conda-forge/npstreams.svg)](https://anaconda.org/conda-forge/npstreams) [![DOI badge](https://img.shields.io/badge/DOI-10.1186%2Fs40679--018--0060--y-blue)](https://doi.org/10.1186/s40679-018-0060-y) 4 | 5 | npstreams is an open-source Python package for streaming NumPy array 6 | operations. The goal is to provide tested routines that operate on 7 | streams (or generators) of arrays instead of dense arrays. 8 | 9 | Streaming reduction operations (sums, averages, etc.) can be implemented 10 | in constant memory, which in turns allows for easy parallelization. 11 | 12 | This approach has been a huge boon when working with lots of images; the 13 | images are read one-by-one from disk and combined/processed in a 14 | streaming fashion. 15 | 16 | This package is developed in conjunction with other software projects in 17 | the [Siwick research group](http://www.physics.mcgill.ca/siwicklab/). 18 | 19 | ## Motivating Example 20 | 21 | Consider the following snippet to combine 50 images from an iterable 22 | `source`: 23 | 24 | ```python 25 | import numpy as np 26 | 27 | images = np.empty( shape = (2048, 2048, 50) ) 28 | for index, im in enumerate(source): 29 | images[:,:,index] = im 30 | 31 | avg = np.average(images, axis = 2) 32 | ``` 33 | 34 | If the `source` iterable provided 1000 images, the above routine would 35 | not work on most machines. Moreover, what if we want to transform the 36 | images one by one before averaging them? What about looking at the 37 | average while it is being computed? Let\'s look at an example: 38 | 39 | ```python 40 | import numpy as np 41 | from npstreams import iaverage 42 | from scipy.misc import imread 43 | 44 | stream = map(imread, list_of_filenames) 45 | averaged = iaverage(stream) 46 | ``` 47 | 48 | At this point, the generators `map` and `iaverage` are \'wired\' but 49 | will not compute anything until it is requested. We can look at the 50 | average evolve: 51 | 52 | ```python 53 | import matplotlib.pyplot as plt 54 | for avg in average: 55 | plt.imshow(avg); plt.show() 56 | ``` 57 | 58 | We can also use `last` to get at the final average: 59 | 60 | ```python 61 | from npstreams import last 62 | 63 | total = last(averaged) # average of the entire stream 64 | ``` 65 | 66 | ## Streaming Functions 67 | 68 | npstreams comes with some streaming functions built-in. Some examples: 69 | 70 | - Numerics : `isum`, `iprod`, `isub`, etc. 71 | - Statistics : `iaverage` (weighted mean), `ivar` (single-pass 72 | variance), etc. 73 | 74 | More importantly, npstreams gives you all the tools required to build 75 | your own streaming function. All routines are documented in the [API 76 | Reference on readthedocs.io](http://npstreams.readthedocs.io). 77 | 78 | ## Benchmarking 79 | 80 | npstreams provides a function for benchmarking common use cases. 81 | 82 | To run the benchmark with default parameters, from the interpreter: 83 | 84 | ```python 85 | from npstreams import benchmark 86 | benchmark() 87 | ``` 88 | 89 | From a command-line terminal: 90 | 91 | ```bash 92 | python -c 'import npstreams; npstreams.benchmark()' 93 | ``` 94 | 95 | The results will be printed to the screen. 96 | 97 | ## Future Work 98 | 99 | Some of the features I want to implement in this package in the near 100 | future: 101 | 102 | - Optimize the CUDA-enabled routines 103 | - More functions : more streaming functions borrowed from NumPy and 104 | SciPy. 105 | 106 | ## API Reference 107 | 108 | The [API Reference on readthedocs.io](http://npstreams.readthedocs.io) 109 | provides API-level documentation, as well as tutorials. 110 | 111 | ## Installation 112 | 113 | The only requirement is NumPy. To have access to CUDA-enabled routines, 114 | PyCUDA must also be installed. npstreams is available on PyPI; it can be 115 | installed with [pip](https://pip.pypa.io).: 116 | 117 | ```bash 118 | python -m pip install npstreams 119 | ``` 120 | 121 | npstreams can also be installed with the conda package manager, from the 122 | conda-forge channel: 123 | 124 | ```bash 125 | conda config --add channels conda-forge 126 | conda install npstreams 127 | ``` 128 | 129 | To install the latest development version from 130 | [Github](https://github.com/LaurentRDC/npstreams): 131 | 132 | ```bash 133 | python -m pip install git+git://github.com/LaurentRDC/npstreams.git 134 | ``` 135 | 136 | Tests can be run using the `pytest` package. 137 | 138 | ## Citations 139 | 140 | If you find this software useful, please consider citing the following 141 | publication: 142 | 143 | > L. P. René de Cotret, M. R. Otto, M. J. Stern. and B. J. Siwick, *An open-source software ecosystem for the interactive exploration of ultrafast electron scattering data*, Advanced Structural and Chemical Imaging 4:11 (2018) [DOI: 10.1186/s40679-018-0060-y.](https://ascimaging.springeropen.com/articles/10.1186/s40679-018-0060-y) 144 | 145 | 146 | ## Support / Report Issues 147 | 148 | All support requests and issue reports should be [filed on Github as an 149 | issue](https://github.com/LaurentRDC/npstreams/issues). 150 | 151 | ## License 152 | 153 | npstreams is made available under the BSD License, same as NumPy. For 154 | more details, see 155 | [LICENSE.txt](https://github.com/LaurentRDC/npstreams/blob/master/LICENSE.txt). 156 | -------------------------------------------------------------------------------- /docs/making_your_own.rst: -------------------------------------------------------------------------------- 1 | .. include:: references.txt 2 | 3 | .. _making_your_own: 4 | 5 | ******************************************** 6 | Making your own Streaming Reduction Function 7 | ******************************************** 8 | 9 | .. currentmodule:: npstreams 10 | 11 | ============================================ 12 | The :func:`ireduce_ufunc` generator function 13 | ============================================ 14 | 15 | You can assemble your own streaming reduction function from a **binary** NumPy ufunc 16 | using the following generator function: 17 | 18 | .. autofunction:: ireduce_ufunc 19 | 20 | The non-generator version is also available: 21 | 22 | .. autofunction:: reduce_ufunc 23 | 24 | Note that while all NumPy ufuncs have a :meth:`reduce` method, not all of them are useful. 25 | This is why :func:`ireduce_ufunc` and :func:`reduce_ufunc` will only work with **binary** ufuncs, 26 | most of which are listed below. For performance reasons, we further restrict the use of 27 | :func:`ireduce_ufunc` and :func:`reduce_ufunc` to ufuncs that have the same input types 28 | as output types. Therefore, for example, :func:`numpy.greater` cannot be made to work with 29 | :func:`ireduce_ufunc` and :func:`reduce_ufunc`. 30 | 31 | NaNs handling 32 | ------------- 33 | 34 | NumPy ufuncs can have an identity value, that is, a value such that ``ufunc(x1, identity)`` is always ``x1``. For such ufuncs, 35 | :func:`ireduce_ufunc` and :func:`reduce_ufunc` can replace NaNs in the stream with the ufunc's identity value, if ``ignore_nan = True``. 36 | Note that not all ufuncs have an identity value; for example, how would you define the identity value of ``numpy.maximum``? There is no answer. 37 | 38 | .. _numpy_binary_ufuncs: 39 | 40 | =================== 41 | NumPy Binary Ufuncs 42 | =================== 43 | 44 | :func:`ireduce_ufunc` is tested to work on the following binary ufuncs, which are available in `NumPy`_. 45 | 46 | 47 | Arithmetics 48 | ----------- 49 | 50 | .. autosummary:: 51 | :nosignatures: 52 | 53 | numpy.add 54 | numpy.subtract 55 | numpy.multiply 56 | numpy.divide 57 | numpy.logaddexp 58 | numpy.logaddexp2 59 | numpy.true_divide 60 | numpy.floor_divide 61 | numpy.power 62 | numpy.remainder 63 | numpy.mod 64 | numpy.fmod 65 | 66 | Trigonometric functions 67 | ----------------------- 68 | 69 | .. autosummary:: 70 | :nosignatures: 71 | 72 | numpy.arctan2 73 | numpy.hypot 74 | 75 | Bit-twiddling functions 76 | ----------------------- 77 | 78 | .. autosummary:: 79 | :nosignatures: 80 | 81 | numpy.bitwise_and 82 | numpy.bitwise_or 83 | numpy.bitwise_xor 84 | numpy.left_shift 85 | numpy.right_shift 86 | 87 | Comparison functions 88 | -------------------- 89 | 90 | .. autosummary:: 91 | :nosignatures: 92 | 93 | numpy.maximum 94 | numpy.fmax 95 | numpy.minimum 96 | numpy.fmin 97 | 98 | Floating functions 99 | ------------------ 100 | 101 | .. autosummary:: 102 | :nosignatures: 103 | 104 | numpy.copysign 105 | numpy.nextafter 106 | numpy.ldexp 107 | 108 | ========================== 109 | Example: Streaming Maximum 110 | ========================== 111 | 112 | Let's create a streaming maximum function for a stream. First, we have to choose 113 | how to handle NaNs; since ``numpy.maximum`` does not have an identity value, we must find 114 | another way. We can proceed as follows: 115 | 116 | * If we want to propagate NaNs, we should use :func:`numpy.maximum` 117 | * If we want to ignore NaNs, we should use :func:`numpy.fmax` 118 | 119 | Both of those functions are binary ufuncs, so we can use :func:`ireduce_ufunc`. Note that any function based 120 | on :func:`ireduce_ufunc` or :func:`reduce_ufunc` will automatically work on streams of numbers thanks to the 121 | :func:`array_stream` decorator. 122 | 123 | Putting it all together:: 124 | 125 | from npstreams import ireduce_ufunc 126 | from numpy import maximum, fmax 127 | 128 | def imax(arrays, axis = -1, ignore_nan = False, **kwargs): 129 | """ 130 | Streaming cumulative maximum along an axis. 131 | 132 | Parameters 133 | ---------- 134 | arrays : iterable 135 | Stream of arrays to be compared. 136 | axis : int or None, optional 137 | Axis along which to compute the maximum. If None, 138 | arrays are flattened before reduction. 139 | ignore_nan : bool, optional 140 | If True, NaNs are ignored. Default is False. 141 | 142 | Yields 143 | ------ 144 | online_max : ndarray 145 | """ 146 | ufunc = fmax if ignore_nan else maximum 147 | yield from ireduce_ufunc(arrays, ufunc, axis = axis, **kwargs) 148 | 149 | This will provide us with a streaming function, meaning that we can look at the progress 150 | as it is being computed. We can also create a function that returns the max of the stream 151 | like :meth:`numpy.ndarray.max()` using the :func:`reduce_ufunc` function:: 152 | 153 | from npstreams import reduce_ufunc 154 | 155 | def smax(*args, **kwargs): # s for stream 156 | """ 157 | Maximum of a stream along an axis. 158 | 159 | Parameters 160 | ---------- 161 | arrays : iterable 162 | Stream of arrays to be compared. 163 | axis : int or None, optional 164 | Axis along which to compute the maximum. If None, 165 | arrays are flattened before reduction. 166 | ignore_nan : bool, optional 167 | If True, NaNs are ignored. Default is False. 168 | 169 | Yields 170 | ------ 171 | max : ndarray 172 | """ 173 | return reduce_ufunc(*args, **kwargs) -------------------------------------------------------------------------------- /npstreams/parallel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Parallelization utilities 4 | ------------------------- 5 | """ 6 | from collections.abc import Sized 7 | from functools import partial, reduce 8 | from multiprocessing import Pool 9 | 10 | from .iter_utils import chunked 11 | 12 | 13 | def preduce(func, iterable, args=None, kwargs=None, processes=1, ntotal=None): 14 | """ 15 | Parallel application of the reduce function, with keyword arguments. 16 | 17 | Parameters 18 | ---------- 19 | func : callable 20 | Function to be applied to every element of `iterable`. 21 | iterable : iterable 22 | Iterable of items to be reduced. Generators are consumed. 23 | args : tuple or None, optional 24 | Positional arguments of `function`. 25 | kwargs : dictionary or None, optional 26 | Keyword arguments of `function`. 27 | processes : int or None, optional 28 | Number of processes to use. If `None`, maximal number of processes 29 | is used. Default is one. 30 | ntotal : int or None, optional 31 | If the length of `iterable` is known, but passing `iterable` as a list 32 | would take too much memory, the total length `ntotal` can be specified. This 33 | allows for `preduce` to chunk better. 34 | 35 | Returns 36 | ------- 37 | reduced : object 38 | 39 | Notes 40 | ----- 41 | If `processes` is 1, `preduce` is equivalent to functools.reduce with the 42 | added benefit of using `args` and `kwargs`, but `initializer` is not supported. 43 | """ 44 | if kwargs is None: 45 | kwargs = dict() 46 | 47 | if args is None: 48 | args = tuple() 49 | 50 | func = partial(func, *args, **kwargs) 51 | 52 | if processes == 1: 53 | return reduce(func, iterable) 54 | 55 | with Pool(processes) as pool: 56 | chunksize = 1 57 | if isinstance(iterable, Sized): 58 | chunksize = max(1, int(len(iterable) / pool._processes)) 59 | elif ntotal is not None: 60 | chunksize = max(1, int(ntotal / pool._processes)) 61 | 62 | # Some reductions are order-sensitive 63 | res = pool.imap(partial(reduce, func), tuple(chunked(iterable, chunksize))) 64 | return reduce(func, res) 65 | 66 | 67 | def pmap(func, iterable, args=None, kwargs=None, processes=1, ntotal=None): 68 | """ 69 | Parallel application of a function with keyword arguments. 70 | 71 | Parameters 72 | ---------- 73 | func : callable 74 | Function to be applied to every element of `iterable`. 75 | iterable : iterable 76 | Iterable of items to be mapped. 77 | args : tuple or None, optional 78 | Positional arguments of `function`. 79 | kwargs : dictionary or None, optional 80 | Keyword arguments of `function`. 81 | processes : int or None, optional 82 | Number of processes to use. If `None`, maximal number of processes 83 | is used. Default is one. 84 | ntotal : int or None, optional 85 | If the length of `iterable` is known, but passing `iterable` as a list 86 | would take too much memory, the total length `ntotal` can be specified. This 87 | allows for `pmap` to chunk better. 88 | 89 | Yields 90 | ------ 91 | Mapped values. 92 | 93 | See Also 94 | -------- 95 | pmap_unordered : parallel map that does not preserve order 96 | 97 | Notes 98 | ----- 99 | If `processes` is 1, `pmap` reduces to `map`, with the added benefit of 100 | of using `kwargs` 101 | """ 102 | if kwargs is None: 103 | kwargs = dict() 104 | 105 | if args is None: 106 | args = tuple() 107 | 108 | func = partial(func, *args, **kwargs) 109 | 110 | if processes == 1: 111 | yield from map(func, iterable) 112 | return 113 | 114 | with Pool(processes) as pool: 115 | chunksize = 1 116 | if isinstance(iterable, Sized): 117 | chunksize = max(1, int(len(iterable) / pool._processes)) 118 | elif ntotal is not None: 119 | chunksize = max(1, int(ntotal / pool._processes)) 120 | 121 | yield from pool.imap(func=func, iterable=iterable, chunksize=chunksize) 122 | 123 | 124 | def pmap_unordered(func, iterable, args=None, kwargs=None, processes=1, ntotal=None): 125 | """ 126 | Parallel application of a function with keyword arguments in no particular order. 127 | This can reduce memory usage because results are not accumulated so that the order is preserved. 128 | 129 | Parameters 130 | ---------- 131 | func : callable 132 | Function to be applied to every element of `iterable`. 133 | iterable : iterable 134 | Iterable of items to be mapped. 135 | args : tuple or None, optional 136 | Positional arguments of `function`. 137 | kwargs : dictionary or None, optional 138 | Keyword arguments of `function`. 139 | processes : int or None, optional 140 | Number of processes to use. If `None`, maximal number of processes 141 | is used. Default is one. 142 | ntotal : int or None, optional 143 | If the length of `iterable` is known, but passing `iterable` as a list 144 | would take too much memory, the total length `ntotal` can be specified. This 145 | allows for `pmap` to chunk better. 146 | 147 | Yields 148 | ------ 149 | Mapped values. 150 | 151 | See Also 152 | -------- 153 | pmap : parallel map that preserves order 154 | 155 | Notes 156 | ----- 157 | If `processes` is 1, `pmap_unordered` reduces to `map`, with the added benefit of 158 | of using `kwargs` 159 | """ 160 | if kwargs is None: 161 | kwargs = dict() 162 | 163 | if args is None: 164 | args = tuple() 165 | 166 | func = partial(func, *args, **kwargs) 167 | 168 | if processes == 1: 169 | yield from map(func, iterable) 170 | return 171 | 172 | with Pool(processes) as pool: 173 | chunksize = 1 174 | if isinstance(iterable, Sized): 175 | chunksize = max(1, int(len(iterable) / pool._processes)) 176 | elif ntotal is not None: 177 | chunksize = max(1, int(ntotal / pool._processes)) 178 | 179 | yield from pool.imap_unordered( 180 | func=func, iterable=iterable, chunksize=chunksize 181 | ) 182 | -------------------------------------------------------------------------------- /npstreams/tests/test_reduce.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | 5 | from npstreams import ireduce_ufunc, preduce_ufunc, last, nan_to_num, reduce_ufunc 6 | import pytest 7 | 8 | # Only testing binary ufuncs that support floats 9 | # i.e. leaving bitwise_* and logical_* behind 10 | # Also, numpy.ldexp takes in ints and floats separately, so 11 | # leave it behind 12 | UFUNCS = ( 13 | np.add, 14 | np.subtract, 15 | np.multiply, 16 | np.divide, 17 | np.logaddexp, 18 | np.logaddexp2, 19 | np.true_divide, 20 | np.floor_divide, 21 | np.power, 22 | np.remainder, 23 | np.mod, 24 | np.fmod, 25 | np.arctan2, 26 | np.hypot, 27 | np.maximum, 28 | np.fmax, 29 | np.minimum, 30 | np.fmin, 31 | np.copysign, 32 | np.nextafter, 33 | ) 34 | 35 | UFUNCS_WITH_IDENTITY = list(filter(lambda u: u.identity is not None, UFUNCS)) 36 | 37 | 38 | def test_ireduce_ufunc_no_side_effects(): 39 | """Test that no arrays in the stream are modified""" 40 | source = [np.random.random((16, 5, 8)) for _ in range(10)] 41 | stack = np.stack(source, axis=-1) 42 | for arr in source: 43 | arr.setflags(write=False) 44 | out = last(ireduce_ufunc(source, np.add)) 45 | 46 | 47 | def test_ireduce_ufunc_single_array(): 48 | """Test ireduce_ufunc on a single array, not a sequence""" 49 | source = [np.random.random((16, 5, 8)) for _ in range(10)] 50 | stack = np.stack(source, axis=-1) 51 | source = np.ones((16, 16), dtype=int) 52 | out = last(ireduce_ufunc(source, np.add, axis=-1)) 53 | assert np.allclose(source, out) 54 | 55 | 56 | def test_ireduce_ufunc_out_parameter(): 57 | """Test that the kwargs ``out`` is correctly passed to reduction function""" 58 | source = [np.random.random((16, 5, 8)) for _ in range(10)] 59 | stack = np.stack(source, axis=-1) 60 | not_out = last(ireduce_ufunc(source, np.add, axis=-1)) 61 | out = np.empty_like(source[0]) 62 | last(ireduce_ufunc(source, ufunc=np.add, out=out)) 63 | 64 | assert np.allclose(not_out, out) 65 | 66 | not_out = last(ireduce_ufunc(source, np.add, axis=2)) 67 | out = np.empty_like(source[0]) 68 | from_out = last(ireduce_ufunc(source, ufunc=np.add, out=out, axis=2)) 69 | 70 | assert np.allclose(not_out, from_out) 71 | 72 | 73 | def test_ireduce_ufunc_ignore_nan_no_identity(): 74 | """Test ireduce_ufunc on an ufunc with no identity raises 75 | an error for ignore_nan = True""" 76 | source = [np.ones((16, 16), dtype=int) for _ in range(5)] 77 | with pytest.raises(ValueError): 78 | ireduce_ufunc(source, np.maximum, axis=-1, ignore_nan=True) 79 | 80 | 81 | def test_ireduce_ufunc_non_ufunc(): 82 | """Test that ireduce_ufunc raises TypeError when a non-ufunc is passed""" 83 | with pytest.raises(TypeError): 84 | ireduce_ufunc(range(10), ufunc=lambda x: x) 85 | 86 | 87 | def test_ireduce_ufunc_non_binary_ufunc(): 88 | """Test that ireduce_ufunc raises ValueError if non-binary ufunc is used""" 89 | with pytest.raises(ValueError): 90 | ireduce_ufunc(range(10), ufunc=np.absolute) 91 | 92 | 93 | @pytest.mark.parametrize("axis", (0, 1, 2, 3, None)) 94 | def test_ireduce_ufunc_output_shape(axis): 95 | """Test output shape""" 96 | source = [np.random.random((16, 5, 8)) for _ in range(10)] 97 | stack = np.stack(source, axis=-1) 98 | 99 | from_numpy = np.add.reduce(stack, axis=axis) 100 | out = last(ireduce_ufunc(source, np.add, axis=axis)) 101 | assert from_numpy.shape == out.shape 102 | assert np.allclose(out, from_numpy) 103 | 104 | 105 | @pytest.mark.parametrize("axis", (0, 1, 2, 3, None)) 106 | def test_ireduce_ufunc_length(axis): 107 | """Test that the number of elements yielded by ireduce_ufunc is correct""" 108 | 109 | source = (np.zeros((16, 5, 8)) for _ in range(10)) 110 | out = list(ireduce_ufunc(source, np.add, axis=axis)) 111 | assert 10 == len(out) 112 | 113 | 114 | @pytest.mark.parametrize("axis", (0, 1, 2, 3, None)) 115 | def test_ireduce_ufunc_ignore_nan(axis): 116 | """Test that ignore_nan is working""" 117 | source = [np.random.random((16, 5, 8)) for _ in range(10)] 118 | stack = np.stack(source, axis=-1) 119 | 120 | out = last(ireduce_ufunc(source, np.add, axis=axis, ignore_nan=True)) 121 | assert not np.any(np.isnan(out)) 122 | 123 | 124 | def test_preduce_ufunc_trivial(): 125 | """Test preduce_ufunc for a sum of zeroes over two processes""" 126 | stream = [np.zeros((8, 8)) for _ in range(10)] 127 | s = preduce_ufunc(stream, ufunc=np.add, processes=2, ntotal=10) 128 | assert np.allclose(s, np.zeros_like(s)) 129 | 130 | 131 | def test_preduce_ufunc_correctess(): 132 | """Test preduce_ufunc is equivalent to reduce_ufunc for random sums""" 133 | stream = [np.random.random((8, 8)) for _ in range(20)] 134 | s = preduce_ufunc(stream, ufunc=np.add, processes=3, ntotal=10) 135 | assert np.allclose(s, reduce_ufunc(stream, np.add)) 136 | 137 | 138 | # Dynamics generation of tests on binary ufuncs 139 | @pytest.mark.parametrize("ufunc", UFUNCS) 140 | @pytest.mark.parametrize("axis", (0, 1, 2, -1)) 141 | def test_binary_ufunc(ufunc, axis): 142 | """Generate a test to ensure that ireduce_ufunc(..., ufunc, ...) 143 | works as intendent.""" 144 | source = [np.random.random((16, 5, 8)) for _ in range(10)] 145 | stack = np.stack(source, axis=-1) 146 | 147 | def sufunc(arrays, axis=-1): # s for stream 148 | return last(ireduce_ufunc(arrays, ufunc, axis=axis)) 149 | 150 | from_numpy = ufunc.reduce(stack, axis=axis) 151 | from_sufunc = sufunc(source, axis=axis) 152 | assert from_sufunc.shape == from_numpy.shape 153 | assert np.allclose(from_numpy, from_sufunc) 154 | 155 | 156 | @pytest.mark.parametrize("ufunc", UFUNCS_WITH_IDENTITY) 157 | def test_binary_ufunc_ignore_nan(ufunc): 158 | """Generate a test to ensure that ireduce_ufunc(..., ufunc, ...) 159 | works as intendent with NaNs in stream.""" 160 | 161 | source = [np.random.random((16, 5, 8)) for _ in range(10)] 162 | source[0][0, 0, 0] = np.nan 163 | stack = nan_to_num(np.stack(source, axis=-1), fill_value=ufunc.identity) 164 | 165 | def sufunc(arrays, ignore_nan=False): # s for stream 166 | return last(ireduce_ufunc(arrays, ufunc, axis=1, ignore_nan=True)) 167 | 168 | from_numpy = ufunc.reduce(stack, axis=1) 169 | from_sufunc = sufunc(source) 170 | assert from_numpy.shape == from_sufunc.shape 171 | assert np.allclose(from_numpy, from_sufunc) 172 | -------------------------------------------------------------------------------- /npstreams/iter_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Iterator/Generator utilities 4 | ---------------------------- 5 | """ 6 | from collections import deque 7 | from functools import wraps 8 | from itertools import chain, islice, tee 9 | 10 | 11 | def primed(gen): 12 | """ 13 | Decorator that primes a generator function, i.e. runs the function 14 | until the first ``yield`` statement. Useful in cases where there 15 | are preliminary checks when creating the generator. 16 | """ 17 | 18 | @wraps(gen) 19 | def primed_gen(*args, **kwargs): 20 | generator = gen(*args, **kwargs) 21 | next(generator) 22 | return generator 23 | 24 | return primed_gen 25 | 26 | 27 | @primed 28 | def chunked(iterable, chunksize): 29 | """ 30 | Generator yielding multiple iterables of length 'chunksize'. 31 | 32 | Parameters 33 | ---------- 34 | iterable : iterable 35 | Iterable to be chunked. 36 | chunksize : int 37 | Chunk size. 38 | 39 | Yields 40 | ------ 41 | chunk : iterable 42 | Iterable of size `chunksize`. In special case of iterable not being 43 | divisible by `chunksize`, the last `chunk` will be smaller. 44 | 45 | Raises 46 | ------ 47 | TypeError : if `chunksize` is not an integer. 48 | """ 49 | if not isinstance(chunksize, int): 50 | raise TypeError( 51 | f"Expected `chunksize` to be an integer, but received {chunksize}" 52 | ) 53 | 54 | yield 55 | 56 | iterable = iter(iterable) 57 | 58 | next_chunk = tuple(islice(iterable, chunksize)) 59 | while next_chunk: 60 | yield next_chunk 61 | next_chunk = tuple(islice(iterable, chunksize)) 62 | 63 | 64 | def peek(iterable): 65 | """ 66 | Peek ahead in an iterable. 67 | 68 | Parameters 69 | ---------- 70 | iterable : iterable 71 | 72 | Returns 73 | ------- 74 | first : object 75 | First element of ``iterable`` 76 | stream : iterable 77 | Iterable containing ``first`` and all other elements from ``iterable`` 78 | """ 79 | iterable = iter(iterable) 80 | ahead = next(iterable) 81 | return ahead, chain([ahead], iterable) 82 | 83 | 84 | def itercopy(iterable, copies=2): 85 | """ 86 | Split iterable into 'copies'. Once this is done, the original iterable *should 87 | not* be used again. 88 | 89 | Parameters 90 | ---------- 91 | iterable : iterable 92 | Iterable to be split. Once it is split, the original iterable 93 | should not be used again. 94 | copies : int, optional 95 | Number of copies. Also determines the number of returned iterables. 96 | 97 | Returns 98 | ------- 99 | iter1, iter2, ... : iterable 100 | Copies of ``iterable``. 101 | 102 | Examples 103 | -------- 104 | By rebinding the name of the original iterable, we make sure that it 105 | will never be used again. 106 | 107 | >>> from npstreams import itercopy 108 | >>> evens = (2*n for n in range(1000)) 109 | >>> evens, evens_copy = itercopy(evens, copies = 2) 110 | 111 | See Also 112 | -------- 113 | itertools.tee : equivalent function 114 | """ 115 | # itercopy is included because documentation of itertools.tee isn't obvious 116 | # to everyone 117 | return tee(iterable, copies) 118 | 119 | 120 | def linspace(start, stop, num, endpoint=True): 121 | """ 122 | Generate linear space. This is sometimes more appropriate than 123 | using `range`. 124 | 125 | Parameters 126 | ---------- 127 | start : float 128 | The starting value of the sequence. 129 | stop : float 130 | The end value of the sequence. 131 | num : int 132 | Number of samples to generate. 133 | endpoint : bool, optional 134 | If True (default), the endpoint is included in the linear space. 135 | 136 | Yields 137 | ------ 138 | val : float 139 | 140 | See also 141 | -------- 142 | numpy.linspace : generate linear space as a dense array. 143 | """ 144 | # If endpoint are to be counted in, 145 | # step does not count the last yield 146 | if endpoint: 147 | num -= 1 148 | 149 | step = (stop - start) / num 150 | 151 | val = start 152 | for _ in range(num): 153 | yield val 154 | val += step 155 | 156 | if endpoint: 157 | yield stop 158 | 159 | 160 | def multilinspace(start, stop, num, endpoint=True): 161 | """ 162 | Generate multilinear space, for joining the values in two iterables. 163 | 164 | Parameters 165 | ---------- 166 | start : iterable of floats 167 | The starting value. This iterable will be consumed. 168 | stop : iterable of floats 169 | The end value. This iterable will be consumed. 170 | num : int 171 | Number of samples to generate. 172 | endpoint : bool, optional 173 | If True (default), the endpoint is included in the linear space. 174 | 175 | Yields 176 | ------ 177 | val : tuple 178 | Tuple of the same length as start and stop 179 | 180 | Examples 181 | -------- 182 | >>> from npstreams import multilinspace 183 | >>> multispace = multilinspace(start = (0, 0), stop = (1, 1), num = 4, endpoint = False) 184 | >>> print(list(multispace)) 185 | [(0, 0), (0.25, 0.25), (0.5, 0.5), (0.75, 0.75)] 186 | 187 | See also 188 | -------- 189 | linspace : generate a linear space between two numbers 190 | """ 191 | start, stop = tuple(start), tuple(stop) 192 | if len(start) != len(stop): 193 | raise ValueError("start and stop must have the same length") 194 | 195 | spaces = tuple( 196 | linspace(a, b, num=num, endpoint=endpoint) for a, b in zip(start, stop) 197 | ) 198 | yield from zip(*spaces) 199 | 200 | 201 | def last(stream): 202 | """ 203 | Retrieve the last item from a stream/iterator, consuming 204 | iterables in the process. If empty stream, a RuntimeError is raised. 205 | """ 206 | # Wonderful idea from itertools recipes 207 | # https://docs.python.org/3.9/library/itertools.html#itertools-recipes 208 | try: 209 | return deque(stream, maxlen=1)[0] 210 | except IndexError: 211 | raise RuntimeError("Empty stream") 212 | 213 | 214 | def cyclic(iterable): 215 | """ 216 | Yields cyclic permutations of an iterable. 217 | 218 | Examples 219 | -------- 220 | >>> from npstreams import cyclic 221 | >>> list(cyclic((1,2,3))) 222 | [(1, 2, 3), (3, 1, 2), (2, 3, 1)] 223 | """ 224 | iterable = tuple(iterable) 225 | n = len(iterable) 226 | yield from (tuple(iterable[i - j] for i in range(n)) for j in range(n)) 227 | 228 | 229 | def length_hint(obj, default=0): 230 | """ 231 | Return an estimate of the number of items in ``obj``. 232 | 233 | This is useful for presizing containers when building from an 234 | iterable. 235 | 236 | If the object supports len(), the result will be 237 | exact. Otherwise, it may over- or under-estimate by an 238 | arbitrary amount. The result will be an integer >= 0. 239 | 240 | Notes 241 | ----- 242 | Source : https://www.python.org/dev/peps/pep-0424/ 243 | 244 | Examples 245 | -------- 246 | >>> from npstreams import length_hint 247 | >>> length_hint([1,2,3,4,5]) # Should be exact 248 | 5 249 | >>> length_hint(None, default = 15) # Does not implement __length_hint__ 250 | 15 251 | """ 252 | try: 253 | return len(obj) 254 | except TypeError: 255 | try: 256 | get_hint = type(obj).__length_hint__ 257 | except AttributeError: 258 | return default 259 | try: 260 | hint = get_hint(obj) 261 | except TypeError: 262 | return default 263 | if hint is NotImplemented: 264 | return default 265 | if not isinstance(hint, int): 266 | raise TypeError("Length hint must be an integer, not %r" % type(hint)) 267 | if hint < 0: 268 | raise ValueError("__length_hint__() should return >= 0") 269 | return hint 270 | -------------------------------------------------------------------------------- /npstreams/cuda.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | CUDA-accelerated streaming operations 4 | ------------------------------------- 5 | """ 6 | from functools import partial 7 | from itertools import repeat 8 | from operator import iadd, imul 9 | from subprocess import run, PIPE 10 | 11 | import numpy as np 12 | 13 | from . import array_stream, itercopy, nan_to_num, peek 14 | 15 | # Determine if 16 | # 1. pycuda is installed; 17 | # 2. pycuda can compile with nvcc 18 | # 3. a GPU is available 19 | 20 | try: 21 | import pycuda.gpuarray as gpuarray 22 | import pycuda.autoinit 23 | except ImportError: 24 | raise ImportError("PyCUDA is not installed. CUDA capabilities are not available.") 25 | else: 26 | import pycuda.driver as driver 27 | from pycuda.compiler import SourceModule 28 | 29 | # Check if nvcc compiler is installed at all 30 | nvcc_installed = run(["nvcc", "-h"], stdout=PIPE).returncode == 0 31 | if not nvcc_installed: 32 | raise ImportError("CUDA compiler `nvcc` not installed.") 33 | 34 | # Check that nvcc is at least set up properly 35 | # For example, if nvcc is installed but C++ compiler is not in path 36 | try: 37 | SourceModule("") 38 | except driver.CompileError: 39 | raise ImportError("CUDA compiler `nvcc` is not properly set up.") 40 | 41 | if driver.Device.count() == 0: 42 | raise ImportError("No GPU is available.") 43 | 44 | 45 | @array_stream 46 | def cuda_inplace_reduce(arrays, operator, dtype=None, ignore_nan=False, identity=0): 47 | """ 48 | Inplace reduce on GPU arrays. 49 | 50 | Parameters 51 | ---------- 52 | arrays : iterable 53 | Arrays to be reduced. 54 | operator : callable 55 | Callable of two arguments. This operator should operate in-place, storing the results into 56 | the buffer of the first argument, e.g. operator.iadd 57 | dtype : numpy.dtype, optional 58 | Arrays of the stream are cast to this dtype before reduction. 59 | ignore_nan : bool, optional 60 | If True, NaNs are replaced with ``identity``. Default is propagation of NaNs. 61 | identity : float, optional 62 | If ``ignore_nan = True``, NaNs are replaced with this value. 63 | 64 | Returns 65 | ------- 66 | out : ndarray 67 | """ 68 | # No need to cast all arrays if ``dtype`` is the same 69 | # type as the stream 70 | first, arrays = peek(arrays) 71 | if (dtype is not None) and (first.dtype != dtype): 72 | arrays = map(lambda arr: arr.astype(dtype), arrays) 73 | 74 | if ignore_nan: 75 | arrays = map(partial(nan_to_num, fill_value=identity), arrays) 76 | 77 | acc_gpu = gpuarray.to_gpu(next(arrays)) # Accumulator 78 | arr_gpu = gpuarray.empty_like(acc_gpu) # GPU memory location for each array 79 | for arr in arrays: 80 | arr_gpu.set(arr) 81 | operator(acc_gpu, arr_gpu) 82 | 83 | return acc_gpu.get() 84 | 85 | 86 | def csum(arrays, dtype=None, ignore_nan=False): 87 | """ 88 | CUDA-enabled sum of stream of arrays. Arrays are summed along 89 | the streaming axis for performance reasons. 90 | 91 | Parameters 92 | ---------- 93 | arrays : iterable 94 | Arrays to be summed. 95 | ignore_nan : bool, optional 96 | If True, NaNs are ignored. Default is propagation of NaNs. 97 | 98 | Returns 99 | ------- 100 | cuda_sum : ndarray 101 | 102 | See Also 103 | -------- 104 | isum : streaming sum of array elements, possibly along different axes 105 | """ 106 | return cuda_inplace_reduce( 107 | arrays, operator=iadd, dtype=dtype, ignore_nan=ignore_nan, identity=0 108 | ) 109 | 110 | 111 | def cprod(arrays, dtype=None, ignore_nan=False): 112 | """ 113 | CUDA-enabled product of a stream of arrays. Arrays are multiplied 114 | along the streaming axis for performance reasons. 115 | 116 | Parameters 117 | ---------- 118 | arrays : iterable 119 | Arrays to be multiplied. 120 | dtype : numpy.dtype, optional 121 | The type of the yielded array and of the accumulator in which the elements 122 | are summed. The dtype of a is used by default unless a has an integer dtype 123 | of less precision than the default platform integer. In that case, if a is 124 | signed then the platform integer is used while if a is unsigned then an 125 | unsigned integer of the same precision as the platform integer is used. 126 | ignore_nan : bool, optional 127 | If True, NaNs are ignored. Default is propagation of NaNs. 128 | 129 | Yields 130 | ------ 131 | online_prod : ndarray 132 | """ 133 | return cuda_inplace_reduce( 134 | arrays, operator=imul, dtype=dtype, ignore_nan=ignore_nan, identity=1 135 | ) 136 | 137 | 138 | @array_stream 139 | def cmean(arrays, ignore_nan=False): 140 | """ 141 | CUDA-enabled mean of stream of arrays (i.e. unweighted average). Arrays are averaged 142 | along the streaming axis for performance reasons. 143 | 144 | Parameters 145 | ---------- 146 | arrays : iterable of ndarrays 147 | Arrays to be averaged. This iterable can also a generator. 148 | ignore_nan : bool, optional 149 | If True, NaNs are set to zero weight. Default is propagation of NaNs. 150 | 151 | Returns 152 | ------- 153 | cuda_mean : ndarray 154 | 155 | See also 156 | -------- 157 | caverage : CUDA-enabled weighted average 158 | imean : streaming mean of arrays, possibly along different axes 159 | """ 160 | first, arrays = peek(arrays) 161 | 162 | # Need to know which array has NaNs, and modify the weights stream accordingly 163 | if ignore_nan: 164 | arrays, arrays2 = itercopy(arrays) 165 | weights = map( 166 | lambda arr, wgt: np.logical_not(np.isnan(arr)) * wgt, arrays2, weights 167 | ) 168 | arrays = map(np.nan_to_num, arrays) 169 | return caverage(arrays, weights, ignore_nan=False) 170 | 171 | accumulator = gpuarray.to_gpu(next(arrays)) 172 | array_gpu = gpuarray.empty_like(accumulator) 173 | num_arrays = 1 174 | for arr in arrays: 175 | num_arrays += 1 176 | array_gpu.set(arr) 177 | accumulator += array_gpu 178 | 179 | return accumulator.get() / num_arrays 180 | 181 | 182 | @array_stream 183 | def caverage(arrays, weights=None, ignore_nan=False): 184 | """ 185 | CUDA-enabled average of stream of arrays, possibly weighted. Arrays are averaged 186 | along the streaming axis for performance reasons. 187 | 188 | Parameters 189 | ---------- 190 | arrays : iterable of ndarrays 191 | Arrays to be averaged. This iterable can also a generator. 192 | weights : iterable of ndarray, iterable of floats, or None, optional 193 | Iterable of weights associated with the values in each item of `images`. 194 | Each value in an element of `images` contributes to the average 195 | according to its associated weight. The weights array can either be a float 196 | or an array of the same shape as any element of `images`. If weights=None, 197 | then all data in each element of `images` are assumed to have a weight equal to one. 198 | ignore_nan : bool, optional 199 | If True, NaNs are set to zero weight. Default is propagation of NaNs. 200 | 201 | Returns 202 | ------- 203 | cuda_avg : ndarray 204 | 205 | See also 206 | -------- 207 | iaverage : streaming weighted average, possibly along different axes 208 | """ 209 | if weights is None: 210 | return cmean(arrays, ignore_nan) 211 | 212 | first, arrays = peek(arrays) 213 | 214 | # We make sure that weights is always an array 215 | # This simplifies the handling of NaNs. 216 | if weights is None: 217 | weights = repeat(1) 218 | weights = map(partial(np.broadcast_to, shape=first.shape), weights) 219 | weights = map( 220 | lambda arr: arr.astype(first.dtype), weights 221 | ) # Won't work without this 222 | 223 | # Need to know which array has NaNs, and modify the weights stream accordingly 224 | if ignore_nan: 225 | arrays, arrays2 = itercopy(arrays) 226 | weights = map( 227 | lambda arr, wgt: np.logical_not(np.isnan(arr)) * wgt, arrays2, weights 228 | ) 229 | arrays = map(np.nan_to_num, arrays) 230 | 231 | first = next(arrays) 232 | fst_wgt = next(weights) 233 | 234 | arr_gpu = gpuarray.to_gpu(first * fst_wgt) 235 | wgt_gpu = gpuarray.to_gpu(fst_wgt) 236 | for arr, wgt in zip(arrays, weights): 237 | arr_gpu += gpuarray.to_gpu(arr) * gpuarray.to_gpu(wgt) 238 | wgt_gpu += gpuarray.to_gpu(wgt) 239 | 240 | arr_gpu /= wgt_gpu 241 | return arr_gpu.get() 242 | -------------------------------------------------------------------------------- /npstreams/tests/test_numerics.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from random import randint, random 4 | 5 | import numpy as np 6 | 7 | from npstreams import isum, iprod, last, isub, iany, iall, prod 8 | from npstreams import sum as nssum # avoiding name clashes 9 | import pytest 10 | 11 | 12 | def test_isum_trivial(): 13 | """Test a sum of zeros""" 14 | source = [np.zeros((16,), dtype=float) for _ in range(10)] 15 | summed = last(isum(source)) 16 | assert np.allclose(summed, np.zeros_like(summed)) 17 | 18 | 19 | def test_isum_ignore_nans(): 20 | """Test a sum of zeros with NaNs sprinkled""" 21 | source = [np.zeros((16,), dtype=float) for _ in range(10)] 22 | source.append(np.full((16,), fill_value=np.nan)) 23 | summed = last(isum(source, ignore_nan=True)) 24 | assert np.allclose(summed, np.zeros_like(summed)) 25 | 26 | 27 | def test_isum_length(): 28 | """Test that the number of yielded elements is the same as source""" 29 | source = [np.zeros((16,), dtype=float) for _ in range(10)] 30 | summed = list(isum(source, axis=0)) 31 | assert 10 == len(summed) 32 | 33 | 34 | def test_isum_dtype(): 35 | """Test a sum of floating zeros with an int accumulator""" 36 | source = [np.zeros((16,), dtype=float) for _ in range(10)] 37 | summed = last(isum(source, dtype=int)) 38 | assert np.allclose(summed, np.zeros_like(summed)) 39 | assert summed.dtype == int 40 | 41 | 42 | def test_isum_axis(): 43 | """Test that isum(axis = 0) yields 0d arrays""" 44 | source = [np.zeros((16,), dtype=float) for _ in range(10)] 45 | 46 | summed = last(isum(source, axis=0)) 47 | assert np.allclose(summed, np.zeros_like(summed)) 48 | 49 | summed = last(isum(source, axis=None)) 50 | assert np.allclose(summed, 0) 51 | 52 | 53 | def test_isum_return_shape(): 54 | """Test that the shape of output is as expected""" 55 | source = [np.zeros((16,), dtype=float) for _ in range(10)] 56 | 57 | summed = last(isum(source, axis=0)) 58 | assert summed.shape == (1, 10) 59 | 60 | 61 | @pytest.mark.parametrize("axis", (0, 1, 2, None)) 62 | def test_isum_against_numpy(axis): 63 | """Test that isum() returns the same as numpy.sum() for various axis inputs""" 64 | 65 | stream = [np.random.random((16, 16)) for _ in range(10)] 66 | stack = np.dstack(stream) 67 | 68 | from_numpy = np.sum(stack, axis=axis) 69 | from_isum = last(isum(stream, axis=axis)) 70 | assert np.allclose(from_isum, from_numpy) 71 | 72 | 73 | def test_sum_trivial(): 74 | """Test a sum of zeros""" 75 | source = [np.zeros((16,), dtype=float) for _ in range(10)] 76 | summed = nssum(source) 77 | assert np.allclose(summed, np.zeros_like(summed)) 78 | 79 | 80 | def test_sum_ignore_nans(): 81 | """Test a sum of zeros with NaNs sprinkled""" 82 | source = [np.zeros((16,), dtype=float) for _ in range(10)] 83 | source.append(np.full((16,), fill_value=np.nan)) 84 | summed = nssum(source, ignore_nan=True) 85 | assert np.allclose(summed, np.zeros_like(summed)) 86 | 87 | 88 | def test_sum_dtype(): 89 | """Test a sum of floating zeros with an int accumulator""" 90 | source = [np.zeros((16,), dtype=float) for _ in range(10)] 91 | summed = nssum(source, dtype=int) 92 | assert np.allclose(summed, np.zeros_like(summed)) 93 | assert summed.dtype == int 94 | 95 | 96 | def test_sum_axis(): 97 | """Test that isum(axis = 0) yields 0d arrays""" 98 | source = [np.zeros((16,), dtype=float) for _ in range(10)] 99 | 100 | summed = nssum(source, axis=0) 101 | assert np.allclose(summed, np.zeros_like(summed)) 102 | 103 | summed = nssum(source, axis=None) 104 | assert np.allclose(summed, 0) 105 | 106 | 107 | def test_sum_return_shape(): 108 | """Test that the shape of output is as expected""" 109 | source = [np.zeros((16,), dtype=float) for _ in range(10)] 110 | 111 | summed = nssum(source, axis=0) 112 | assert summed.shape == (1, 10) 113 | 114 | 115 | @pytest.mark.parametrize("axis", (0, 1, 2, None)) 116 | def test_sum_against_numpy(axis): 117 | """Test that isum() returns the same as numpy.sum() for various axis inputs""" 118 | 119 | stream = [np.random.random((16, 16)) for _ in range(10)] 120 | stack = np.dstack(stream) 121 | 122 | from_numpy = np.sum(stack, axis=axis) 123 | from_sum = nssum(stream, axis=axis) 124 | assert np.allclose(from_sum, from_numpy) 125 | 126 | 127 | def test_iprod_trivial(): 128 | """Test a product of ones""" 129 | source = [np.ones((16,), dtype=float) for _ in range(10)] 130 | product = last(iprod(source)) 131 | assert np.allclose(product, np.ones_like(product)) 132 | 133 | 134 | def test_iprod_ignore_nans(): 135 | """Test that NaNs are ignored.""" 136 | source = [np.ones((16,), dtype=float) for _ in range(10)] 137 | source.append(np.full_like(source[0], np.nan)) 138 | product = last(iprod(source, ignore_nan=True)) 139 | assert np.allclose(product, np.ones_like(product)) 140 | 141 | 142 | def test_iprod_dtype(): 143 | """Test that dtype argument is working""" 144 | source = [np.ones((16,), dtype=float) for _ in range(10)] 145 | product = last(iprod(source, dtype=int)) 146 | assert np.allclose(product, np.ones_like(product)) 147 | assert product.dtype == int 148 | 149 | 150 | def test_iprod_axis(): 151 | """Test that iprod(axis = 0) yields 0d arrays""" 152 | source = [np.ones((16,), dtype=float) for _ in range(10)] 153 | 154 | summed = last(iprod(source, axis=0)) 155 | assert np.all(summed == 1) 156 | 157 | summed = last(iprod(source, axis=None)) 158 | assert np.allclose(summed, np.ones_like(summed)) 159 | 160 | 161 | @pytest.mark.parametrize("axis", (0, 1, 2, None)) 162 | def test_iprod_against_numpy(axis): 163 | """Test that iprod() returns the same as numpy.prod() for various axis inputs""" 164 | 165 | stream = [np.random.random((16, 16)) for _ in range(10)] 166 | stack = np.dstack(stream) 167 | 168 | from_numpy = np.prod(stack, axis=axis) 169 | from_stream = last(iprod(stream, axis=axis)) 170 | assert np.allclose(from_stream, from_numpy) 171 | 172 | 173 | def test_prod_trivial(): 174 | """Test a product of ones""" 175 | source = [np.ones((16,), dtype=float) for _ in range(10)] 176 | product = prod(source) 177 | assert np.allclose(product, np.ones_like(product)) 178 | 179 | 180 | def test_prod_ignore_nans(): 181 | """Test that NaNs are ignored.""" 182 | source = [np.ones((16,), dtype=float) for _ in range(10)] 183 | source.append(np.full_like(source[0], np.nan)) 184 | product = prod(source, ignore_nan=True) 185 | assert np.allclose(product, np.ones_like(product)) 186 | 187 | 188 | def test_prod_dtype(): 189 | """Test that dtype argument is working""" 190 | source = [np.ones((16,), dtype=float) for _ in range(10)] 191 | product = prod(source, dtype=int) 192 | assert np.allclose(product, np.ones_like(product)) 193 | assert product.dtype == int 194 | 195 | 196 | def test_prod_axis(): 197 | """Test that iprod(axis = 0) yields 0d arrays""" 198 | source = [np.ones((16,), dtype=float) for _ in range(10)] 199 | 200 | summed = prod(source, axis=0) 201 | assert np.all(summed == 1) 202 | 203 | summed = prod(source, axis=None) 204 | assert np.allclose(summed, np.ones_like(summed)) 205 | 206 | 207 | @pytest.mark.parametrize("axis", (0, 1, 2, None)) 208 | def test_prod_against_numpy(axis): 209 | """Test that iprod() returns the same as numpy.prod() for various axis inputs""" 210 | 211 | stream = [np.random.random((16, 16)) for _ in range(10)] 212 | stack = np.dstack(stream) 213 | 214 | from_numpy = np.prod(stack, axis=axis) 215 | from_stream = prod(stream, axis=axis) 216 | assert np.allclose(from_stream, from_numpy) 217 | 218 | 219 | @pytest.mark.parametrize("axis", (0, 1, 2)) 220 | def test_isub_against_numpy(axis): 221 | """Test against numpy.subtract.reduce""" 222 | stream = [np.random.random((8, 16, 2)) for _ in range(11)] 223 | stack = np.stack(stream, axis=-1) 224 | 225 | from_numpy = np.subtract.reduce(stack, axis=axis) 226 | from_stream = last(isub(stream, axis=axis)) 227 | assert np.allclose(from_numpy, from_stream) 228 | 229 | 230 | @pytest.mark.parametrize("axis", (0, 1, 2, None)) 231 | def test_iall_against_numpy(axis): 232 | """Test iall against numpy.all""" 233 | stream = [np.zeros((8, 16, 2)) for _ in range(11)] 234 | stream[3][3, 0, 1] = 1 # so that np.all(axis = None) evaluates to False 235 | stack = np.stack(stream, axis=-1) 236 | 237 | from_numpy = np.all(stack, axis=axis) 238 | from_stream = last(iall(stream, axis=axis)) 239 | assert np.allclose(from_numpy, from_stream) 240 | 241 | 242 | @pytest.mark.parametrize("axis", (0, 1, 2, None)) 243 | def test_iany_against_numpy(axis): 244 | """Test iany against numpy.any""" 245 | stream = [np.zeros((8, 16, 2)) for _ in range(11)] 246 | stream[3][3, 0, 1] = 1 # so that np.all(axis = None) evaluates to False 247 | stack = np.stack(stream, axis=-1) 248 | 249 | from_numpy = np.any(stack, axis=axis) 250 | from_stream = last(iany(stream, axis=axis)) 251 | assert np.allclose(from_numpy, from_stream) 252 | -------------------------------------------------------------------------------- /npstreams/benchmarks.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Reliably benchmarking npstreams performance. 4 | """ 5 | import inspect 6 | import sys 7 | import timeit 8 | from collections import namedtuple 9 | from contextlib import redirect_stdout 10 | from functools import partial 11 | from shutil import get_terminal_size 12 | 13 | import numpy as np 14 | 15 | from . import __version__ 16 | from .reduce import _check_binary_ufunc 17 | 18 | UFUNC_SETUP = """ 19 | from npstreams import reduce_ufunc, stack 20 | import numpy as np 21 | from numpy import {ufunc.__name__} 22 | 23 | np.random.seed(42056) 24 | 25 | def stream(): 26 | return (np.random.random({shape}) for _ in range(10)) 27 | """ 28 | 29 | FUNC_SETUP = """ 30 | from npstreams import stack 31 | import numpy as np 32 | from numpy import {func.__name__} as np_{func.__name__} 33 | from npstreams import {func.__name__} as ns_{func.__name__} 34 | 35 | np.random.seed(42056) 36 | 37 | def stream(): 38 | return (np.random.random({shape}) for _ in range(10)) 39 | """ 40 | 41 | BenchmarkResults = namedtuple( 42 | "BenchmarkResults", field_names=["numpy_time", "npstreams_time", "shape"] 43 | ) 44 | 45 | 46 | def autotimeit(statement, setup="pass", repeat=3): 47 | """ 48 | Time a statement, automatically determining the number of times to 49 | run the statement so that the total excecution time is not too short. 50 | 51 | .. versionadded:: 1.5.2 52 | 53 | Parameters 54 | ---------- 55 | statement: string 56 | Statement to time. The statement will be executed after the `setup` statement. 57 | setup : string, optional 58 | Setup statement executed before timing starts. 59 | repeat : int, optional 60 | Number of repeated timing to execute. 61 | 62 | Returns 63 | ------- 64 | time : float 65 | Minimal time per execution of `statement` [seconds]. 66 | """ 67 | timer = timeit.Timer(stmt=statement, setup=setup) 68 | number, _ = timer.autorange() 69 | return min(timer.repeat(repeat=repeat, number=number)) / number 70 | 71 | 72 | def benchmark( 73 | funcs=[np.average, np.mean, np.std, np.sum, np.prod], 74 | ufuncs=[np.add, np.multiply, np.power, np.true_divide, np.mod], 75 | shapes=[(4, 4), (8, 8), (16, 16), (64, 64)], 76 | file=None, 77 | ): 78 | """ 79 | Benchmark npstreams against numpy and print the results. 80 | 81 | There are two categories of benchmarks. The first category compares NumPy functions against 82 | npstreams versions of the same functions. The second category compares NumPy universal functions 83 | against dynamically-generated npstreams versions of those same universal functions. 84 | 85 | All benchmarks compare a reduction operation on a stream of arrays of varying sizes. The sequence length is fixed. 86 | 87 | .. versionadded:: 1.5.2 88 | 89 | Parameters 90 | ---------- 91 | funcs : iterable of NumPy functions, optional 92 | NumPy functions to compare. An equivalent must exist in npstreams, e.g. `np.average` and `npstreams.average` . 93 | Functions without equivalents will be skipped. 94 | ufuncs : iterable of NumPy ufunc, optional 95 | Invalid ufuncs (e.g. non-binary ufuncs) will be skipped. 96 | shapes : iterable of tuples, optional 97 | Shapes of arrays to test. Streams of random numbers will be generated with arrays of those shapes. 98 | The sequence lengths are fixed. 99 | file : file-like or None, optional 100 | File to which the benchmark results will be written. If None, sys.stdout will be used. 101 | """ 102 | # Preliminaries 103 | console_width = min(get_terminal_size().columns, 80) 104 | func_test_name = "numpy.{f.__name__} vs npstreams.{f.__name__}".format 105 | ufunc_test_name = ( 106 | "numpy.{f.__name__} vs npstreams.reduce_ufunc(numpy.{f.__name__}, ...)".format 107 | ) 108 | 109 | # Determine justification based on maximal shape functions 110 | sh_just = max(map(lambda s: len(str(s)), shapes)) + 10 111 | 112 | # To make it easy to either write the results in a file or print to stdout, 113 | # We actually redirect stdout. 114 | if file is None: 115 | file = sys.stdout 116 | 117 | with redirect_stdout(file): 118 | # Start benchmarks -------------------------------------------------------- 119 | print( 120 | "".ljust(console_width, "*"), 121 | "npstreams performance benchmark".upper().center(console_width), 122 | "", 123 | " npstreams".ljust(15) + f" {__version__}", 124 | " NumPy".ljust(15) + f" {np.__version__}", 125 | "", 126 | " Speedup is NumPy time divided by npstreams time (Higher is better)", 127 | "".ljust(console_width, "*"), 128 | sep="\n", 129 | ) 130 | 131 | # Determine valid ufuncs and funcs first ---------------------------------- 132 | valid_ufuncs = comparable_ufuncs(ufuncs, file) 133 | valid_funcs = comparable_funcs(funcs, file) 134 | 135 | # Benchmarking functions -------------------------------------------------- 136 | for func in sorted(valid_funcs, key=lambda fn: fn.__name__): 137 | print(func_test_name(f=func).center(console_width), "\n") 138 | 139 | for (np_time, ns_time, shape) in benchmark_func(func, shapes): 140 | speedup = np_time / ns_time 141 | print( 142 | " ", 143 | f"shape = {shape}".ljust(sh_just), 144 | f"speedup = {speedup:.4f}x", 145 | ) 146 | 147 | print("".ljust(console_width, "-")) 148 | 149 | # Benchmarking universal functions ---------------------------------------- 150 | for ufunc in sorted(valid_ufuncs, key=lambda fn: fn.__name__): 151 | print(ufunc_test_name(f=ufunc).center(console_width), "\n") 152 | 153 | for (np_time, ns_time, shape) in benchmark_ufunc(ufunc, shapes): 154 | speedup = np_time / ns_time 155 | print( 156 | " ", 157 | f"shape = {shape}".ljust(sh_just), 158 | f"speedup = {speedup:.4f}x", 159 | ) 160 | 161 | print("".ljust(console_width, "-")) 162 | 163 | 164 | def benchmark_ufunc(ufunc, shapes): 165 | """ 166 | Compare the running time between a NumPy ufunc and the npstreams equivalent. 167 | 168 | Parameters 169 | ---------- 170 | ufunc : NumPy ufunc 171 | 172 | shapes : iterable of tuples, optional 173 | Shapes of arrays to test. Streams of random numbers will be generated with arrays of those shapes. 174 | The sequence lengths are fixed. 175 | 176 | Yields 177 | ------ 178 | results : BenchmarkResults 179 | """ 180 | for shape in shapes: 181 | 182 | numpy_statement = f"{ufunc.__name__}.reduce(stack(stream()), axis = -1)" 183 | npstreams_statement = f"reduce_ufunc(stream(), {ufunc.__name__}, axis = -1)" 184 | 185 | with np.errstate(invalid="ignore"): 186 | np_time = autotimeit( 187 | numpy_statement, UFUNC_SETUP.format(ufunc=ufunc, shape=shape) 188 | ) 189 | ns_time = autotimeit( 190 | npstreams_statement, UFUNC_SETUP.format(ufunc=ufunc, shape=shape) 191 | ) 192 | 193 | yield BenchmarkResults(np_time, ns_time, shape) 194 | 195 | 196 | def benchmark_func(func, shapes): 197 | """ 198 | Compare the running time between a NumPy func and the npstreams equivalent. 199 | 200 | Parameters 201 | ---------- 202 | func : NumPy func 203 | 204 | shapes : iterable of tuples, optional 205 | Shapes of arrays to test. Streams of random numbers will be generated with arrays of those shapes. 206 | The sequence lengths are fixed. 207 | 208 | Yields 209 | ------ 210 | results : BenchmarkResults 211 | """ 212 | for shape in shapes: 213 | 214 | numpy_statement = f"np_{func.__name__}(stack(stream()), axis = -1)" 215 | npstreams_statement = f"ns_{func.__name__}(stream(), axis = -1)" 216 | 217 | with np.errstate(invalid="ignore"): 218 | np_time = autotimeit( 219 | numpy_statement, FUNC_SETUP.format(func=func, shape=shape) 220 | ) 221 | ns_time = autotimeit( 222 | npstreams_statement, FUNC_SETUP.format(func=func, shape=shape) 223 | ) 224 | 225 | yield BenchmarkResults(np_time, ns_time, shape) 226 | 227 | 228 | def comparable_ufuncs(ufuncs, file): 229 | """ 230 | Yields ufuncs that can be compared between numpy and npstreams. 231 | 232 | Parameters 233 | ---------- 234 | ufuncs : iterable of NumPy ufunc 235 | NumPy ufuncs to check. Ufuncs that cannot be compared will be skipped. 236 | 237 | Yields 238 | ------ 239 | ufunc : callable 240 | NumPy ufuncs that can be compared with npstreams. 241 | """ 242 | for ufunc in ufuncs: 243 | if not isinstance(ufunc, np.ufunc): 244 | print( 245 | f"Skipping function {ufunc.__name__} as it is not a NumPy Universal Function" 246 | ) 247 | continue 248 | 249 | try: 250 | _check_binary_ufunc(ufunc) 251 | except ValueError: 252 | print( 253 | f"Skipping function {ufunc.__name__} as it is not a valid binary ufunc" 254 | ) 255 | else: 256 | yield ufunc 257 | 258 | 259 | def comparable_funcs(funcs, file): 260 | """ 261 | Yields NumPy functions that have npstreams equivalents. 262 | 263 | Parameters 264 | ---------- 265 | ufuncs : iterable of NumPy functions 266 | NumPy funcs to check. 267 | 268 | Yields 269 | ------ 270 | ufunc : callable 271 | NumPy funcs that have npstreams equivalents. 272 | """ 273 | import npstreams 274 | 275 | npstreams_functions = set( 276 | name for name, value in inspect.getmembers(npstreams, inspect.isfunction) 277 | ) 278 | for func in funcs: 279 | if func.__name__ not in npstreams_functions: 280 | print( 281 | f"Skipping function {func.__name__} as there is no npstreams equivalent" 282 | ) 283 | else: 284 | yield func 285 | 286 | 287 | if __name__ == "__main__": 288 | benchmark() 289 | -------------------------------------------------------------------------------- /npstreams/numerics.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Numerics Functions 4 | ------------------ 5 | """ 6 | import numpy as np 7 | 8 | from .reduce import ireduce_ufunc, reduce_ufunc 9 | 10 | 11 | def isum(arrays, axis=-1, dtype=None, ignore_nan=False): 12 | """ 13 | Streaming sum of array elements. 14 | 15 | Parameters 16 | ---------- 17 | arrays : iterable 18 | Arrays to be summed. 19 | axis : int or None, optional 20 | Reduction axis. Default is to sum the arrays in the stream as if 21 | they had been stacked along a new axis, then sum along this new axis. 22 | If None, arrays are flattened before summing. If `axis` is an int larger that 23 | the number of dimensions in the arrays of the stream, arrays are summed 24 | along the new axis. 25 | dtype : numpy.dtype, optional 26 | The type of the yielded array and of the accumulator in which the elements 27 | are summed. The dtype of a is used by default unless a has an integer dtype 28 | of less precision than the default platform integer. In that case, if a is 29 | signed then the platform integer is used while if a is unsigned then an 30 | unsigned integer of the same precision as the platform integer is used. 31 | ignore_nan : bool, optional 32 | If True, NaNs are ignored. Default is propagation of NaNs. 33 | 34 | Yields 35 | ------ 36 | online_sum : ndarray 37 | """ 38 | yield from ireduce_ufunc( 39 | arrays, ufunc=np.add, axis=axis, ignore_nan=ignore_nan, dtype=dtype 40 | ) 41 | 42 | 43 | def sum(arrays, axis=-1, dtype=None, ignore_nan=False): 44 | """ 45 | Sum of arrays in a stream. 46 | 47 | Parameters 48 | ---------- 49 | arrays : iterable 50 | Arrays to be summed. 51 | axis : int or None, optional 52 | Reduction axis. Default is to sum the arrays in the stream as if 53 | they had been stacked along a new axis, then sum along this new axis. 54 | If None, arrays are flattened before summing. If `axis` is an int larger that 55 | the number of dimensions in the arrays of the stream, arrays are summed 56 | along the new axis. 57 | dtype : numpy.dtype, optional 58 | The type of the yielded array and of the accumulator in which the elements 59 | are summed. The dtype of a is used by default unless a has an integer dtype 60 | of less precision than the default platform integer. In that case, if a is 61 | signed then the platform integer is used while if a is unsigned then an 62 | unsigned integer of the same precision as the platform integer is used. 63 | ignore_nan : bool, optional 64 | If True, NaNs are ignored. Default is propagation of NaNs. 65 | 66 | Returns 67 | ------- 68 | sum : ndarray 69 | """ 70 | return reduce_ufunc( 71 | arrays, ufunc=np.add, axis=axis, dtype=dtype, ignore_nan=ignore_nan 72 | ) 73 | 74 | 75 | def iprod(arrays, axis=-1, dtype=None, ignore_nan=False): 76 | """ 77 | Streaming product of array elements. 78 | 79 | Parameters 80 | ---------- 81 | arrays : iterable 82 | Arrays to be multiplied. 83 | axis : int or None, optional 84 | Reduction axis. Default is to multiply the arrays in the stream as if 85 | they had been stacked along a new axis, then multiply along this new axis. 86 | If None, arrays are flattened before multiplication. If `axis` is an int larger that 87 | the number of dimensions in the arrays of the stream, arrays are multiplied 88 | along the new axis. 89 | dtype : numpy.dtype, optional 90 | The type of the yielded array and of the accumulator in which the elements 91 | are summed. The dtype of a is used by default unless a has an integer dtype 92 | of less precision than the default platform integer. In that case, if a is 93 | signed then the platform integer is used while if a is unsigned then an 94 | unsigned integer of the same precision as the platform integer is used. 95 | ignore_nan : bool, optional 96 | If True, NaNs are ignored. Default is propagation of NaNs. 97 | 98 | Yields 99 | ------ 100 | online_prod : ndarray 101 | """ 102 | yield from ireduce_ufunc( 103 | arrays, ufunc=np.multiply, axis=axis, dtype=dtype, ignore_nan=ignore_nan 104 | ) 105 | 106 | 107 | def prod(arrays, axis=-1, dtype=None, ignore_nan=False): 108 | """ 109 | Product of arrays in a stream. 110 | 111 | Parameters 112 | ---------- 113 | arrays : iterable 114 | Arrays to be multiplied. 115 | axis : int or None, optional 116 | Reduction axis. Default is to multiply the arrays in the stream as if 117 | they had been stacked along a new axis, then multiply along this new axis. 118 | If None, arrays are flattened before multiplication. If `axis` is an int larger that 119 | the number of dimensions in the arrays of the stream, arrays are multiplied 120 | along the new axis. 121 | dtype : numpy.dtype, optional 122 | The type of the yielded array and of the accumulator in which the elements 123 | are summed. The dtype of a is used by default unless a has an integer dtype 124 | of less precision than the default platform integer. In that case, if a is 125 | signed then the platform integer is used while if a is unsigned then an 126 | unsigned integer of the same precision as the platform integer is used. 127 | ignore_nan : bool, optional 128 | If True, NaNs are ignored. Default is propagation of NaNs. 129 | 130 | Returns 131 | ------- 132 | product : ndarray 133 | """ 134 | return reduce_ufunc( 135 | arrays, ufunc=np.multiply, axis=axis, dtype=dtype, ignore_nan=ignore_nan 136 | ) 137 | 138 | 139 | def isub(arrays, axis=-1, dtype=None): 140 | """ 141 | Subtract elements in a reduction fashion. Equivalent to ``numpy.subtract.reduce`` on a dense array. 142 | 143 | Parameters 144 | ---------- 145 | arrays : iterable 146 | Arrays to be multiplied. 147 | axis : int, optional 148 | Reduction axis. Since subtraction is not reorderable (unlike a sum, for example), 149 | `axis` must be specified as an int; full reduction (``axis = None``) will raise an exception. 150 | Default is to subtract the arrays in the stream as if they had been stacked along a new axis, 151 | then subtract along this new axis. If None, arrays are flattened before subtraction. 152 | If `axis` is an int larger that the number of dimensions in the arrays of the stream, 153 | arrays are subtracted along the new axis. 154 | dtype : numpy.dtype, optional 155 | The type of the yielded array and of the accumulator in which the elements 156 | are combined. The dtype of a is used by default unless a has an integer dtype 157 | of less precision than the default platform integer. In that case, if a is 158 | signed then the platform integer is used while if a is unsigned then an 159 | unsigned integer of the same precision as the platform integer is used. 160 | 161 | Yields 162 | ------ 163 | online_sub : ndarray 164 | 165 | Raises 166 | ------ 167 | ValueError 168 | If `axis` is None. Since subtraction is not reorderable (unlike a sum, for example), 169 | `axis` must be specified as an int. 170 | """ 171 | if axis is None: 172 | raise ValueError( 173 | "Subtraction is not a reorderable operation, and \ 174 | therefore a specific axis must be given." 175 | ) 176 | yield from ireduce_ufunc(arrays, ufunc=np.subtract, axis=axis, dtype=dtype) 177 | 178 | 179 | def iall(arrays, axis=-1): 180 | """ 181 | Test whether all array elements along a given axis evaluate to True 182 | 183 | Parameters 184 | ---------- 185 | arrays : iterable 186 | Arrays to be reduced. 187 | axis : int or None, optional 188 | Axis along which a logical AND reduction is performed. The default 189 | is to perform a logical AND along the 'stream axis', as if all arrays in ``array`` 190 | were stacked along a new dimension. If ``axis = None``, arrays in ``arrays`` are flattened 191 | before reduction. 192 | 193 | Yields 194 | ------ 195 | all : ndarray, dtype bool 196 | """ 197 | # TODO: use ``where`` keyword to only check places that are already ``True`` 198 | yield from ireduce_ufunc(arrays, ufunc=np.logical_and, axis=axis) 199 | 200 | 201 | def iany(arrays, axis=-1): 202 | """ 203 | Test whether any array elements along a given axis evaluate to True. 204 | 205 | Parameters 206 | ---------- 207 | arrays : iterable 208 | Arrays to be reduced. 209 | axis : int or None, optional 210 | Axis along which a logical OR reduction is performed. The default 211 | is to perform a logical AND along the 'stream axis', as if all arrays in ``array`` 212 | were stacked along a new dimension. If ``axis = None``, arrays in ``arrays`` are flattened 213 | before reduction. 214 | 215 | Yields 216 | ------ 217 | any : ndarray, dtype bool 218 | """ 219 | # TODO: use ``where`` keyword to only check places that are not already ``True`` 220 | yield from ireduce_ufunc(arrays, ufunc=np.logical_or, axis=axis) 221 | 222 | 223 | def imax(arrays, axis, ignore_nan=False): 224 | """ 225 | Maximum of a stream of arrays along an axis. 226 | 227 | Parameters 228 | ---------- 229 | arrays : iterable 230 | Arrays to be reduced. 231 | axis : int or None, optional 232 | Axis along which the maximum is found. The default 233 | is to find the maximum along the 'stream axis', as if all arrays in ``array`` 234 | were stacked along a new dimension. If ``axis = None``, arrays in ``arrays`` are flattened 235 | before reduction. 236 | ignore_nan : bool, optional 237 | If True, NaNs are ignored. Default is propagation of NaNs. 238 | 239 | Yields 240 | ------ 241 | online_max : ndarray 242 | Cumulative maximum. 243 | """ 244 | ufunc = np.fmax if ignore_nan else np.maximum 245 | yield from ireduce_ufunc(arrays, ufunc, axis) 246 | 247 | 248 | def imin(arrays, axis, ignore_nan=False): 249 | """ 250 | Minimum of a stream of arrays along an axis. 251 | 252 | Parameters 253 | ---------- 254 | arrays : iterable 255 | Arrays to be reduced. 256 | axis : int or None, optional 257 | Axis along which the minimum is found. The default 258 | is to find the minimum along the 'stream axis', as if all arrays in ``array`` 259 | were stacked along a new dimension. If ``axis = None``, arrays in ``arrays`` are flattened 260 | before reduction. 261 | ignore_nan : bool, optional 262 | If True, NaNs are ignored. Default is propagation of NaNs. 263 | 264 | Yields 265 | ------ 266 | online_min : ndarray 267 | Cumulative minimum. 268 | """ 269 | ufunc = np.fmin if ignore_nan else np.minimum 270 | yield from ireduce_ufunc(arrays, ufunc, axis) 271 | -------------------------------------------------------------------------------- /npstreams/reduce.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | General stream reduction 4 | ------------------------ 5 | """ 6 | from functools import lru_cache, partial 7 | from itertools import islice, repeat 8 | from multiprocessing import Pool 9 | 10 | import numpy as np 11 | 12 | from .array_stream import array_stream 13 | from .array_utils import nan_to_num 14 | from .iter_utils import chunked, last, peek, primed 15 | from .parallel import preduce 16 | 17 | identity = lambda i: i 18 | 19 | 20 | @lru_cache(maxsize=128) 21 | def _check_binary_ufunc(ufunc): 22 | """ 23 | Check that ufunc is suitable for ``ireduce_ufunc``. 24 | 25 | Specifically, a binary ``numpy.ufunc`` function is required. Functions 26 | that returns a boolean are also not suitable because they cannot be accumulated. 27 | 28 | This function does not return anything. 29 | 30 | Parameters 31 | ---------- 32 | ufunc : callable 33 | Function to check. 34 | 35 | Raises 36 | ------ 37 | TypeError : if ``ufunc`` is not a ``numpy.ufunc`` 38 | ValueError: if ``ufunc`` is not binary or the return type is boolean. 39 | """ 40 | if not isinstance(ufunc, np.ufunc): 41 | raise TypeError(f"{ufunc.__name__} is not a NumPy Ufunc") 42 | if ufunc.nin != 2: 43 | raise ValueError( 44 | f"Only binary ufuncs are supported, and {ufunc.__name__} is not one of them" 45 | ) 46 | 47 | 48 | @primed 49 | @array_stream 50 | def ireduce_ufunc(arrays, ufunc, axis=-1, dtype=None, ignore_nan=False, **kwargs): 51 | """ 52 | Streaming reduction generator function from a binary NumPy ufunc. Generator 53 | version of `reduce_ufunc`. 54 | 55 | ``ufunc`` must be a NumPy binary Ufunc (i.e. it takes two arguments). Moreover, 56 | for performance reasons, ufunc must have the same return types as input types. 57 | This precludes the use of ``numpy.greater``, for example. 58 | 59 | Note that performance is much better for the default ``axis = -1``. In such a case, 60 | reduction operations can occur in-place. This also allows to operate in constant-memory. 61 | 62 | Parameters 63 | ---------- 64 | arrays : iterable 65 | Arrays to be reduced. 66 | ufunc : numpy.ufunc 67 | Binary universal function. 68 | axis : int or None, optional 69 | Reduction axis. Default is to reduce the arrays in the stream as if 70 | they had been stacked along a new axis, then reduce along this new axis. 71 | If None, arrays are flattened before reduction. If `axis` is an int larger that 72 | the number of dimensions in the arrays of the stream, arrays are reduced 73 | along the new axis. Note that not all of NumPy Ufuncs support 74 | ``axis = None``, e.g. ``numpy.subtract``. 75 | dtype : numpy.dtype or None, optional 76 | Overrides the dtype of the calculation and output arrays. 77 | ignore_nan : bool, optional 78 | If True and ufunc has an identity value (e.g. ``numpy.add.identity`` is 0), then NaNs 79 | are replaced with this identity. An error is raised if ``ufunc`` has no identity 80 | (e.g. ``numpy.maximum.identity`` is ``None``). 81 | kwargs 82 | Keyword arguments are passed to ``ufunc``. Note that some valid ufunc keyword arguments 83 | (e.g. ``keepdims``) are not valid for all streaming functions. Also, contrary to NumPy 84 | v. 1.10+, ``casting = 'unsafe`` is the default in npstreams. 85 | 86 | Yields 87 | ------ 88 | reduced : ndarray or scalar 89 | 90 | Raises 91 | ------ 92 | TypeError : if ``ufunc`` is not NumPy ufunc. 93 | ValueError : if ``ignore_nan`` is True but ``ufunc`` has no identity 94 | ValueError : if ``ufunc`` is not a binary ufunc 95 | ValueError : if ``ufunc`` does not have the same input type as output type 96 | """ 97 | kwargs.update({"dtype": dtype, "axis": axis}) 98 | 99 | _check_binary_ufunc(ufunc) 100 | 101 | if ignore_nan: 102 | if ufunc.identity is None: 103 | raise ValueError( 104 | f"Cannot ignore NaNs because {ufunc.__name__} has no identity value" 105 | ) 106 | # TODO: use the ``where`` keyword in ufuncs instead 107 | arrays = map(partial(nan_to_num, fill_value=ufunc.identity, copy=False), arrays) 108 | 109 | # Since ireduce_ufunc is primed, we need to wait here 110 | # Priming is a way to start error checking before actually running 111 | # any computations. 112 | yield 113 | 114 | if kwargs["axis"] == -1: 115 | yield from _ireduce_ufunc_new_axis(arrays, ufunc, **kwargs) 116 | return 117 | 118 | if kwargs["axis"] is None: 119 | yield from _ireduce_ufunc_all_axes(arrays, ufunc, **kwargs) 120 | return 121 | 122 | first, arrays = peek(arrays) 123 | 124 | if kwargs["axis"] >= first.ndim: 125 | kwargs["axis"] = -1 126 | yield from ireduce_ufunc(arrays, ufunc, **kwargs) 127 | return 128 | 129 | yield from _ireduce_ufunc_existing_axis(arrays, ufunc, **kwargs) 130 | 131 | 132 | def reduce_ufunc(arrays, ufunc, axis=-1, dtype=None, ignore_nan=False, **kwargs): 133 | """ 134 | Reduce a stream using a binary NumPy ufunc. Function version of ``ireduce_ufunc``. 135 | 136 | ``ufunc`` must be a NumPy binary Ufunc (i.e. it takes two arguments). Moreover, 137 | for performance reasons, ufunc must have the same return types as input types. 138 | This precludes the use of ``numpy.greater``, for example. 139 | 140 | Note that performance is much better for the default ``axis = -1``. In such a case, 141 | reduction operations can occur in-place. This also allows to operate in constant-memory. 142 | 143 | Parameters 144 | ---------- 145 | arrays : iterable 146 | Arrays to be reduced. 147 | ufunc : numpy.ufunc 148 | Binary universal function. 149 | axis : int or None, optional 150 | Reduction axis. Default is to reduce the arrays in the stream as if 151 | they had been stacked along a new axis, then reduce along this new axis. 152 | If None, arrays are flattened before reduction. If `axis` is an int larger that 153 | the number of dimensions in the arrays of the stream, arrays are reduced 154 | along the new axis. Note that not all of NumPy Ufuncs support 155 | ``axis = None``, e.g. ``numpy.subtract``. 156 | dtype : numpy.dtype or None, optional 157 | Overrides the dtype of the calculation and output arrays. 158 | ignore_nan : bool, optional 159 | If True and ufunc has an identity value (e.g. ``numpy.add.identity`` is 0), then NaNs 160 | are replaced with this identity. An error is raised if ``ufunc`` has no identity (e.g. ``numpy.maximum.identity`` is ``None``). 161 | kwargs 162 | Keyword arguments are passed to ``ufunc``. Note that some valid ufunc keyword arguments 163 | (e.g. ``keepdims``) are not valid for all streaming functions. Note that 164 | contrary to NumPy v. 1.10+, ``casting = 'unsafe`` is the default in npstreams. 165 | 166 | Returns 167 | ------- 168 | reduced : ndarray or scalar 169 | 170 | Raises 171 | ------ 172 | TypeError : if ``ufunc`` is not NumPy ufunc. 173 | ValueError : if ``ignore_nan`` is True but ``ufunc`` has no identity 174 | ValueError: if ``ufunc`` is not a binary ufunc 175 | ValueError: if ``ufunc`` does not have the same input type as output type 176 | """ 177 | return last( 178 | ireduce_ufunc( 179 | arrays, ufunc, axis=axis, dtype=dtype, ignore_nan=ignore_nan, **kwargs 180 | ) 181 | ) 182 | 183 | 184 | @array_stream 185 | def preduce_ufunc( 186 | arrays, 187 | ufunc, 188 | axis=-1, 189 | dtype=None, 190 | ignore_nan=False, 191 | processes=1, 192 | ntotal=None, 193 | **kwargs, 194 | ): 195 | """ 196 | Parallel reduction of array streams. 197 | 198 | ``ufunc`` must be a NumPy binary Ufunc (i.e. it takes two arguments). Moreover, 199 | for performance reasons, ufunc must have the same return types as input types. 200 | This precludes the use of ``numpy.greater``, for example. 201 | 202 | Parameters 203 | ---------- 204 | arrays : iterable 205 | Arrays to be reduced. 206 | ufunc : numpy.ufunc 207 | Binary universal function. 208 | axis : int or None, optional 209 | Reduction axis. Default is to reduce the arrays in the stream as if 210 | they had been stacked along a new axis, then reduce along this new axis. 211 | If None, arrays are flattened before reduction. If `axis` is an int larger that 212 | the number of dimensions in the arrays of the stream, arrays are reduced 213 | along the new axis. Note that not all of NumPy Ufuncs support 214 | ``axis = None``, e.g. ``numpy.subtract``. 215 | dtype : numpy.dtype or None, optional 216 | Overrides the dtype of the calculation and output arrays. 217 | ignore_nan : bool, optional 218 | If True and ufunc has an identity value (e.g. ``numpy.add.identity`` is 0), then NaNs 219 | are replaced with this identity. An error is raised if ``ufunc`` has no identity (e.g. ``numpy.maximum.identity`` is ``None``). 220 | processes : int or None, optional 221 | Number of processes to use. If `None`, maximal number of processes 222 | is used. Default is 1. 223 | kwargs 224 | Keyword arguments are passed to ``ufunc``. Note that some valid ufunc keyword arguments 225 | (e.g. ``keepdims``) are not valid for all streaming functions. Also, contrary to NumPy 226 | v. 1.10+, ``casting = 'unsafe`` is the default in npstreams. 227 | """ 228 | if processes == 1: 229 | return reduce_ufunc(arrays, ufunc, axis, dtype, ignore_nan, **kwargs) 230 | 231 | kwargs.update( 232 | {"ufunc": ufunc, "ignore_nan": ignore_nan, "dtype": dtype, "axis": axis} 233 | ) 234 | reduce = partial(reduce_ufunc, **kwargs) 235 | # return preduce(reduce, arrays, processes = processes, ntotal = ntotal) 236 | 237 | with Pool(processes) as pool: 238 | chunksize = 1 239 | if ntotal is not None: 240 | chunksize = max(1, int(ntotal / pool._processes)) 241 | res = pool.imap(reduce, chunked(arrays, chunksize)) 242 | return reduce(res) 243 | 244 | 245 | def _ireduce_ufunc_new_axis(arrays, ufunc, **kwargs): 246 | """ 247 | Reduction operation for arrays, in the direction of a new axis (i.e. stacking). 248 | 249 | Parameters 250 | ---------- 251 | arrays : iterable 252 | Arrays to be reduced. 253 | ufunc : numpy.ufunc 254 | Binary universal function. Must have a signature of the form ufunc(x1, x2, ...) 255 | kwargs 256 | Keyword arguments are passed to ``ufunc``. 257 | 258 | Yields 259 | ------ 260 | reduced : ndarray 261 | """ 262 | arrays = iter(arrays) 263 | first = next(arrays) 264 | 265 | kwargs.pop("axis") 266 | 267 | dtype = kwargs.get("dtype", None) 268 | if dtype is None: 269 | dtype = first.dtype 270 | else: 271 | kwargs["casting"] = "unsafe" 272 | 273 | # If the out parameter was already given 274 | # we create the accumulator from it 275 | # Otherwise, it is a copy of the first array 276 | accumulator = kwargs.pop("out", None) 277 | if accumulator is not None: 278 | accumulator[:] = first 279 | else: 280 | accumulator = np.array(first, copy=True).astype(dtype) 281 | yield accumulator 282 | 283 | for array in arrays: 284 | ufunc(accumulator, array, out=accumulator, **kwargs) 285 | yield accumulator 286 | 287 | 288 | def _ireduce_ufunc_existing_axis(arrays, ufunc, **kwargs): 289 | """ 290 | Reduction operation for arrays, in the direction of an existing axis. 291 | 292 | Parameters 293 | ---------- 294 | arrays : iterable 295 | Arrays to be reduced. 296 | ufunc : numpy.ufunc 297 | Binary universal function. Must have a signature of the form ufunc(x1, x2, ...) 298 | kwargs 299 | Keyword arguments are passed to ``ufunc``. The ``out`` parameter is ignored. 300 | 301 | Yields 302 | ------ 303 | reduced : ndarray 304 | """ 305 | arrays = iter(arrays) 306 | first = next(arrays) 307 | 308 | if kwargs["axis"] not in range(first.ndim): 309 | axis = kwargs["axis"] 310 | raise ValueError(f"Axis {axis} not supported on arrays of shape {first.shape}.") 311 | 312 | # Remove parameters that will not be used. 313 | kwargs.pop("out", None) 314 | 315 | dtype = kwargs.get("dtype") 316 | if dtype is None: 317 | dtype = first.dtype 318 | 319 | axis_reduce = partial(ufunc.reduce, **kwargs) 320 | 321 | accumulator = np.atleast_1d(axis_reduce(first)) 322 | yield accumulator 323 | 324 | # On the first pass of the following loop, accumulator is missing a dimensions 325 | # therefore, the stacking function cannot be 'concatenate' 326 | second = next(arrays) 327 | accumulator = np.stack([accumulator, np.atleast_1d(axis_reduce(second))], axis=-1) 328 | yield accumulator 329 | 330 | # On the second pass, the new dimensions exists, and thus we switch to 331 | # using concatenate. 332 | for array in arrays: 333 | reduced = np.expand_dims( 334 | np.atleast_1d(axis_reduce(array)), axis=accumulator.ndim - 1 335 | ) 336 | accumulator = np.concatenate([accumulator, reduced], axis=accumulator.ndim - 1) 337 | yield accumulator 338 | 339 | 340 | def _ireduce_ufunc_all_axes(arrays, ufunc, **kwargs): 341 | """ 342 | Reduction operation for arrays, over all axes. 343 | 344 | Parameters 345 | ---------- 346 | arrays : iterable 347 | Arrays to be reduced. 348 | ufunc : numpy.ufunc 349 | Binary universal function. Must have a signature of the form ufunc(x1, x2, ...) 350 | kwargs 351 | Keyword arguments are passed to ``ufunc``. The ``out`` parameter is ignored. 352 | 353 | Yields 354 | ------ 355 | reduced : scalar 356 | """ 357 | arrays = iter(arrays) 358 | first = next(arrays) 359 | 360 | kwargs.pop("out", None) 361 | 362 | kwargs["axis"] = None 363 | axis_reduce = partial(ufunc.reduce, **kwargs) 364 | 365 | accumulator = axis_reduce(first) 366 | yield accumulator 367 | 368 | for array in arrays: 369 | accumulator = axis_reduce([accumulator, axis_reduce(array)]) 370 | yield accumulator 371 | -------------------------------------------------------------------------------- /npstreams/tests/test_stats.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from itertools import repeat 4 | from random import randint, random, seed 5 | from warnings import catch_warnings, simplefilter 6 | import pytest 7 | 8 | import numpy as np 9 | 10 | try: 11 | from scipy.stats import sem as scipy_sem 12 | 13 | WITH_SCIPY = True 14 | except ImportError: 15 | WITH_SCIPY = False 16 | 17 | from npstreams import ( 18 | iaverage, 19 | imean, 20 | isem, 21 | istd, 22 | ivar, 23 | last, 24 | ihistogram, 25 | mean, 26 | average, 27 | sem, 28 | std, 29 | var, 30 | ) 31 | 32 | seed(23) 33 | 34 | 35 | def test_average_trivial(): 36 | """Test average() on a stream of zeroes""" 37 | stream = repeat(np.zeros((64, 64), dtype=float), times=5) 38 | for av in average(stream): 39 | assert np.allclose(av, np.zeros_like(av)) 40 | 41 | 42 | @pytest.mark.parametrize("axis", (0, 1, 2, None)) 43 | def test_average_vs_numpy(axis): 44 | """Test average vs. numpy.average""" 45 | stream = [np.random.random(size=(64, 64)) for _ in range(5)] 46 | stack = np.dstack(stream) 47 | 48 | from_stream = average(stream, axis=axis) 49 | from_numpy = np.average(stack, axis=axis) 50 | assert np.allclose(from_numpy, from_stream) 51 | 52 | 53 | def test_average_weighted_average(): 54 | """Test results of weighted average against numpy.average""" 55 | stream = [np.random.random(size=(16, 16)) for _ in range(5)] 56 | 57 | weights = [random() for _ in stream] 58 | from_average = average(stream, weights=weights) 59 | from_numpy = np.average(np.dstack(stream), axis=2, weights=np.array(weights)) 60 | assert np.allclose(from_average, from_numpy) 61 | 62 | weights = [np.random.random(size=stream[0].shape) for _ in stream] 63 | from_average = average(stream, weights=weights) 64 | from_numpy = np.average(np.dstack(stream), axis=2, weights=np.dstack(weights)) 65 | assert np.allclose(from_average, from_numpy) 66 | 67 | 68 | def test_average_ignore_nan(): 69 | """Test that NaNs are handled correctly""" 70 | stream = [np.random.random(size=(16, 12)) for _ in range(5)] 71 | for s in stream: 72 | s[randint(0, 15), randint(0, 11)] = np.nan 73 | 74 | with catch_warnings(): 75 | simplefilter("ignore") 76 | from_average = average(stream, ignore_nan=True) 77 | from_numpy = np.nanmean(np.dstack(stream), axis=2) 78 | assert np.allclose(from_average, from_numpy) 79 | 80 | 81 | def test_iaverage_trivial(): 82 | """Test iaverage on stream of zeroes""" 83 | stream = repeat(np.zeros((64, 64), dtype=float), times=5) 84 | for av in iaverage(stream): 85 | assert np.allclose(av, np.zeros_like(av)) 86 | 87 | 88 | def test_iaverage_weighted_average(): 89 | """Test results of weighted iverage against numpy.average""" 90 | stream = [np.random.random(size=(16, 16)) for _ in range(5)] 91 | 92 | weights = [random() for _ in stream] 93 | from_iaverage = last(iaverage(stream, weights=weights)) 94 | from_numpy = np.average(np.dstack(stream), axis=2, weights=np.array(weights)) 95 | assert np.allclose(from_iaverage, from_numpy) 96 | 97 | weights = [np.random.random(size=stream[0].shape) for _ in stream] 98 | from_iaverage = last(iaverage(stream, weights=weights)) 99 | from_numpy = np.average(np.dstack(stream), axis=2, weights=np.dstack(weights)) 100 | assert np.allclose(from_iaverage, from_numpy) 101 | 102 | 103 | def test_iaverage_ignore_nan(): 104 | """Test that NaNs are handled correctly""" 105 | stream = [np.random.random(size=(16, 12)) for _ in range(5)] 106 | for s in stream: 107 | s[randint(0, 15), randint(0, 11)] = np.nan 108 | 109 | with catch_warnings(): 110 | simplefilter("ignore") 111 | from_iaverage = last(iaverage(stream, ignore_nan=True)) 112 | from_numpy = np.nanmean(np.dstack(stream), axis=2) 113 | assert np.allclose(from_iaverage, from_numpy) 114 | 115 | 116 | def test_iaverage_length(): 117 | """Test that the number of yielded elements is the same as source""" 118 | source = (np.zeros((16,)) for _ in range(5)) 119 | avg = list(iaverage(source, axis=0)) 120 | assert len(avg) == 5 121 | 122 | 123 | @pytest.mark.parametrize("dtype", (np.uint8, bool, np.int16, np.float16)) 124 | def test_iaverage_output_dtype(dtype): 125 | """Test that that yielded arrays are always floats""" 126 | source = (np.zeros((16,), dtype=dtype) for _ in range(5)) 127 | avg = last(iaverage(source)) 128 | assert avg.dtype == float 129 | 130 | 131 | @pytest.mark.parametrize("axis", (0, 1, 2, None)) 132 | def test_iaverage_output_shape(axis): 133 | """Test output shape""" 134 | source = [np.random.random((16, 12, 5)) for _ in range(10)] 135 | stack = np.stack(source, axis=-1) 136 | 137 | from_numpy = np.average(stack, axis=axis) 138 | out = last(iaverage(source, axis=axis)) 139 | assert from_numpy.shape == out.shape 140 | assert np.allclose(out, from_numpy) 141 | 142 | 143 | def test_mean_trivial(): 144 | """Test mean() on a stream of zeroes""" 145 | stream = repeat(np.zeros((64, 64), dtype=float), times=5) 146 | for av in mean(stream): 147 | assert np.allclose(av, np.zeros_like(av)) 148 | 149 | 150 | @pytest.mark.parametrize("axis", (0, 1, 2, None)) 151 | def test_mean_vs_numpy(axis): 152 | """Test mean vs. numpy.mean""" 153 | stream = [np.random.random(size=(64, 64)) for _ in range(5)] 154 | stack = np.dstack(stream) 155 | 156 | from_stream = mean(stream, axis=axis) 157 | from_numpy = np.mean(stack, axis=axis) 158 | assert np.allclose(from_numpy, from_stream) 159 | 160 | 161 | @pytest.mark.parametrize("axis", (0, 1, 2, None)) 162 | def test_mean_against_numpy_nanmean(axis): 163 | """Test results against numpy.mean""" 164 | source = [np.random.random((16, 12, 5)) for _ in range(10)] 165 | for arr in source: 166 | arr[randint(0, 15), randint(0, 11), randint(0, 4)] = np.nan 167 | stack = np.stack(source, axis=-1) 168 | 169 | from_numpy = np.nanmean(stack, axis=axis) 170 | out = mean(source, axis=axis, ignore_nan=True) 171 | assert from_numpy.shape == out.shape 172 | assert np.allclose(out, from_numpy) 173 | 174 | 175 | @pytest.mark.parametrize("axis", (0, 1, 2, None)) 176 | def test_imean_against_numpy_mean(axis): 177 | """Test results against numpy.mean""" 178 | source = [np.random.random((16, 12, 5)) for _ in range(10)] 179 | stack = np.stack(source, axis=-1) 180 | 181 | from_numpy = np.mean(stack, axis=axis) 182 | out = last(imean(source, axis=axis)) 183 | assert from_numpy.shape == out.shape 184 | assert np.allclose(out, from_numpy) 185 | 186 | 187 | @pytest.mark.parametrize("axis", (0, 1, 2, None)) 188 | def test_imean_against_numpy_nanmean(axis): 189 | """Test results against numpy.mean""" 190 | source = [np.random.random((16, 12, 5)) for _ in range(10)] 191 | for arr in source: 192 | arr[randint(0, 15), randint(0, 11), randint(0, 4)] = np.nan 193 | stack = np.stack(source, axis=-1) 194 | 195 | from_numpy = np.nanmean(stack, axis=axis) 196 | out = last(imean(source, axis=axis, ignore_nan=True)) 197 | assert from_numpy.shape == out.shape 198 | assert np.allclose(out, from_numpy) 199 | 200 | 201 | @pytest.mark.parametrize("axis", (0, 1, 2, None)) 202 | def test_var_vs_numpy(axis): 203 | """Test that the axis parameter is handled correctly""" 204 | stream = [np.random.random((16, 7, 3)) for _ in range(5)] 205 | stack = np.stack(stream, axis=-1) 206 | 207 | from_numpy = np.var(stack, axis=axis) 208 | from_var = var(stream, axis=axis) 209 | assert from_numpy.shape == from_var.shape 210 | assert np.allclose(from_var, from_numpy) 211 | 212 | 213 | @pytest.mark.parametrize("axis", (0, 1, 2, None)) 214 | @pytest.mark.parametrize("ddof", range(4)) 215 | def test_var_ddof(axis, ddof): 216 | """Test that the ddof parameter is equivalent to numpy's""" 217 | stream = [np.random.random((16, 7, 3)) for _ in range(10)] 218 | stack = np.stack(stream, axis=-1) 219 | 220 | with catch_warnings(): 221 | simplefilter("ignore") 222 | 223 | from_numpy = np.var(stack, axis=axis, ddof=ddof) 224 | from_var = var(stream, axis=axis, ddof=ddof) 225 | assert from_numpy.shape == from_var.shape 226 | assert np.allclose(from_var, from_numpy) 227 | 228 | 229 | def test_ivar_first(): 230 | """Test that the first yielded value of ivar is an array fo zeros""" 231 | stream = repeat(np.random.random(size=(64, 64)), times=5) 232 | first = next(ivar(stream)) 233 | 234 | assert np.allclose(first, np.zeros_like(first)) 235 | 236 | 237 | @pytest.mark.parametrize("axis", (0, 1, 2, None)) 238 | def test_ivar_output_shape(axis): 239 | """Test that the axis parameter is handled correctly""" 240 | stream = [np.random.random((16, 7, 3)) for _ in range(5)] 241 | stack = np.stack(stream, axis=-1) 242 | 243 | from_numpy = np.var(stack, axis=axis) 244 | from_ivar = last(ivar(stream, axis=axis)) 245 | assert from_numpy.shape == from_ivar.shape 246 | assert np.allclose(from_ivar, from_numpy) 247 | 248 | 249 | @pytest.mark.parametrize("axis", (0, 1, 2, None)) 250 | @pytest.mark.parametrize("ddof", range(4)) 251 | def test_ivar_ddof(axis, ddof): 252 | """Test that the ddof parameter is equivalent to numpy's""" 253 | stream = [np.random.random((16, 7, 3)) for _ in range(10)] 254 | stack = np.stack(stream, axis=-1) 255 | 256 | with catch_warnings(): 257 | simplefilter("ignore") 258 | 259 | from_numpy = np.var(stack, axis=axis, ddof=ddof) 260 | from_ivar = last(ivar(stream, axis=axis, ddof=ddof)) 261 | assert from_numpy.shape == from_ivar.shape 262 | assert np.allclose(from_ivar, from_numpy) 263 | 264 | 265 | @pytest.mark.parametrize("axis", (0, 1, 2, None)) 266 | @pytest.mark.parametrize("ddof", range(4)) 267 | def test_std_against_numpy_std(axis, ddof): 268 | stream = [np.random.random((16, 7, 3)) for _ in range(10)] 269 | stack = np.stack(stream, axis=-1) 270 | 271 | with catch_warnings(): 272 | simplefilter("ignore") 273 | 274 | from_numpy = np.std(stack, axis=axis, ddof=ddof) 275 | from_ivar = std(stream, axis=axis, ddof=ddof) 276 | assert from_numpy.shape == from_ivar.shape 277 | assert np.allclose(from_ivar, from_numpy) 278 | 279 | 280 | @pytest.mark.parametrize("axis", (0, 1, 2, None)) 281 | @pytest.mark.parametrize("ddof", range(4)) 282 | def test_std_against_numpy_nanstd(axis, ddof): 283 | source = [np.random.random((16, 12, 5)) for _ in range(10)] 284 | for arr in source: 285 | arr[randint(0, 15), randint(0, 11), randint(0, 4)] = np.nan 286 | stack = np.stack(source, axis=-1) 287 | 288 | from_numpy = np.nanstd(stack, axis=axis, ddof=ddof) 289 | from_ivar = std(source, axis=axis, ddof=ddof, ignore_nan=True) 290 | assert from_numpy.shape == from_ivar.shape 291 | assert np.allclose(from_ivar, from_numpy) 292 | 293 | 294 | @pytest.mark.parametrize("axis", (0, 1, 2, None)) 295 | @pytest.mark.parametrize("ddof", range(4)) 296 | def test_istd_against_numpy_std(axis, ddof): 297 | stream = [np.random.random((16, 7, 3)) for _ in range(10)] 298 | stack = np.stack(stream, axis=-1) 299 | 300 | with catch_warnings(): 301 | simplefilter("ignore") 302 | 303 | from_numpy = np.std(stack, axis=axis, ddof=ddof) 304 | from_ivar = last(istd(stream, axis=axis, ddof=ddof)) 305 | assert from_numpy.shape == from_ivar.shape 306 | assert np.allclose(from_ivar, from_numpy) 307 | 308 | 309 | @pytest.mark.parametrize("axis", (0, 1, 2, None)) 310 | @pytest.mark.parametrize("ddof", range(4)) 311 | def test_istd_against_numpy_nanstd(axis, ddof): 312 | source = [np.random.random((16, 12, 5)) for _ in range(10)] 313 | for arr in source: 314 | arr[randint(0, 15), randint(0, 11), randint(0, 4)] = np.nan 315 | stack = np.stack(source, axis=-1) 316 | 317 | from_numpy = np.nanstd(stack, axis=axis, ddof=ddof) 318 | from_ivar = last(istd(source, axis=axis, ddof=ddof, ignore_nan=True)) 319 | assert from_numpy.shape == from_ivar.shape 320 | assert np.allclose(from_ivar, from_numpy) 321 | 322 | 323 | @pytest.mark.skipif(not WITH_SCIPY, reason="SciPy is not installed/importable") 324 | @pytest.mark.parametrize("axis", (0, 1, 2, None)) 325 | @pytest.mark.parametrize("ddof", range(4)) 326 | def test_sem_against_scipy_no_nans(axis, ddof): 327 | """Test that isem outputs the same as scipy.stats.sem""" 328 | source = [np.random.random((16, 12, 5)) for _ in range(10)] 329 | stack = np.stack(source, axis=-1) 330 | 331 | from_scipy = scipy_sem(stack, axis=axis, ddof=ddof) 332 | from_isem = sem(source, axis=axis, ddof=ddof) 333 | assert from_scipy.shape == from_isem.shape 334 | assert np.allclose(from_isem, from_scipy) 335 | 336 | 337 | @pytest.mark.skipif(not WITH_SCIPY, reason="SciPy is not installed/importable") 338 | @pytest.mark.parametrize("axis", (0, 1, 2, None)) 339 | @pytest.mark.parametrize("ddof", range(4)) 340 | def test_sem_against_scipy_with_nans(axis, ddof): 341 | """Test that isem outputs the same as scipy.stats.sem when NaNs are ignored.""" 342 | source = [np.random.random((16, 12, 5)) for _ in range(10)] 343 | for arr in source: 344 | arr[randint(0, 15), randint(0, 11), randint(0, 4)] = np.nan 345 | stack = np.stack(source, axis=-1) 346 | 347 | from_scipy = scipy_sem(stack, axis=axis, ddof=ddof, nan_policy="omit") 348 | from_isem = sem(source, axis=axis, ddof=ddof, ignore_nan=True) 349 | assert from_scipy.shape == from_isem.shape 350 | assert np.allclose(from_isem, from_scipy) 351 | 352 | 353 | @pytest.mark.skipif(not WITH_SCIPY, reason="SciPy is not installed/importable") 354 | @pytest.mark.parametrize("axis", (0, 1, 2, None)) 355 | @pytest.mark.parametrize("ddof", range(4)) 356 | def test_isem_against_scipy_no_nans(axis, ddof): 357 | """Test that isem outputs the same as scipy.stats.sem""" 358 | source = [np.random.random((16, 12, 5)) for _ in range(10)] 359 | stack = np.stack(source, axis=-1) 360 | 361 | from_scipy = scipy_sem(stack, axis=axis, ddof=ddof) 362 | from_isem = last(isem(source, axis=axis, ddof=ddof)) 363 | assert from_scipy.shape == from_isem.shape 364 | assert np.allclose(from_isem, from_scipy) 365 | 366 | 367 | @pytest.mark.skipif(not WITH_SCIPY, reason="SciPy is not installed/importable") 368 | @pytest.mark.parametrize("axis", (0, 1, 2, None)) 369 | @pytest.mark.parametrize("ddof", range(4)) 370 | def test_isem_against_scipy_with_nans(axis, ddof): 371 | """Test that isem outputs the same as scipy.stats.sem when NaNs are ignored.""" 372 | source = [np.random.random((16, 12, 5)) for _ in range(10)] 373 | for arr in source: 374 | arr[randint(0, 15), randint(0, 11), randint(0, 4)] = np.nan 375 | stack = np.stack(source, axis=-1) 376 | 377 | from_scipy = scipy_sem(stack, axis=axis, ddof=ddof, nan_policy="omit") 378 | from_isem = last(isem(source, axis=axis, ddof=ddof, ignore_nan=True)) 379 | assert from_scipy.shape == from_isem.shape 380 | assert np.allclose(from_isem, from_scipy) 381 | 382 | 383 | def test_ihistogram_against_numpy_no_weights(): 384 | """Test ihistogram against numpy.histogram with no weights""" 385 | source = [np.random.random((16, 12, 5)) for _ in range(10)] 386 | stack = np.stack(source, axis=-1) 387 | 388 | bins = np.linspace(0, 1, num=10) 389 | from_numpy = np.histogram(stack, bins=bins)[0] 390 | from_ihistogram = last(ihistogram(source, bins=bins)) 391 | 392 | # Since histogram output is int, cannot use allclose 393 | assert np.all(np.equal(from_numpy, from_ihistogram)) 394 | 395 | 396 | def test_ihistogram_trivial_weights(): 397 | """Test ihistogram with weights being all 1s vs. weights=None""" 398 | source = [np.random.random((16, 12, 5)) for _ in range(10)] 399 | weights = [np.array([1]) for _ in source] 400 | 401 | bins = np.linspace(0, 1, num=10) 402 | none_weights = last(ihistogram(source, bins=bins, weights=None)) 403 | trivial_weights = last(ihistogram(source, bins=bins, weights=weights)) 404 | 405 | assert np.all(np.equal(none_weights, trivial_weights)) 406 | -------------------------------------------------------------------------------- /npstreams/stats.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Statistical functions 4 | --------------------- 5 | """ 6 | from functools import partial 7 | from itertools import count, repeat, starmap 8 | from operator import truediv 9 | from warnings import catch_warnings, simplefilter 10 | 11 | import numpy as np 12 | 13 | from .array_stream import array_stream 14 | from .array_utils import nan_to_num 15 | from .iter_utils import itercopy, last, peek 16 | from .numerics import isum 17 | 18 | 19 | @array_stream 20 | def _iaverage(arrays, axis=-1, weights=None, ignore_nan=False): 21 | """ 22 | Primitive version of weighted averaging that yields the running sum and running weights sum, 23 | but avoids the costly division at every step. 24 | """ 25 | # Special case: in the easiest case, no need to calculate 26 | # weights and ignore nans. 27 | # This case is pretty common 28 | if (weights is None) and (not ignore_nan) and (axis == -1): 29 | yield from zip(isum(arrays, axis=axis, dtype=float, ignore_nan=False), count(1)) 30 | return 31 | 32 | first, arrays = peek(arrays) 33 | 34 | # We make sure that weights is always an array 35 | # This simplifies the handling of NaNs. 36 | if weights is None: 37 | weights = repeat(1) 38 | weights = map(partial(np.broadcast_to, shape=first.shape), weights) 39 | 40 | # Need to know which array has NaNs, and modify the weights stream accordingly 41 | if ignore_nan: 42 | arrays, arrays2 = itercopy(arrays) 43 | weights = map( 44 | lambda arr, wgt: np.logical_not(np.isnan(arr)) * wgt, arrays2, weights 45 | ) 46 | 47 | weights1, weights2 = itercopy(weights) 48 | 49 | sum_of_weights = isum(weights1, axis=axis, dtype=float) 50 | weighted_arrays = map(lambda arr, wgt: arr * wgt, arrays, weights2) 51 | weighted_sum = isum(weighted_arrays, axis=axis, ignore_nan=ignore_nan, dtype=float) 52 | 53 | yield from zip(weighted_sum, sum_of_weights) 54 | 55 | 56 | @array_stream 57 | def average(arrays, axis=-1, weights=None, ignore_nan=False): 58 | """ 59 | Average (weighted) of a stream of arrays. This function consumes the 60 | entire stream. 61 | 62 | Parameters 63 | ---------- 64 | arrays : iterable of ndarrays 65 | Arrays to be averaged. This iterable can also a generator. 66 | axis : int, optional 67 | Reduction axis. Default is to average the arrays in the stream as if 68 | they had been stacked along a new axis, then average along this new axis. 69 | If None, arrays are flattened before averaging. If `axis` is an int larger that 70 | the number of dimensions in the arrays of the stream, arrays are averaged 71 | along the new axis. 72 | weights : iterable of ndarray, iterable of floats, or None, optional 73 | Iterable of weights associated with the values in each item of `arrays`. 74 | Each value in an element of `arrays` contributes to the average 75 | according to its associated weight. The weights array can either be a float 76 | or an array of the same shape as any element of `arrays`. If ``weights=None``, 77 | then all data in each element of `arrays` are assumed to have a weight equal to one. 78 | ignore_nan : bool, optional 79 | If True, NaNs are set to zero weight. Default is propagation of NaNs. 80 | 81 | Returns 82 | ------- 83 | avg: `~numpy.ndarray`, dtype float 84 | Weighted average. 85 | 86 | See Also 87 | -------- 88 | iaverage : streaming (weighted) average. 89 | numpy.average : (weighted) average of dense arrays 90 | mean : non-weighted average of a stream. 91 | """ 92 | total_sum, total_weight = last(_iaverage(arrays, axis, weights, ignore_nan)) 93 | with catch_warnings(): 94 | simplefilter("ignore", category=RuntimeWarning) 95 | return np.true_divide(total_sum, total_weight) 96 | 97 | 98 | @array_stream 99 | def iaverage(arrays, axis=-1, weights=None, ignore_nan=False): 100 | """ 101 | Streaming (weighted) average of arrays. 102 | 103 | Parameters 104 | ---------- 105 | arrays : iterable of ndarrays 106 | Arrays to be averaged. This iterable can also a generator. 107 | axis : int, optional 108 | Reduction axis. Default is to average the arrays in the stream as if 109 | they had been stacked along a new axis, then average along this new axis. 110 | If None, arrays are flattened before averaging. If `axis` is an int larger that 111 | the number of dimensions in the arrays of the stream, arrays are averaged 112 | along the new axis. 113 | weights : iterable of ndarray, iterable of floats, or None, optional 114 | Iterable of weights associated with the values in each item of `arrays`. 115 | Each value in an element of `arrays` contributes to the average 116 | according to its associated weight. The weights array can either be a float 117 | or an array of the same shape as any element of `arrays`. If weights=None, 118 | then all data in each element of `arrays` are assumed to have a weight equal to one. 119 | ignore_nan : bool, optional 120 | If True, NaNs are set to zero weight. Default is propagation of NaNs. 121 | 122 | Yields 123 | ------ 124 | avg: `~numpy.ndarray`, dtype float 125 | Weighted average. 126 | 127 | See Also 128 | -------- 129 | imean : streaming array mean (non-weighted average). 130 | """ 131 | # Primitive stream is composed of tuples (running_sum, running_weights) 132 | primitive = _iaverage(arrays, axis, weights, ignore_nan) 133 | yield from map(lambda element: truediv(*element), primitive) 134 | 135 | 136 | @array_stream 137 | def mean(arrays, axis=-1, ignore_nan=False): 138 | """ 139 | Mean of a stream of arrays. This function consumes the 140 | entire stream. 141 | 142 | Parameters 143 | ---------- 144 | arrays : iterable of ndarrays 145 | Arrays to be averaged. This iterable can also a generator. 146 | axis : int, optional 147 | Reduction axis. Default is to average the arrays in the stream as if 148 | they had been stacked along a new axis, then average along this new axis. 149 | If None, arrays are flattened before averaging. If `axis` is an int larger that 150 | the number of dimensions in the arrays of the stream, arrays are averaged 151 | along the new axis. 152 | ignore_nan : bool, optional 153 | If True, NaNs are set to zero weight. Default is propagation of NaNs. 154 | 155 | Returns 156 | ------- 157 | mean: `~numpy.ndarray`, dtype float 158 | Total mean array. 159 | """ 160 | total_sum, total_count = last( 161 | _iaverage(arrays, axis, weights=None, ignore_nan=ignore_nan) 162 | ) 163 | return total_sum / total_count 164 | 165 | 166 | @array_stream 167 | def imean(arrays, axis=-1, ignore_nan=False): 168 | """ 169 | Streaming mean of arrays. Equivalent to `iaverage(arrays, weights = None)`. 170 | 171 | Parameters 172 | ---------- 173 | arrays : iterable of ndarrays 174 | Arrays to be averaged. This iterable can also a generator. 175 | axis : int, optional 176 | Reduction axis. Default is to average the arrays in the stream as if 177 | they had been stacked along a new axis, then average along this new axis. 178 | If None, arrays are flattened before averaging. If `axis` is an int larger that 179 | the number of dimensions in the arrays of the stream, arrays are averaged 180 | along the new axis. 181 | ignore_nan : bool, optional 182 | If True, NaNs are set to zero weight. Default is propagation of NaNs. 183 | 184 | Yields 185 | ------ 186 | mean: `~numpy.ndarray`, dtype float 187 | Online mean array. 188 | """ 189 | # Primitive stream is composed of tuples (running_sum, running_count) 190 | primitive = _iaverage(arrays, axis, weights=None, ignore_nan=ignore_nan) 191 | yield from map(lambda element: truediv(*element), primitive) 192 | 193 | 194 | @array_stream 195 | def _ivar(arrays, axis=-1, weights=None, ignore_nan=False): 196 | """ 197 | Primitive version of weighted variance that yields the running average, running average of squares and running weights sum, 198 | but avoids the costly division and squaring at every step. 199 | """ 200 | first, arrays = peek(arrays) 201 | 202 | # We make sure that weights is always an array 203 | # This simplifies the handling of NaNs. 204 | if weights is None: 205 | weights = repeat(1) 206 | weights = map(partial(np.broadcast_to, shape=first.shape), weights) 207 | 208 | # Need to know which array has NaNs, and modify the weights stream accordingly 209 | if ignore_nan: 210 | arrays, arrays2 = itercopy(arrays) 211 | weights = map( 212 | lambda arr, wgt: np.logical_not(np.isnan(arr)) * wgt, arrays2, weights 213 | ) 214 | 215 | arrays, arrays2 = itercopy(arrays) 216 | weights, weights2, weights3 = itercopy(weights, 3) 217 | 218 | avgs = iaverage(arrays, axis=axis, weights=weights, ignore_nan=ignore_nan) 219 | avg_of_squares = iaverage( 220 | map(np.square, arrays2), axis=axis, weights=weights2, ignore_nan=ignore_nan 221 | ) 222 | sum_of_weights = isum(weights3, axis=axis, ignore_nan=ignore_nan) 223 | 224 | yield from zip(avgs, avg_of_squares, sum_of_weights) 225 | 226 | 227 | @array_stream 228 | def average_and_var(arrays, axis=-1, ddof=0, weights=None, ignore_nan=False): 229 | """ 230 | Calculate the simultaneous average and variance of a stream of arrays. This is done in 231 | single iteration for maximum performance. 232 | 233 | .. versionadded:: 1.6.1 234 | 235 | Parameters 236 | ---------- 237 | arrays : iterable of ndarrays 238 | Arrays to be combined. This iterable can also a generator. 239 | axis : int, optional 240 | Reduction axis. Default is to combine the arrays in the stream as if 241 | they had been stacked along a new axis, then compute the variance along this new axis. 242 | If None, arrays are flattened. If `axis` is an int larger that 243 | the number of dimensions in the arrays of the stream, variance is computed 244 | along the new axis. 245 | ddof : int, optional 246 | Means Delta Degrees of Freedom. The divisor used in calculations 247 | is ``N - ddof``, where ``N`` represents the number of elements. 248 | weights : iterable of ndarray, iterable of floats, or None, optional 249 | Iterable of weights associated with the values in each item of `arrays`. 250 | Each value in an element of `arrays` contributes to the variance 251 | according to its associated weight. The weights array can either be a float 252 | or an array of the same shape as any element of `arrays`. If weights=None, 253 | then all data in each element of `arrays` are assumed to have a weight equal to one. 254 | ignore_nan : bool, optional 255 | If True, NaNs are set to zero weight. Default is propagation of NaNs. 256 | 257 | Returns 258 | ------- 259 | average : `~numpy.ndarray` 260 | Average, possibly weighted. 261 | var: `~numpy.ndarray` 262 | Variance, possibly weighted. 263 | 264 | Notes 265 | ----- 266 | Since the calculation of the variance requires knowledge of the average, this function is a 267 | very thin wrapper around `var`. 268 | 269 | References 270 | ---------- 271 | .. [#] D. H. D. West, Updating the mean and variance estimates: an improved method. 272 | Communications of the ACM Vol. 22, Issue 9, pp. 532 - 535 (1979) 273 | """ 274 | # Since the variance calculation requires knowing the average, 275 | # `average_and_var` runs in the exact same time as `var` 276 | avg, sq_avg, swgt = last( 277 | _ivar(arrays=arrays, axis=axis, weights=weights, ignore_nan=ignore_nan) 278 | ) 279 | variance = (sq_avg - avg**2) * (swgt / (swgt - ddof)) 280 | return avg, variance 281 | 282 | 283 | @array_stream 284 | def var(arrays, axis=-1, ddof=0, weights=None, ignore_nan=False): 285 | """ 286 | Total variance of a stream of arrays. Weights are also supported. This function 287 | consumes the input stream. 288 | 289 | Parameters 290 | ---------- 291 | arrays : iterable of ndarrays 292 | Arrays to be combined. This iterable can also a generator. 293 | axis : int, optional 294 | Reduction axis. Default is to combine the arrays in the stream as if 295 | they had been stacked along a new axis, then compute the variance along this new axis. 296 | If None, arrays are flattened. If `axis` is an int larger that 297 | the number of dimensions in the arrays of the stream, variance is computed 298 | along the new axis. 299 | ddof : int, optional 300 | Means Delta Degrees of Freedom. The divisor used in calculations 301 | is ``N - ddof``, where ``N`` represents the number of elements. 302 | weights : iterable of ndarray, iterable of floats, or None, optional 303 | Iterable of weights associated with the values in each item of `arrays`. 304 | Each value in an element of `arrays` contributes to the variance 305 | according to its associated weight. The weights array can either be a float 306 | or an array of the same shape as any element of `arrays`. If weights=None, 307 | then all data in each element of `arrays` are assumed to have a weight equal to one. 308 | ignore_nan : bool, optional 309 | If True, NaNs are set to zero weight. Default is propagation of NaNs. 310 | 311 | Returns 312 | ------- 313 | var: `~numpy.ndarray` 314 | Variance. 315 | 316 | See Also 317 | -------- 318 | ivar : streaming variance 319 | numpy.var : variance calculation for dense arrays. Weights are not supported. 320 | 321 | References 322 | ---------- 323 | .. [#] D. H. D. West, Updating the mean and variance estimates: an improved method. 324 | Communications of the ACM Vol. 22, Issue 9, pp. 532 - 535 (1979) 325 | """ 326 | _, variance = average_and_var( 327 | arrays=arrays, axis=axis, ddof=ddof, weights=weights, ignore_nan=ignore_nan 328 | ) 329 | return variance 330 | 331 | 332 | @array_stream 333 | def ivar(arrays, axis=-1, ddof=0, weights=None, ignore_nan=False): 334 | """ 335 | Streaming variance of arrays. Weights are also supported. 336 | 337 | Parameters 338 | ---------- 339 | arrays : iterable of ndarrays 340 | Arrays to be combined. This iterable can also a generator. 341 | axis : int, optional 342 | Reduction axis. Default is to combine the arrays in the stream as if 343 | they had been stacked along a new axis, then compute the variance along this new axis. 344 | If None, arrays are flattened. If `axis` is an int larger that 345 | the number of dimensions in the arrays of the stream, variance is computed 346 | along the new axis. 347 | ddof : int, optional 348 | Means Delta Degrees of Freedom. The divisor used in calculations 349 | is ``N - ddof``, where ``N`` represents the number of elements. 350 | weights : iterable of ndarray, iterable of floats, or None, optional 351 | Iterable of weights associated with the values in each item of `arrays`. 352 | Each value in an element of `arrays` contributes to the variance 353 | according to its associated weight. The weights array can either be a float 354 | or an array of the same shape as any element of `arrays`. If weights=None, 355 | then all data in each element of `arrays` are assumed to have a weight equal to one. 356 | ignore_nan : bool, optional 357 | If True, NaNs are set to zero weight. Default is propagation of NaNs. 358 | 359 | Yields 360 | ------ 361 | var: `~numpy.ndarray` 362 | Variance. 363 | 364 | See Also 365 | -------- 366 | numpy.var : variance calculation for dense arrays. Weights are not supported. 367 | 368 | References 369 | ---------- 370 | .. [#] D. H. D. West, Updating the mean and variance estimates: an improved method. 371 | Communications of the ACM Vol. 22, Issue 9, pp. 532 - 535 (1979) 372 | """ 373 | primitive = _ivar(arrays=arrays, axis=axis, weights=weights, ignore_nan=ignore_nan) 374 | for avg, sq_avg, swgt in primitive: 375 | yield (sq_avg - avg**2) * (swgt / (swgt - ddof)) 376 | 377 | 378 | @array_stream 379 | def std(arrays, axis=-1, ddof=0, weights=None, ignore_nan=False): 380 | """ 381 | Total standard deviation of arrays. Weights are also supported. This function 382 | consumes the input stream. 383 | 384 | Parameters 385 | ---------- 386 | arrays : iterable of ndarrays 387 | Arrays to be combined. This iterable can also a generator. 388 | axis : int, optional 389 | Reduction axis. Default is to combine the arrays in the stream as if 390 | they had been stacked along a new axis, then compute the standard deviation along this new axis. 391 | If None, arrays are flattened. If `axis` is an int larger that 392 | the number of dimensions in the arrays of the stream, standard deviation is computed 393 | along the new axis. 394 | ddof : int, optional 395 | Means Delta Degrees of Freedom. The divisor used in calculations 396 | is ``N - ddof``, where ``N`` represents the number of elements. 397 | weights : iterable of ndarray, iterable of floats, or None, optional 398 | Iterable of weights associated with the values in each item of `arrays`. 399 | Each value in an element of `arrays` contributes to the standard deviation 400 | according to its associated weight. The weights array can either be a float 401 | or an array of the same shape as any element of `arrays`. If weights=None, 402 | then all data in each element of `arrays` are assumed to have a weight equal to one. 403 | ignore_nan : bool, optional 404 | If True, NaNs are set to zero weight. Default is propagation of NaNs. 405 | 406 | Returns 407 | ------- 408 | std: `~numpy.ndarray` 409 | Standard deviation 410 | 411 | See Also 412 | -------- 413 | istd : streaming standard deviation. 414 | numpy.std : standard deviation calculation of dense arrays. Weights are not supported. 415 | """ 416 | return np.sqrt( 417 | var(arrays=arrays, axis=axis, ddof=ddof, weights=weights, ignore_nan=ignore_nan) 418 | ) 419 | 420 | 421 | @array_stream 422 | def istd(arrays, axis=-1, ddof=0, weights=None, ignore_nan=False): 423 | """ 424 | Streaming standard deviation of arrays. Weights are also supported. 425 | This is equivalent to calling `numpy.std(axis = 2)` on a stack of images. 426 | 427 | Parameters 428 | ---------- 429 | arrays : iterable of ndarrays 430 | Arrays to be combined. This iterable can also a generator. 431 | axis : int, optional 432 | Reduction axis. Default is to combine the arrays in the stream as if 433 | they had been stacked along a new axis, then compute the standard deviation along this new axis. 434 | If None, arrays are flattened. If `axis` is an int larger that 435 | the number of dimensions in the arrays of the stream, standard deviation is computed 436 | along the new axis. 437 | ddof : int, optional 438 | Means Delta Degrees of Freedom. The divisor used in calculations 439 | is ``N - ddof``, where ``N`` represents the number of elements. 440 | weights : iterable of ndarray, iterable of floats, or None, optional 441 | Iterable of weights associated with the values in each item of `arrays`. 442 | Each value in an element of `arrays` contributes to the standard deviation 443 | according to its associated weight. The weights array can either be a float 444 | or an array of the same shape as any element of `arrays`. If weights=None, 445 | then all data in each element of `arrays` are assumed to have a weight equal to one. 446 | ignore_nan : bool, optional 447 | If True, NaNs are set to zero weight. Default is propagation of NaNs. 448 | 449 | Yields 450 | ------ 451 | std: `~numpy.ndarray` 452 | Standard deviation 453 | 454 | See Also 455 | -------- 456 | std : total standard deviation. 457 | numpy.std : standard deviation calculation of dense arrays. Weights are not supported. 458 | """ 459 | yield from map( 460 | np.sqrt, 461 | ivar( 462 | arrays=arrays, axis=axis, ddof=ddof, weights=weights, ignore_nan=ignore_nan 463 | ), 464 | ) 465 | 466 | 467 | @array_stream 468 | def sem(arrays, axis=-1, ddof=0, weights=None, ignore_nan=False): 469 | """ 470 | Standard error in the mean (SEM) of a stream of arrays. This function consumes 471 | the entire stream. 472 | 473 | Parameters 474 | ---------- 475 | arrays : iterable of ndarrays 476 | Arrays to be combined. This iterable can also a generator. 477 | axis : int, optional 478 | Reduction axis. Default is to combine the arrays in the stream as if 479 | they had been stacked along a new axis, then compute the standard error along this new axis. 480 | If None, arrays are flattened. If `axis` is an int larger that 481 | the number of dimensions in the arrays of the stream, standard error is computed 482 | along the new axis. 483 | ddof : int, optional 484 | Means Delta Degrees of Freedom. The divisor used in calculations 485 | is ``N - ddof``, where ``N`` represents the number of elements. 486 | weights : iterable of ndarray, iterable of floats, or None, optional 487 | Iterable of weights associated with the values in each item of `arrays`. 488 | Each value in an element of `arrays` contributes to the standard error 489 | according to its associated weight. The weights array can either be a float 490 | or an array of the same shape as any element of `arrays`. If weights=None, 491 | then all data in each element of `arrays` are assumed to have a weight equal to one. 492 | ignore_nan : bool, optional 493 | If True, NaNs are set to zero weight. Default is propagation of NaNs. 494 | 495 | Returns 496 | ------- 497 | sem: `~numpy.ndarray`, dtype float 498 | Standard error in the mean. 499 | 500 | See Also 501 | -------- 502 | scipy.stats.sem : standard error in the mean of dense arrays. 503 | """ 504 | avg, sq_avg, swgt = last( 505 | _ivar(arrays=arrays, axis=axis, weights=weights, ignore_nan=ignore_nan) 506 | ) 507 | return np.sqrt((sq_avg - avg**2) * (1 / (swgt - ddof))) 508 | 509 | 510 | @array_stream 511 | def isem(arrays, axis=-1, ddof=1, weights=None, ignore_nan=False): 512 | """ 513 | Streaming standard error in the mean (SEM) of arrays. This is equivalent to 514 | calling `scipy.stats.sem(axis = 2)` on a stack of images. 515 | 516 | Parameters 517 | ---------- 518 | arrays : iterable of ndarrays 519 | Arrays to be combined. This iterable can also a generator. 520 | axis : int, optional 521 | Reduction axis. Default is to combine the arrays in the stream as if 522 | they had been stacked along a new axis, then compute the standard error along this new axis. 523 | If None, arrays are flattened. If `axis` is an int larger that 524 | the number of dimensions in the arrays of the stream, standard error is computed 525 | along the new axis. 526 | ddof : int, optional 527 | Means Delta Degrees of Freedom. The divisor used in calculations 528 | is ``N - ddof``, where ``N`` represents the number of elements. 529 | weights : iterable of ndarray, iterable of floats, or None, optional 530 | Iterable of weights associated with the values in each item of `arrays`. 531 | Each value in an element of `arrays` contributes to the standard error 532 | according to its associated weight. The weights array can either be a float 533 | or an array of the same shape as any element of `arrays`. If weights=None, 534 | then all data in each element of `arrays` are assumed to have a weight equal to one. 535 | ignore_nan : bool, optional 536 | If True, NaNs are set to zero weight. Default is propagation of NaNs. 537 | 538 | Yields 539 | ------ 540 | sem: `~numpy.ndarray`, dtype float 541 | Standard error in the mean. 542 | 543 | See Also 544 | -------- 545 | scipy.stats.sem : standard error in the mean of dense arrays. 546 | """ 547 | primitive = _ivar(arrays=arrays, axis=axis, weights=weights, ignore_nan=ignore_nan) 548 | for avg, sq_avg, swgt in primitive: 549 | yield np.sqrt((sq_avg - avg**2) * (1 / (swgt - ddof))) 550 | 551 | 552 | @array_stream 553 | def ihistogram(arrays, bins, range=None, weights=None): 554 | """ 555 | Streaming histogram calculation. 556 | 557 | Parameters 558 | ---------- 559 | arrays : iterable of ndarrays 560 | Arrays to be combined. This iterable can also a generator. Arrays in this stream 561 | can be of any shape; the histogram is computed over the flattened array. 562 | bins : iterable 563 | Bin edges, including the rightmost edge, allowing for non-uniform bin widths. 564 | To determine the appropriate bins automatically, see ``numpy.histogram_bin_edges``. 565 | weights : iterable of ndarray, iterable of floats, or None, optional 566 | Iterable of weights associated with the values in each item of `arrays`. 567 | Each value in a only contributes its associated weight towards the 568 | bin count (instead of 1). The weights array can either be a float 569 | or an array of the same shape as any element of `arrays`. If ``weights=None``, 570 | then all data in each element of `arrays` are assumed to have a weight equal to one. 571 | 572 | .. versionadded:: 1.6.1 573 | 574 | Yields 575 | ------ 576 | hist : `~numpy.ndarray` 577 | Streamed histogram. 578 | 579 | See Also 580 | -------- 581 | numpy.histogram : 1D histogram of dense arrays. 582 | numpy.histogram_bin_edges : automatic selection of bins 583 | """ 584 | bins = np.asarray(bins) 585 | first, arrays = peek(arrays) 586 | 587 | if weights is None: 588 | weights = repeat(None) 589 | else: 590 | weights = map(partial(np.broadcast_to, shape=first.shape), weights) 591 | 592 | # np.histogram also returns the bin edges, which we ignore 593 | hist_func = lambda arr, wgt: np.histogram(arr, bins=bins, weights=wgt)[0] 594 | yield from isum(starmap(hist_func, zip(arrays, weights))) 595 | --------------------------------------------------------------------------------