├── .gitignore ├── .travis.yml ├── Changelog ├── LICENSE ├── MANIFEST.in ├── README.rst ├── appveyor.yml ├── datarray ├── LICENSE ├── __init__.py ├── datarray.py ├── print_grid.py ├── testing │ ├── __init__.py │ ├── testlib.py │ ├── tests │ │ └── test_utils.py │ └── utils.py ├── tests │ ├── __init__.py │ ├── test_bugfixes.py │ ├── test_data_array.py │ └── test_print.py └── version.py ├── doc ├── Makefile ├── README.txt ├── devel │ └── make_release.rst ├── doc-requirements.txt └── source │ ├── basic_data_array.rst │ ├── conf.py │ ├── design │ ├── array_axes.svg │ ├── design.rst │ ├── index.rst │ └── issues.rst │ ├── generated │ └── .gitignore │ ├── index.rst │ ├── license.rst │ ├── licenses │ ├── numpydoc_license.rst │ └── pandas_license.rst │ ├── ndarray_methods.rst │ ├── other_projects │ ├── index.rst │ ├── larry_overview.rst │ └── pandas_overview.rst │ └── printing.rst ├── examples └── inference_algs.py ├── requirements.txt ├── setup.cfg ├── setup.py └── tools └── release.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[oc] 2 | *.so 3 | __pycache__/ 4 | # setup.py working directory 5 | build 6 | # setup.py dist directory 7 | dist/ 8 | # Documentation build files 9 | doc/build 10 | # Editor temporary/working/backup files 11 | *$ 12 | .*.sw[nop] 13 | .sw[nop] 14 | *~ 15 | [#]*# 16 | .#* 17 | *.bak 18 | *.tmp 19 | *.tgz 20 | *.rej 21 | *.org 22 | .project 23 | *.diff 24 | .settings/ 25 | # Egg metadata 26 | ./*.egg-info 27 | # The shelf plugin uses this dir 28 | ./.shelf 29 | # Mac droppings 30 | .DS_Store 31 | 32 | # Build products 33 | MANIFEST 34 | *.egg-info/ 35 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | cache: 4 | directories: 5 | - $HOME/.cache/pip 6 | 7 | env: 8 | global: 9 | - DEPENDS="numpy" 10 | - INSTALL_TYPE="setup" 11 | - MANYLINUX_URL=https://nipy.bic.berkeley.edu/manylinux 12 | 13 | python: 14 | - 3.3 15 | - 3.4 16 | - 3.5 17 | 18 | matrix: 19 | include: 20 | - python: 2.6 21 | env: 22 | # Last networkx version that runs on 2.6 23 | - NETWORKX_VER_SPEC="==1.9" 24 | - python: 2.7 25 | env: 26 | - COVERAGE=1 27 | - DOCTESTS=1 28 | - DOC_DOCTEST=1 29 | # Absolute minimum dependencies 30 | - python: 2.7 31 | env: 32 | # Check numpy minimum version in datarray/version.py 33 | - DEPENDS="numpy==1.7.0" 34 | - python: 2.7 35 | env: 36 | - INSTALL_TYPE=sdist 37 | - python: 2.7 38 | env: 39 | - INSTALL_TYPE=wheel 40 | - python: 2.7 41 | env: 42 | - INSTALL_TYPE=requirements 43 | # test against pre-release builds 44 | - python: 3.5 45 | env: 46 | - EXTRA_PIP_FLAGS="--pre" 47 | 48 | before_install: 49 | - virtualenv --python=python venv 50 | - source venv/bin/activate 51 | - pip install -U pip wheel 52 | - pip install -f $MANYLINUX_URL $EXTRA_PIP_FLAGS $DEPENDS 53 | - if [ "${COVERAGE}" == "1" ]; then 54 | pip install coverage; 55 | pip install coveralls; 56 | fi 57 | 58 | install: 59 | - | 60 | if [ "$INSTALL_TYPE" == "setup" ]; then 61 | python setup.py install 62 | elif [ "$INSTALL_TYPE" == "sdist" ]; then 63 | python setup.py egg_info # check egg_info while we're here 64 | python setup.py sdist 65 | pip install -f $MANYLINUX_URL dist/*.tar.gz 66 | elif [ "$INSTALL_TYPE" == "wheel" ]; then 67 | pip install wheel 68 | python setup.py bdist_wheel 69 | pip install -f $MANYLINUX_URL dist/*.whl 70 | elif [ "$INSTALL_TYPE" == "requirements" ]; then 71 | pip install -f $MANYLINUX_URL -r requirements.txt 72 | python setup.py install 73 | fi 74 | 75 | script: 76 | - pip install nose 77 | # Change into an innocuous directory and find tests from installation 78 | - mkdir for_testing 79 | - cd for_testing 80 | - if [ "${COVERAGE}" == "1" ]; then 81 | cp ../.coveragerc .; 82 | COVER_ARGS="--with-coverage --cover-package datarray"; 83 | fi 84 | - if [ "${DOCTESTS}" == "1" ]; then 85 | DOCTEST_ARGS="--with-doctest"; 86 | fi 87 | # Run unit tests 88 | - nosetests $COVER_ARGS $DOCTEST_ARGS datarray 89 | # Run example to check for errors 90 | - pip install networkx${NETWORKX_VER_SPEC} 91 | - python ../examples/inference_algs.py 92 | # Run doc doctests 93 | - if [ "${DOC_DOCTEST}" == "1" ]; then 94 | pip install sphinx; 95 | (cd ../doc && make doctest); 96 | fi 97 | 98 | after_success: 99 | - if [ "${COVERAGE}" == "1" ]; then coveralls; fi 100 | -------------------------------------------------------------------------------- /Changelog: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | .. vim:ft=rst 3 | 4 | .. _changelog: 5 | 6 | ################### 7 | Datarray change log 8 | ################### 9 | 10 | The main authors of datarray are: 11 | 12 | * Fernando Perez (FP); 13 | * Matthew Brett (MB); 14 | * Mike Trumpis (MT); 15 | * Jonathan Terhorst (JT); 16 | * Keith Goodman (KG). 17 | 18 | ******** 19 | Releases 20 | ******** 21 | 22 | * 0.1.0 (TBA) 23 | 24 | * Fixed bug in axis sorting leading to unpredictable errors slicing 25 | DataArrays; 26 | * Added 'any' and 'all' as reduction methods, fixing incorrect retention of 27 | axes with these methods; 28 | * Port to Python >= 3.3 in common codebase with Python 2; 29 | * Move from fperez personal github account to BIDS github organization. 30 | 31 | * 0.0.6 (Wednesday November 10 2010) 32 | 33 | * 0.0.5 (Friday October 8 2010) 34 | 35 | * 0.0.4 (Wednesday October 6 2010) 36 | 37 | * 0.0.3 (Thursday July 29 2010) 38 | 39 | * 0.0.2 (Wednesday July 28 2010) 40 | 41 | * 0.0.1 (Tuesday July 27 2010) 42 | 43 | * Initial release 44 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Note: the full license file is datarray/LICENSE 2 | 3 | This is so that the license can be installed by distutils along with the real 4 | package for end users. 5 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | 3 | graft datarray 4 | 5 | graft doc 6 | exclude doc/\#* 7 | exclude doc/man/*.1 8 | 9 | # docs subdirs we want to skip 10 | prune doc/build 11 | 12 | global-exclude *~ 13 | global-exclude *.flc 14 | global-exclude *.pyc 15 | global-exclude .dircopy.log 16 | global-exclude .git 17 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. image:: https://travis-ci.org/BIDS/datarray.svg?branch=master 2 | :target: https://travis-ci.org/BIDS/datarray 3 | 4 | ###################################### 5 | Datarray: Numpy arrays with named axes 6 | ###################################### 7 | 8 | Scientists, engineers, mathematicians and statisticians don't just work with 9 | matrices; they often work with structured data, just like you'd find in a 10 | table. However, functionality for this is missing from Numpy, and there are 11 | efforts to create something to fill the void. This is one of those efforts. 12 | 13 | .. warning:: 14 | 15 | This code is currently experimental, and its API *will* change! It is meant 16 | to be a place for the community to understand and develop the right 17 | semantics and have a prototype implementation that will ultimately 18 | (hopefully) be folded back into Numpy. 19 | 20 | Datarray provides a subclass of Numpy ndarrays that support: 21 | 22 | - individual dimensions (axes) being labeled with meaningful descriptions 23 | - labeled 'ticks' along each axis 24 | - indexing and slicing by named axis 25 | - indexing on any axis with the tick labels instead of only integers 26 | - reduction operations (like .sum, .mean, etc) support named axis arguments 27 | instead of only integer indices. 28 | 29 | ********* 30 | Prior Art 31 | ********* 32 | 33 | In no particular order: 34 | 35 | * `xarray `_ - very close in spirit to 36 | this package, xarray implements named ND array axes and tick labels. It 37 | integrates with (and depends on) Pandas. If you are doing production work, 38 | and don't mind the pandas dependency, please use xarray rather than this 39 | package. Xarray used to be called "xray". 40 | 41 | * `pandas `_ is based around a number of 42 | DataFrame-esque datatypes. 43 | 44 | * `Tabular `_ implements a 45 | spreadsheet-inspired datatype, with rows/columns, csv/etc. IO, and fancy 46 | tabular operations. 47 | 48 | * `scikits.statsmodels `_ sounded as 49 | though it had some features we'd like to eventually see implemented on top of 50 | something such as datarray, and `Skipper `_ 51 | seemed pretty interested in something like this himself. 52 | 53 | * `scikits.timeseries `_ also has a 54 | time-series-specific object that's somewhat reminiscent of labeled arrays. 55 | 56 | * `pydataframe `_ is supposed to be a 57 | clone of R's data.frame. 58 | 59 | * `larry `_, or "labeled array," often comes up 60 | in discussions alongside pandas. 61 | 62 | * `divisi `_ includes labeled sparse and 63 | dense arrays. 64 | 65 | * `pymvpa `_ provides Dataset class 66 | encapsulating the data together with matching in length sets of attributes 67 | for the first two (samples and features) dimensions. Dataset is not a 68 | subclass of numpy array to allow other data structures (e.g. sparse 69 | matrices). 70 | 71 | * `ptsa `_ subclasses 72 | ndarray to provide attributes per dimensions aiming to ease slicing/indexing 73 | given the values of the axis attributes 74 | 75 | ************* 76 | Project Goals 77 | ************* 78 | 79 | 1. Get something akin to this in the numpy core; 80 | 2. Stick to basic functionality such that projects like scikits.statsmodels can 81 | use it as a base datatype; 82 | 3. Make an interface that allows for simple, pretty manipulation that doesn't 83 | introduce confusion; 84 | 4. Oh, and make sure that the base numpy array is still accessible. 85 | 86 | **** 87 | Code 88 | **** 89 | 90 | You can find our sources and single-click downloads: 91 | 92 | * `Main repository`_ on Github; 93 | * Documentation_ for the current release; 94 | * Download the `current trunk`_ as a tar/zip file; 95 | * Downloads of all `available releases`_. 96 | 97 | The latest released version is always available from `pypi 98 | `_. 99 | 100 | ******* 101 | Support 102 | ******* 103 | 104 | Please put up issues on the `datarray issue tracker 105 | `_. 106 | 107 | .. _main repository: http://github.com/bids/datarray 108 | .. _Documentation: http://bids.github.com/datarray 109 | .. _current trunk: http://github.com/bids/datarray/archives/master 110 | .. _available releases: http://github.com/bids/datarray/releases 111 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | # vim ft=yaml 2 | # CI on Windows via appveyor 3 | # Largely from: 4 | # https://github.com/ogrisel/python-appveyor-demo/blob/master/appveyor.yml 5 | 6 | environment: 7 | 8 | matrix: 9 | 10 | - PYTHON: "C:\\Python27" 11 | PYTHON_VERSION: "2.7.x" # currently 2.7.10 12 | PYTHON_ARCH: "32" 13 | 14 | - PYTHON: "C:\\Python27-x64" 15 | PYTHON_VERSION: "2.7.x" # currently 2.7.10 16 | PYTHON_ARCH: "64" 17 | 18 | - PYTHON: "C:\\Python34" 19 | PYTHON_VERSION: "3.4.x" # currently 3.4.3 20 | PYTHON_ARCH: "32" 21 | 22 | - PYTHON: "C:\\Python34-x64" 23 | PYTHON_VERSION: "3.4.x" # currently 3.4.3 24 | PYTHON_ARCH: "64" 25 | 26 | - PYTHON: "C:\\Python35" 27 | PYTHON_VERSION: "3.5.x" # currently 3.5.0 28 | PYTHON_ARCH: "32" 29 | 30 | - PYTHON: "C:\\Python35-x64" 31 | PYTHON_VERSION: "3.5.x" # currently 3.5.0 32 | PYTHON_ARCH: "64" 33 | 34 | install: 35 | - cmd: echo "Using cmd" 36 | # Prepend newly installed Python to the PATH of this build (this cannot be 37 | # done from inside the powershell script as it would require to restart 38 | # the parent CMD process). 39 | - SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH% 40 | # Check that we have the expected version and architecture for Python 41 | - python --version 42 | - python -c "import struct; print(struct.calcsize('P') * 8)" 43 | # Upgrade pip 44 | - python -m pip install --upgrade pip 45 | 46 | build_script: 47 | # Install with dependencies 48 | - pip install nose numpy 49 | - pip install . 50 | 51 | test_script: 52 | # Run the project tests 53 | - mkdir tmp_for_test 54 | - cd tmp_for_test 55 | - nosetests datarray 56 | - cd .. 57 | -------------------------------------------------------------------------------- /datarray/LICENSE: -------------------------------------------------------------------------------- 1 | ======= 2 | License 3 | ======= 4 | 5 | The ``datarray`` package is distributed under a Simplified BSD license. Parts 6 | of NumPy, larry and numpydoc, which all have BSD licenses, are included in 7 | datarray. 8 | 9 | datarray license 10 | ---------------- 11 | 12 | Copyright (c) 2009-2016, NumPy Developers. 13 | All rights reserved. 14 | 15 | Redistribution and use in source and binary forms, with or without 16 | modification, are permitted provided that the following conditions are 17 | met: 18 | 19 | * Redistributions of source code must retain the above copyright 20 | notice, this list of conditions and the following disclaimer. 21 | 22 | * Redistributions in binary form must reproduce the above 23 | copyright notice, this list of conditions and the following 24 | disclaimer in the documentation and/or other materials provided 25 | with the distribution. 26 | 27 | * Neither the name of the NumPy Developers nor the names of any 28 | contributors may be used to endorse or promote products derived 29 | from this software without specific prior written permission. 30 | 31 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 32 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 33 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 34 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 35 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 36 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 37 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 38 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 39 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 40 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 41 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 42 | 43 | 44 | la license 45 | ---------- 46 | 47 | Copyright (c) 2008, 2009, 2010, Archipel Asset Management AB. 48 | All rights reserved. 49 | 50 | Redistribution and use in source and binary forms, with or without 51 | modification, are permitted provided that the following conditions are met: 52 | 53 | * Redistributions of source code must retain the above copyright notice, 54 | this list of conditions and the following disclaimer. 55 | 56 | * Redistributions in binary form must reproduce the above copyright 57 | notice, this list of conditions and the following disclaimer in the 58 | documentation and/or other materials provided with the distribution. 59 | 60 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 61 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 62 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 63 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 64 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 65 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 66 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 67 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 68 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 69 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 70 | POSSIBILITY OF SUCH DAMAGE. 71 | 72 | 73 | NumPy license 74 | ------------- 75 | 76 | Copyright (c) 2005-2009, NumPy Developers. 77 | All rights reserved. 78 | 79 | Redistribution and use in source and binary forms, with or without 80 | modification, are permitted provided that the following conditions are 81 | met: 82 | 83 | * Redistributions of source code must retain the above copyright 84 | notice, this list of conditions and the following disclaimer. 85 | 86 | * Redistributions in binary form must reproduce the above 87 | copyright notice, this list of conditions and the following 88 | disclaimer in the documentation and/or other materials provided 89 | with the distribution. 90 | 91 | * Neither the name of the NumPy Developers nor the names of any 92 | contributors may be used to endorse or promote products derived 93 | from this software without specific prior written permission. 94 | 95 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 96 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 97 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 98 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 99 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 100 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 101 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 102 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 103 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 104 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 105 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 106 | -------------------------------------------------------------------------------- /datarray/__init__.py: -------------------------------------------------------------------------------- 1 | """Arrays with rich geometric semantics. 2 | """ 3 | #----------------------------------------------------------------------------- 4 | # Imports 5 | #----------------------------------------------------------------------------- 6 | # Stdlib 7 | from __future__ import print_function 8 | 9 | import distutils.version as v 10 | 11 | # Third-party 12 | import numpy as np 13 | # datarray uses the __array_prepare__ method introduced in numpy 1.4.0 14 | if v.LooseVersion(np.__version__) < v.LooseVersion('1.4'): 15 | raise ImportError('Numpy version >= 1.4 is required to use datarray') 16 | 17 | # Our own 18 | try: 19 | from .testing.testlib import test 20 | except ImportError: 21 | print("No datarray unit testing available.") 22 | 23 | from .version import __version__ 24 | from .datarray import DataArray 25 | -------------------------------------------------------------------------------- /datarray/print_grid.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functions for pretty-printing tabular data, such as a DataArray, as a grid. 3 | """ 4 | import sys 5 | if sys.version_info[0] < 3: 6 | range = xrange 7 | 8 | import numpy as np 9 | 10 | class GridDataFormatter(object): 11 | """ 12 | A GridDataFormatter takes an ndarray of objects and represents them as 13 | equal-length strings. It is flexible about what string length to use, 14 | and can make suggestions about the string length based on the data it 15 | will be asked to render. 16 | 17 | Each GridDataFormatter instance specifies: 18 | 19 | - `min_width`, the smallest acceptable width 20 | - `standard_width`, a reasonable width when putting many items on the 21 | screen 22 | - `max_width`, the width it prefers if space is not limited 23 | 24 | This top-level class specifies reasonable defaults for a formatter, and 25 | subclasses refine it for particular data types. 26 | """ 27 | def __init__(self, data=None): 28 | self.data = data 29 | 30 | def min_width(self): 31 | return 1 32 | 33 | def standard_width(self): 34 | return min(9, self.max_width) 35 | 36 | def max_width(self): 37 | if self.data is None: 38 | # no information, so just use all the space we're given 39 | return 100 40 | return max([len(str(val)) for val in self.data.flat]) 41 | 42 | def format(self, value, width=None): 43 | """ 44 | Formats a given value to a fixed width. 45 | """ 46 | if width is None: width = self.standard_width() 47 | return '{0:<{width}}'.format(value, width=width)[:width] 48 | 49 | def format_all(self, values, width=None): 50 | """ 51 | Formats an array of values to a fixed width, returning a string array. 52 | """ 53 | if width is None: width = self.standard_width() 54 | out = np.array([self.format(value, width) for value in values.flat]) 55 | return out.reshape(values.shape) 56 | 57 | class FloatFormatter(GridDataFormatter): 58 | """ 59 | Formats floating point numbers either in standard or exponential notation, 60 | whichever fits better and represents the numbers better in the given amount 61 | of space. 62 | """ 63 | def __init__(self, data, sign=False, strip_zeros=True): 64 | GridDataFormatter.__init__(self, data) 65 | flat = data.flatten() 66 | absolute = np.abs(flat.compress((flat != 0) & ~np.isnan(flat) & ~np.isinf(flat))) 67 | if sign: self.sign = '+' 68 | else: self.sign = ' ' 69 | self.strip_zeros = strip_zeros 70 | if len(absolute): 71 | self.max_val = np.max(absolute) 72 | self.min_val = np.min(absolute) 73 | self.leading_digits = max(1, int(np.log10(self.max_val)) + 1) 74 | self.leading_zeros = max(0, int(np.ceil(-np.log10(self.min_val)))) 75 | else: 76 | self.max_val = self.min_val = 0 77 | self.leading_digits = 1 78 | self.leading_zeros = 0 79 | self.large_exponent = (self.leading_digits >= 101) or (self.leading_zeros >= 100) 80 | 81 | def min_width(self): 82 | return min(self._min_width_standard(), self._min_width_exponential()) 83 | 84 | def _min_width_standard(self): 85 | # 1 character for sign 86 | # enough room for all the leading digits 87 | # 1 character for decimal point 88 | # enough room for all the leading zeros 89 | # 1 more digit 90 | return self.leading_digits + self.leading_zeros + 3 91 | 92 | def _min_width_exponential(self): 93 | # enough room for -3.1e+nn or -3.1e+nnn 94 | return self.large_exponent + 8 95 | 96 | def standard_width(self): 97 | return self.min_width() + 2 98 | 99 | def max_width(self): 100 | return min(self.leading_digits + 8, 16) 101 | 102 | def format(self, value, width=None): 103 | if width is None: width = self.standard_width() 104 | if self._use_exponential_format(width): 105 | return self._format_exponential(value, width) 106 | else: 107 | return self._format_standard(value, width) 108 | 109 | def _format_exponential(self, value, width): 110 | precision = max(1, width - 7 - self.large_exponent) 111 | return '{0:<{sign}{width}.{precision}e}'.format(value, 112 | width=width, 113 | sign=self.sign, 114 | precision=precision) 115 | 116 | def _format_standard(self, value, width): 117 | precision = max(1, width - 2 - self.leading_digits) 118 | result = '{0:>{sign}{width}.{precision}f}'.format(value, width=width, 119 | sign=self.sign, 120 | precision=precision) 121 | if self.strip_zeros: 122 | return '{0:<{width}}'.format(result.rstrip('0'), width=width) 123 | else: return result 124 | 125 | def _use_exponential_format(self, width): 126 | """ 127 | The FloatFormatter will use exponential format if the standard format 128 | cannot accurately represent all the numbers in the given width. 129 | 130 | This criterion favors standard format more than NumPy's arrayprint. 131 | """ 132 | return (width < self._min_width_standard()) 133 | 134 | def format_all(self, values, width=None): 135 | """ 136 | Formats an array of values to a fixed width, returning a string array. 137 | """ 138 | if width is None: width = self.standard_width() 139 | if self._use_exponential_format(width): 140 | formatter = self._format_exponential 141 | else: 142 | formatter = self._format_standard 143 | 144 | out = np.array([formatter(value, width) for value in values.flat]) 145 | return out.reshape(values.shape) 146 | 147 | class IntFormatter(FloatFormatter): 148 | """ 149 | The IntFormatter tries to just print all the digits of the ints, but falls 150 | back on being an exponential FloatFormatter if there isn't room. 151 | """ 152 | def _min_width_standard(self): 153 | return self.leading_digits + 1 154 | 155 | def standard_width(self): 156 | return self._min_width_standard() 157 | 158 | def _format_standard(self, value, width): 159 | return '{0:>{sign}{width}d}'.format(value, width=width, sign=self.sign) 160 | 161 | class BoolFormatter(GridDataFormatter): 162 | """ 163 | The BoolFormatter prints 'True' and 'False' if there is room, and 164 | otherwise prints 'T' and '-' ('T' and 'F' are too visually similar). 165 | """ 166 | def standard_width(self): 167 | return 5 168 | 169 | def max_width(self): 170 | return 5 171 | 172 | def format(self, value, width=5): 173 | if width < 5: 174 | if value: return 'T' 175 | else: return '-' 176 | else: 177 | if value: return ' True' 178 | else: return 'False' 179 | 180 | class StrFormatter(GridDataFormatter): 181 | """ 182 | A StrFormatter's behavior is almost entirely defined by the default. 183 | When it must truncate strings, it insists on showing at least 3 184 | characters. 185 | """ 186 | def min_width(self): 187 | return min(3, self.max_width()) 188 | 189 | class ComplexFormatter(GridDataFormatter): 190 | """ 191 | A ComplexFormatter uses two FloatFormatters side by side. This can make 192 | its min_width fairly large. 193 | """ 194 | def __init__(self, data): 195 | GridDataFormatter.__init__(self, data) 196 | self.real_format = FloatFormatter(data, strip_zeros=False) 197 | self.imag_format = FloatFormatter(data, strip_zeros=False, 198 | sign=True) 199 | 200 | def min_width(self): 201 | return max(self.real_format.min_width(), 202 | self.imag_format.min_width())*2 + 1 203 | 204 | def standard_width(self): 205 | return max(self.real_format.standard_width(), 206 | self.imag_format.standard_width())*2 + 1 207 | 208 | def max_width(self): 209 | return max(self.real_format.max_width(), 210 | self.imag_format.max_width())*2 211 | 212 | def format(self, value, width=None): 213 | #TODO: optimize 214 | if width is None: width = self.standard_width() 215 | part_width = (width-1)//2 216 | real_part = self.real_format.format(value.real, part_width) 217 | imag_part = self.imag_format.format(value.imag, part_width) 218 | result = '{0}{1}j'.format(real_part, imag_part) 219 | return '{0:<{width}}'.format(result, width=width) 220 | 221 | 222 | # Formatters for numpy dtype kinds 223 | _KIND2FORMAT = dict(b = BoolFormatter, 224 | u = IntFormatter, 225 | i = IntFormatter, 226 | f = FloatFormatter, 227 | c = ComplexFormatter) 228 | 229 | 230 | def get_formatter(arr): 231 | """ 232 | Get a formatter for this array's data type, and prime it on this array. 233 | """ 234 | return _KIND2FORMAT.get(arr.dtype.kind, StrFormatter)(arr) 235 | 236 | 237 | def grid_layout(arr, width=75, height=10): 238 | """ 239 | Given a 2-D non-empty array, turn it into a list of lists of strings to be 240 | joined. 241 | 242 | This uses plain lists instead of a string array, because certain 243 | formatting tricks might want to join columns, resulting in a ragged- 244 | shaped array. 245 | """ 246 | # get the maximum possible amount we'd be able to display 247 | array_sample = arr[:height, :width//2] 248 | formatter = get_formatter(arr) 249 | 250 | # first choice: show the whole array at full width 251 | cell_width = formatter.max_width() 252 | columns_shown = arr.shape[1] 253 | column_ellipsis = False 254 | 255 | if (cell_width+1) * columns_shown > width+1: 256 | # second choice: show the whole array at at least standard width 257 | standard_width = formatter.standard_width() 258 | cell_width = (width+1) // (columns_shown) - 1 259 | if cell_width < standard_width: 260 | # third choice: show at least 5 columns at standard width 261 | column_ellipsis = True 262 | cell_width = standard_width 263 | columns_shown = (width-3) // (cell_width+1) 264 | if columns_shown < 5: 265 | # fourth choice: as many columns as possible at minimum width 266 | cell_width = formatter.min_width() 267 | columns_shown = max(1, (width-3) // (cell_width+1)) 268 | cells_shown = arr[:height, :columns_shown] 269 | layout = formatter.format_all(cells_shown, cell_width) 270 | 271 | ungrid = [list(row) for row in layout] 272 | 273 | if column_ellipsis: 274 | ungrid[0].append('...') 275 | 276 | if height < arr.shape[0]: # row ellipsis 277 | ungrid.append(['...']) 278 | 279 | return ungrid, cells_shown 280 | 281 | def labeled_layout(arr, width=75, height=10, row_label_width=9): 282 | """ 283 | Given a 2-D non-empty array that may have labeled axes, rows, or columns, 284 | render the array as strings to be joined and attach the axes in visually 285 | appropriate places. 286 | 287 | Returns a list of lists of strings to be joined. 288 | """ 289 | inner_width, inner_height = width, height 290 | if arr.axes[0].labels: 291 | inner_width = width - row_label_width-1 292 | if arr.axes[1].labels: 293 | inner_height -= 1 294 | row_header = (arr.axes[0].labels and arr.axes[0].name) 295 | col_header = (arr.axes[1].labels and arr.axes[1].name) 296 | if row_header or col_header: 297 | inner_height -= 2 298 | 299 | layout, cells_shown = grid_layout(arr, inner_width, inner_height) 300 | cell_width = len(layout[0][0]) 301 | label_formatter = StrFormatter() 302 | 303 | if arr.axes[1].labels: 304 | # use one character less than available, to make axes more visually 305 | # separate 306 | 307 | col_label_layout = [label_formatter.format(str(name)[:cell_width-1], 308 | cell_width) for name in cells_shown.axes[1].labels] 309 | layout = [col_label_layout] + layout 310 | 311 | if arr.axes[0].labels: 312 | layout = [[' '*row_label_width] + row for row in layout] 313 | labels = cells_shown.axes[0].labels 314 | offset = 0 315 | if arr.axes[1].labels: offset = 1 316 | for r in range(cells_shown.shape[0]): 317 | layout[r+offset][0] = label_formatter.format(str(labels[r]), row_label_width) 318 | 319 | if row_header or col_header: 320 | header0 = [] 321 | header1 = [] 322 | if row_header: 323 | header0.append(label_formatter.format(row_header, row_label_width)) 324 | header1.append('-' * row_label_width) 325 | elif arr.axes[0].labels: 326 | header0.append(' ' * row_label_width) 327 | header1.append(' ' * row_label_width) 328 | if col_header: 329 | # We can use all remaining columns. How wide are they? 330 | offset = 0 331 | if arr.axes[0].labels: offset = 1 332 | merged_width = len(' '.join(layout[0][offset:])) 333 | header0.append(label_formatter.format(col_header, merged_width)) 334 | header1.append('-' * merged_width) 335 | layout = [header0, header1] + layout 336 | 337 | return layout 338 | 339 | def layout_to_string(layout): 340 | return '\n'.join([' '.join(row) for row in layout]) 341 | 342 | def array_to_string(arr, width=75, height=10): 343 | """ 344 | Get a 2-D text representation of a NumPy array. 345 | """ 346 | assert arr.ndim <= 2 347 | while arr.ndim < 2: 348 | arr = arr[np.newaxis, ...] 349 | return layout_to_string(grid_layout(arr, width, height)) 350 | 351 | def datarray_to_string(arr, width=75, height=10): 352 | """ 353 | Get a 2-D text representation of a datarray. 354 | """ 355 | assert arr.ndim <= 2 356 | while arr.ndim < 2: 357 | arr = arr[np.newaxis, ...] 358 | return layout_to_string(labeled_layout(arr, width, height)) 359 | 360 | -------------------------------------------------------------------------------- /datarray/testing/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .utils import * 3 | -------------------------------------------------------------------------------- /datarray/testing/testlib.py: -------------------------------------------------------------------------------- 1 | """Module defining the main test entry point exposed at the top level. 2 | """ 3 | #----------------------------------------------------------------------------- 4 | # Imports 5 | #----------------------------------------------------------------------------- 6 | 7 | # Stdlib 8 | import sys 9 | 10 | # Third-party 11 | import nose 12 | import nose.plugins.builtin 13 | from nose.core import TestProgram 14 | 15 | #----------------------------------------------------------------------------- 16 | # Functions and classes 17 | #----------------------------------------------------------------------------- 18 | 19 | def test(doctests=True, extra_argv=None, **kw): 20 | """Run the nitime test suite using nose. 21 | 22 | Parameters 23 | ---------- 24 | doctests : bool, optional (default True) 25 | If true, also run the doctests in all docstrings. 26 | 27 | kw : dict 28 | Any other keywords are passed directly to nose.TestProgram(), which 29 | itself is a subclass of unittest.TestProgram(). 30 | """ 31 | # We construct our own argv manually, so we must set argv[0] ourselves 32 | argv = [ 'nosetests', 33 | # Name the package to actually test, in this case nitime 34 | 'datarray', 35 | 36 | # extra info in tracebacks 37 | '--detailed-errors', 38 | 39 | # We add --exe because of setuptools' imbecility (it blindly does 40 | # chmod +x on ALL files). Nose does the right thing and it tries 41 | # to avoid executables, setuptools unfortunately forces our hand 42 | # here. This has been discussed on the distutils list and the 43 | # setuptools devs refuse to fix this problem! 44 | '--exe', 45 | ] 46 | 47 | if doctests: 48 | argv.append('--with-doctest') 49 | 50 | if extra_argv is not None: 51 | argv.extend(extra_argv) 52 | 53 | # Now nose can run 54 | TestProgram(argv=argv, exit=False, **kw) 55 | 56 | 57 | # Tell nose that the test() function itself isn't a test, otherwise we get a 58 | # recursive loop inside nose. 59 | test.__test__ = False 60 | -------------------------------------------------------------------------------- /datarray/testing/tests/test_utils.py: -------------------------------------------------------------------------------- 1 | "Tests of datarray unit test utilities" 2 | 3 | import numpy as np 4 | from numpy.testing import assert_raises 5 | 6 | from datarray.datarray import DataArray 7 | from datarray.testing.utils import assert_datarray_equal 8 | 9 | def test_assert_datarray_equal(): 10 | # Test assert_datarray_equal 11 | 12 | x = DataArray([1, 2]) 13 | y = DataArray([1, 2]) 14 | assert_datarray_equal(x, y, "Should not raise assertion") 15 | y = DataArray([1, 3]) 16 | assert_raises(AssertionError, assert_datarray_equal, x, y) 17 | y = DataArray([1, 2, 3]) 18 | assert_raises(AssertionError, assert_datarray_equal, x, y) 19 | y = DataArray([1, 2], 'a') 20 | assert_raises(AssertionError, assert_datarray_equal, x, y) 21 | y = DataArray([1, 2], [('a', ['a', 'b'])]) 22 | assert_raises(AssertionError, assert_datarray_equal, x, y) 23 | 24 | x = DataArray([1, 2], 'a') 25 | y = DataArray([1, 2], 'a') 26 | assert_datarray_equal(x, y, "Should not raise assertion") 27 | y = DataArray([1, 2], 'b') 28 | assert_raises(AssertionError, assert_datarray_equal, x, y) 29 | y = DataArray([1, 2], [('b', ['a', 'b'])]) 30 | assert_raises(AssertionError, assert_datarray_equal, x, y) 31 | 32 | x = DataArray([1, 2], 'a') 33 | y = DataArray([1, 2], [('a', None)]) 34 | assert_datarray_equal(x, y, "Should not raise assertion") 35 | 36 | x = DataArray([[1, 2], [3, 4]], [('ax1', ['a', 'b']), ('ax2', ['a', 'b'])]) 37 | y = DataArray([[1, 2], [3, 4]], [('ax1', ['a', 'b']), ('ax2', ['a', 'b'])]) 38 | assert_datarray_equal(x, y, "Should not raise assertion") 39 | y = DataArray([[1, 2], [3, 4]], [('ax1', ['X', 'b']), ('ax2', ['a', 'b'])]) 40 | assert_raises(AssertionError, assert_datarray_equal, x, y) 41 | y = DataArray([[1, 2], [3, 4]], [('ax1', ['a', 'b']), ('ax2', None)]) 42 | assert_raises(AssertionError, assert_datarray_equal, x, y) 43 | y = DataArray([[9, 2], [3, 4]], [('ax1', ['a', 'b']), ('ax2', ['a', 'b'])]) 44 | assert_raises(AssertionError, assert_datarray_equal, x, y) 45 | 46 | x = DataArray([1, np.nan]) 47 | y = DataArray([1, np.nan]) 48 | assert_datarray_equal(x, y, "Should not raise assertion") 49 | 50 | x = DataArray([1, 2], 'a') 51 | y = 1 52 | assert_raises(AssertionError, assert_datarray_equal, x, y) 53 | y = np.array([1, 2]) 54 | assert_raises(AssertionError, assert_datarray_equal, x, y) 55 | 56 | x = 1 57 | y = 2 58 | assert_raises(AssertionError, assert_datarray_equal, x, y) 59 | x = np.array([1]) 60 | y = np.array([2]) 61 | assert_raises(AssertionError, assert_datarray_equal, x, y) 62 | -------------------------------------------------------------------------------- /datarray/testing/utils.py: -------------------------------------------------------------------------------- 1 | """datarray unit testing utilities""" 2 | #----------------------------------------------------------------------------- 3 | # Imports 4 | #----------------------------------------------------------------------------- 5 | 6 | # Third-party 7 | import numpy as np 8 | from numpy.testing import assert_, assert_equal, assert_array_equal 9 | 10 | # Our own 11 | from datarray.datarray import DataArray 12 | 13 | __all__ = ['assert_datarray_equal'] 14 | 15 | #----------------------------------------------------------------------------- 16 | # Functions and classes 17 | #----------------------------------------------------------------------------- 18 | 19 | def assert_datarray_equal(x, y, err_msg='', verbose=True): 20 | """ 21 | Raise an AssertionError if two datarrays are not equal. 22 | 23 | Given two datarrays, assert that the shapes are equal, axes are equal, and 24 | all elements of the datarrays are equal. Given two scalars assert equality. 25 | In contrast to the standard usage in numpy, NaNs are compared like numbers, 26 | no assertion is raised if both objects have NaNs in the same positions. 27 | 28 | The usual caution for verifying equality with floating point numbers is 29 | advised. 30 | 31 | Parameters 32 | ---------- 33 | x : {datarray, scalar} 34 | If you are testing a datarray method, for example, then this is the 35 | datarray (or scalar) returned by the method. 36 | y : {datarray, scalar} 37 | This datarray represents the expected result. If `x` is not equal to 38 | `y`, then an AssertionError is raised. 39 | err_msg : str 40 | If `x` is not equal to `y`, then the string `err_msg` will be added to 41 | the top of the AssertionError message. 42 | verbose : bool 43 | If True, the conflicting values are appended to the error message. 44 | 45 | Returns 46 | ------- 47 | None 48 | 49 | Raises 50 | ------ 51 | AssertionError 52 | If actual and desired datarrays are not equal. 53 | 54 | Examples 55 | -------- 56 | If the two datarrays are equal then None is returned: 57 | 58 | >>> from datarray.testing import assert_datarray_equal 59 | >>> from datarray.datarray import DataArray 60 | >>> x = DataArray([1, 2]) 61 | >>> y = DataArray([1, 2]) 62 | >>> assert_datarray_equal(x, y) 63 | 64 | If the two datarrays are not equal then an AssertionError is raised: 65 | 66 | >>> x = DataArray([1, 2], ('time',)) 67 | >>> y = DataArray([1, 2], ('distance',)) 68 | >>> assert_datarray_equal(x, y) 69 | Traceback (most recent call last): 70 | File "", line 1, in 71 | File "datarray/testing/utils.py", line 133, in assert_datarray_equal 72 | raise AssertionError, err_msg 73 | AssertionError: 74 | 75 | ---------- 76 | AXIS NAMES 77 | ---------- 78 | 79 | Items are not equal: 80 | item=0 81 | 82 | ACTUAL: 'time' 83 | DESIRED: 'distance' 84 | 85 | """ 86 | # Initialize 87 | fail = [] 88 | 89 | # Function to make section headings 90 | def heading(text): 91 | line = '-' * len(text) 92 | return '\n\n' + line + '\n' + text + '\n' + line + '\n' 93 | 94 | # The assert depends on the type of x and y 95 | if np.isscalar(x) and np.isscalar(y): 96 | 97 | # Both x and y are scalars 98 | try: 99 | assert_equal(x, y) 100 | except AssertionError as err: 101 | fail.append(heading('SCALARS') + str(err)) 102 | 103 | elif (type(x) is np.ndarray) and (type(y) is np.ndarray): 104 | 105 | # Both x and y are scalars 106 | try: 107 | assert_array_equal(x, y) 108 | except AssertionError as err: 109 | fail.append(heading('ARRAYS') + str(err)) 110 | 111 | elif (type(x) == DataArray) + (type(y) == DataArray) == 1: 112 | 113 | # Only one of x and y are datarrays; test failed 114 | try: 115 | assert_equal(type(x), type(y)) 116 | except AssertionError as err: 117 | fail.append(heading('TYPE') + str(err)) 118 | 119 | else: 120 | 121 | # Both x and y are datarrays 122 | 123 | # shape 124 | try: 125 | assert_equal(x.shape, y.shape) 126 | except AssertionError as err: 127 | fail.append(heading('SHAPE') + str(err)) 128 | 129 | # axis names 130 | try: 131 | assert_equal(x.names, y.names) 132 | except AssertionError as err: 133 | fail.append(heading('AXIS NAMES') + str(err)) 134 | 135 | # labels 136 | for ax in range(x.ndim): 137 | try: 138 | assert_equal(x.axes[ax].labels, y.axes[ax].labels) 139 | except AssertionError as err: 140 | fail.append(heading('LABELS ALONG AXIS = %d' % ax) + str(err)) 141 | 142 | # axes 143 | for ax in range(x.ndim): 144 | try: 145 | assert_(x.axes[ax], y.axes[ax]) 146 | except AssertionError as err: 147 | fail.append(heading('AXIS OBJECT ALONG AXIS = %d' % ax) + str(err)) 148 | fail.append('x: ' + str(x.axes[ax])) 149 | fail.append('y: ' + str(y.axes[ax])) 150 | 151 | # data 152 | try: 153 | assert_array_equal(x.base, y.base) 154 | except AssertionError as err: 155 | fail.append(heading('ARRAY') + str(err)) 156 | 157 | # Did the test pass? 158 | if len(fail) > 0: 159 | # No 160 | if verbose: 161 | err_msgs = ''.join(fail) 162 | err_msgs = err_msgs.replace('\n', '\n\t') 163 | if len(err_msg): 164 | err_msg = heading("TEST: " + err_msg) + err_msgs 165 | else: 166 | err_msg = err_msgs 167 | raise AssertionError(err_msg) 168 | else: 169 | raise AssertionError 170 | 171 | -------------------------------------------------------------------------------- /datarray/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BIDS/datarray/1d53044a838874609b824ad6eeeb1b4e819f417b/datarray/tests/__init__.py -------------------------------------------------------------------------------- /datarray/tests/test_bugfixes.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from datarray.datarray import Axis, DataArray, NamedAxisError, \ 4 | _pull_axis, _reordered_axes 5 | 6 | from datarray.testing.utils import assert_datarray_equal 7 | import datarray.print_grid as print_grid 8 | 9 | import nose.tools as nt 10 | import numpy.testing as npt 11 | 12 | def test_full_reduction(): 13 | # issue #2 14 | nt.assert_equal(DataArray([1, 2, 3]).sum(axis=0),6) 15 | 16 | def test_bug3(): 17 | "Bug 3" 18 | x = np.array([1,2,3]) 19 | y = DataArray(x, 'x') 20 | nt.assert_equal( x.sum(), y.sum() ) 21 | nt.assert_equal( x.max(), y.max() ) 22 | 23 | def test_bug5(): 24 | "Bug 5: Support 0d arrays" 25 | A = DataArray(10) 26 | # Empty tuples evaluate to false 27 | nt.assert_false(tuple(A.axes)) 28 | nt.assert_equal(len(A.axes), 0) 29 | nt.assert_raises(IndexError, lambda: A.axes[0]) 30 | nt.assert_false(A.names) 31 | 32 | def test_1d_label_indexing(): 33 | # issue #18 34 | cap_ax_spec = 'capitals', ['washington', 'london', 'berlin', 'paris', 'moscow'] 35 | caps = DataArray(np.arange(5),[cap_ax_spec]) 36 | caps.axes.capitals["washington"] 37 | 38 | def test_bug22(): 39 | "Bug 22: DataArray does not accepting array as ticks" 40 | A = DataArray([1, 2], [('time', ['a', 'b'])]) 41 | B = DataArray([1, 2], [('time', np.array(['a', 'b']))]) 42 | assert_datarray_equal(A, B) 43 | 44 | def test_bug26(): 45 | "Bug 26: check that axes names are computed on demand." 46 | a = DataArray([1,2,3]) 47 | nt.assert_true(a.axes[0].name is None) 48 | a.axes[0].name = "a" 49 | nt.assert_equal(a.axes[0].name, "a") 50 | 51 | def test_bug34(): 52 | "Bug 34: datetime.date ticks not handled by datarray_to_string" 53 | from datarray.print_grid import datarray_to_string 54 | from datetime import date as D 55 | A = DataArray([[1,2],[3,4]], [('row', ('a', D(2010,1,1))),('col', 'cd')]) 56 | exp_out = """row col 57 | --------- ------------------- 58 | c d 59 | a 1 2 60 | 2010-01-0 3 4""" 61 | nt.assert_equal(datarray_to_string(A), exp_out) 62 | # Output for unsigned integers 63 | B = A.astype(np.uint32) 64 | nt.assert_equal(datarray_to_string(B), exp_out) 65 | 66 | 67 | def test_bug35(): 68 | "Bug 35" 69 | txt_array = DataArray(['a','b'], axes=['dummy']) 70 | #calling datarray_to_string on string arrays used to fail 71 | print_grid.datarray_to_string(txt_array) 72 | #because get_formatter returned the class not an instance 73 | assert isinstance(print_grid.get_formatter(txt_array), 74 | print_grid.StrFormatter) 75 | 76 | def test_bug38(): 77 | "Bug 38: DataArray.__repr__ should parse as a single entity" 78 | # Calling repr() on an ndarray prepends array (instead of np.array) 79 | arys = ( 80 | DataArray(np.random.randint(0, 10000, size=(1,2,3,4,5)), 'abcde'), 81 | DataArray(np.random.randint(0, 10000, size=(3,3,3))), # Try with missing axes 82 | DataArray(np.random.randint(0, 10000, (2,4,5,6)), # Try with ticks 83 | ('a', ('b', ('b1','b2','b3','b4')), 'c', 'd')), 84 | ) 85 | # Load `array` into namespace for `eval` 86 | array = np.array 87 | for A in arys: 88 | assert_datarray_equal(A, eval(repr(A))) 89 | 90 | def test_bug44(): 91 | "Bug 44" 92 | # In instances where axis=None, the operation runs 93 | # on the flattened array. Here it makes sense to return 94 | # the op on the underlying np.ndarray. 95 | A = [[1,2,3],[4,5,6]] 96 | x = DataArray(A, 'xy').std() 97 | y = np.std(A) 98 | nt.assert_equal( x.sum(), y.sum() ) 99 | 100 | -------------------------------------------------------------------------------- /datarray/tests/test_data_array.py: -------------------------------------------------------------------------------- 1 | '''Tests for DataArray and friend''' 2 | 3 | import sys 4 | PY3 = sys.version_info[0] >= 3 5 | 6 | import numpy as np 7 | 8 | from datarray.datarray import (Axis, DataArray, NamedAxisError, AxesManager, 9 | _pull_axis, _reordered_axes) 10 | 11 | import nose.tools as nt 12 | import numpy.testing as npt 13 | 14 | DA = DataArray(np.random.randn(4, 2, 6), 'xyz') 15 | YZ = AxesManager(DA, (Axis('y', 0, None), Axis('z', 1, None))) 16 | XZ = AxesManager(DA, (Axis('x', 0, None), Axis('z', 1, None))) 17 | XY = AxesManager(DA, (Axis('x', 0, None), Axis('y', 1, None))) 18 | AXES_REMOVED = dict(x=YZ, y=XZ, z=XY) 19 | 20 | 21 | def test_axis_equal(): 22 | ax1 = Axis('aname', 0, None) 23 | ax2 = Axis('aname', 0, None) 24 | nt.assert_equal(ax1, ax2) 25 | # The array to which the axis points does not matter in comparison 26 | ax3 = Axis('aname', 0, np.arange(10)) 27 | nt.assert_equal(ax1, ax3) 28 | # but the index does 29 | ax4 = Axis('aname', 1, None) 30 | nt.assert_not_equal(ax1, ax4) 31 | # so does the name 32 | ax5 = Axis('anothername', 0, None) 33 | nt.assert_not_equal(ax1, ax5) 34 | # and obviously both 35 | nt.assert_not_equal(ax4, ax5) 36 | # Try with labels 37 | ax6 = Axis('same', 0, None, labels=['a', 'b']) 38 | ax7 = Axis('same', 0, None, labels=['a', 'b']) 39 | nt.assert_equal(ax6, ax7) 40 | ax8 = Axis('same', 0, None, labels=['a', 'xx']) 41 | nt.assert_not_equal(ax6, ax8) 42 | 43 | def test_bad_labels1(): 44 | d = np.zeros(5) 45 | # bad labels length 46 | nt.assert_raises(ValueError, DataArray, d, axes=[('a', 'uvw')]) 47 | 48 | def test_bad_labels2(): 49 | d = np.zeros(5) 50 | # uniqueness error 51 | nt.assert_raises(ValueError, DataArray, d, axes=[('a', ['u']*5)]) 52 | 53 | def test_bad_labels3(): 54 | d = np.zeros(5) 55 | # type error 56 | nt.assert_raises(ValueError, DataArray, d, axes=[('a', [1, 1, 1, 1, 1])]) 57 | 58 | def test_basic(): 59 | adata = [2,3] 60 | a = DataArray(adata, 'x', float) 61 | nt.assert_equal(a.names, ('x',)) 62 | nt.assert_equal(a.dtype, np.dtype(float)) 63 | b = DataArray([[1,2],[3,4],[5,6]], 'xy') 64 | nt.assert_equal(b.names, ('x','y')) 65 | # integer slicing 66 | b0 = b.axes.x[0] 67 | npt.assert_equal(b0, [1,2]) 68 | # slice slicing 69 | b1 = b.axes.x[1:] 70 | npt.assert_equal(b1, [[3,4], [5,6]]) 71 | 72 | def test_bad_axes_axes(): 73 | d = np.random.randn(3,2) 74 | nt.assert_raises(NamedAxisError, DataArray, d, axes='xx') 75 | 76 | def test_combination(): 77 | narr = DataArray(np.zeros((1,2,3)), axes=('a','b','c')) 78 | n3 = DataArray(np.ones((1,2,3)), axes=('x','b','c')) 79 | nt.assert_raises(NamedAxisError, np.add, narr, n3) 80 | # addition of scalar 81 | res = narr + 2 82 | nt.assert_true(isinstance(res, DataArray)) 83 | nt.assert_equal(res.axes, narr.axes) 84 | # addition of matching size array, with matching names 85 | res = narr + narr 86 | nt.assert_equal(res.axes, narr.axes) 87 | 88 | def test_label_change(): 89 | a = DataArray([1,2,3]) 90 | nt.assert_equal(a.names, (None,)) 91 | a.axes[0].name = "test" 92 | nt.assert_equal(a.names, ("test",)) 93 | 94 | def test_1d(): 95 | adata = [2,3] 96 | a = DataArray(adata, 'x', int) 97 | # Verify scalar extraction 98 | nt.assert_true(np.isscalar(a.axes.x[0])) 99 | nt.assert_equal(np.dtype(a.axes.x[0]), np.dtype(np.int)) 100 | # Verify indexing of axis 101 | nt.assert_equals(a.axes.x.index, 0) 102 | # Iteration checks 103 | for i,val in enumerate(a.axes.x): 104 | nt.assert_equals(val, adata[i]) 105 | nt.assert_true(np.isscalar(val)) 106 | nt.assert_equal(np.dtype(val), np.dtype(np.int)) 107 | 108 | def test_2d(): 109 | b = DataArray([[1,2],[3,4],[5,6]], 'xy') 110 | nt.assert_equals(b.names, ('x', 'y')) 111 | # Check row named slicing 112 | rs = b.axes.x[0] 113 | npt.assert_equal(rs, [1,2]) 114 | nt.assert_equal(rs.names, ('y',)) 115 | nt.assert_equal(tuple(rs.axes), (Axis('y', 0, rs),)) 116 | # Now, check that when slicing a row, we get the right names in the output 117 | nt.assert_equal(b.axes.x[1:].names, ('x','y')) 118 | # Check column named slicing 119 | cs = b.axes.y[1] 120 | npt.assert_equal(cs, [2, 4, 6]) 121 | nt.assert_equal(cs.names, ('x',)) 122 | nt.assert_equal(tuple(cs.axes), (Axis('x', 0, cs),)) 123 | # What happens if we do normal slicing? 124 | rs = b[0] 125 | npt.assert_equal(rs, [1, 2]) 126 | nt.assert_equal(rs.names, ('y',)) 127 | nt.assert_equal(tuple(rs.axes), (Axis('y', 0, rs),)) 128 | 129 | def test__pull_axis(): 130 | a = Axis('x', 0, None) 131 | b = Axis('y', 1, None) 132 | c = Axis('z', 2, None) 133 | t_pos = Axis('y', 1, None) 134 | t_neg = Axis('x', 5, None) 135 | axes = [a, b, c] 136 | nt.assert_true(t_pos in axes) 137 | nt.assert_false(t_neg in axes) 138 | nt.assert_equal(axes, _pull_axis(axes, t_neg)) 139 | nt.assert_equal(axes[:-1], _pull_axis(axes, c)) 140 | new_axes = [a, Axis('z', 1, None)] 141 | nt.assert_equal(new_axes, _pull_axis(axes, t_pos)) 142 | 143 | def test__reordered_axes(): 144 | a = Axis('x', 0, None) 145 | b = Axis('y', 1, None) 146 | c = Axis('z', 2, None) 147 | res = _reordered_axes([a,b,c], (1,2,0)) 148 | names_inds = [(ax.name, ax.index) for ax in res] 149 | nt.assert_equal(set(names_inds), set([('y',0),('z',1),('x',2)])) 150 | 151 | def test_axis_set_name(): 152 | a = DataArray(np.arange(20).reshape(2,5,2), 'xyz') 153 | a.axes[0].set_name('u') 154 | nt.assert_equal(a.axes[0].name, 'u', 'name change failed') 155 | nt.assert_equal(a.axes.u, a.axes[0], 'name remapping failed') 156 | nt.assert_equal(a.axes.u.index, 0, 'name remapping failed') 157 | 158 | def test_array_set_name(): 159 | a = DataArray(np.arange(20).reshape(2,5,2), 'xyz') 160 | a.set_name(0, 'u') 161 | nt.assert_equal(a.axes[0].name, 'u', 'name change failed') 162 | nt.assert_equal(a.axes.u, a.axes[0], 'name remapping failed') 163 | nt.assert_equal(a.axes.u.index, 0, 'name remapping failed') 164 | 165 | def test_axis_make_slice(): 166 | p_arr = np.random.randn(2,4,5) 167 | ax_spec = 'capitals', ['washington', 'london', 'berlin', 'paris', 'moscow'] 168 | d_arr = DataArray(p_arr, [None, None, ax_spec]) 169 | a = d_arr.axes.capitals 170 | sl = a.make_slice( slice('london', 'moscow') ) 171 | should_be = ( slice(None), slice(None), slice(1,4) ) 172 | nt.assert_equal(should_be, sl, 'slicing tuple from labels not correct') 173 | sl = a.make_slice( slice(1,4) ) 174 | nt.assert_equal(should_be, sl, 'slicing tuple from idx not correct') 175 | 176 | # also test with the slicing syntax 177 | def test_labels_slicing(): 178 | p_arr = np.random.randn(2,4,5) 179 | ax_spec = 'capitals', ['washington', 'london', 'berlin', 'paris', 'moscow'] 180 | d_arr = DataArray(p_arr, [None, None, ax_spec]) 181 | a = d_arr.axes.capitals 182 | sub_arr = d_arr.axes.capitals['washington'::2] 183 | nt.assert_equal(sub_arr.axes.capitals.labels, 184 | a.labels[0::2]) 185 | nt.assert_true((sub_arr == d_arr[:,:,0::2]).all()) 186 | 187 | # -- Tests for reshaping ----------------------------------------------------- 188 | 189 | def test_flatten_and_ravel(): 190 | "Test the functionality of ravel() and flatten() methods" 191 | d = DataArray(np.arange(20).reshape(4,5), 'xy') 192 | df = d.flatten() 193 | nt.assert_true(type(df) is np.ndarray, 'Type error in flatten') 194 | nt.assert_true(df.shape == (20,), 'Wrong shape in flatten') 195 | df[:4] = 0 196 | nt.assert_false((d[0,:4] == 0).all(), 'Copy not made in flatten') 197 | 198 | dr = d.ravel() 199 | nt.assert_true(type(dr) is np.ndarray, 'Type error in ravel') 200 | nt.assert_true(dr.shape == (20,), 'Wrong shape in ravel') 201 | dr[:4] = 0 202 | nt.assert_true((d[0,:4] == 0).all(), 'View not made in ravel') 203 | 204 | def test_squeeze(): 205 | "Test squeeze method" 206 | d = DataArray(np.random.randn(3,2,9), 'xyz') 207 | d2 = d[None,:,None,:,:,None] 208 | nt.assert_true(d2.shape == (1,3,1,2,9,1), 'newaxis slicing failed') 209 | d3 = d.squeeze() 210 | nt.assert_true(d3.shape == d.shape, 211 | 'squeezing length-1 dimensions failed') 212 | nt.assert_true(d3.names == d.names, 'Axes got lost in squeeze') 213 | 214 | def test_reshape(): 215 | d = DataArray(np.random.randn(3,4,5), 'xyz') 216 | new_shape = (1,3,1,4,5) 217 | # Test padding the shape 218 | d2 = d.reshape(new_shape) 219 | new_labels = (None, 'x', None, 'y', 'z') 220 | nt.assert_true(d2.names == new_labels, 221 | 'Array with inserted dimensions has wrong labels') 222 | nt.assert_true(d2.shape == new_shape, 'New shape wrong') 223 | 224 | # Test trimming the shape 225 | d3 = d2.reshape(d.shape) 226 | nt.assert_true(d3.names == d.names, 227 | 'Array with removed dimensions has wrong labels') 228 | nt.assert_true(d3.shape == d.shape, 'New shape wrong') 229 | 230 | # Test a combo of padding and trimming 231 | d4 = d2.reshape(3,4,1,5,1) 232 | new_labels = ('x', 'y', None, 'z', None) 233 | nt.assert_true( 234 | d4.names == new_labels, 235 | 'Array with inserted and removed dimensions has wrong labels') 236 | nt.assert_true(d4.shape == (3, 4, 1, 5, 1), 'New shape wrong') 237 | 238 | def test_reshape_corners(): 239 | "Test some corner cases for reshape" 240 | d = DataArray(np.random.randn(3,4,5), 'xyz') 241 | d2 = d.reshape(-1) 242 | nt.assert_true(d2.shape == (60,), 'Flattened shape wrong') 243 | nt.assert_true(type(d2) is np.ndarray, 'Flattened type wrong') 244 | 245 | d2 = d.reshape(60) 246 | nt.assert_true(d2.shape == (60,), 'Flattened shape wrong') 247 | nt.assert_true(type(d2) is np.ndarray, 'Flattened type wrong') 248 | 249 | def test_axis_as_index(): 250 | narr = DataArray(np.array([[1, 2, 3], [4, 5, 6]]), axes=('a', 'b')) 251 | npt.assert_array_equal(np.sum(narr, axis=narr.axes.a), [5, 7, 9]) 252 | 253 | # -- Tests for redefined methods --------------------------------------------- 254 | 255 | def test_transpose(): 256 | b = DataArray([[1,2],[3,4],[5,6]], 'xy') 257 | bt = b.T 258 | c = DataArray([ [1,3,5], [2,4,6] ], 'yx') 259 | nt.assert_true(bt.axes.x.index == 1 and bt.axes.y.index == 0) 260 | nt.assert_true(bt.shape == (2,3)) 261 | nt.assert_true((bt==c).all()) 262 | 263 | def test_swapaxes(): 264 | n_arr = np.random.randn(2,4,3) 265 | a = DataArray(n_arr, 'xyz') 266 | b = a.swapaxes('x', 'z') 267 | c = DataArray(n_arr.transpose(2,1,0), 'zyx') 268 | nt.assert_true((c==b).all(), 'data not equal in swapaxes test') 269 | for ax1, ax2 in zip(b.axes, c.axes): 270 | nt.assert_true(ax1==ax2, 'axes not equal in swapaxes test') 271 | 272 | # -- Tests for wrapped ndarray methods --------------------------------------- 273 | 274 | reductions = ['mean', 'var', 'std', 'min', 275 | 'max', 'sum', 'prod', 'ptp', 'any', 'all', 276 | 'argmax', 'argmin'] 277 | accumulations = ['cumprod', 'cumsum'] 278 | 279 | methods = reductions + accumulations 280 | 281 | def check_data_axes(d_arr, op, axis, exp_axes, *args, **kwargs): 282 | """ Check data and axes correct after operation `op` 283 | """ 284 | from datarray.datarray import _names_to_numbers 285 | super_opr = getattr(np.ndarray, op) 286 | axis_idx = _names_to_numbers(d_arr.axes, [axis])[0] 287 | d1 = super_opr(np.asarray(d_arr), axis_idx, *args, **kwargs) 288 | opr = getattr(d_arr, op) 289 | d_arr_out = opr(axis, *args, **kwargs) 290 | nt.assert_equal(d_arr_out.axes, exp_axes) 291 | d2 = np.asarray(d_arr_out) 292 | npt.assert_equal(d1.shape, d2.shape) 293 | npt.assert_array_equal(d1, d2) 294 | 295 | 296 | def test_wrapped_ops_data(): 297 | a = DataArray(np.random.randn(4,2,6), 'xyz') 298 | for m in methods: 299 | check_data_axes(a, m, 'x', YZ if m in reductions else DA.axes) 300 | check_data_axes(a, m, 'y', XZ if m in reductions else DA.axes) 301 | check_data_axes(a, m, 'z', XY if m in reductions else DA.axes) 302 | 303 | 304 | def test_reductions_keepdims(): 305 | names = 'xyz' 306 | a = np.arange(24).reshape((2, 3, 4)) 307 | da = DataArray(a, names) 308 | for idx, name in enumerate(names): 309 | axes_removed = AXES_REMOVED[name] 310 | # Test keepdims as kwarg 311 | for method in reductions: 312 | check_data_axes(da, method, name, axes_removed) 313 | if method not in ('ptp', 'argmin', 'argmax'): 314 | # Reductions taking keepdims argument 315 | check_data_axes(da, method, name, DA.axes, keepdims=True) 316 | # Test the individual functions with positional args 317 | dt = np.dtype(float) 318 | out = np.mean(da, axis=name) 319 | kd_out = DataArray(np.mean(a, axis=idx, keepdims=True), names) 320 | # Functions with signature axis, dtype, out, keepdims 321 | for method in ('mean', 'sum', 'prod', 'all', 'any'): 322 | check_data_axes(da, method, name, axes_removed, dt, out) 323 | check_data_axes(da, method, name, DA.axes, dt, kd_out, True) 324 | # Signature axis, out, dtype, ddof, keepdims 325 | for method in ('var', 'std'): 326 | check_data_axes(da, method, name, axes_removed, dt, out, 0) 327 | check_data_axes(da, method, name, DA.axes, dt, kd_out, 0, True) 328 | # Signature axis, out, keepdims 329 | for method in ('min', 'max'): 330 | check_data_axes(da, method, name, axes_removed, out) 331 | check_data_axes(da, method, name, DA.axes, kd_out, True) 332 | # Test reductions not using keepdims 333 | out_int = out.astype(np.intp) # argmin/max have integer output 334 | for method in ('argmin', 'argmax'): 335 | check_data_axes(da, method, name, axes_removed, out_int) 336 | check_data_axes(da, 'ptp', name, axes_removed, out) 337 | 338 | 339 | # -- Tests for slicing with "newaxis" ---------------------------------------- 340 | def test_newaxis_slicing(): 341 | b = DataArray([[1,2],[3,4],[5,6]], 'xy') 342 | b2 = b[np.newaxis] 343 | nt.assert_true(b2.shape == (1,) + b.shape) 344 | nt.assert_true(b2.axes[0].name == None) 345 | 346 | b2 = b[:,np.newaxis] 347 | nt.assert_true(b2.shape == (3,1,2)) 348 | nt.assert_true((b2[:,0,:]==b).all()) 349 | 350 | # -- Testing broadcasting features ------------------------------------------- 351 | def test_broadcast(): 352 | b = DataArray([[1,2],[3,4],[5,6]], 'xy') 353 | a = DataArray([1,0], 'y') 354 | # both of these should work 355 | c = b + a 356 | nt.assert_true(c.names == ('x', 'y'), 'simple broadcast failed') 357 | c = a + b 358 | nt.assert_true(c.names == ('x', 'y'), 359 | 'backwards simple broadcast failed') 360 | 361 | a = DataArray([1, 1, 1], 'x') 362 | # this should work too 363 | c = a[:,np.newaxis] + b 364 | nt.assert_true(c.names == ('x', 'y'), 'forward broadcast1 failed') 365 | c = b + a[:,np.newaxis] 366 | nt.assert_true(c.names == ('x', 'y'), 'forward broadcast2 failed') 367 | 368 | b = DataArray(np.random.randn(3,2,4), ['x', None, 'y']) 369 | a = DataArray(np.random.randn(2,4), [None, 'y']) 370 | # this should work 371 | c = b + a 372 | nt.assert_true(c.names == ('x', None, 'y'), 373 | 'broadcast with unlabeled dimensions failed') 374 | # and this 375 | a = DataArray(np.random.randn(2,1), [None, 'y']) 376 | c = b + a 377 | nt.assert_true( 378 | c.names == ('x', None, 'y'), 379 | 'broadcast with matched name, but singleton dimension failed') 380 | # check that labeled Axis names the resulting Axis 381 | b = DataArray(np.random.randn(3,2,4), ['x', 'z', 'y']) 382 | a = DataArray(np.random.randn(2,4), [None, 'y']) 383 | # this should work 384 | c = b + a 385 | nt.assert_true(c.names == ('x', 'z', 'y'), 386 | 'broadcast with unlabeled dimensions failed') 387 | 388 | 389 | # -- Testing slicing failures ------------------------------------------------ 390 | @nt.raises(NamedAxisError) 391 | def test_broadcast_fails1(): 392 | a = DataArray( np.random.randn(5,6), 'yz' ) 393 | b = DataArray( np.random.randn(5,6), 'xz' ) 394 | c = a + b 395 | 396 | @nt.raises(ValueError) 397 | def test_broadcast_fails2(): 398 | a = DataArray( np.random.randn(2,5,6), 'xy' ) # last axis is unlabeled 399 | b = DataArray( np.random.randn(2,6,6), 'xy' ) 400 | # this should fail simply because the dimensions are not matched 401 | c = a + b 402 | 403 | @nt.raises(IndexError) 404 | def test_indexing_fails(): 405 | "Ensure slicing non-existent dimension fails" 406 | a = DataArray( np.random.randn(2,5,6), 'xy' ) 407 | a[:2,:1,:2,:5] 408 | 409 | @nt.raises(IndexError) 410 | def test_ambiguous_ellipsis_fails(): 411 | a = DataArray( np.random.randn(2,5,6), 'xy' ) 412 | a[...,0,...] 413 | 414 | def test_ellipsis_slicing(): 415 | a = DataArray( np.random.randn(2,5,6), 'xy' ) 416 | nt.assert_true((a[...,0] == a[:,:,0]).all(), 417 | 'slicing with ellipsis failed') 418 | nt.assert_true((a[0,...] == a[0]).all(), 419 | 'slicing with ellipsis failed') 420 | nt.assert_true((a[0,...,0] == a[0,:,0]).all(), 421 | 'slicing with ellipsis failed') 422 | 423 | def test_shifty_axes(): 424 | arr = np.random.randn(2,5,6) 425 | a = DataArray( arr, 'xy' ) 426 | # slicing out the "x" Axis triggered the unlabeled axis to change 427 | # name from "_2" to "_1".. make sure that this change is mapped 428 | b = a[0,:2] 429 | nt.assert_true((b == arr[0,:2]).all(), 'shifty axes strike again!') 430 | 431 | # -- Testing utility functions ----------------------------------------------- 432 | from datarray.datarray import _expand_ellipsis, _make_singleton_axes 433 | 434 | def test_ellipsis_expansion(): 435 | slicing = ( slice(2), Ellipsis, 2 ) 436 | fixed = _expand_ellipsis(slicing, 4) 437 | should_be = ( slice(2), slice(None), slice(None), 2 ) 438 | nt.assert_true(fixed==should_be, 'wrong slicer1') 439 | fixed = _expand_ellipsis(slicing, 2) 440 | should_be = ( slice(2), 2 ) 441 | nt.assert_true(fixed==should_be, 'wrong slicer2') 442 | 443 | def test_singleton_axis_prep(): 444 | b = DataArray( np.random.randn(5,6), 'xz' ) 445 | slicing = ( None, ) 446 | shape, axes, key = _make_singleton_axes(b, slicing) 447 | 448 | key_should_be = (slice(None), ) # should be trimmed 449 | shape_should_be = (1,5,6) 450 | ax_should_be = [ Axis(l, i, b) for i, l in enumerate((None, 'x', 'z')) ] 451 | 452 | nt.assert_true(key_should_be==key, 'key translated poorly') 453 | nt.assert_true(shape_should_be==shape, 'shape computed poorly') 454 | nt.assert_true(all([a1==a2 for a1,a2 in zip(ax_should_be, axes)]), 455 | 'axes computed poorly') 456 | 457 | def test_singleton_axis_prep2(): 458 | # a little more complicated 459 | b = DataArray( np.random.randn(5,6), 'xz' ) 460 | slicing = ( 0, None ) 461 | shape, axes, key = _make_singleton_axes(b, slicing) 462 | 463 | key_should_be = (0, ) # should be trimmed 464 | shape_should_be = (5,1,6) 465 | ax_should_be = [ Axis(l, i, b) for i, l in enumerate(('x', None, 'z')) ] 466 | 467 | nt.assert_true(key_should_be==key, 'key translated poorly') 468 | nt.assert_true(shape_should_be==shape, 'shape computed poorly') 469 | nt.assert_true(all([a1==a2 for a1,a2 in zip(ax_should_be, axes)]), 470 | 'axes computed poorly') 471 | 472 | # -- Test binary operations -------------------------------------------------- 473 | 474 | def test_label_mismatch(): 475 | dar1 = DataArray([1, 2], [('time', ['A1', 'B1'])]) 476 | dar2 = DataArray([1, 2], [('time', ['A2', 'B2'])]) 477 | nt.assert_raises(NamedAxisError, dar1.__add__, dar2) 478 | nt.assert_raises(NamedAxisError, dar1.__sub__, dar2) 479 | nt.assert_raises(NamedAxisError, dar1.__mul__, dar2) 480 | nt.assert_raises(NamedAxisError, dar1.__floordiv__, dar2) 481 | nt.assert_raises(NamedAxisError, dar1.__truediv__, dar2) 482 | if not PY3: 483 | nt.assert_raises(NamedAxisError, dar1.__div__, dar2) 484 | 485 | # -- Test DataArray.axes 486 | class TestAxesManager(object): 487 | def setUp(self): 488 | self.axes_spec = ('date', ('stocks', ('aapl', 'ibm', 'goog', 'msft')), 'metric') 489 | self.A = DataArray(np.random.randn(200, 4, 10), axes=self.axes_spec) 490 | 491 | def test_axes_name_collision(self): 492 | "Test .axes object for attribute collisions with axis names" 493 | A = DataArray(np.arange(6).reshape([1,2,3]), 494 | ('_arr', '_axes', '_namemap')) 495 | nt.assert_true(A.axes[0] is A.axes('_arr') is A.axes._arr) 496 | nt.assert_true(A.axes[1] is A.axes('_axes') is A.axes._axes) 497 | nt.assert_true(A.axes[2] is A.axes('_namemap') is A.axes._namemap) 498 | 499 | # Try to invoke some methods that use these attributes internally 500 | B = A[np.newaxis, ...] 501 | nt.assert_equal(B.shape, (1,1,2,3)) 502 | nt.assert_true(np.all(A + A == 2*A)) 503 | 504 | def test_axes_numeric_access(self): 505 | for i,spec in enumerate(self.axes_spec): 506 | try: 507 | name,labels = spec 508 | except ValueError: 509 | name,labels = spec,None 510 | nt.assert_true(self.A.axes[i] == Axis(name=name, index=i, 511 | parent_arr=self.A, labels=labels)) 512 | 513 | def test_axes_attribute_access(self): 514 | for spec in self.axes_spec: 515 | try: 516 | name,labels = spec 517 | except ValueError: 518 | name,labels = spec,None 519 | nt.assert_true(getattr(self.A.axes, name) is self.A.axes(name)) 520 | 521 | def test_equality(self): 522 | B = DataArray(np.random.randn(200, 4, 10), axes=self.axes_spec) 523 | nt.assert_true(self.A.axes == B.axes) 524 | # What if axes differ by labels only? 525 | D = DataArray(np.random.randn(200, 4, 10), axes=('date', 'stocks', 'metric')) 526 | nt.assert_false(self.A.axes == D.axes) 527 | -------------------------------------------------------------------------------- /datarray/tests/test_print.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from datarray.datarray import DataArray 3 | from datarray.print_grid import datarray_to_string 4 | 5 | def test_2d_datarray_to_string(): 6 | grid_string = """ 7 | country year 8 | --------- ------------------------------------------------- 9 | 1994 1998 2002 2006 2010 10 | Netherlan 0. 0.142857 0.285714 0.428571 0.571429 11 | Uruguay 0.714286 0.857143 1. 1.142857 1.285714 12 | Germany 1.428571 1.571429 1.714286 1.857143 2. 13 | Spain 2.142857 2.285714 2.428571 2.571429 2.714286 14 | """.strip() 15 | 16 | test_array = np.arange(20).reshape((4, 5)) / 7.0 17 | row_spec = 'country', ['Netherlands', 'Uruguay', 'Germany', 'Spain'] 18 | col_spec = 'year', list(map(str, [1994, 1998, 2002, 2006, 2010])) 19 | 20 | d_arr = DataArray(test_array, [row_spec, col_spec]) 21 | assert datarray_to_string(d_arr) == grid_string 22 | 23 | 24 | def test_1d_datarray_to_string(): 25 | grid_string = """ 26 | country 27 | --------------------------------------- 28 | Netherla Uruguay Germany Spain 29 | 0. 0.714286 1.428571 2.142857 30 | """.strip() 31 | 32 | test_array = np.arange(20).reshape((4, 5)) / 7.0 33 | row_spec = 'country', ['Netherlands', 'Uruguay', 'Germany', 'Spain'] 34 | col_spec = 'year', list(map(str, [1994, 1998, 2002, 2006, 2010])) 35 | 36 | d_arr = DataArray(test_array, [row_spec, col_spec]) 37 | assert datarray_to_string(d_arr.axes.year['1994']) == grid_string 38 | 39 | -------------------------------------------------------------------------------- /datarray/version.py: -------------------------------------------------------------------------------- 1 | """datarray version information""" 2 | 3 | # Format expected by setup.py and doc/source/conf.py: string of form 4 | # "X.Y.Zextra" 5 | _version_major = 0 6 | _version_minor = 2 7 | _version_micro = 0 8 | _version_extra = 'dev' # development 9 | # _version_extra = '' # release 10 | __version__ = "%s.%s.%s%s" % (_version_major, 11 | _version_minor, 12 | _version_micro, 13 | _version_extra) 14 | 15 | 16 | CLASSIFIERS = ["Development Status :: 3 - Alpha", 17 | "Environment :: Console", 18 | "Intended Audience :: Science/Research", 19 | "License :: OSI Approved :: BSD License", 20 | "Operating System :: OS Independent", 21 | "Programming Language :: Python", 22 | "Topic :: Scientific/Engineering"] 23 | 24 | description = "NumPy arrays with named axes and named indices." 25 | 26 | # Note: this long_description is actually a copy/paste from the top-level 27 | # README.rst, so that it shows up nicely on PyPI. So please remember to edit 28 | # it only in one place and sync it correctly. I (MB) edit both in vim windows 29 | # and use vim diff mode to push the changes from one to the other. 30 | long_description = """ 31 | ###################################### 32 | Datarray: Numpy arrays with named axes 33 | ###################################### 34 | 35 | Scientists, engineers, mathematicians and statisticians don't just work with 36 | matrices; they often work with structured data, just like you'd find in a 37 | table. However, functionality for this is missing from Numpy, and there are 38 | efforts to create something to fill the void. This is one of those efforts. 39 | 40 | .. warning:: 41 | 42 | This code is currently experimental, and its API *will* change! It is meant 43 | to be a place for the community to understand and develop the right 44 | semantics and have a prototype implementation that will ultimately 45 | (hopefully) be folded back into Numpy. 46 | 47 | Datarray provides a subclass of Numpy ndarrays that support: 48 | 49 | - individual dimensions (axes) being labeled with meaningful descriptions 50 | - labeled 'ticks' along each axis 51 | - indexing and slicing by named axis 52 | - indexing on any axis with the tick labels instead of only integers 53 | - reduction operations (like .sum, .mean, etc) support named axis arguments 54 | instead of only integer indices. 55 | 56 | ********* 57 | Prior Art 58 | ********* 59 | 60 | In no particular order: 61 | 62 | * `xarray `_ - very close in spirit to 63 | this package, xarray implements named ND array axes and tick labels. It 64 | integrates with (and depends on) Pandas. If you are doing production work, 65 | and don't mind the pandas dependency, please use xarray rather than this 66 | package. Xarray used to be called "xray". 67 | 68 | * `pandas `_ is based around a number of 69 | DataFrame-esque datatypes. 70 | 71 | * `Tabular `_ implements a 72 | spreadsheet-inspired datatype, with rows/columns, csv/etc. IO, and fancy 73 | tabular operations. 74 | 75 | * `scikits.statsmodels `_ sounded as 76 | though it had some features we'd like to eventually see implemented on top of 77 | something such as datarray, and `Skipper `_ 78 | seemed pretty interested in something like this himself. 79 | 80 | * `scikits.timeseries `_ also has a 81 | time-series-specific object that's somewhat reminiscent of labeled arrays. 82 | 83 | * `pandas `_ is based around a number of 84 | DataFrame-esque datatypes. 85 | 86 | * `pydataframe `_ is supposed to be a 87 | clone of R's data.frame. 88 | 89 | * `larry `_, or "labeled array," often comes up 90 | in discussions alongside pandas. 91 | 92 | * `divisi `_ includes labeled sparse and 93 | dense arrays. 94 | 95 | * `pymvpa `_ provides Dataset class 96 | encapsulating the data together with matching in length sets of attributes 97 | for the first two (samples and features) dimensions. Dataset is not a 98 | subclass of numpy array to allow other data structures (e.g. sparse 99 | matrices). 100 | 101 | * `ptsa `_ subclasses 102 | ndarray to provide attributes per dimensions aiming to ease slicing/indexing 103 | given the values of the axis attributes 104 | 105 | ************* 106 | Project Goals 107 | ************* 108 | 109 | 1. Get something akin to this in the numpy core; 110 | 2. Stick to basic functionality such that projects like scikits.statsmodels can 111 | use it as a base datatype; 112 | 3. Make an interface that allows for simple, pretty manipulation that doesn't 113 | introduce confusion; 114 | 4. Oh, and make sure that the base numpy array is still accessible. 115 | 116 | **** 117 | Code 118 | **** 119 | 120 | You can find our sources and single-click downloads: 121 | 122 | * `Main repository`_ on Github; 123 | * Documentation_ for the current release; 124 | * Download the `current trunk`_ as a tar/zip file; 125 | * Downloads of all `available releases`_. 126 | 127 | The latest released version is always available from `pypi 128 | `_. 129 | 130 | ******* 131 | Support 132 | ******* 133 | 134 | Please put up issues on the `datarray issue tracker 135 | `_. 136 | 137 | .. _main repository: http://github.com/bids/datarray 138 | .. _Documentation: http://bids.github.com/datarray 139 | .. _current trunk: http://github.com/bids/datarray/archives/master 140 | .. _available releases: http://github.com/bids/datarray/releases 141 | """ 142 | 143 | 144 | NAME = 'datarray' 145 | MAINTAINER = "Numpy Developers" 146 | MAINTAINER_EMAIL = "numpy-discussion@scipy.org" 147 | DESCRIPTION = description 148 | LONG_DESCRIPTION = long_description 149 | URL = "http://github.com/bids/datarray" 150 | DOWNLOAD_URL = "http://github.com/bids/datarray/archives/master" 151 | LICENSE = "Simplified BSD" 152 | CLASSIFIERS = CLASSIFIERS 153 | AUTHOR = "Datarray developers" 154 | AUTHOR_EMAIL = "numpy-discussion@scipy.org" 155 | PLATFORMS = "OS Independent" 156 | MAJOR = _version_major 157 | MINOR = _version_minor 158 | MICRO = _version_micro 159 | ISRELEASED = False 160 | VERSION = __version__ 161 | PACKAGES = ["datarray", "datarray/tests", "datarray/testing"] 162 | PACKAGE_DATA = {'datarray': ['LICENSE']} 163 | REQUIRES = ["numpy (>=1.7)"] 164 | INSTALL_REQUIRES = ["numpy>=1.7"] 165 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 14 | 15 | .PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex changes linkcheck doctest all 16 | 17 | all: html 18 | 19 | help: 20 | @echo "Please use \`make ' where is one of" 21 | @echo " html to make standalone HTML files" 22 | @echo " gh-pages to make the docs in Github-pages form" 23 | @echo " dirhtml to make HTML files named index.html in directories" 24 | @echo " pickle to make pickle files" 25 | @echo " json to make JSON files" 26 | @echo " htmlhelp to make HTML files and a HTML help project" 27 | @echo " qthelp to make HTML files and a qthelp project" 28 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 29 | @echo " changes to make an overview of all changed/added/deprecated items" 30 | @echo " linkcheck to check all external links for integrity" 31 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 32 | 33 | clean: 34 | -rm -rf $(BUILDDIR)/* source/generated/* 35 | 36 | apidocs: 37 | sphinx-apidoc -f -o source/generated ../datarray ../datarray/tests 38 | 39 | html: apidocs 40 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 41 | @echo 42 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 43 | 44 | dirhtml: 45 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 46 | @echo 47 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 48 | 49 | pickle: 50 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 51 | @echo 52 | @echo "Build finished; now you can process the pickle files." 53 | 54 | json: 55 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 56 | @echo 57 | @echo "Build finished; now you can process the JSON files." 58 | 59 | htmlhelp: 60 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 61 | @echo 62 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 63 | ".hhp project file in $(BUILDDIR)/htmlhelp." 64 | 65 | qthelp: 66 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 67 | @echo 68 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 69 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 70 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/DataArrayDocs.qhcp" 71 | @echo "To view the help file:" 72 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/DataArrayDocs.qhc" 73 | 74 | latex: 75 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 76 | @echo 77 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 78 | @echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \ 79 | "run these through (pdf)latex." 80 | 81 | changes: 82 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 83 | @echo 84 | @echo "The overview file is in $(BUILDDIR)/changes." 85 | 86 | linkcheck: 87 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 88 | @echo 89 | @echo "Link check complete; look for any errors in the above output " \ 90 | "or in $(BUILDDIR)/linkcheck/output.txt." 91 | 92 | doctest: 93 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 94 | @echo "Testing of doctests in the sources finished, look at the " \ 95 | "results in $(BUILDDIR)/doctest/output.txt." 96 | 97 | github: html 98 | touch $(BUILDDIR)/html/.nojekyll 99 | ghp-import $(BUILDDIR)/html/ 100 | git push -u origin gh-pages 101 | @echo 102 | @echo "Published to Github" 103 | -------------------------------------------------------------------------------- /doc/README.txt: -------------------------------------------------------------------------------- 1 | ====================== 2 | datarray Documentation 3 | ====================== 4 | 5 | This is the top level build directory for the datarray documentation. All 6 | of the documentation is written using Sphinx_, a Python documentation 7 | system built on top of reST_. In order to build the documentation, 8 | you must have Sphinx v1.0 or greater installed. 9 | 10 | This directory contains: 11 | 12 | * Makefile - the build script to build the HTML or PDF docs. Type 13 | ``make help`` for a list of options. 14 | 15 | * source - the directory containing the reST source 16 | 17 | * source/links_names.inc - reST document with hyperlink targets for common 18 | links used throughout the documentation 19 | 20 | * source/conf.py - the sphinx configuration. 21 | 22 | * source/_static - used by the sphinx build system. 23 | 24 | * source/_templates - used by the sphinx build system. 25 | 26 | Building the documentation 27 | -------------------------- 28 | 29 | You should first install the documentation dependencies. From this directory:: 30 | 31 | pip install -r doc-requirements.txt 32 | 33 | Then:: 34 | 35 | make html 36 | 37 | .. Since this README.txt is not processed by Sphinx during the 38 | .. documentation build, I've included the links directly so it is at 39 | .. least a valid reST doc. 40 | 41 | .. _Sphinx: http://sphinx.pocoo.org/ 42 | .. _reST: http://docutils.sourceforge.net/rst.html 43 | .. _numpy: http://www.scipy.org/NumPy 44 | -------------------------------------------------------------------------------- /doc/devel/make_release.rst: -------------------------------------------------------------------------------- 1 | .. _release-guide: 2 | 3 | ********************************** 4 | Guide to making a datarray release 5 | ********************************** 6 | 7 | A guide for developers making a datarray release. 8 | 9 | .. _release-checklist: 10 | 11 | Release checklist 12 | ================= 13 | 14 | * Review the open list of `datarray issues`_. Check whether there are 15 | outstanding issues that can be closed, and whether there are any issues that 16 | should delay the release. Label them ! 17 | 18 | * Review and update the release notes. Review and update the :file:`Changelog` 19 | file. Get a partial list of contributors with something like:: 20 | 21 | git shortlog -ns 0.6.0.. 22 | 23 | where ``0.6.0`` was the last release tag name. 24 | 25 | Then manually go over ``git shortlog 0.6.0..`` to make sure the release 26 | notes are as complete as possible and that every contributor was recognized; 27 | 28 | * Use the opportunity to update the ``.mailmap`` file if there are any 29 | duplicate authors listed from ``git shortlog -ns``; 30 | 31 | * Add any new authors to the ``AUTHORS`` file. Add any new entries to the 32 | ``THANKS`` file; 33 | 34 | * Check the copyright years in ``doc/source/conf.py`` and 35 | ``datarray/LICENSE``; 36 | 37 | * Check that the ``README.rst`` text is the same as the text in the 38 | ``long_description`` field in ``version.py``; 39 | 40 | * If you have travis-ci_ building set up you might want to push the code in its 41 | current state to a branch that will build, e.g:: 42 | 43 | git branch -D pre-release-test # in case branch already exists 44 | git co -b pre-release-test 45 | 46 | * Clean:: 47 | 48 | git clean -fxd 49 | 50 | * Make sure all tests pass on your local machine (from the datarray root 51 | directory):: 52 | 53 | nosetests --with-doctest datarray 54 | 55 | Do this on a Python 2 and Python 3 setup. 56 | 57 | * Consider running the same tests after installing into a virtualenv, to test 58 | that installing works correctly:: 59 | 60 | mkvirtualenv datarray-test 61 | pip install nose wheel 62 | git clean -fxd 63 | python setup.py install 64 | mkdir for_test 65 | cd for_test 66 | nosetests --with-doctest datarray 67 | 68 | * Check the documentation doctests:: 69 | 70 | cd doc 71 | make doctest 72 | cd .. 73 | 74 | * The release should now be ready. 75 | 76 | Doing the release 77 | ================= 78 | 79 | * Edit :file:`datarray/version.py` to set ``_version_*`` strings to the 80 | version you want. Make ``_version_extra`` be the empty string for the 81 | release; 82 | 83 | * Check you are getting the version / package name that you want by doing:: 84 | 85 | git clean -fxd 86 | python setup.py sdist --formats=gztar,zip 87 | python setup.py bdist_wheel 88 | 89 | and checking the output filenames in ``dist/``; 90 | 91 | * Make a signed tag for the release with tag of form ``0.6.0``:: 92 | 93 | git tag -s -m 'Fifth public release' 0.6.0 94 | 95 | * Once everything looks good, upload the source release to PyPi, using `twine 96 | `_:: 97 | 98 | twine upload dist/datarray* 99 | 100 | * Remember you'll need your ``~/.pypirc`` file set up right for this to work. 101 | See `setuptools intro`_. The file should look something like this:: 102 | 103 | [distutils] 104 | index-servers = 105 | pypi 106 | 107 | [pypi] 108 | username:your.pypi.username 109 | password:your-password 110 | 111 | [server-login] 112 | username:your.pypi.username 113 | password:your-password 114 | 115 | * Check how everything looks on pypi - the description, the packages. 116 | 117 | * Push the tag with ``git push origin 0.6.0`` 118 | 119 | * Push the documentation up to github with:: 120 | 121 | cd doc 122 | make github 123 | 124 | * Edit ``datarray/version.py`` to set to the next upcoming version. Set 125 | ``_version_extra`` to ``dev``. Commit and push. 126 | 127 | * Announce to the mailing lists. 128 | 129 | .. datarray code stuff 130 | .. _datarray github: http://github.com/bids/datarray 131 | .. _datarray pypi: http://pypi.python.org/pypi/datarray 132 | .. _datarray issues: http://github.com/bids/datarray/issues 133 | .. _datarray travis-ci: https://travis-ci.org/bids/datarray 134 | -------------------------------------------------------------------------------- /doc/doc-requirements.txt: -------------------------------------------------------------------------------- 1 | # Requirements for building docs 2 | # Use with: 3 | # pip install -r doc-requirements.txt 4 | 5 | -r ../requirements.txt 6 | sphinx>=1.3 7 | ghp-import 8 | -------------------------------------------------------------------------------- /doc/source/basic_data_array.rst: -------------------------------------------------------------------------------- 1 | .. testsetup:: 2 | 3 | import numpy as np 4 | from datarray import DataArray 5 | 6 | ============ 7 | DataArrays 8 | ============ 9 | 10 | .. _init_ufuncs: 11 | 12 | 13 | Basic DataArray Creation And Mixing 14 | =================================== 15 | 16 | DataArrays are constructed with array-like sequences and axis names: 17 | 18 | .. doctest:: 19 | 20 | >>> narr = DataArray(np.zeros((1,2,3)), axes=('a', 'b', 'c')) 21 | >>> narr.names 22 | ('a', 'b', 'c') 23 | >>> narr.axes.a 24 | Axis(name='a', index=0, labels=None) 25 | >>> narr.axes.b 26 | Axis(name='b', index=1, labels=None) 27 | >>> narr.axes.c 28 | Axis(name='c', index=2, labels=None) 29 | >>> narr.shape 30 | (1, 2, 3) 31 | 32 | Not all axes must necessarily be explicitly named, since None is a valid axis 33 | name: 34 | 35 | .. doctest:: 36 | 37 | >>> narr2 = DataArray(np.zeros((1,2,3)), axes=('a', None, 'b' )) 38 | >>> narr2.names 39 | ('a', None, 'b') 40 | 41 | If no name is given for an axis, None is implicitly assumed. So trailing axes 42 | without axes will be named as None: 43 | 44 | .. doctest:: 45 | 46 | >>> narr2 = DataArray(np.zeros((1,2,3,2)), axes=('a','b' )) 47 | >>> narr2.names 48 | ('a', 'b', None, None) 49 | 50 | Combining named and unnamed arrays: 51 | 52 | .. doctest:: 53 | 54 | >>> narr = DataArray(np.zeros((1,2,3)), axes='abc') 55 | >>> res = narr + 5 # OK 56 | >>> res = narr + np.zeros((1,2,3)) # OK 57 | >>> n2 = DataArray(np.ones((1,2,3)), axes=('a','b','c')) 58 | >>> res = narr + n2 # OK 59 | 60 | >>> n3 = DataArray(np.ones((1,2,3)), axes=('x','b','c')) 61 | 62 | >>> res = narr + n3 63 | Traceback (most recent call last): 64 | ... 65 | NamedAxisError: Axis names are incompatible for a binary operation: ('a', 'b', 'c'), ('x', 'b', 'c') 66 | 67 | 68 | Now, what about matching names, but different indices for the names? 69 | 70 | .. doctest:: 71 | 72 | >>> n4 = DataArray(np.ones((2,1,3)), axes=('b','a','c')) 73 | >>> res = narr + n4 # is this OK? 74 | Traceback (most recent call last): 75 | ... 76 | NamedAxisError: Axis names are incompatible for a binary operation: ('a', 'b', 'c'), ('b', 'a', 'c') 77 | 78 | The names and the position have to be the same, and the above example should 79 | raise an error. At least for now we will raise an error, and review later. 80 | 81 | With "labels" 82 | ------------- 83 | 84 | Constructing a DataArray such that an Axis has labels, for example: 85 | 86 | .. doctest:: 87 | 88 | >>> cap_ax_spec = 'capitals', ['washington', 'london', 'berlin', 'paris', 'moscow'] 89 | >>> time_ax_spec = 'time', ['0015', '0615', '1215', '1815'] 90 | >>> time_caps = DataArray(np.arange(4*5).reshape(4,5), [time_ax_spec, cap_ax_spec]) 91 | >>> time_caps.axes 92 | (Axis(name='time', index=0, labels=['0015', '0615', '1215', '1815']), Axis(name='capitals', index=1, labels=['washington', 'london', 'berlin', 'paris', 'moscow'])) 93 | 94 | .. _slicing: 95 | 96 | Slicing 97 | ======= 98 | 99 | A DataArray with simple named axes can be sliced many ways. 100 | 101 | Per Axis: 102 | 103 | .. doctest:: 104 | 105 | >>> narr = DataArray(np.zeros((1,2,3)), axes=('a','b','c')) 106 | >>> narr.axes.a 107 | Axis(name='a', index=0, labels=None) 108 | >>> narr.axes.a[0] 109 | DataArray(array([[ 0., 0., 0.], 110 | [ 0., 0., 0.]]), 111 | ('b', 'c')) 112 | >>> narr.axes.a[0].axes 113 | (Axis(name='b', index=0, labels=None), Axis(name='c', index=1, labels=None)) 114 | 115 | By normal "numpy" slicing: 116 | 117 | .. doctest:: 118 | 119 | >>> narr[0].shape 120 | (2, 3) 121 | >>> narr[0].axes 122 | (Axis(name='b', index=0, labels=None), Axis(name='c', index=1, labels=None)) 123 | >>> narr.axes.a[0].axes == narr[0,:].axes 124 | True 125 | 126 | Also, slicing with ``newaxis`` is implemented: 127 | 128 | .. doctest:: 129 | 130 | >>> arr = np.arange(24).reshape((3,2,4)) 131 | >>> b = DataArray(arr, ['x', 'y', 'z']) 132 | >>> b[:,:,np.newaxis].shape 133 | (3, 2, 1, 4) 134 | >>> b[:,:,np.newaxis].names 135 | ('x', 'y', None, 'z') 136 | 137 | I can also slice with ``newaxis`` at each Axis. The effect of this is always 138 | to insert an unnamed Axis with length-1 at the original index of the named 139 | Axis: 140 | 141 | .. doctest:: 142 | 143 | >>> b.axes 144 | (Axis(name='x', index=0, labels=None), Axis(name='y', index=1, labels=None), Axis(name='z', index=2, labels=None)) 145 | >>> b.axes.y[np.newaxis].names 146 | ('x', None, 'y', 'z') 147 | >>> b.axes.y[np.newaxis].shape 148 | (3, 1, 2, 4) 149 | 150 | Slicing and labels 151 | ------------------ 152 | 153 | It is also possible to use labels in any of the slicing syntax above: 154 | 155 | .. doctest:: 156 | 157 | >>> time_caps #doctest: +NORMALIZE_WHITESPACE 158 | DataArray(array([[ 0, 1, 2, 3, 4], 159 | [ 5, 6, 7, 8, 9], 160 | [10, 11, 12, 13, 14], 161 | [15, 16, 17, 18, 19]]), 162 | (('time', ('0015', '0615', '1215', '1815')), ('capitals', ('washington', 'london', 'berlin', 'paris', 'moscow')))) 163 | >>> time_caps.axes.capitals['berlin'::-1] #doctest: +NORMALIZE_WHITESPACE 164 | DataArray(array([[ 2, 1, 0], 165 | [ 7, 6, 5], 166 | [12, 11, 10], 167 | [17, 16, 15]]), 168 | (('time', ('0015', '0615', '1215', '1815')), ('capitals', ('berlin', 'london', 'washington')))) 169 | >>> time_caps.axes.time['0015':'1815'] #doctest: +NORMALIZE_WHITESPACE 170 | DataArray(array([[ 0, 1, 2, 3, 4], 171 | [ 5, 6, 7, 8, 9], 172 | [10, 11, 12, 13, 14]]), 173 | (('time', ('0015', '0615', '1215')), ('capitals', ('washington', 'london', 'berlin', 'paris', 'moscow')))) 174 | >>> time_caps[:, 'london':3] #doctest: +NORMALIZE_WHITESPACE 175 | DataArray(array([[ 1, 2], 176 | [ 6, 7], 177 | [11, 12], 178 | [16, 17]]), 179 | (('time', ('0015', '0615', '1215', '1815')), ('capitals', ('london', 'berlin')))) 180 | 181 | The .start and .stop attributes of the slice object can be either None, an 182 | integer index, or a valid tick. They may even be mixed. *The .step attribute, 183 | however, must be None or an nonzero integer.* 184 | 185 | **Historical note: previously integer labels clobbered indices.** For example:: 186 | 187 | >>> centered_data = DataArray(np.random.randn(6), [ ('c_idx', range(-3,3)) ]) 188 | >>> centered_data.axes.c_idx.make_slice( slice(0, 6, None) ) 189 | (slice(3, 6, None),) 190 | 191 | .. note:: 192 | 193 | The code above doesn't currently (as of Nov/2010) run, because integer 194 | labels haven't been implemented. See ticket gh-40. 195 | 196 | make_slice() first tries to look up the key parameters as labels, and then sees 197 | if the key parameters can be used as simple indices. Thus 0 is found as index 198 | 3, and 6 is passed through as index 6. 199 | 200 | Possible resolution 1 201 | ~~~~~~~~~~~~~~~~~~~~~ 202 | 203 | "larry" would make this distinction:: 204 | 205 | >>> centered_data.axes.c_idx[ [0]:[2] ] 206 | >>> < returns underlying array from [3:5] > 207 | >>> centered_data.axes.c_idx[ 0:2 ] 208 | >>> < returns underlying array from [0:2] > 209 | 210 | And I believe mixing of labels and is valid also. 211 | 212 | Possible resolution 2 (the winner) 213 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 214 | 215 | Do not allow integer labels -- cast to float perhaps 216 | 217 | **Note**: this will be the solution. When validating labels on an Axis, ensure 218 | that none of them ``isinstance(t, int)`` 219 | 220 | 221 | Possible resolution 3 222 | ~~~~~~~~~~~~~~~~~~~~~ 223 | 224 | Restrict access to tick based slicing to another special slicing object. 225 | 226 | .. _broadcasting: 227 | 228 | Broadcasting 229 | ============ 230 | 231 | What about broadcasting between two named arrays, where the broadcasting 232 | adds an axis? All ordinary NumPy rules for shape compatibility apply. 233 | Additionally, DataArray imposes axis name consistency rules. 234 | 235 | The broadcasted DataArray below, "a", takes on dummy dimensions that are taken 236 | to be compatible with the larger DataArray: 237 | 238 | .. doctest:: 239 | 240 | >>> b = DataArray(np.ones((3,3)), axes=('x','y')) 241 | >>> a = DataArray(np.ones((3,)), axes=('y',)) 242 | >>> res = 2*b - a 243 | >>> res # doctest: +NORMALIZE_WHITESPACE 244 | DataArray(array([[ 1., 1., 1.], 245 | [ 1., 1., 1.], 246 | [ 1., 1., 1.]]), 247 | ('x', 'y')) 248 | 249 | When there are unnamed dimensions, they also must be consistently oriented 250 | across arrays when broadcasting: 251 | 252 | .. doctest:: 253 | 254 | >>> b = DataArray(np.arange(24).reshape(3,2,4), ['x', None, 'y']) 255 | >>> a = DataArray(np.arange(8).reshape(2,4), [None, 'y']) 256 | >>> res = a + b 257 | >>> res 258 | DataArray(array([[[ 0, 2, 4, 6], 259 | [ 8, 10, 12, 14]], 260 | 261 | [[ 8, 10, 12, 14], 262 | [16, 18, 20, 22]], 263 | 264 | [[16, 18, 20, 22], 265 | [24, 26, 28, 30]]]), 266 | ('x', None, 'y')) 267 | 268 | We already know that if the dimension names don't match, this won't be allowed 269 | (even though the shapes are correct): 270 | 271 | .. doctest:: 272 | 273 | >>> b = DataArray(np.ones((3,3)), axes=('x','y')) 274 | >>> a = DataArray(np.ones((3,)), axes=('x',)) 275 | >>> res = 4*b - a 276 | Traceback (most recent call last): 277 | ... 278 | NamedAxisError: Axis names are incompatible for a binary operation: ('x', 'y'), ('x',) 279 | 280 | But a numpy idiom for padding dimensions helps us in this case: 281 | 282 | .. doctest:: 283 | 284 | >>> res = 2*b - a[:,None] 285 | >>> res # doctest: +NORMALIZE_WHITESPACE 286 | DataArray(array([[ 1., 1., 1.], 287 | [ 1., 1., 1.], 288 | [ 1., 1., 1.]]), 289 | ('x', 'y')) 290 | 291 | In other words, this scenario is also a legal combination: 292 | 293 | .. doctest:: 294 | 295 | >>> a2 = a[:,None] 296 | >>> a2.names 297 | ('x', None) 298 | >>> b + a2 # doctest: +NORMALIZE_WHITESPACE 299 | DataArray(array([[ 2., 2., 2.], 300 | [ 2., 2., 2.], 301 | [ 2., 2., 2.]]), 302 | ('x', 'y')) 303 | 304 | The rule for dimension compatibility is that any two axes match if one of the following is true 305 | 306 | * their (name, length) pairs are equal 307 | * their dimensions are broadcast-compatible, and their axes are equal 308 | * their dimensions are broadcast-compatible, and their axes are 309 | non-conflicting (ie, one or both are None) 310 | 311 | **Question** -- what about this situation: 312 | 313 | .. doctest:: 314 | 315 | >>> b = DataArray(np.ones((3,3)), axes=('x','y')) 316 | >>> a = DataArray(np.ones((3,1)), axes=('x','y')) 317 | >>> a+b # doctest: +NORMALIZE_WHITESPACE 318 | DataArray(array([[ 2., 2., 2.], 319 | [ 2., 2., 2.], 320 | [ 2., 2., 2.]]), 321 | ('x', 'y')) 322 | 323 | The broadcasting rules currently allow this combination. I'm inclined to allow 324 | it. Even though the axes are different lengths in ``a`` and ``b``, and 325 | therefore *might* be considered different logical axes, there is no actual 326 | information collision from ``a.axes.y``. 327 | 328 | .. _iteration: 329 | 330 | Iteration 331 | ========= 332 | 333 | seems to work: 334 | 335 | .. doctest:: 336 | 337 | >>> for foo in time_caps: 338 | ... print foo 339 | ... print foo.axes 340 | ... 341 | DataArray([0 1 2 3 4], 342 | (('capitals', ('washington', 'london', 'berlin', 'paris', 'moscow')),)) 343 | (Axis(name='capitals', index=0, labels=['washington', 'london', 'berlin', 'paris', 'moscow']),) 344 | DataArray([5 6 7 8 9], 345 | (('capitals', ('washington', 'london', 'berlin', 'paris', 'moscow')),)) 346 | (Axis(name='capitals', index=0, labels=['washington', 'london', 'berlin', 'paris', 'moscow']),) 347 | DataArray([10 11 12 13 14], 348 | (('capitals', ('washington', 'london', 'berlin', 'paris', 'moscow')),)) 349 | (Axis(name='capitals', index=0, labels=['washington', 'london', 'berlin', 'paris', 'moscow']),) 350 | DataArray([15 16 17 18 19], 351 | (('capitals', ('washington', 'london', 'berlin', 'paris', 'moscow')),)) 352 | (Axis(name='capitals', index=0, labels=['washington', 'london', 'berlin', 'paris', 'moscow']),) 353 | 354 | >>> for foo in time_caps.T: 355 | ... print foo 356 | ... print foo.axes 357 | ... 358 | DataArray([ 0 5 10 15], 359 | (('time', ('0015', '0615', '1215', '1815')),)) 360 | (Axis(name='time', index=0, labels=['0015', '0615', '1215', '1815']),) 361 | DataArray([ 1 6 11 16], 362 | (('time', ('0015', '0615', '1215', '1815')),)) 363 | (Axis(name='time', index=0, labels=['0015', '0615', '1215', '1815']),) 364 | DataArray([ 2 7 12 17], 365 | (('time', ('0015', '0615', '1215', '1815')),)) 366 | (Axis(name='time', index=0, labels=['0015', '0615', '1215', '1815']),) 367 | DataArray([ 3 8 13 18], 368 | (('time', ('0015', '0615', '1215', '1815')),)) 369 | (Axis(name='time', index=0, labels=['0015', '0615', '1215', '1815']),) 370 | DataArray([ 4 9 14 19], 371 | (('time', ('0015', '0615', '1215', '1815')),)) 372 | (Axis(name='time', index=0, labels=['0015', '0615', '1215', '1815']),) 373 | 374 | Or even more conveniently: 375 | 376 | .. doctest:: 377 | 378 | >>> for foo in time_caps.axes.capitals: 379 | ... print foo 380 | ... 381 | DataArray([ 0 5 10 15], 382 | (('time', ('0015', '0615', '1215', '1815')),)) 383 | DataArray([ 1 6 11 16], 384 | (('time', ('0015', '0615', '1215', '1815')),)) 385 | DataArray([ 2 7 12 17], 386 | (('time', ('0015', '0615', '1215', '1815')),)) 387 | DataArray([ 3 8 13 18], 388 | (('time', ('0015', '0615', '1215', '1815')),)) 389 | DataArray([ 4 9 14 19], 390 | (('time', ('0015', '0615', '1215', '1815')),)) 391 | 392 | .. _transposition: 393 | 394 | Transposition of Axes 395 | ===================== 396 | 397 | Transposition of a DataArray preserves the dimension names, and updates the 398 | corresponding indices: 399 | 400 | .. doctest:: 401 | 402 | >>> b = DataArray(np.zeros((3, 2, 4)), axes=['x', None, 'y']) 403 | >>> b.shape 404 | (3, 2, 4) 405 | >>> b.axes 406 | (Axis(name='x', index=0, labels=None), Axis(name=None, index=1, labels=None), Axis(name='y', index=2, labels=None)) 407 | >>> b.T.shape 408 | (4, 2, 3) 409 | >>> b.T.axes 410 | (Axis(name='y', index=0, labels=None), Axis(name=None, index=1, labels=None), Axis(name='x', index=2, labels=None)) 411 | 412 | -------------------------------------------------------------------------------- /doc/source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # DataArray Docs documentation build configuration file, created by 4 | # sphinx-quickstart on Fri May 28 11:07:18 2010. 5 | # 6 | # This file is execfile()d with the current directory set to its containing dir. 7 | # 8 | # Note that not all possible configuration values are present in this 9 | # autogenerated file. 10 | # 11 | # All configuration values have a default; values that are commented out 12 | # serve to show the default. 13 | 14 | import os 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # sys.path.append(os.path.abspath('.')) 20 | 21 | # If your documentation needs a minimal Sphinx version, state it here. 22 | needs_sphinx = '1.3' 23 | 24 | # We load the release info into a dict by explicit execution 25 | # Use exec on contents for Python 3 compatibility 26 | rel = {} 27 | ver_file = os.path.join('..', '..', 'datarray', 'version.py') 28 | with open(ver_file, 'rt') as fobj: 29 | exec(fobj.read(), rel) 30 | 31 | # -- General configuration ----------------------------------------------------- 32 | 33 | # Add any Sphinx extension module names here, as strings. They can be extensions 34 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 35 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.mathjax', 'sphinx.ext.doctest', 36 | 'sphinx.ext.napoleon', 37 | # Only uncomment intersphinx if we really start using it, and in 38 | # that case it should probably be conditionally added only for 39 | # release builds, because it makes network lookups on every build 40 | # and can make the process annoyingly slow. 41 | #'sphinx.ext.intersphinx', 42 | ] 43 | 44 | # Add any paths that contain templates here, relative to this directory. 45 | templates_path = ['_templates'] 46 | 47 | # The suffix of source filenames. 48 | source_suffix = '.rst' 49 | 50 | # The encoding of source files. 51 | #source_encoding = 'utf-8' 52 | 53 | # The master toctree document. 54 | master_doc = 'index' 55 | 56 | # General information about the project. 57 | project = u'DataArray Docs' 58 | copyright = u'2010-2016, %(MAINTAINER)s <%(AUTHOR_EMAIL)s>' % rel 59 | 60 | # The version info for the project you're documenting, acts as replacement for 61 | # |version| and |release|, also used in various other places throughout the 62 | # built documents. 63 | # 64 | # The short X.Y version. 65 | version = rel['__version__'] 66 | # The full version, including alpha/beta/rc tags. 67 | release = version 68 | 69 | # The language for content autogenerated by Sphinx. Refer to documentation 70 | # for a list of supported languages. 71 | #language = None 72 | 73 | # There are two options for replacing |today|: either, you set today to some 74 | # non-false value, then it is used: 75 | #today = '' 76 | # Else, today_fmt is used as the format for a strftime call. 77 | #today_fmt = '%B %d, %Y' 78 | 79 | # List of documents that shouldn't be included in the build. 80 | #unused_docs = [] 81 | 82 | # List of directories, relative to source directory, that shouldn't be searched 83 | # for source files. 84 | exclude_trees = [] 85 | 86 | # The reST default role (used for this markup: `text`) to use for all documents. 87 | #default_role = None 88 | 89 | # If true, '()' will be appended to :func: etc. cross-reference text. 90 | #add_function_parentheses = True 91 | 92 | # If true, the current module name will be prepended to all description 93 | # unit titles (such as .. function::). 94 | #add_module_names = True 95 | 96 | # If true, sectionauthor and moduleauthor directives will be shown in the 97 | # output. They are ignored by default. 98 | #show_authors = False 99 | 100 | # The name of the Pygments (syntax highlighting) style to use. 101 | pygments_style = 'sphinx' 102 | 103 | # A list of ignored prefixes for module index sorting. 104 | #modindex_common_prefix = [] 105 | 106 | 107 | # -- Options for HTML output --------------------------------------------------- 108 | 109 | # The theme to use for HTML and HTML Help pages. 110 | html_theme = 'alabaster' 111 | 112 | # Theme options are theme-specific and customize the look and feel of a theme 113 | # further. For a list of options available for each theme, see the 114 | # documentation. 115 | #html_theme_options = {} 116 | 117 | # Add any paths that contain custom themes here, relative to this directory. 118 | #html_theme_path = [] 119 | 120 | # The name for this set of Sphinx documents. If None, it defaults to 121 | # " v documentation". 122 | #html_title = None 123 | 124 | # A shorter title for the navigation bar. Default is the same as html_title. 125 | #html_short_title = None 126 | 127 | # The name of an image file (relative to this directory) to place at the top 128 | # of the sidebar. 129 | #html_logo = None 130 | 131 | # The name of an image file (within the static path) to use as favicon of the 132 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 133 | # pixels large. 134 | #html_favicon = None 135 | 136 | # Add any paths that contain custom static files (such as style sheets) here, 137 | # relative to this directory. They are copied after the builtin static files, 138 | # so a file named "default.css" will overwrite the builtin "default.css". 139 | #html_static_path = ['_static'] 140 | html_static_path = [] 141 | 142 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 143 | # using the given strftime format. 144 | #html_last_updated_fmt = '%b %d, %Y' 145 | 146 | # If true, SmartyPants will be used to convert quotes and dashes to 147 | # typographically correct entities. 148 | #html_use_smartypants = True 149 | 150 | # Custom sidebar templates, maps document names to template names. 151 | #html_sidebars = {} 152 | 153 | # Additional templates that should be rendered to pages, maps page names to 154 | # template names. 155 | #html_additional_pages = {} 156 | 157 | # If false, no module index is generated. 158 | #html_use_modindex = True 159 | 160 | # If false, no index is generated. 161 | #html_use_index = True 162 | 163 | # If true, the index is split into individual pages for each letter. 164 | #html_split_index = False 165 | 166 | # If true, links to the reST sources are added to the pages. 167 | #html_show_sourcelink = True 168 | 169 | # If true, an OpenSearch description file will be output, and all pages will 170 | # contain a tag referring to it. The value of this option must be the 171 | # base URL from which the finished HTML is served. 172 | #html_use_opensearch = '' 173 | 174 | # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). 175 | #html_file_suffix = '' 176 | 177 | # Output file base name for HTML help builder. 178 | htmlhelp_basename = 'DataArrayDocsdoc' 179 | 180 | 181 | # -- Options for LaTeX output -------------------------------------------------- 182 | 183 | # The paper size ('letter' or 'a4'). 184 | #latex_paper_size = 'letter' 185 | 186 | # The font size ('10pt', '11pt' or '12pt'). 187 | #latex_font_size = '10pt' 188 | 189 | # Grouping the document tree into LaTeX files. List of tuples 190 | # (source start file, target name, title, author, documentclass [howto/manual]). 191 | latex_documents = [ 192 | ('index', 'DataArrayDocs.tex', u'DataArray Docs Documentation', 193 | u'Mike Trumpis, Fernando Pérez, Kilian Koepseel', 'manual'), 194 | ] 195 | 196 | # The name of an image file (relative to this directory) to place at the top of 197 | # the title page. 198 | #latex_logo = None 199 | 200 | # For "manual" documents, if this is true, then toplevel headings are parts, 201 | # not chapters. 202 | #latex_use_parts = False 203 | 204 | # Additional stuff for the LaTeX preamble. 205 | #latex_preamble = '' 206 | 207 | # Documents to append as an appendix to all manuals. 208 | #latex_appendices = [] 209 | 210 | # If false, no module index is generated. 211 | #latex_use_modindex = True 212 | 213 | 214 | # Example configuration for intersphinx: refer to the Python standard library. 215 | intersphinx_mapping = {'http://docs.python.org/': None} 216 | -------------------------------------------------------------------------------- /doc/source/design/array_axes.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 19 | 21 | 28 | 33 | 34 | 41 | 46 | 47 | 54 | 59 | 60 | 67 | 74 | 75 | 94 | 96 | 97 | 99 | image/svg+xml 100 | 102 | 103 | 104 | 105 | 106 | 110 | 117 | 121 | 126 | 131 | 132 | 133 | -------------------------------------------------------------------------------- /doc/source/design/design.rst: -------------------------------------------------------------------------------- 1 | ============================== 2 | DataArray: some design notes 3 | ============================== 4 | 5 | A DataArray is a subclass of the basic Numpy ndarray object that provides an 6 | explicit mechanism for attaching information to the *axes* of the underlying 7 | numpy array. This is achieved by attaching an Axis object to each dimension of 8 | the array; an Axis object has an optional *name* as well as optional *labels* 9 | (think of them as tick labels in a figure). 10 | 11 | With Axis objects attached to an array, it becomes possible to manipulate the 12 | array by named axis, to slice an axis by named label, etc. These features 13 | complement the rich semantics that numpy has for the *contents* of an array, 14 | encapsulated its dtype machinery for structured/record arrays. 15 | 16 | Arrays with named / labeled axes 17 | ================================ 18 | 19 | ndarrays extended to have an explicit "hypercross" of axes, each with 20 | names (possibly defaulted). 21 | 22 | * for methods in which an "axis" is denoted, an axis name may be used 23 | 24 | * indexing/slicing along a named axis returns that slicing, at that axis, 25 | along with slice(None) slicing along all other axes 26 | 27 | * for all arithmetic/binary-op matters under which dimension numbers and 28 | lengths must match, also the hypercrosses must be consistent 29 | 30 | * broadcasting will "inherit" labels from the super-hyper-cross 31 | (see np.broadcast) 32 | 33 | * padding dimensions will insert "dummy" dimensions, eg:: 34 | 35 | a = datarray( np.random.randn(10,10), ('time', 'temp') ) 36 | a[:,None,:].axes --> ('time', None, 'temp') 37 | 38 | * axes may be transposed 39 | 40 | Arrays with named axes, whose named axes have ticks 41 | =================================================== 42 | 43 | each named axis has tick labels 44 | 45 | * numpy, fancy and slice-like indexing on each axis:: 46 | 47 | x.named_axis[...] 48 | --> does any kind of numpy indexing on the axis 49 | x.named_axis.at( *args ) 50 | --> returns essentially "fancy" indexing along the axis, at valid ticks in args 51 | x.named_axis.t_slice( start, stop, [step]) 52 | --> where arguments are valid ticks, performs a slicing-like operation along the axis 53 | 54 | * mixed indexing on the array:: 55 | 56 | x.at( *args ) 57 | --> len(args) <= x.ndim -- for each indexing spec in args, perform that indexing 58 | on the enumerated axes 59 | x.t_slice( *args ) 60 | --> same as above, but perform t_slice slicing on the enumerated axes 61 | 62 | (my thoughts on) What Is The DataArray? 63 | ======================================= 64 | 65 | * 1st and foremost, **an ndarray**, in N dimensions, with any dtype 66 | * has means to locate data more descriptively (IE, with custom names 67 | for dimensions/axes, and custom names for indices along any axis) 68 | 69 | :: 70 | 71 | >>> darr = DataArray(np.random.randn(2,3,4), ('ex', 'why', 'zee')) 72 | >>> darr.sum(axis='ex') 73 | DataArray([[-0.39052695, -2.07493873, 1.19664474, 0.36681094], 74 | [-1.04287781, 0.5767191 , -0.35425298, 1.10468356], 75 | [ 0.08331866, -0.36532857, 0.12905265, -1.94559672]]) 76 | ('why', 'zee') 77 | >>> for subarr in darr.axis.why: 78 | ... print subarr.shape, subarr.labels 79 | ... 80 | (2, 4) ('ex', 'zee') 81 | (2, 4) ('ex', 'zee') 82 | (2, 4) ('ex', 'zee') 83 | 84 | * An axis "label" can always stand in for an axis number; an index 85 | "tick" can (in some TBD sense) stand in for an integer index 86 | * if anything is **more restrictive** in operations, for example 87 | 88 | :: 89 | 90 | >>> ndarr_ones = np.ones((10,10,10)) 91 | >>> ndarr_twos = np.ones((10,10,10))*2 92 | >>> ndarr_3s = ndarr_ones + ndarr_twos # OK! 93 | >>> darr_abc = DataArray(ndarr_ones, ('a', 'b', 'c')) 94 | >>> darr_bac = DataArray(ndarr_twos, ('b', 'a', 'c')) 95 | >>> darr_wtf = darr_abc + darr_bac # BAD! frames are rotated 96 | 97 | (and my very own thoughts on) What The DataArray Is Not 98 | ======================================================= 99 | 100 | Unions And Intersections 101 | ------------------------ 102 | 103 | DataArray may broadcast with certain union rules for adapting 104 | metadata, but it does not do any data union/intersection rule for 105 | operations. For example, the result of adding an array with axes ('a', 'c') with an 106 | array with axis 'c' takes on information from the "superset" of 107 | axes. This is analogous to ndarray taking on shape information from 108 | the superset of shapes. 109 | 110 | :: 111 | 112 | >>> darr_abc[:,0,:] 113 | DataArray([[ 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], 114 | ... 115 | [ 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]) 116 | ('a', 'c') 117 | >>> darr_bac[0,0] 118 | DataArray([ 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.]) 119 | ('c',) 120 | >>> darr_abc[:,0,:] + darr_bac[0,0] 121 | DataArray([[ 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.], 122 | ... 123 | [ 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.]]) 124 | ('a', 'c') 125 | 126 | But it will not fill or trim any dimension to fit the shape of a 127 | fellow operand's array (it seems this violation is simply caught at the C-level of an ndarray):: 128 | 129 | >>> darr_abc[:,0,:] + darr_bac[0,0,:5] 130 | ------------------------------------------------------------ 131 | Traceback (most recent call last): 132 | File "", line 1, in 133 | ValueError: shape mismatch: objects cannot be broadcast to a single shape 134 | 135 | For me, this looks like the **domain of utility functions** (or 136 | possibly utility methods that yield new DataArrays). 137 | 138 | Namespace 139 | --------- 140 | 141 | It would be good practice to keep all the dynamically generated 142 | DataArray attributes (eg, Axis labels) removed from the top-level 143 | array attribute list. This is what we currently have as "axis". 144 | 145 | It might(?) be a good idea to put all future special purpose methods 146 | under that object too. 147 | 148 | 149 | Lessons Learned 150 | =============== 151 | 152 | "Smart" Indexing 153 | ---------------- 154 | 155 | The smart indexing implemented by Larry is very full featured. I believe the 156 | design of using lists to separating labels from integers in mixed indexing is a 157 | good choice (and necessary). However, I think it illustrates the potential 158 | confusion created by mixed indexing and is a good argument for discouraging/not 159 | allowing it. 160 | 161 | "Smart" Arithmetic 162 | ------------------ 163 | 164 | * Larry makes attempts to align its arrays when performing arithmetic, so as to 165 | operate on identical coordinates. 166 | * It also might introduce intersections between arrays. 167 | * It does not broadcast 168 | 169 | Ideas 170 | ===== 171 | 172 | Axis Slicing 173 | ------------ 174 | 175 | Use Case: chained axis slicing 176 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 177 | 178 | slicing on an axis returns a new DataArray:: 179 | 180 | arr = DataArray(np.random.randn(10,10), labels=('time', 'freq')) 181 | arr.axis.time[:5] --> new DataArray with (time, freq) axes 182 | 183 | However, slicing on the special slicing object "aix" returns a new Special 184 | Tuple (stuple). 185 | 186 | Stuple: 187 | 188 | * is len-N, for ND arrays 189 | * only one entry is (potentially) not ``slice(None)`` 190 | * has knowledge of its own index 191 | * has knowledge of other axes (static or dynamically generated attributes) 192 | * can be composed with other stuples in a special way (??) -- 193 | 194 | :: 195 | 196 | s1 --> ( slice(0,4), slice(None) ) 197 | s2 --> ( slice(None), slice(3,10) ) 198 | s1 s2 --> ( slice(0,4), slice(3,10) ) 199 | 200 | * can be given a "parent" stuple when constructed, into which the new stuple 201 | merges its own slicing in ``__getitem__`` 202 | 203 | Constructor prototype:: 204 | 205 | def __init__(self, *args, parent=None, index=None, name=None) ?? 206 | 207 | To chain slicing, the syntax would be like this:: 208 | 209 | arr.aix.time[:4].freq[3:8] 210 | --OR-- 211 | arr[ arr.aix.time[:4].freq[3:8] ] 212 | 213 | Chaining an axis on itself **will not** be implemented yet (possibly ever):: 214 | 215 | arr.aix.time[:4].time[:2] --> raise error 216 | 217 | 218 | ============================================ 219 | The May 2011 DataArray summit at Enthought 220 | ============================================ 221 | 222 | How to handle datarray indexing 223 | =============================== 224 | 225 | This document is a summary of the syntax and semantics that was agreed upon at 226 | the Data Array summit held at Enthought in May 2011. 227 | 228 | The DataArray object will have a .axes attribute which exhibits the following 229 | behaviour:: 230 | 231 | >>> a = DataArray( ..., axes=('date', ('stocks', ('aapl', 'ibm' 'goog', 'msft')), 'metric')) 232 | 233 | # get the axis object 234 | >>> a.axes.stocks 235 | 236 | # the same as a[:,0:2,:] 237 | >>> a.axes.stocks['aapl':'goog'] 238 | 239 | # get the nth axis object (particularly if not named) 240 | >>> a.axes[n] 241 | 242 | # get an "axes indexer" object for the indicated objects. 243 | >>> a.axes('stocks', 'date') 244 | 245 | This indexer object returns something that is meant to be indexed with as many 246 | dimensions as it was passed arguments, but that will, upon indexing, return 247 | arrays with dimensions ordered just like the original underlying array. 248 | 249 | The information that is all available at the point where you are constructing 250 | the slicer, so you don't need to go rummaging around the code to find the 251 | correct order of the axes from where the array was originally defined. It also 252 | potentially permits you to use underlying arrays with different axis orders in 253 | the same code unambiguously. 254 | 255 | There was also the thought that with numerical arguments that this would fill a 256 | hole in the current numpy API for arbitrary re-ordering of axes in a view for 257 | slicing (essentially a super-generalized transpose-ish sort of thing) 258 | 259 | The result of the slicing operation retains the original ordering, but the 260 | slices provided to a.axes()[] need to match the order of the arguments to 261 | a.axes. So in other words, when you do:: 262 | 263 | >>> tslicer = a.axes('t') 264 | 265 | then:: 266 | 267 | >>> tslicer['a':'z'] 268 | 269 | returns an array with axes x, y, z, t in that order, but sliced as:: 270 | 271 | a[:,:,:,'a':'z'] 272 | 273 | When you have:: 274 | 275 | xyslicer = a.axes('x', 'y') 276 | yxslicer = a.axes('y', 'x') 277 | 278 | then I would expect to do:: 279 | 280 | xyslicer[x1:x2, y1:y2] 281 | 282 | but:: 283 | 284 | yxslicer[y1:y2, x1:x2] 285 | 286 | However, these are two equivalent ways of writing ``a[x1:x2, y1:y2, :, :]``. 287 | If explicit transposition of the returned data is desired, it can be done 288 | with:: 289 | 290 | >>> a.transpose('stocks','date').axes('stocks','date')[...] 291 | 292 | # Now, actually do the slicing: equivalent to a[100, 0:2, :] 293 | >>> a.axes('stocks', 'date')['aapl':'goog',100] 294 | 295 | # can supply an axis number as well 296 | >>> a.axes(1, 'date')['aapl':'goog',100:200] 297 | 298 | In addition axes can have the notion of a index mapper which allows indexing and 299 | slicing by labels or values other than strings and integers. To use these, you 300 | have to supply a keyword argument to the axes call:: 301 | 302 | # add a datetime.date -> index map 303 | >>> date_mapper = DictMapper(...) 304 | >>> a = DataArray( ..., axes=(('date', date_mapper), ... )) 305 | 306 | # do mapped indexing XXX - this might not have been the final decision 307 | >>> a.axes('stocks', 'date', mapped=True)['aapl':'goog', datetime.date(2011, 1, 1):datetime.date(2011, 5, 14)] 308 | 309 | # For mapped indexing 310 | 311 | The exact semantics of mapping are yet to be determined, but the thought is that 312 | there would be standard mappers to do things like interpolation, mapped integer 313 | indexing. 314 | 315 | Other notes 316 | ----------- 317 | 318 | * Axis names can only be strings that are valid Python identifiers. 319 | * Labels can only be strings, and must be unique. 320 | * All other indexing cases are handled by mapping (however that will work). 321 | * Axes can have arbitrary aliases which do not have to be unique. 322 | * An axis can have an associated array of the same length as the set of labels 323 | for additional data storage. 324 | -------------------------------------------------------------------------------- /doc/source/design/index.rst: -------------------------------------------------------------------------------- 1 | ====== 2 | Design 3 | ====== 4 | 5 | Contents: 6 | 7 | .. toctree:: 8 | :maxdepth: 1 9 | 10 | design.rst 11 | issues.rst 12 | 13 | 14 | Indices and tables 15 | ================== 16 | 17 | * :ref:`genindex` 18 | * :ref:`modindex` 19 | * :ref:`search` 20 | 21 | -------------------------------------------------------------------------------- /doc/source/design/issues.rst: -------------------------------------------------------------------------------- 1 | ====================================== 2 | Issues, open questions and todo list 3 | ====================================== 4 | 5 | Questions and issues about the datarray prototype. 6 | 7 | .. contents:: 8 | 9 | 10 | Labels 11 | ====== 12 | 13 | Labels are a relatively new addition to datarrays. The labels of a datarrays 14 | identify the axes of the array. The labels of a datarray identify the elements 15 | along an axis. Both labels and labels are optional. 16 | 17 | Axis._label_dict is not updated when labels are changed 18 | ------------------------------------------------------- 19 | 20 | Example:: 21 | 22 | >> dar = DataArray([1, 2], [('time', ['A', 'B'])]) 23 | >> dar.axis.time._label_dict 24 | {'A': 0, 'B': 1} 25 | >> dar.axis.time.labels[0] = 'X' 26 | >> dar.axis.time.labels 27 | ['X', 'B'] 28 | >> dar.axis.time._label_dict 29 | {'A': 0, 'B': 1} 30 | 31 | Possible solutions: 32 | 33 | #. Don't allow labels to be changed 34 | #. Only allow labels to be changed through a method that also updates _label_dict 35 | #. Don't store _label_dict, create on the fly as needed 36 | 37 | pandas, I believe, makes the labels immutable (#1). larry allows the labels to 38 | be changed and calculates the mapping dict on the fly (#3). 39 | 40 | 41 | Can I have labels without axis names? 42 | ------------------------------------- 43 | 44 | I'd like to use labels without names. At the moment that is not possible:: 45 | 46 | >>> DataArray([1, 2], [(None, ('a', 'b'))]) 47 | 48 | ValueError: labels only supported when Axis has a name 49 | 50 | Well, it is possible:: 51 | 52 | >>> dar = DataArray([1, 2], [('tmp', ('a', 'b'))]) 53 | >>> dar.set_name(0, None) 54 | >>> dar.axes 55 | (Axis(name=None, index=0, labels=('a', 'b')),) 56 | 57 | 58 | Add a labels input parameter? 59 | ----------------------------- 60 | 61 | What do you think of adding a ``labels`` parameter to DataArray? 62 | 63 | Current behavior:: 64 | 65 | >>> dar = DataArray([[1, 2], [3, 4]], (('row', ['A','B']), ('col', ['C', 'D']))) 66 | >>> dar.axes 67 | (Axis(name='row', index=0, labels=['A', 'B']), 68 |  Axis(name='col', index=1, labels=['C', 'D'])) 69 | 70 | Proposed labels as separate input parameter:: 71 | 72 | >>> DataArray([[1, 2], [3, 4]], names=('row', 'col'), labels=[['A', 'B'], ['C', 'D']]) 73 | 74 | I think this would make it easier for new users to construct a DataArray with 75 | labels just from looking at the DataArray signature. It would match the 76 | signature of Axis. My use case is to use labels only and not names axes (at 77 | first), so:: 78 | 79 | >>> DataArray([[1, 2], [3, 4]], labels=[['A', 'B'], ['C', 'D']]) 80 | 81 | instead of the current:: 82 | 83 | >>> DataArray([[1, 2], [3, 4]], ((None, ['A','B']), (None, ['C', 'D']))) 84 | 85 | It might also cause less typos (parentheses matching) at the command line. 86 | 87 | Having separate names and labels input parameters would also leave the option 88 | open to allow any hashable object, like a tuple, to be used as a name. 89 | Currently tuples have a special meaning, the (names, labels) tuple. 90 | 91 | Create Axis._label_dict when needed? 92 | ------------------------------------ 93 | 94 | How about creating Axis._label_dict on the fly when needed (but not saving it)? 95 | 96 | **Pros** 97 | 98 | - Faster datarray creation (it does look like you get _label_dict for free 99 | since you need to check that the labels are unique anyway, but set() 100 | is faster) 101 | - Faster datarray copy 102 | - Use less memory 103 | - Easier to archive 104 | - Simplify Axis 105 | - Prevent user from doing ``dar.axes[0]._label_dict['a'] = 10`` 106 | - Catches (on calls to ``make_slice`` and ``keep``) user mischief like 107 | dar.axes[0].labels = ('a', 'a') 108 | - No need to update Axis._label_dict when user changes labels 109 | 110 | **Cons** 111 | 112 | - Slower ``make_slice`` 113 | - Slower ``keep`` 114 | 115 | 116 | Axis, axes 117 | ========== 118 | 119 | Datarrays were created from the need to name the axes of a numpy array. 120 | 121 | datarray1 + datarrat2 = which axes? 122 | ----------------------------------- 123 | 124 | Which axes are returned by binary operations? 125 | 126 | Make two datarrays:: 127 | 128 | >> dar1 = DataArray([1, 2], [('time', ['A1', 'B1'])]) 129 | >> dar2 = DataArray([1, 2], [('time', ['A2', 'B2'])]) 130 | 131 | ``dar1`` on the left-hand side:: 132 | 133 | >> dar12 = dar1 + dar2 134 | >> dar12.axes 135 | (Axis(name='time', index=0, labels=['A1', 'B1']),) 136 | 137 | ``dar1`` on the right-hand side:: 138 | 139 | >> dar21 = dar2 + dar1 140 | >> dar21.axes 141 | (Axis(name='time', index=0, labels=['A2', 'B2']),) 142 | 143 | So a binary operation returns the axes from the left-hand side? No. Seems the 144 | left most non-None axes are used:: 145 | 146 | >> dar3 = DataArray([1, 2]) 147 | >> dar31 = dar3 + dar1 148 | >> dar31.axes 149 | (Axis(name='time', index=0, labels=['A1', 'B1']),) 150 | 151 | So binary operation may returns parts of both axes:: 152 | 153 | >> dar1 = DataArray([[1, 2], [3, 4]], [None, ('col', ['A', 'B'])]) 154 | >> dar2 = DataArray([[1, 2], [3, 4]], [('row', ['a', 'b']), None]) 155 | >> dar12 = dar1 + dar2 156 | >> dar12.axes 157 | 158 | (Axis(name='row', index=0, labels=['a', 'b']), 159 | Axis(name='col', index=1, labels=['A', 'B'])) 160 | 161 | Is that the intended behavior? 162 | 163 | Why does Axis.__eq__ require the index to be equal? 164 | --------------------------------------------------- 165 | 166 | Example:: 167 | 168 | >> dar1 = DataArray([[1, 2], [3, 4]], [('row', ['r0', 'r1']), ('col', ['c0', 'c1'])]) 169 | >> dar2 = DataArray([[1, 2], [3, 4]], [('col', ['c0', 'c1']), ('row', ['r0', 'r1'])]) 170 | >> dar1.axes[0] == dar2.axes[1] 171 | False 172 | 173 | Axis, axis, axes 174 | ---------------- 175 | 176 | The functions, classes, and methods that take care of axes are: 177 | 178 | - Axis (class) 179 | - DataArray.axis (meth) 180 | - DataArray.axes (meth) 181 | - _reordered_axes (func) 182 | - _expand_ellipsis (func) 183 | - _make_singleton_axes (func) 184 | 185 | I find having both DataArray.axis and DataArray.axes confusing at first. I 186 | wonder if it would simplify things if there was only: 187 | 188 | - Axes (class) 189 | - Data.axes (instance of Axes) 190 | 191 | That would consolidate everything in the Axes class. For example, in 192 | DataArray.__getitem__ this:: 193 | 194 | if isinstance(key, tuple): 195 | old_shape = self.shape 196 | old_axes = self.axes 197 | new_shape, new_axes, key = _make_singleton_axes(self, key) 198 | # Will undo this later 199 | self.shape = new_shape 200 | _set_axes(self, new_axes) 201 | # data is accessed recursively, starting with 202 | # the full array 203 | arr = self 204 | 205 | # We must copy of the names of the axes 206 | # before looping through the elements of key, 207 | # as the index of a given axis may change. 208 | names = [a.name for a in self.axes] 209 | 210 | # If an Axis gets sliced out entirely, then any following 211 | # unnamed Axis in the array will spontaneously change name. 212 | # So anticipate the name change here. 213 | reduction = 0 214 | adjustments = [] 215 | for k in key: 216 | adjustments.append(reduction) 217 | if not isinstance(k, slice): 218 | # reduce the idx # on the remaining default names 219 | reduction -= 1 220 | 221 | names = [n if a.name else '_%d'%(a.index+r) 222 | for n, a, r in zip(names, self.axes, adjustments)] 223 | 224 | for slice_or_int, name in zip(key, names): 225 | arr = arr.axis[name][slice_or_int] 226 | 227 | # restore old shape and axes 228 | self.shape = old_shape 229 | _set_axes(self, old_axes) 230 | 231 | could be replaced with:: 232 | 233 | if isinstance(key, tuple): 234 | self.axes = self.axes[key] 235 | 236 | So it would pull out the axes logic from DataArray and place it in Axes. 237 | 238 | Should DataArray.axes be a list instead of a tuple? 239 | --------------------------------------------------- 240 | 241 | Why not make DataArray.axes a list instead of a tuple? Then user can replace 242 | an axis from one datarray to another, can pop an Axis, etc. 243 | 244 | 245 | Can axis names be anything besides None or str? 246 | ----------------------------------------------- 247 | 248 | from http://projects.scipy.org/numpy/wiki/NdarrayWithNamedAxes: "Axis names 249 | (the name of a dimension) must be valid Python identifiers." I don't know 250 | what that means. 251 | 252 | It would be nice if axis names could be anything hashable like str, 253 | datetime.date(), int, tuple. 254 | 255 | But names must be strings to do indexing like this:: 256 | 257 | >>> dar = DataArray([[1, 2], [3, 4]], (('row', ['A','B']), ('col', ['C', 'D']))) 258 | >>> dar.axis.row['A'] 259 | DataArray([1, 2]) 260 | ('col',) 261 | 262 | One way to make it work would be to rewrite the above as:: 263 | 264 | >>> dar.axis['row']['A'] 265 | DataArray([1, 2]) 266 | ('col',) 267 | 268 | which would also make it easier to loop through the axes by name:: 269 | 270 | >>> for axisname in ['row', col']: 271 | ....: dar.axis[axisname][idx] 272 | ....: ... 273 | 274 | 275 | Performance 276 | =========== 277 | 278 | Performance is not the primary concern during the prototype phase of datarray. 279 | But some attention to performance issue will help guide the development of 280 | datarrays. 281 | 282 | How long does it take to create a datarray? 283 | ------------------------------------------- 284 | 285 | Set up data:: 286 | 287 | >> import numpy as np 288 | >> N = 100 289 | >> arr = np.random.rand(N, N) 290 | >> idx1 = map(str, range(N)) 291 | >> idx2 = map(str, range(N)) 292 | 293 | Time the creation of a datarray:: 294 | 295 | >> from datarray import DataArray 296 | >> import datarray 297 | >> names = [('row', idx1), ('col', idx2)] 298 | >> timeit datarray.DataArray(arr, names) 299 | 1000 loops, best of 3: 160 us per loop 300 | 301 | Time the creation of a pandas DataMatrix. A DataMatrix it is also a subclass 302 | of numpy's ndarray, but it has been optimized so should be a proxy for how 303 | fast a datarray can become:: 304 | 305 | >> import pandas 306 | >> timeit pandas.DataMatrix(arr, idx1, idx2) 307 | 10000 loops, best of 3: 50.7 us per loop 308 | 309 | larry is not a subclass of numpy's ndarray, I think that is one reason it is 310 | faster to create:: 311 | 312 | >> import la 313 | >> name = [idx1, idx2] 314 | >> timeit la.larry(arr, name) 315 | 100000 loops, best of 3: 13.5 us per loop 316 | >> timeit la.larry(arr, name, integrity=False) 317 | 1000000 loops, best of 3: 1.25 us per loop 318 | 319 | Also both datarray and DataMatrix make a mapping dictionary when the data 320 | object is created---that takes time. larry makes a mapping dictionary on the 321 | fly, when needed. 322 | 323 | Why is the time to create a datarray important? Because even an operation as 324 | simple as ``dar1 + dar2`` creates a datarray. 325 | 326 | Direct access to array? 327 | ----------------------- 328 | 329 | Names and labels add overhead. Sometimes, after aligning my datarrays, I would 330 | like to work directly with the numpy arrays. Is there a way to do that with 331 | datarrays? 332 | 333 | For example, with a named array, larry_, the underlying numpy array is always 334 | accessible as the attribute ``x``:: 335 | 336 | >>> import la 337 | >>> lar = la.larry([1, 2, 3]) 338 | >>> lar.x 339 | array([1, 2, 3]) 340 | >>> lar.x = myfunc(lar.x) 341 | 342 | .. _larry: http://github.com/kwgoodman/la 343 | 344 | This might be one solution (base):: 345 | 346 | >> from datarray import DataArray 347 | >> x = DataArray([[1,2],[3,4]], [('row', ['r1', 'r2']), ('col', ['c1', 'c2'])]) 348 | >> timeit x + x 349 | 10000 loops, best of 3: 61.4 us per loop 350 | >> timeit x.base + x.base 351 | 100000 loops, best of 3: 2.16 us per loop 352 | 353 | and:: 354 | 355 | >> x = DataArray([1, 2]) 356 | >> x.base[0] = 9 357 | >> x 358 | 359 | DataArray([9, 2]) 360 | (None,) 361 | 362 | But base is not guaranteed to be a view. What's another solution? Could create 363 | an attribute at init time, but that slows down init. 364 | 365 | 366 | Alignment 367 | ========= 368 | 369 | Datarray may not handle alignment directly. But some users of datarrays would 370 | like an easy way to align datarrays. 371 | 372 | Support for alignment? 373 | ---------------------- 374 | 375 | Will datarray provide any support for those who want binary operations between 376 | two datarrays to join names or labels using various join methods? 377 | 378 | `A use case `_ from larry_: 379 | 380 | By default, binary operations between two larrys use an inner join of the 381 | names (the intersection of the names):: 382 | 383 | >>> lar1 = larry([1, 2]) 384 | >>> lar2 = larry([1, 2, 3]) 385 | >>> lar1 + lar2 386 | name_0 387 | 0 388 | 1 389 | x 390 | array([2, 4]) 391 | 392 | The sum of two larrys using an outer join (union of the names):: 393 | 394 | >>> la.add(lar1, lar2, join='outer') 395 | name_0 396 | 0 397 | 1 398 | 2 399 | x 400 | array([ 2., 4., NaN]) 401 | 402 | The available join methods are inner, outer, left, right, and list. If the 403 | join method is specified as a list then the first element in the list is the 404 | join method for axis=0, the second element is the join method for axis=1, and 405 | so on. 406 | 407 | How can datarrays be aligned? 408 | ----------------------------- 409 | 410 | What's an outer join (or inner, left, right) along an axis of two datarrays if 411 | one datarray has labels and the other doesn't? 412 | 413 | Background: 414 | 415 | It is often useful to align two datarrays before performing binary operations 416 | such as +, -, \*, /. Two datarrays are aligned when both datarrays have the same 417 | names and labels along all axes. 418 | 419 | Aligned:: 420 | 421 | >> dar1 = DataArray([1, 2]) 422 | >> dar2 = DataArray([3, 4]) 423 | >> dar1.axes == dar2.axes 424 | True 425 | 426 | Unaligned:: 427 | 428 | >> dar1 = DataArray([1, 2], names=("time",)) 429 | >> dar2 = DataArray([3, 4], names=("distance",)) 430 | >> dar1.axes == dar2.axes 431 | False 432 | 433 | Unaligned but returns aligned since Axis.__eq__ doesn't (yet) check for 434 | equality of labels:: 435 | 436 | >> dar1 = DataArray([1, 2], names=[("time", ['A', 'B'])]) 437 | >> dar2 = DataArray([1, 2], names=[("time", ['A', 'different'])]) 438 | >> dar1.axes == dar2.axes 439 | True 440 | 441 | Let's say we make an add function with user control of the join method:: 442 | 443 | >>> add(dar1, dar2, join='outer') 444 | 445 | Since datarray allows empty axis names (None) and labels (None), what does an 446 | outer join mean if dar1 has labels but dar2 doesn't:: 447 | 448 | >>> dar1 = DataArray([1, 2], names=[("time", ['A', 'B'])]) 449 | >>> dar2 = DataArray([1, 2], names=[("time",)]) 450 | 451 | What would the following return? 452 | :: 453 | 454 | >>> add(dar1, dar2, join='outer') 455 | 456 | larry requires all axes to have labels, if none are given then the labels default 457 | to range(n). 458 | 459 | datarray.reshape 460 | ---------------- 461 | 462 | Reshape operations scramble names and labels. Some numpy functions and 463 | array methods use reshape. Should reshape convert a datarray to an array? 464 | 465 | Looks like datarray will need unit tests for every numpy function and array 466 | method. 467 | 468 | 469 | Misc 470 | ==== 471 | 472 | Miscellaneous observation on datarrays. 473 | 474 | How do I save a datarray in HDF5 using h5py? 475 | -------------------------------------------- 476 | 477 | `h5py `_, which stores data in HDF5 format, can only 478 | save numpy arrays. 479 | 480 | What are the parts of a datarray that need to be saved? And can they be stored 481 | as numpy arrays? 482 | 483 | A datarray can be broken down to the following components: 484 | 485 | - data (store directly as numpy array) 486 | - names (store as object array since it contains None and str and covert 487 | back on load?) 488 | - labels (each axis stored as numpy array with axis number stored as HDF5 489 | Dataset attribute, but then labels along any one axis must be homogeneous 490 | in dtype) 491 | - Dictionary of label index mappings (ignore, recreate on load) 492 | 493 | (I need to write a function that saves an Axis object to HDF5.) 494 | 495 | If I don't save Axis._label_dict, would I have to worry about a user changing 496 | the mapping? 497 | :: 498 | 499 | >>> dar.axes[0] 500 | Axis(name='one', index=0, labels=('a', 'b')) 501 | >>> dar.axes[0]._label_dict 502 | {'a': 0, 'b': 1} 503 | >>> dar.axes[0]._label_dict['a'] = 10 504 | >>> dar.axes[0]._label_dict 505 | {'a': 10, 'b': 1} 506 | 507 | 508 | Can names and labels be changed? 509 | -------------------------------- 510 | 511 | Labels can be changed:: 512 | 513 | >>> dar = DataArray([1, 2], [('row', ['A','B'])]) 514 | >>> dar.axes 515 | (Axis(name='row', index=0, labels=['A', 'B']),) 516 | >>> dar.axes[0].labels[0] = 'CHANGED' 517 | >>> dar.axes 518 | (Axis(name='row', index=0, labels=['CHANGED', 'B']),) 519 | 520 | But Axis._label_dict is not updated when user changes labels. 521 | 522 | And so can names:: 523 | 524 | >>> dar.set_name(0, 'new name') 525 | >>> dar 526 | DataArray([1, 2]) 527 | ('new name',) 528 | 529 | Fancy Indexing 530 | -------------- 531 | 532 | It's not implemented at all yet. 533 | 534 | .. _name_updates: 535 | 536 | Changing Names on DataArrays 537 | ============================= 538 | 539 | Tricky Attributes 540 | ----------------- 541 | 542 | * .names -- currently a mutable list of Axis.name attributes 543 | * .axes -- currently a mutable list of Axis objects 544 | * .axis -- a key-to-attribute dictionary 545 | 546 | Need an event-ful way to change an Axis's label, such that all the above 547 | attributes are updated. 548 | 549 | **Proposed solution**: 550 | 551 | 1. use a set_label() method. This will consequently update the parent array's 552 | (names, axes, axis) attributes. 553 | 2. make the mutable lists into *tuples* to deny write access. 554 | 3. make the KeyStruct ``.axis`` have write-once access 555 | 556 | .. _todo: 557 | 558 | ToDo 559 | ==== 560 | 561 | * Support DataArray instances with mixed axes: simple ones with no values 562 | and 'fancy' ones with data in them. Syntax? 563 | 564 | ``a = DataArray.from_names(data, axes=['a','b','c'])`` 565 | 566 | ``b = DataArray(data, axes=[('a',['1','2','3']), ('b',['one','two']), ('c',['red','black'])])`` 567 | 568 | ``c = DataArray(data, axes=[('a',['1','2','3']), ('b',None), ('c',['red','black'])])`` 569 | 570 | * Can a, b, and c be combined in binary operations, given the different tick 571 | combinations? 572 | * How to handle complicated reshaping (not flattening or, padding/trimming with 573 | 1s) 574 | * Units support (Darren's) 575 | * Jagged arrays? Kilian's suggestion. Drop the base array altogether, and 576 | access data via the .axis objects alone. 577 | * "Enum dtype", could be useful for event selection. 578 | * "Ordered factors"? Something R supports. 579 | * How many axis classes? 580 | 581 | * Allowing non-string axis names? 582 | 583 | - At least they must be hashable... 584 | - Serialization? 585 | 586 | 587 | * Allowing multiple names per axis? 588 | 589 | 590 | * Rob Speer's proposal for purely top-level, 'magical' attributes? 591 | 592 | 593 | * Finish the semantics of .lix indexing, especially with regards to what it 594 | should do when integer labels are present. 595 | 596 | * What should a.axis.x[object] do: .lix-style indexing or pure numpy indexing? 597 | 598 | Indexing semantics possibilities 599 | -------------------------------- 600 | 601 | 1. .lix: Integers always labels. a.lix[3:10] means labels 3 and 10 MUST exist. 602 | 603 | 2. .nix: Integers are never treated as labels. 604 | 605 | 3. .awful_ix: 1, then 2. 606 | 607 | 608 | Axis api 609 | -------- 610 | If a is an axis from an array: a = x.axis.a 611 | 612 | - a.at(key): return the slice at that key, with one less dimension than x 613 | - a.keep(keys): join slices for given keys, dims=dims(x) 614 | - a.drop(keys): like keep, but the opposite 615 | 616 | a[i] valid cases: 617 | 618 | - i: integer => normal numpy scalar indexing, one less dim than x 619 | - i: slice: numpy view slicing. same dims as x, must recover the labels 620 | - i: list/array: numpy fancy indexing, as long as the index list is 1d only. 621 | -------------------------------------------------------------------------------- /doc/source/generated/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | -------------------------------------------------------------------------------- /doc/source/index.rst: -------------------------------------------------------------------------------- 1 | ======================================================= 2 | Welcome to the documentation for the Datarray prototype 3 | ======================================================= 4 | 5 | Datarray is a prototype implementation of a numpy ndarray subclass with named 6 | axes and optionally labeled ticks on said axes. 7 | 8 | The datarray code is being collaboratively developed and hosted at: 9 | 10 | http://github.com/BIDS/datarray 11 | 12 | The documentation for various releases and a current build can be found here: 13 | 14 | http://bids.github.com/datarray 15 | 16 | Contents: 17 | 18 | .. toctree:: 19 | :maxdepth: 1 20 | 21 | basic_data_array 22 | ndarray_methods 23 | printing 24 | design/index 25 | other_projects/index 26 | license 27 | API docs 28 | 29 | Indices and tables 30 | ================== 31 | 32 | * :ref:`genindex` 33 | * :ref:`modindex` 34 | * :ref:`search` 35 | -------------------------------------------------------------------------------- /doc/source/license.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../../datarray/LICENSE 2 | 3 | Other Licenses 4 | -------------- 5 | 6 | Throughout the writing of datarray we have relied heavily on Pandas, as well as 7 | using numpydoc. These are the relevant licenses: 8 | 9 | :doc:`Pandas ` 10 | 11 | :doc:`Numpydoc ` 12 | 13 | .. toctree:: 14 | :hidden: 15 | 16 | licenses/numpydoc_license 17 | licenses/pandas_license 18 | -------------------------------------------------------------------------------- /doc/source/licenses/numpydoc_license.rst: -------------------------------------------------------------------------------- 1 | ================== 2 | Numpydoc License 3 | ================== 4 | 5 | The files: 6 | 7 | - numpydoc.py 8 | - autosummary.py 9 | - autosummary_generate.py 10 | - docscrape.py 11 | - docscrape_sphinx.py 12 | - phantom_import.py 13 | 14 | have the following license:: 15 | 16 | Copyright (C) 2008 Stefan van der Walt , Pauli Virtanen 17 | 18 | Redistribution and use in source and binary forms, with or without 19 | modification, are permitted provided that the following conditions are 20 | met: 21 | 22 | 1. Redistributions of source code must retain the above copyright 23 | notice, this list of conditions and the following disclaimer. 24 | 2. Redistributions in binary form must reproduce the above copyright 25 | notice, this list of conditions and the following disclaimer in 26 | the documentation and/or other materials provided with the 27 | distribution. 28 | 29 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 30 | IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 31 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 32 | DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, 33 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 34 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 35 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 36 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 37 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 38 | IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 39 | POSSIBILITY OF SUCH DAMAGE. 40 | 41 | ------------------------------------------------------------------------------- 42 | The files 43 | - compiler_unparse.py 44 | - comment_eater.py 45 | - traitsdoc.py 46 | have the following license: 47 | 48 | This software is OSI Certified Open Source Software. 49 | OSI Certified is a certification mark of the Open Source Initiative. 50 | 51 | Copyright (c) 2006, Enthought, Inc. 52 | All rights reserved. 53 | 54 | Redistribution and use in source and binary forms, with or without 55 | modification, are permitted provided that the following conditions are met: 56 | 57 | * Redistributions of source code must retain the above copyright notice, this 58 | list of conditions and the following disclaimer. 59 | * Redistributions in binary form must reproduce the above copyright notice, 60 | this list of conditions and the following disclaimer in the documentation 61 | and/or other materials provided with the distribution. 62 | * Neither the name of Enthought, Inc. nor the names of its contributors may 63 | be used to endorse or promote products derived from this software without 64 | specific prior written permission. 65 | 66 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 67 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 68 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 69 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 70 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 71 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 72 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 73 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 74 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 75 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 76 | 77 | 78 | ------------------------------------------------------------------------------- 79 | The files 80 | - only_directives.py 81 | - plot_directive.py 82 | originate from Matplotlib (http://matplotlib.sf.net/) which has 83 | the following license: 84 | 85 | Copyright (c) 2002-2008 John D. Hunter; All Rights Reserved. 86 | 87 | 1. This LICENSE AGREEMENT is between John D. Hunter (“JDH”), and the Individual or Organization (“Licensee”) accessing and otherwise using matplotlib software in source or binary form and its associated documentation. 88 | 89 | 2. Subject to the terms and conditions of this License Agreement, JDH hereby grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce, analyze, test, perform and/or display publicly, prepare derivative works, distribute, and otherwise use matplotlib 0.98.3 alone or in any derivative version, provided, however, that JDH’s License Agreement and JDH’s notice of copyright, i.e., “Copyright (c) 2002-2008 John D. Hunter; All Rights Reserved” are retained in matplotlib 0.98.3 alone or in any derivative version prepared by Licensee. 90 | 91 | 3. In the event Licensee prepares a derivative work that is based on or incorporates matplotlib 0.98.3 or any part thereof, and wants to make the derivative work available to others as provided herein, then Licensee hereby agrees to include in any such work a brief summary of the changes made to matplotlib 0.98.3. 92 | 93 | 4. JDH is making matplotlib 0.98.3 available to Licensee on an “AS IS” basis. JDH MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, JDH MAKES NO AND DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF MATPLOTLIB 0.98.3 WILL NOT INFRINGE ANY THIRD PARTY RIGHTS. 94 | 95 | 5. JDH SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF MATPLOTLIB 0.98.3 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING MATPLOTLIB 0.98.3, OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. 96 | 97 | 6. This License Agreement will automatically terminate upon a material breach of its terms and conditions. 98 | 99 | 7. Nothing in this License Agreement shall be deemed to create any relationship of agency, partnership, or joint venture between JDH and Licensee. This License Agreement does not grant permission to use JDH trademarks or trade name in a trademark sense to endorse or promote products or services of Licensee, or any third party. 100 | 101 | 8. By copying, installing or otherwise using matplotlib 0.98.3, Licensee agrees to be bound by the terms and conditions of this License Agreement. 102 | 103 | -------------------------------------------------------------------------------- /doc/source/licenses/pandas_license.rst: -------------------------------------------------------------------------------- 1 | ==================== 2 | The Pandas License 3 | ==================== 4 | 5 | :: 6 | 7 | Copyright (c) 2008-2009 AQR Capital Management, LLC 8 | All rights reserved. 9 | 10 | Redistribution and use in source and binary forms, with or without 11 | modification, are permitted provided that the following conditions are 12 | met: 13 | 14 | * Redistributions of source code must retain the above copyright 15 | notice, this list of conditions and the following disclaimer. 16 | 17 | * Redistributions in binary form must reproduce the above 18 | copyright notice, this list of conditions and the following 19 | disclaimer in the documentation and/or other materials provided 20 | with the distribution. 21 | 22 | * Neither the name of the copyright holder nor the names of any 23 | contributors may be used to endorse or promote products derived 24 | from this software without specific prior written permission. 25 | 26 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS 27 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 28 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 29 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 30 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 31 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 32 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 33 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 34 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 35 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 36 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37 | -------------------------------------------------------------------------------- /doc/source/ndarray_methods.rst: -------------------------------------------------------------------------------- 1 | .. testsetup:: 2 | 3 | import numpy as np 4 | 5 | ======= 6 | Methods 7 | ======= 8 | 9 | Here is a list of the ``array`` methods: 10 | 11 | .. we got the method names with 12 | 13 | >>> a = np.random.randn(3,4) 14 | >>> filter(lambda x: type(getattr(a,x))==type(a.min), dir(a)) 15 | 16 | * '__array__', 17 | * :ref:`'__array_prepare__',` 18 | * :ref:`'__array_wrap__',` 19 | * '__copy__', 20 | * '__deepcopy__', 21 | * :ref:`'__new__',` 22 | * '__reduce__', 23 | * '__reduce_ex__', 24 | * '__setstate__', 25 | * 'all', 26 | * 'any', 27 | * :ref:`'argmax', ` 28 | * :ref:`'argmin', ` 29 | * :ref:`'argsort',` 30 | * 'astype', 31 | * 'byteswap', 32 | * :ref:`'choose',` 33 | * 'clip', 34 | * 'compress', 35 | * 'conj', 36 | * 'conjugate', 37 | * 'copy', 38 | * :ref:`'cumprod',` 39 | * :ref:`'cumsum',` 40 | * :ref:`'diagonal',` 41 | * 'dump', 42 | * 'dumps', 43 | * 'fill', 44 | * :ref:`'flatten',` 45 | * 'getfield', 46 | * 'item', 47 | * 'itemset', 48 | * :ref:`'max',` 49 | * :ref:`'mean',` 50 | * :ref:`'min',` 51 | * 'newbyteorder', 52 | * 'nonzero', 53 | * :ref:`'prod',` 54 | * :ref:`'ptp',` 55 | * 'put', 56 | * :ref:`'ravel',` 57 | * :ref:`'repeat',` 58 | * :ref:`'reshape',` 59 | * :ref:`'resize',` 60 | * 'round', 61 | * :ref:`'searchsorted',` 62 | * 'setfield', 63 | * 'setflags', 64 | * :ref:`'sort',` 65 | * :ref:`'squeeze',` 66 | * :ref:`'std',` 67 | * :ref:`'sum',` 68 | * :ref:`'swapaxes',` 69 | * :ref:`'take',` 70 | * 'tofile', 71 | * 'tolist', 72 | * 'tostring', 73 | * 'trace', 74 | * :ref:`'transpose',` 75 | * :ref:`'var',` 76 | * 'view'] 77 | 78 | .. _sorting_methods: 79 | 80 | Sorting 81 | ------- 82 | 83 | sort() and argsort() 84 | 85 | These methods default to sorting the flattened array (returning an 86 | ndarray). If given an axis keyword, then it is possible to preserve 87 | the axes meta-data *only if* there are no ticks on the sorted 88 | Axis. Otherwise, an ndarray is returned. 89 | 90 | .. _explicitly_redef: 91 | 92 | Explicitly overloaded 93 | --------------------- 94 | 95 | These methods do not fit into a simple pattern, and are explicitly overloaded 96 | in the DataArray class definition. 97 | 98 | .. _wrapped_reduction: 99 | 100 | Regular reductions (eg, min) 101 | ---------------------------- 102 | 103 | These methods are wrapped in a generic runner that pays attention to which axis 104 | is being trimmed out (if only one), and then sets the remaining axes on the 105 | resulting array. It also allows for the translation of Axis-name to Axis-index. 106 | 107 | .. _wrapped_reduction_special: 108 | 109 | Special reductions (eg, argmin) 110 | ------------------------------- 111 | 112 | These methods are currently wrapped as a generic reduction. 113 | 114 | These methods return an index, or an array of indices into the array in 115 | question. That significantly changes the model of the array in question. Should 116 | the return type here NOT be DataArray? 117 | 118 | .. _incomplete_reductions: 119 | 120 | Accumulations 121 | ------------- 122 | 123 | These methods are wrapped in a generic accumulator. 124 | 125 | These methods have the property of taking an "axis" keyword argument, and yet 126 | not eliminating that axis. They also default to working on the flattened array 127 | if the axis parameter is left unspecified. 128 | 129 | .. _wtf_methods: 130 | 131 | Not-applicable methods 132 | ---------------------- 133 | 134 | Possibly N/A methods? 135 | 136 | .. _reshaping_methods: 137 | 138 | Reshapes 139 | -------- 140 | 141 | Reshaping is prickly.. I've already implemented certain slicing 142 | mechanisms that can insert unlabeled axes with length-1. This seems 143 | legitimate. Also squeezing out length-1 seems legitimate (**even if 144 | the Axis is labeled?**). 145 | 146 | The reshaping currently only trims or pads the array shape with 1s, or 147 | flattens the array entirely (returning an ndarray). 148 | 149 | -------------------------------------------------------------------------------- /doc/source/other_projects/index.rst: -------------------------------------------------------------------------------- 1 | ================ 2 | Other projects 3 | ================ 4 | 5 | The following are closely related projects that have heavily influenced the 6 | design of datarray. Both Larry and Pandas target slightly higher level 7 | problems than datarray, but the intended outcome is for datarray to provide 8 | a base object on which projects like these can more easily build their 9 | domain-specific tools with a common foundation. 10 | 11 | .. toctree:: 12 | :maxdepth: 1 13 | 14 | larry_overview.rst 15 | pandas_overview.rst 16 | -------------------------------------------------------------------------------- /doc/source/other_projects/larry_overview.rst: -------------------------------------------------------------------------------- 1 | ==================== 2 | Larray (aka Larry) 3 | ==================== 4 | 5 | Overview 6 | ^^^^^^^^ 7 | 8 | Larray offers the notion of "ticks", but the axes themselves are not named. The 9 | model seems to be something like *data with coordinates* 10 | 11 | Importantly, 12 | 13 | * Pure Python implementation 14 | * Is **not** an ndarray 15 | 16 | * therefore, lots of redefined functionality 17 | * also lots of presumed intention of data (shuffling labels, group means, ...) 18 | * not lightweight 19 | 20 | * Does **not** offer named axes 21 | * **Only one (class of) dtype!!** 22 | * Can do n-D 23 | * Good mixed indexing 24 | 25 | 26 | Construction 27 | ************ 28 | 29 | Larrays can be constructed from an array-like object and tick names for each 30 | axis. Alternatively, Larrays can be constructed from a number of 31 | data-with-coordinates representations. 32 | 33 | 34 | Here's how to create a larry using **fromtuples** (note the cast to float, and 35 | the filled-in NaN):: 36 | 37 | >>> data = [('a', 'a', 1), ('a', 'b', 2), ('b', 'a', 3)] 38 | >>> larry.fromtuples(data) 39 | label_0 40 | a 41 | b 42 | label_1 43 | a 44 | b 45 | x 46 | array([[ 1., 2.], 47 | [ 3., NaN]]) 48 | 49 | Here are examples of **fromdict** and **fromlist**:: 50 | 51 | >>> data = {('a', 'c'): 1, ('a', 'd'): 2, ('b', 'c'): 3, ('b', 'd'): 4} 52 | >>> larry.fromdict(data) 53 | label_0 54 | a 55 | b 56 | label_1 57 | c 58 | d 59 | x 60 | array([[ 1., 2.], 61 | [ 3., 4.]]) 62 | 63 | >>> data = [[1, 2, 3, 4], [('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd')]] 64 | >>> larry.fromlist(data) 65 | label_0 66 | a 67 | b 68 | label_1 69 | c 70 | d 71 | x 72 | array([[ 1., 2.], 73 | [ 3., 4.]]) 74 | 75 | Indexing 76 | ******** 77 | 78 | Indexing using the bracket syntax arr[ ] seems to return you exactly 79 | what numpy would slice out of the underlying array. All slicing works, with the 80 | exception of "fancy" indexing, and ellipsis indexing, and the use of 81 | **np.newaxis**. 82 | 83 | There is also a smart slicer riding along with the larrays that can slice with 84 | label information. It seems to nicely blend labels and regular integer slicing. 85 | To disambiguate possible integer labels and integer indexing, labels always 86 | must be enclosed in a list:: 87 | 88 | >>> arr = la.larry(np.arange(6).reshape(2,3), [ ['u', 'v'], [2,5,3], ]) 89 | >>> arr 90 | label_0 91 | u 92 | v 93 | label_1 94 | 2 95 | 5 96 | 3 97 | x 98 | array([[0, 1, 2], 99 | [3, 4, 5]]) 100 | >>> arr.lix[['u']] 101 | label_0 102 | 2 103 | 5 104 | 3 105 | x 106 | array([0, 1, 2]) 107 | >>> arr.lix[['u'],2:5] 108 | 2 109 | >>> arr.lix[['u'],[2]:[5]] 110 | 0 111 | >>> arr.lix[['u'],[2]:[3]] 112 | label_0 113 | 2 114 | 5 115 | x 116 | array([0, 1]) 117 | 118 | 119 | Binary Operations (arithmetic) 120 | ****************************** 121 | 122 | Binary operations are not, in general, numpy-thonic 123 | 124 | Alignment 125 | --------- 126 | 127 | Larray seems to want to only make binary operations on data with identical 128 | coordinates. Furthermore, it will re-align the data if necessary. Therefore, 129 | this example is ok:: 130 | 131 | >>> y1 = larry([1, 2], [['a', 'z']]) 132 | >>> y2 = larry([1, 2], [['z', 'a']]) 133 | 134 | What is ``y1 + y2``? 135 | :: 136 | 137 | >>> y1 + y2 138 | label_0 139 | a 140 | z 141 | x 142 | array([3, 3]) 143 | 144 | But this fails:: 145 | 146 | >>> z1 = larry([1, 2], [['a', 'b']]) 147 | >>> z2 = larry([3, 4], [['c', 'd']]) 148 | 149 | >>> z1 + z2 150 | Traceback (most recent call last): 151 | File "", line 1, in 152 | File "la/la/deflarry.py", line 494, in __add__ 153 | x, y, label = self.__align(other) 154 | File "la/la/deflarry.py", line 731, in __align 155 | raise IndexError, 'A dimension has no matching labels' 156 | IndexError: A dimension has no matching labels 157 | 158 | Intersections and Broadcasting 159 | ------------------------------ 160 | 161 | Binary ops can introduce an implicit intersection operation, for example (this 162 | would be illegal code in numpy):: 163 | 164 | >>> arr = la.larry(np.arange(6).reshape(2,3), [ ['u', 'v'], ['x','y','z']]) 165 | >>> arr2 = la.larry(np.arange(9).reshape(3,3), [ ['u', 'v', 'w'], ['x', 'y', 'z']] ) 166 | >>> arr2 + arr 167 | label_0 168 | u 169 | v 170 | label_1 171 | x 172 | y 173 | z 174 | x 175 | array([[ 0, 2, 4], 176 | [ 6, 8, 10]]) 177 | 178 | 179 | According to the matched-coordinates operation rule, broadcasting does not happen:: 180 | 181 | >>> arr3 = la.larry([4,5,6], [['x','y','z']]) 182 | >>> arr3 + arr 183 | ------------------------------------------------------------ 184 | Traceback (most recent call last): 185 | File "", line 1, in 186 | File "/Users/mike/usr/lib/python2.5/site-packages/la/deflarry.py", line 583, in __add__ 187 | x, y, label = self.__align(other) 188 | File "/Users/mike/usr/lib/python2.5/site-packages/la/deflarry.py", line 820, in __align 189 | raise IndexError, msg 190 | IndexError: Binary operation on two larrys with different dimension 191 | -------------------------------------------------------------------------------- /doc/source/other_projects/pandas_overview.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Pandas 3 | ======== 4 | 5 | Overview 6 | ^^^^^^^^ 7 | 8 | Pandas provides a timeseries and stack-of-timeseries objects. They 9 | seem heavily geared towards financial data. Despite the fact of 10 | **being** an ndarray, Pandas objects seem to be a specialized 11 | alternative to ndarrays rather than an augmentation of them. 12 | 13 | Features 14 | 15 | * **Is** an ndarray 16 | * axes are not named 17 | * Is dict-like, with respect to its indices (ticks) 18 | * If ticks are indices, semantics of indexing are ambiguous 19 | * Separate objects from 1D and 2D, no support for n>2 20 | 21 | 22 | Indexing 23 | ******** 24 | 25 | Point-indexing syntax can use ticks or integer indices. Range indexing 26 | only works with integers, but uses the same syntax 27 | 28 | Semantic Ambiguity 29 | ------------------ 30 | 31 | Integer tick values interfere with integer indexing, for example:: 32 | 33 | >>> t = pandas.Series.fromValue(1.0, range(5,0,-1), 'i') 34 | >>> t[:] = np.random.randint(100, size=5) 35 | >>> t 36 | 5 23 37 | 4 62 38 | 3 66 39 | 2 91 40 | 1 91 41 | >>> t[2] = 0 42 | >>> t 43 | 5 23 44 | 4 62 45 | 3 66 46 | 2 0 47 | 1 91 48 | 49 | 50 | 51 | Binary Operations 52 | ***************** 53 | 54 | Alignment 55 | --------- 56 | 57 | If data is partially aligned, missing data is filled with NaNs. This 58 | introduces a union with respect to the "range" of the data. This also 59 | will **cast** the data to floating point.:: 60 | 61 | >>> t.dtype 62 | dtype('int32') 63 | >>> t - t[:3] 64 | 5 0.0 65 | 4 0.0 66 | 3 0.0 67 | 2 NaN 68 | 1 NaN 69 | -------------------------------------------------------------------------------- /doc/source/printing.rst: -------------------------------------------------------------------------------- 1 | Printing Datarrays 2 | ================== 3 | 4 | One of the most important ways to understand what's going on in a labeled 5 | array is to be able to see a pretty text representation of it. In Divisi2 I 6 | stole the __str__ method from PySparse to accomplish this, but NumPy arrays are 7 | more varied than PySparse (where everything is two-dimensional and made of 8 | floats). 9 | 10 | Can we build on NumPy's str? 11 | ---------------------------- 12 | 13 | NumPy has provided somewhat-pretty text representations for a long time, but 14 | the code in numpy.core.arrayprint is 15 | 16 | - difficult to extend 17 | - undocumented 18 | - kind of spaghetti, frankly 19 | - largely untouched for the last 13 years! 20 | 21 | Its output can be aesthetically suboptimal in some cases. When printing large 22 | arrays of floats, for example, it will wrap every line like this:: 23 | 24 | [[ 0.00000000e+00 1.00000000e-04 2.00000000e-04 ..., 4.70000000e-03 25 | 4.80000000e-03 4.90000000e-03] 26 | [ 5.00000000e-03 5.10000000e-03 5.20000000e-03 ..., 9.70000000e-03 27 | 9.80000000e-03 9.90000000e-03] 28 | [ 1.00000000e-02 1.01000000e-02 1.02000000e-02 ..., 1.47000000e-02 29 | 1.48000000e-02 1.49000000e-02] 30 | ..., 31 | [ 3.35000000e-01 3.35100000e-01 3.35200000e-01 ..., 3.39700000e-01 32 | 3.39800000e-01 3.39900000e-01] 33 | [ 3.40000000e-01 3.40100000e-01 3.40200000e-01 ..., 3.44700000e-01 34 | 3.44800000e-01 3.44900000e-01] 35 | [ 3.45000000e-01 3.45100000e-01 3.45200000e-01 ..., 3.49700000e-01 36 | 3.49800000e-01 3.49900000e-01]] 37 | 38 | The user can understand what that means, but it'll be hard to stick labels on. 39 | 40 | My conclusion is that it will be better to build this representation from the 41 | ground up. 42 | 43 | The 2D pretty-printer 44 | --------------------- 45 | Screens are 2D, so everything is a variant of the 2D case. What we need is a 46 | class designed for printing strings in a grid. This class will then: 47 | 48 | - Find a formatter for the dtype of the matrix (the "cell formatter"). 49 | - Make an array (a string array? might as well) of equal-width string 50 | representations 51 | - Attach row and column labels as the first row and column of the array 52 | - Join together everything into a correctly-aligned, multi-line string 53 | 54 | The width of each cell is a negotiation between the grid formatter and the cell 55 | formatter: 56 | 57 | - Cell: I can print these floats in 5 to 15 characters. More characters is 58 | better, of course. 59 | - Grid: I'll give you 7. 60 | - Cell: Stingy bastard. 61 | 62 | Maybe this could be accomplished with "small", "medium", and "large" options 63 | for each formatter, allowing us to reuse arrayprint formatters: 64 | 65 | - float: large = high precision, medium = lower precision, small = lower 66 | precision and suppress_small 67 | - int: large = max number of digits, medium/small = exponential notation 68 | - str: large = maximum length, medium = truncate 69 | - bool: large = ' True'/'False', medium/small = 'T'/'-' (to be visually 70 | distinct) 71 | 72 | Brackets are _not_ printed (it's too hard to work them in with the labels). 73 | 74 | The 1D pretty-printer 75 | --------------------- 76 | It's the 2D printer with only one row. 77 | 78 | The 3D pretty-printer 79 | --------------------- 80 | When people work with n-dimensional labeled data and n>2, what they often do 81 | is flatten it out into 2 dimensions. The rows are single data points, and the 82 | columns are all the indices followed by the value. Show a few of these from the 83 | beginning of the matrix, dots, and a few of these from the end of the matrix. 84 | 85 | Then put all those back into the grid-maker. 86 | 87 | If there are more than 30 or so dimensions, we are sad. 88 | -------------------------------------------------------------------------------- /examples/inference_algs.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function 2 | 3 | import sys 4 | if sys.version_info[0] < 3: # Use range iterator for Python 2 5 | range = xrange 6 | from functools import reduce 7 | 8 | import operator 9 | from itertools import combinations 10 | 11 | import networkx as nx 12 | import numpy as np 13 | 14 | from datarray import DataArray 15 | 16 | from numpy.testing import assert_almost_equal 17 | 18 | 19 | def test_pearl_network(): 20 | """ From Russell and Norvig, "Artificial Intelligence, A Modern Approach," 21 | Section 15.1 originally from Pearl. 22 | 23 | "Consider the following situation. You have a new burglar alarm installed 24 | at home. It is fairly reliable at detecting a burglary, but also responds 25 | on occasion to minor earthquakes. You also have two neighbors, John and 26 | Mary, who have promised to call you at work when they hear the alarm. John 27 | always calls when he hears the alarm, but sometimes confuses the telephone 28 | ringing with the alarm and calls then, too. Mary on the other hand, likes 29 | rather loud music and sometimes misses the alarm altogether. Given the 30 | evidence of who has or has not called, we would like to estimate the 31 | probability of a burglary. 32 | 33 | Burglary Earthquake 34 | 35 | \ / 36 | _\| |/_ 37 | 38 | Alarm 39 | 40 | / \ 41 | |/_ _\| 42 | 43 | Johncalls Marycalls 44 | 45 | This test function uses four different algorithms to calculate 46 | 47 | P(burglary | johncalls = 1, marycalls = 1) 48 | 49 | In increasing order of sophistication: 50 | 1. Simple (calculate joint distribution and marginalize) 51 | 2. Elimination (strategically marginalize over one variable at a time) 52 | 3. Sum-product algorithm on factor graph 53 | 4. Junction tree algorithm 54 | """ 55 | burglary = DataArray([.999,.001], axes=["burglary"]) 56 | earthquake = DataArray([.998,.002], axes=["earthquake"]) 57 | alarm = DataArray([ [[.05,.95], [.06,.94]], 58 | [[.71,.29], [.999,.001]] ], 59 | ["burglary","earthquake","alarm"]) 60 | 61 | johncalls = DataArray([[.10,.90],[.95,.05]],["alarm","johncalls"]) 62 | marycalls = DataArray([[.30,.70],[.01,.99]],["alarm","marycalls"]) 63 | 64 | cpts = [burglary, earthquake, alarm, johncalls, marycalls] 65 | 66 | evidence = {"johncalls":0, "marycalls":0} 67 | 68 | margs1,lik1 = calc_marginals_simple(cpts,evidence) 69 | p_burglary,lik2 = digraph_eliminate(cpts,evidence,["burglary"]) 70 | margs3, lik3 = calc_marginals_sumproduct(cpts, evidence, 'burglary') 71 | 72 | # TODO: This version is disabled until I can dig up the reference to figure 73 | # out how it works. -jt 74 | # margs4,lik4 = calc_marginals_jtree(cpts,evidence) 75 | 76 | # Check that all four calculations give the same p(burglary) and 77 | # likelihood, up to numerical error 78 | for (marg,lik) in \ 79 | [(p_burglary, lik2), (margs3["burglary"], lik3)]: # , (margs4["burglary"],lik4)]: 80 | assert_almost_equal(marg,margs1["burglary"]) 81 | assert_almost_equal(lik,lik1) 82 | 83 | print("p(burglary) = %s" % margs1["burglary"].__array__()) 84 | print("likelihood of observations = %.3f" % lik1) 85 | 86 | ####### DataArray utilities ################ 87 | 88 | def match_shape(x,yshape,axes): 89 | """ 90 | Creates a view v on x with the same number of dimensions as y. 91 | The axes of x are copied into the axes of v specified by the axes argument. 92 | 93 | Example 94 | --------- 95 | >>> x = np.arange(3) 96 | >>> match_shape(x,(2,3,2),(1,)) 97 | array([[[0, 0], 98 | [1, 1], 99 | [2, 2]], 100 | 101 | [[0, 0], 102 | [1, 1], 103 | [2, 2]]]) 104 | 105 | """ 106 | if isinstance(axes,int): axes = [axes] 107 | assert len(x.shape) == len(axes) 108 | assert all(xsize == yshape[yax] for xsize,yax in zip(x.shape,axes)) 109 | strides = np.zeros(len(yshape), dtype=np.intp) 110 | for yax,xstride in zip(axes,x.strides): 111 | strides[yax] = xstride 112 | return np.ndarray.__new__(np.ndarray, strides=strides, shape=yshape, buffer=x, dtype=x.dtype) 113 | 114 | def multiply_potentials(*DAs): 115 | """ 116 | Multiply DataArrays in the way that we multiply functions, 117 | e.g. h(i,j,k,l) = f(i,j,k) g(k,l) 118 | 119 | Parameters 120 | ------------- 121 | DA1,DA2,... : DataArrays with variable names as axis labels 122 | 123 | Returns 124 | --------- 125 | product 126 | 127 | example 128 | --------- 129 | >>> f_of_a = DataArray([1, 2],"a") 130 | >>> g_of_b = DataArray([1,-1],"b") 131 | >>> multiply_potentials(f_of_a, g_of_b) 132 | DataArray([[ 1, -1], 133 | [ 2, -2]]) 134 | ('a', 'b') 135 | >>> multiply_potentials(f_of_a, f_of_a) 136 | DataArray([1, 4]) 137 | ('a',) 138 | 139 | 140 | """ 141 | if len(DAs) == 0: return 1 142 | 143 | full_names, full_shape = [],[] 144 | for axis,size in zip(_sum(list(DA.axes) for DA in DAs), _sum(DA.shape for DA in DAs)): 145 | if axis.name not in full_names: 146 | full_names.append(axis.name) 147 | full_shape.append(size) 148 | 149 | return DataArray( 150 | _prod(match_shape(DA.copy(), full_shape, 151 | [full_names.index(axis.name) for axis in DA.axes]) for DA in DAs), 152 | axes=full_names) 153 | 154 | def sum_over_axes(DA, axis_names): 155 | Out = DA 156 | for axname in axis_names: 157 | Out = Out.sum(axis=axname) 158 | return Out 159 | 160 | def set_slices(DA,**axes2inds): 161 | """ 162 | return a copy of DataArray DA, where several slices are taken along named axes, 163 | specified by keys ax1=ind1, ax2=ind2, etc. 164 | """ 165 | Out = DA 166 | for (ax,ind) in axes2inds.items(): 167 | Out = Out.axis[ax][ind:(ind+1)] 168 | return Out 169 | 170 | def sum_over_other_axes(DA, kept_axis_name): 171 | "sum all axes of DataArray DA except for ax" 172 | return sum_over_axes(DA, 173 | [axname for axname in DA.names if axname != kept_axis_name]) 174 | 175 | def _sum(seq): return reduce(operator.add, seq) 176 | def _prod(seq): return reduce(operator.mul, seq) 177 | 178 | ####### Simple marginalization ############# 179 | 180 | def calc_marginals_simple(cpts,evidence): 181 | """ 182 | Calculate the marginal probabilities the simple simple way. Calculate joint 183 | distribution of all variables and then marginalize. This algorithm becomes 184 | inefficient when there are a lot of variables, and the joint distribution 185 | becomes high-dimensional. 186 | 187 | Parameters 188 | ----------- 189 | cpts : a list of DataArray. Gives conditional probability of variable with axis=-1 190 | evidence : a dictionary of variable -> value 191 | 192 | Returns 193 | -------- 194 | marginals : dictionary of variable -> prob_table 195 | likelihood : likelihood of observations in the model 196 | """ 197 | joint_dist = multiply_potentials(*cpts) 198 | joint_dist = joint_dist.axes.johncalls[evidence['johncalls']].axes.marycalls[evidence['marycalls']] 199 | return (dict((ax.name, normalize(sum_over_other_axes(joint_dist, ax.name))) 200 | for ax in joint_dist.axes), 201 | joint_dist.sum()) 202 | 203 | 204 | ############# Elimination ############# 205 | 206 | def digraph_eliminate(cpts,evidence,query_list): 207 | """ 208 | Use elimination algorithm to find joint distribution over variables in 209 | query_list, given evidence. 210 | 211 | Parameters 212 | ------------ 213 | cpts : a list of DataArray with variable names for axis names 214 | evidence : a dictionary of observed variables (strings) -> values 215 | query_list : a list of variables (strings) 216 | 217 | Returns 218 | -------- 219 | marginals : dictionary of variable -> prob_table 220 | likelihood : likelihood of observations in the model 221 | """ 222 | 223 | # find the directed graphical model 224 | DG = cpts2digraph(cpts) 225 | # use postorder (leaves to root) from depth-first search as elimination order 226 | rvs = nx.dfs_postorder_nodes(DG) 227 | 228 | # modify elimination list so query nodes are at the end 229 | rvs_elim = [rv for rv in rvs if rv not in query_list] + query_list 230 | for rv in rvs_elim: 231 | # find potentials that reference that node 232 | pots_here = [cpt for cpt in cpts if rv in cpt.names] 233 | # remove them from cpts 234 | cpts = [cpt for cpt in cpts if rv not in cpt.names] 235 | # Find joint probability distribution of this variable and the ones coupled to it 236 | product_pot = multiply_potentials(*pots_here) 237 | # if node is in query set, we don't sum over it 238 | if rv not in query_list: 239 | # if node is in evidence set, take slice 240 | if rv in evidence: product_pot = product_pot.axes(rv)[evidence[rv]] 241 | # otherwise, sum over it 242 | else: product_pot = product_pot.sum(axis=rv) 243 | 244 | # add resulting product potential to cpts 245 | cpts.append(product_pot) 246 | 247 | assert len(cpts) == 1 248 | unnormed_prob = cpts[0] 249 | likelihood = unnormed_prob.sum() 250 | return unnormed_prob/likelihood, likelihood 251 | 252 | def cpts2digraph(cpts): 253 | """ 254 | Each cpt has axes a_1,a_2,...a_k and represents p(a_k | a_1,...a_{k-1}). 255 | Use cpts to construct directed graph corresponding to these conditional 256 | probability dists. 257 | """ 258 | G = nx.DiGraph() 259 | for cpt in cpts: 260 | names = [ax.name for ax in cpt.axes] 261 | target = names[-1] 262 | G.add_edges_from((source, target) for source in names[:-1]) 263 | return G 264 | 265 | ############# Sum-product ############# 266 | 267 | def calc_marginals_sumproduct(cpts, evidence, target_node): 268 | """ 269 | Construct the factor graph. Then use the sum-product algorithm to calculate 270 | marginals for all variables. 271 | 272 | Parameters 273 | ------------ 274 | cpts : a list of DataArray with variable names for axis labels 275 | evidence : a dictionary of observed variables (strings) -> values 276 | target_node : str 277 | Target node from which to calculate likelihood 278 | 279 | Returns 280 | -------- 281 | marginals : dictionary of variable -> prob_table 282 | likelihood : likelihood of observations in the model 283 | """ 284 | 285 | # In this implementation, we use evidence by using an evidence potential, 286 | # which equals 1 at the observed value and zero everywhere else. 287 | # Alternatively, we could take slices of cpts. This is the strategy used in 288 | # the junction tree algorithm below. 289 | 290 | G,names2tables = make_factor_graph(cpts,evidence) 291 | messages = {} 292 | # (source,target) for edges in directed spanning tree resulting from depth 293 | # first search 294 | message_pairs = dfs_edges(G) 295 | 296 | # message passing inward from leaves (actually we don't need to send 297 | # messages up from some leaves because cpt is normalized) 298 | for (parent,child) in message_pairs: 299 | m = make_message(child,parent,G,messages,names2tables) 300 | messages[(child,parent)] = m 301 | 302 | # message passing outward from root 303 | for (parent,child) in reversed(message_pairs): 304 | m = make_message(parent,child,G,messages,names2tables) 305 | messages[(parent,child)] = m 306 | 307 | # calculate marginals 308 | marginals = {} 309 | potentials = {} 310 | for node in G.nodes(): 311 | potential = multiply_potentials(*[messages[(src,node)] for src in G.neighbors(node)]) 312 | marginals[node] = normalize(potential) 313 | potentials[node] = potential 314 | 315 | return marginals, potentials[target_node].sum() 316 | 317 | def make_message(src,targ,G,messages,names2tables): 318 | """ 319 | Collect messages coming to src from all nodes other than targ and multiply them. 320 | If targ is a factor node, this product is the message. 321 | If targ is a variable node, marginalize over all other variables 322 | """ 323 | # collect messages incoming to src 324 | incoming_msgs = [messages[(neighb,src)] for neighb in G.neighbors(src) if neighb != targ] 325 | if isvar2factor(src,targ): return multiply_potentials(names2tables[src],*incoming_msgs) 326 | return sum_over_other_axes(multiply_potentials(names2tables[src],*incoming_msgs),targ) 327 | 328 | def isvar2factor(src,targ): 329 | "True if target is a factor node." 330 | return isinstance(targ,tuple) 331 | 332 | def make_factor_graph(cpts,evidence): 333 | G = nx.Graph() 334 | 335 | names2factors = dict((tuple(cpt.names), cpt) for cpt in cpts) 336 | G.add_nodes_from(names2factors.keys()) 337 | for (name,factor) in names2factors.items(): 338 | for axnames in factor.names: 339 | G.add_edge(name, axnames) 340 | 341 | names2factors.update( 342 | dict((name, 343 | DataArray(np.ones(size) if name not in evidence 344 | else one_hot(size,evidence[name]),[name])) 345 | for cpt in cpts 346 | for (name,size) in zip(cpt.names,cpt.shape))) 347 | 348 | return G, names2factors 349 | 350 | def one_hot(size,val): 351 | "out[val] = 1, out[i] = 0 for i != val" 352 | out = np.zeros(size) 353 | out[val] = 1 354 | return out 355 | 356 | def dfs_edges(G): 357 | """ 358 | (source,target) for edges in directed spanning tree resulting from depth 359 | first search 360 | """ 361 | DG = nx.dfs_tree(G, source=None) 362 | return [(src,targ) for targ in nx.dfs_postorder_nodes(DG) for src in DG.predecessors(targ)] 363 | 364 | 365 | ############# Junction tree ############# 366 | 367 | ## Applying the junction tree algorithm to a directed graphical model requires several steps 368 | ## 1. Moralize the directed graph. 369 | ## 2. Add edges to obtain a triangulated graph. It is hard to find the best triangulation 370 | ## (i.e., the one that adds as few edges as possible), so we use a greedy heuristic "min fill" 371 | ## 3. Form a clique tree for triangulated graph. Assign potentials to cliques. 372 | ## 4. Apply the Hugin algorithm to the clique tree 373 | 374 | 375 | def calc_marginals_jtree(potentials, evidence): 376 | """ 377 | Use the hugin algorithm to find marginals and data likelihood. 378 | """ 379 | JT, names2factors = make_jtree_from_factors(potentials) 380 | pots = hugin(JT, names2factors, evidence) 381 | 382 | # Each random variable appears in many cliques and separators. Each of these potentials is a 383 | # joint probability distribution, and they should give the same marginals. 384 | rv2marg = {} 385 | for pot in pots.values(): 386 | for rv in pot.labels: 387 | if rv not in rv2marg: 388 | rv2marg[rv] = normalize(sum_over_other_axes(pot,rv)) 389 | 390 | return rv2marg, pot.sum() 391 | 392 | def hugin(JT,names2factors,evidence): 393 | 394 | # intialize potentials, taking slices to incorporate evidence 395 | potentials = dict([(name,use_evidence(factor,evidence)) 396 | for (name,factor) in names2factors.items()]) 397 | 398 | message_pairs = dfs_edges(JT) 399 | # iterate over edges of clique tree 400 | for (pred,succ) in message_pairs: 401 | sep = tuple(set(pred).intersection(succ)) 402 | sepname = (pred,succ) 403 | # update separator 404 | potentials[sepname] = sum_over_axes(potentials[succ],set(succ).difference(sep)) 405 | # update predecessor clique 406 | potentials[pred] = multiply_potentials(potentials[pred],potentials[sepname]) 407 | 408 | for (pred,succ) in reversed(message_pairs): 409 | sep = tuple(set(pred).intersection(succ)) 410 | sepname = (pred,succ) 411 | # update separator 412 | oldsep = potentials[sepname] 413 | potentials[sepname] = sum_over_axes(potentials[pred],set(pred).difference(sep)) 414 | # update successor clique 415 | potentials[succ] = multiply_potentials(potentials[succ],1/oldsep,potentials[sepname]) 416 | 417 | return potentials 418 | 419 | def use_evidence(potential,ev_dict): 420 | "Take slices of potential at all variables appearing in ev_dict" 421 | obs_dict = dict((label,ev_dict[label]) for label in potential.labels if label in ev_dict) 422 | return set_slices(potential,**obs_dict) if len(obs_dict) > 0 else potential 423 | 424 | def triangulate_min_fill(G): 425 | """ 426 | Return graph with a triangulation of undirected graph G, using min fill. 427 | Min fill forms an elimination ordering on graph. Each step, we eliminate the node that 428 | requires us to add the fewest new edges. A graph resulting from elimination is always triangulated (why?) 429 | """ 430 | G_elim = nx.Graph(G.edges()) 431 | added_edges = [] 432 | for _ in range(G.number_of_nodes()): 433 | nodes,degrees = zip(*G_elim.degree().items()) 434 | min_deg_node = nodes[np.argmin(degrees)] 435 | new_edges = [(n1,n2) for (n1,n2) in 436 | combinations(G_elim.neighbors(min_deg_node),2) if not 437 | G_elim.has_edge(n1,n2)] 438 | added_edges.extend(new_edges) 439 | G_elim.remove_node(min_deg_node) 440 | G_elim.add_edges_from(new_edges) 441 | 442 | return nx.Graph(G.edges() + added_edges) 443 | 444 | def make_jtree_from_tri_graph(G): 445 | """returns JT graph""" 446 | 447 | # clique graph 448 | CG = nx.Graph() 449 | # maximal weight spanning tree of clique graph is guaranteed to be a junction tree 450 | # (i.e., it satisfies running intersection property) 451 | # where weight is the size of the intersection between adjacent cliques. 452 | CG.add_weighted_edges_from((tuple(c1),tuple(c2),-c1c2) 453 | for (c1,c2) in combinations(nx.find_cliques(G),2) 454 | for c1c2 in [len(set(c1).intersection(set(c2)))] if c1c2 > 0) 455 | JT = nx.Graph(nx.mst(CG)) # Minimal weight spanning tree for CliqueGraph 456 | for src,targ in JT.edges(): 457 | JT[src][targ]["sep"] = tuple(set(src).intersection(set(targ))) 458 | 459 | return JT 460 | 461 | def make_jtree_from_factors(factors): 462 | """ 463 | Make junction tree and assign factors to cliques. 464 | 1. Moralize 465 | 2. Triangulate 466 | 3. Take MST of clique tree to get junction tree 467 | 4. Assign potentials to cliques and multiply them to get clique potentials 468 | 469 | parameters 470 | ----------- 471 | factors : list of DataArray 472 | 473 | returns 474 | -------- 475 | JT : junction tree (directed graph), with nodes labeled by tuples, e.g. ("A","B","C") 476 | clique2pot : dictionary of cliques (i.e., node labels) -> DataArray 477 | """ 478 | VarGraph = moral_graph_from_factors(factors) 479 | TriangulatedGraph = triangulate_min_fill(VarGraph) 480 | JT = make_jtree_from_tri_graph(TriangulatedGraph) 481 | clique2potlist = dict((node,[]) for node in JT.nodes()) 482 | for factor in factors: 483 | varset = set(factor.labels) 484 | for clique in JT: 485 | if varset.issubset(set(clique)): 486 | clique2potlist[clique].append(factor) 487 | continue 488 | clique2pot = dict((clique,multiply_potentials(*potlist)) for (clique,potlist) in clique2potlist.items()) 489 | # todo: make sure all cliques have a potential 490 | return JT,clique2pot 491 | 492 | def moral_graph_from_factors(factors): 493 | G = nx.Graph() 494 | for factor in factors: 495 | for label1,label2 in combinations(factor.names, 2): 496 | G.add_edge(label1,label2) 497 | 498 | return G 499 | 500 | def normalize(arr): 501 | return arr/arr.sum() 502 | 503 | if __name__ == "__main__": 504 | test_pearl_network() 505 | #import doctest 506 | #doctest.testmod() 507 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Requirements for datarray package 2 | # Use with: 3 | # pip install -r requirements.txt 4 | 5 | numpy>=1.7 6 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [wheel] 2 | # Build wheels compatible with Python 2 and Python 3 3 | universal = 1 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Setup file for the Python datarray package.""" 3 | 4 | import os 5 | 6 | # BEFORE importing distutils, remove MANIFEST. distutils doesn't properly 7 | # update it when the contents of directories change. 8 | if os.path.exists('MANIFEST'): os.remove('MANIFEST') 9 | 10 | # Commit to setuptools 11 | import setuptools 12 | 13 | from distutils.core import setup 14 | 15 | # Get version and release info, which is all stored in datarray/version.py 16 | ver_file = os.path.join('datarray', 'version.py') 17 | # Use exec on contents for Python 3 compatibility 18 | with open(ver_file, 'rt') as fobj: 19 | exec(fobj.read()) 20 | 21 | opts = dict(name=NAME, 22 | maintainer=MAINTAINER, 23 | maintainer_email=MAINTAINER_EMAIL, 24 | description=DESCRIPTION, 25 | long_description=LONG_DESCRIPTION, 26 | url=URL, 27 | download_url=DOWNLOAD_URL, 28 | license=LICENSE, 29 | classifiers=CLASSIFIERS, 30 | author=AUTHOR, 31 | author_email=AUTHOR_EMAIL, 32 | platforms=PLATFORMS, 33 | version=VERSION, 34 | packages=PACKAGES, 35 | package_data=PACKAGE_DATA, 36 | requires=REQUIRES, 37 | install_requires=INSTALL_REQUIRES, 38 | zip_safe = False, 39 | ) 40 | 41 | 42 | # Now call the actual setup function 43 | if __name__ == '__main__': 44 | setup(**opts) 45 | -------------------------------------------------------------------------------- /tools/release.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Simple release script for datarray. 3 | 4 | Ensure that you've built the docs and pushed those first (after verifying them 5 | manually). 6 | """ 7 | from __future__ import print_function 8 | 9 | import os 10 | from subprocess import call 11 | 12 | sh = lambda s: call(s, shell=True) 13 | 14 | cwd = os.getcwd() 15 | if not os.path.isfile('setup.py'): 16 | os.chdir('..') 17 | if not os.path.isfile('setup.py'): 18 | print("This script must be run from top-level datarray or tools dir.") 19 | sys.exit(1) 20 | 21 | 22 | sh('./setup.py register') 23 | sh('./setup.py sdist --formats=gztar,zip upload') 24 | --------------------------------------------------------------------------------