├── .editorconfig ├── .github ├── ISSUE_TEMPLATE.md └── workflows │ └── main.yml ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── pyproject.toml ├── requirements_dev.txt ├── setup.cfg ├── setup.py ├── split_dataset ├── __init__.py ├── blocks.py └── split_dataset.py └── tests ├── __init__.py ├── test_blocks.py └── test_split_dataset.py /.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | indent_style = space 7 | indent_size = 4 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | charset = utf-8 11 | end_of_line = lf 12 | 13 | [*.bat] 14 | indent_style = tab 15 | end_of_line = crlf 16 | 17 | [LICENSE] 18 | insert_final_newline = false 19 | 20 | [Makefile] 21 | indent_style = tab 22 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | * Split Dataset version: 2 | * Python version: 3 | * Operating System: 4 | 5 | ### Description 6 | 7 | Describe what you were trying to get done. 8 | Tell us what happened, what went wrong, and what you expected to happen. 9 | 10 | ### What I Did 11 | 12 | ``` 13 | Paste the command(s) you ran and the output. 14 | If there was a crash, please include the traceback here. 15 | ``` 16 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: tests 2 | 3 | on: push 4 | 5 | jobs: 6 | lint: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v2 10 | - name: Set up Python 11 | uses: actions/setup-python@v2 12 | with: 13 | python-version: 3.8 14 | - name: Install dependencies 15 | run: | 16 | python -m pip install --upgrade pip 17 | pip install .[dev] 18 | - name: Lint 19 | run: | 20 | black . --check 21 | flake8 . 22 | isort . --check 23 | test: 24 | needs: lint 25 | runs-on: ubuntu-latest 26 | steps: 27 | - uses: actions/checkout@v2 28 | - name: Set up Python 29 | uses: actions/setup-python@v2 30 | with: 31 | python-version: 3.8 32 | - name: Install dependencies 33 | run: | 34 | python -m pip install --upgrade pip 35 | pip install .[dev] 36 | - name: Test 37 | run: pytest --cov 38 | - name: Coveralls 39 | env: 40 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 41 | run: | 42 | pip install coveralls 43 | coveralls --service=github 44 | 45 | deploy: 46 | needs: test 47 | runs-on: ubuntu-latest 48 | if: startsWith(github.ref, 'refs/tags/v') 49 | steps: 50 | - uses: actions/checkout@v2 51 | - name: Set up Python 52 | uses: actions/setup-python@v1 53 | with: 54 | python-version: "3.x" 55 | - name: Install dependencies 56 | run: | 57 | python -m pip install --upgrade pip 58 | pip install -U setuptools setuptools_scm wheel twine 59 | - name: Build and publish 60 | env: 61 | TWINE_USERNAME: __token__ 62 | TWINE_PASSWORD: ${{ secrets.TWINE_API_KEY }} 63 | run: | 64 | git tag 65 | python setup.py sdist bdist_wheel 66 | twine upload dist/* 67 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # Jupyter Notebook 72 | .ipynb_checkpoints 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule 79 | 80 | # SageMath parsed files 81 | *.sage.py 82 | 83 | # dotenv 84 | .env 85 | 86 | # virtualenv 87 | .venv 88 | venv/ 89 | ENV/ 90 | 91 | # Spyder project settings 92 | .spyderproject 93 | .spyproject 94 | 95 | # Rope project settings 96 | .ropeproject 97 | 98 | # mkdocs documentation 99 | /site 100 | 101 | # mypy 102 | .mypy_cache/ 103 | 104 | # IDE settings 105 | .vscode/ 106 | 107 | # Idea 108 | .idea/ 109 | \.idea/ 110 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | A package for HDF5-based chunked arrays 5 | Copyright (C) 2020 Vilim Stih & Luigi Petrucco @portugueslab 6 | 7 | This program is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | This program is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with this program. If not, see . 19 | 20 | Also add information on how to contact you by electronic and paper mail. 21 | 22 | You should also get your employer (if you work as a programmer) or school, 23 | if any, to sign a "copyright disclaimer" for the program, if necessary. 24 | For more information on this, and how to apply and follow the GNU GPL, see 25 | . 26 | 27 | The GNU General Public License does not permit incorporating your program 28 | into proprietary programs. If your program is a subroutine library, you 29 | may consider it more useful to permit linking proprietary applications with 30 | the library. If this is what you want to do, use the GNU Lesser General 31 | Public License instead of this License. But first, please read 32 | . 33 | 34 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.md 3 | 4 | recursive-include tests * 5 | recursive-exclude * __pycache__ 6 | recursive-exclude * *.py[co] 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | [![Python Version](https://img.shields.io/pypi/pyversions/split_dataset.svg)](https://pypi.org/project/split_dataset) 3 | [![PyPI](https://img.shields.io/pypi/v/split_dataset.svg)]( 4 | https://pypi.python.org/pypi/split_dataset) 5 | [![Tests](https://img.shields.io/github/workflow/status/portugueslab/split_dataset/tests)]( 6 | https://github.com/portugueslab/split_dataset/actions) 7 | [![Coverage Status](https://coveralls.io/repos/github/portugueslab/split_dataset/badge.svg?branch=master)](https://coveralls.io/github/portugueslab/split_dataset?branch=master) 8 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/python/black) 9 | [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) 10 | 11 | 12 | 13 | A minimal package for saving and reading large HDF5-based chunked arrays. 14 | 15 | This package has been developed in the [`Portugues lab`](http://www.portugueslab.com) for volumetric calcium imaging data. `split_dataset` is extensively used in the calcium imaging analysis package [`fimpy`](https://github.com/portugueslab/fimpy); The microscope control libraries [`sashimi`](https://github.com/portugueslab/sashimi) and [`brunoise`](https://github.com/portugueslab/brunoise) save files as split datasets. 16 | 17 | [`napari-split-dataset`](https://github.com/portugueslab/napari-split-dataset) support the visualization of SplitDatasets in `napari`. 18 | 19 | ## Why using Split dataset? 20 | Split datasets are numpy-like array saved over multiple h5 files. The concept of spli datasets is not different from e.g. [zarr arrays](https://zarr.readthedocs.io/en/stable/); however, relying on h5 files allow for partial reading even within the same file, which is crucial for visualizing volumetric time series, the main application `split_dataset` has been developed for (see [this discussion](https://github.com/zarr-developers/zarr-python/issues/521) on the limitation of zarr arrays). 21 | 22 | # Structure of a split dataset 23 | A split dataset is contained in a folder containing multiple, numbered h5 files (one file per chunk) and a metadata json file with information on the shape of the full dataset and of its chunks. 24 | The h5 files are saved using the [flammkuchen](https://github.com/portugueslab/flammkuchen) library (ex [deepdish](https://deepdish.readthedocs.io/en/latest/)). Each file contains a dictionary with the data under the `stack` keyword. 25 | 26 | `SplitDataset` objects can than be instantiated from the dataset path, and numpy-style indexing can then be used to load data as numpy arrays. Any n of dimensions and block sizes are supported in principle; the package has been used mainly with 3D and 4D arrays. 27 | 28 | 29 | 30 | ## Minimal example 31 | ```python 32 | # Load a SplitDataset via a SplitDataset object: 33 | from split_dataset import SplitDataset 34 | ds = SplitDataset(path_to_dataset) 35 | 36 | # Retrieve data in an interval: 37 | data_array = ds[n_start:n_end, :, :, :] 38 | ``` 39 | 40 | ## Creating split datasets 41 | New split datasets can be created with the `split_dataset.save_to_split_dataset` function, provided that the original data is fully loaded in memory. Alternatively, e.g. for time acquisitions, a split dataset can be saved one chunk at a time. It is enough to save with `flammkuchen` correctly formatted .h5 files and the correspondent json metadata file describing the full split dataset shape (this is [what happens in sashimi](https://github.com/portugueslab/sashimi/blob/01046f2f24483ab702be379843a1782ababa7d2d/sashimi/processes/streaming_save.py#L186)) 42 | 43 | 44 | # TODO 45 | * provide utilities for partial saving of split datasets 46 | * support for more advanced indexing (support for step and vector indexing) 47 | * support for cropping a `SplitDataset` 48 | * support for resolution and frequency metadata 49 | 50 | 51 | # History 52 | 53 | ### 0.4.0 (2021-03-23) 54 | * Added support to use a `SplitDataset` as data in a `napari` layer. 55 | 56 | ... 57 | 58 | ### 0.1.0 (2020-05-06) 59 | * First release on PyPI. 60 | 61 | 62 | Credits 63 | ------- 64 | 65 | Part of this package was inspired by [Cookiecutter](https://github.com/audreyr/cookiecutter) and [this](https://github.com/audreyr/cookiecutter-pypackage) template. 66 | 67 | .. _`Portugues lab`: 68 | .. _Cookiecutter: 69 | .. _this: 70 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | target-version = ['py36', 'py37', 'py38'] 3 | skip-string-normalization = false 4 | exclude = ''' 5 | ( 6 | /( 7 | \.eggs 8 | | \.git 9 | | \.hg 10 | | \.mypy_cache 11 | | \.tox 12 | | \.venv 13 | | _build 14 | | buck-out 15 | | build 16 | | dist 17 | | examples 18 | )/ 19 | ) 20 | ''' 21 | 22 | [tool.isort] 23 | multi_line_output = 3 24 | include_trailing_comma = true 25 | -------------------------------------------------------------------------------- /requirements_dev.txt: -------------------------------------------------------------------------------- 1 | pip 2 | bump2version 3 | wheel 4 | flake8 5 | coverage 6 | Sphinx 7 | twine 8 | black 9 | isort 10 | 11 | pytest 12 | pytest-runner 13 | pytest-cov 14 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.4.3 3 | commit = True 4 | tag = True 5 | 6 | [bumpversion:file:setup.py] 7 | search = version="{current_version}" 8 | replace = version="{new_version}" 9 | 10 | [bumpversion:file:split_dataset/__init__.py] 11 | search = __version__ = "{current_version}" 12 | replace = __version__ = "{new_version}" 13 | 14 | [bdist_wheel] 15 | universal = 1 16 | 17 | [flake8] 18 | ignore = E203, W503 19 | max-line-length = 88 20 | exclude = __init__.py 21 | 22 | [aliases] 23 | test = pytest 24 | 25 | [tool:pytest] 26 | collect_ignore = ['setup.py'] 27 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """The setup script.""" 4 | 5 | from setuptools import find_packages, setup 6 | 7 | with open("README.md") as readme_file: 8 | readme = readme_file.read() 9 | 10 | 11 | requirements = ["flammkuchen", "numpy"] 12 | 13 | with open("requirements_dev.txt") as f: 14 | requirements_dev = f.read().splitlines() 15 | 16 | setup( 17 | author="Vilim Stih & Luigi Petrucco @portugueslab", 18 | author_email="luigi.petrucco@gmail.com", 19 | python_requires=">=3.5", 20 | classifiers=[ 21 | "Development Status :: 2 - Pre-Alpha", 22 | "Intended Audience :: Developers", 23 | "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", 24 | "Natural Language :: English", 25 | "Programming Language :: Python :: 3", 26 | "Programming Language :: Python :: 3.5", 27 | "Programming Language :: Python :: 3.6", 28 | "Programming Language :: Python :: 3.7", 29 | "Programming Language :: Python :: 3.8", 30 | ], 31 | description="A package for HDF5-based chunked arrays", 32 | install_requires=requirements, 33 | extras_require=dict(dev=requirements_dev), 34 | license="GNU General Public License v3", 35 | long_description=readme, 36 | long_description_content_type="text/markdown", 37 | include_package_data=True, 38 | keywords="split_dataset", 39 | name="split_dataset", 40 | packages=find_packages(include=["split_dataset", "split_dataset.*"]), 41 | test_suite="tests", 42 | url="https://github.com/portugueslab/split_dataset", 43 | version="0.4.3", 44 | zip_safe=False, 45 | ) 46 | -------------------------------------------------------------------------------- /split_dataset/__init__.py: -------------------------------------------------------------------------------- 1 | """Top-level package for Split Dataset.""" 2 | 3 | __author__ = """Vilim Stih & Luigi Petrucco @portugueslab""" 4 | __version__ = "0.4.3" 5 | 6 | from split_dataset.blocks import Blocks 7 | from split_dataset.split_dataset import ( 8 | EmptySplitDataset, 9 | SplitDataset, 10 | save_to_split_dataset, 11 | ) 12 | -------------------------------------------------------------------------------- /split_dataset/blocks.py: -------------------------------------------------------------------------------- 1 | from itertools import product 2 | from typing import Optional, Tuple, Union 3 | 4 | import numpy as np 5 | 6 | 7 | def _drop_ith(xs, dim_to_drop): 8 | return tuple(x for i, x in enumerate(xs) if i != dim_to_drop) 9 | 10 | 11 | class BlockIterator: 12 | def __init__(self, blocks, slices=True): 13 | self.blocks = blocks 14 | self.current_block = 0 15 | self.slices = slices 16 | 17 | def __iter__(self): 18 | return self 19 | 20 | def __next__(self): 21 | if self.current_block == self.blocks.n_blocks: 22 | raise StopIteration 23 | else: 24 | idx = self.blocks.linear_to_cartesian(self.current_block) 25 | self.current_block += 1 26 | if self.slices: 27 | return ( 28 | idx, 29 | tuple( 30 | slice(s, e) 31 | for s, e in zip( 32 | self.blocks.block_starts[idx], self.blocks.block_ends[idx] 33 | ) 34 | ), 35 | ) 36 | else: 37 | return ( 38 | idx, 39 | tuple( 40 | (s, e) 41 | for s, e in zip( 42 | self.blocks.block_starts[idx], self.blocks.block_ends[idx] 43 | ) 44 | ), 45 | ) 46 | 47 | 48 | def _make_iterable(input_var, n_rep=1): 49 | try: 50 | iter(input_var) 51 | return input_var 52 | except TypeError: 53 | return (input_var,) * n_rep 54 | 55 | 56 | class Blocks: 57 | """ 58 | Blocks have two indexing systems: 59 | - linear: 60 | - cartesian: gives the position of the block in the general block tiling. 61 | """ 62 | 63 | def __init__( 64 | self, 65 | shape_full: Tuple, 66 | shape_block: Optional[Tuple] = None, 67 | dim_split: Optional[int] = None, 68 | blocks_number: Optional[int] = None, 69 | padding: Union[int, Tuple] = 0, 70 | crop: Optional[Tuple] = None, 71 | ): 72 | """Make a block structure. It can be defined using block size or number 73 | of blocks (number of blocks if specified will overwrite size). 74 | For example, one split over the 2nd and 3rd dimensions of a 100x20x40x10 block 75 | can equivalently defined as: 76 | BlockSplitter((100,20,40,10), block_size=(10,10,20,30)) 77 | BlockSplitter((100,20,40,10), blocks_number=(1, 2, 2, 1)) 78 | BlockSplitter((100,20,40,10), dim_split=(1,2), block_size=(10,20)) 79 | BlockSplitter((100,20,40,10), dim_split=(1,2), blocks_number=(2,2)) 80 | 81 | :param shape_full: dimensions of the whole stack 82 | :param dim_split: dimension along which to split (if undefined, start 83 | counting from the first dimension) 84 | :param shape_block: size of blocks along each dimension 85 | :param blocks_number: number of blocks along each dimension 86 | :param padding: amount of overlap between blocks 87 | :param crop: iterable of tuples giving the amount of cropping in 88 | each dimension 89 | """ 90 | self._shape_full = shape_full 91 | 92 | if crop is None: 93 | crop = ((0, 0),) * len(shape_full) if shape_full is not None else None 94 | 95 | self._crop = crop 96 | self.shape_cropped = shape_full 97 | 98 | self.starts = None 99 | self.block_starts = None 100 | self.block_ends = None 101 | 102 | self.update_stack_dims() 103 | 104 | # Define shape block and padding allowing multiple input types. 105 | 106 | # Initialize block size as full stack size and 0 padding: 107 | self._shape_block = list(self.shape_cropped) 108 | self._padding = [0 for _ in range(len(self.shape_cropped))] 109 | 110 | if not dim_split: 111 | dim_split = [j for j, d in enumerate(shape_block) if d is not None] 112 | 113 | # Make tuple if single numbers 114 | self.dim_split = _make_iterable(dim_split) 115 | shape_block = _make_iterable(shape_block, max(self.dim_split) + 1) 116 | pad_amount = _make_iterable(padding, max(self.dim_split) + 1) 117 | 118 | if blocks_number: # define from required number of blocks 119 | shape_block = [] 120 | blocks_number = _make_iterable(blocks_number, len(self.dim_split)) 121 | for dim, n in zip(self.dim_split, blocks_number): 122 | shape_block.append(int(np.ceil(self.shape_cropped[dim] / n))) 123 | 124 | for dim in self.dim_split: 125 | self._shape_block[dim] = min(shape_block[dim], self.shape_cropped[dim]) 126 | self._padding[dim] = pad_amount[dim] 127 | 128 | # set property: 129 | self.shape_block = tuple(self._shape_block) 130 | 131 | @property 132 | def n_blocks(self): 133 | return np.product(self.block_starts.shape[:-1]) 134 | 135 | @property 136 | def n_dims(self): 137 | return len(self.shape_cropped) 138 | 139 | @property 140 | def shape_full(self): 141 | return self._shape_full 142 | 143 | @shape_full.setter 144 | def shape_full(self, value): 145 | self._shape_full = value 146 | self.update_stack_dims() 147 | self.update_block_structure() 148 | 149 | @property 150 | def crop(self): 151 | return self._crop 152 | 153 | @crop.setter 154 | def crop(self, value): 155 | if value is None: 156 | value = ((0, 0),) * len(self.shape_full) 157 | self._crop = value 158 | self.update_stack_dims() 159 | self.update_block_structure() 160 | 161 | @property 162 | def shape_block(self): 163 | return self._shape_block 164 | 165 | @shape_block.setter 166 | def shape_block(self, value): 167 | self._shape_block = value 168 | self.update_block_structure() 169 | 170 | @property 171 | def padding(self): 172 | return self._padding 173 | 174 | @padding.setter 175 | def padding(self, value): 176 | self._padding = value 177 | self.update_block_structure() 178 | 179 | def update_stack_dims(self): 180 | """Update stack dimensions and cropping, if shape_full or cropping 181 | is changed. 182 | :return: 183 | """ 184 | 185 | if self.shape_full is not None: 186 | self.shape_cropped = tuple( 187 | d - cl - ch for d, (cl, ch) in zip(self.shape_full, self.crop) 188 | ) 189 | self.starts = tuple(cl for cl, ch in self.crop) 190 | 191 | def update_block_structure(self): 192 | """ 193 | Update the Blocks structure, e.g. when block 194 | shape or padding are changed. 195 | """ 196 | # Cartesian product for generating a list of indexes on every split 197 | # dimension (i.e., dimensions where int(np.ceil(stack_size / block_size) 198 | # is != 1). 199 | # For example, splitting one time in 2nd and 3rd dims, 200 | # idx_blocks = (0, 0, 0, 0), (0, 0, 1, 0), (0, 1, 0, 0), (0, 1, 1, 0). 201 | 202 | # block_starts and block_ends will be arrays of shape 203 | # (n_blocks_dim0, n_blocks_dim1, n_blocks_dim2 ..., shape_full) 204 | # by addressing the N-1 dimensions with the index of the block we 205 | # will get a vector with the starting position of the block on all 206 | # original dimensions of the full stack. 207 | if self.shape_block is not None: 208 | self.block_starts = np.empty( 209 | tuple( 210 | int(np.ceil((stack_size - pad_size) / block_size)) 211 | for stack_size, block_size, pad_size in zip( 212 | self.shape_cropped, self.shape_block, self.padding 213 | ) 214 | ) 215 | + (len(self.shape_cropped),), 216 | dtype=np.int32, 217 | ) 218 | self.block_ends = np.empty_like(self.block_starts) 219 | for idx_blocks in product( 220 | *(range(s) for s in self.block_starts.shape[:-1]) 221 | ): 222 | self.block_starts[idx_blocks + (slice(None),)] = [ 223 | st + i_bd * bs 224 | for i_bd, bs, st in zip(idx_blocks, self.shape_block, self.starts) 225 | ] 226 | self.block_ends[idx_blocks + (slice(None),)] = [ 227 | min(maxdim + st, (i_bd + 1) * bs + pd + st) 228 | for i_bd, bs, pd, maxdim, st in zip( 229 | idx_blocks, 230 | self.shape_block, 231 | self.padding, 232 | self.shape_cropped, 233 | self.starts, 234 | ) 235 | ] 236 | 237 | def slices(self, as_tuples=False): 238 | return BlockIterator(self, slices=not as_tuples) 239 | 240 | def linear_to_cartesian(self, lin_idx): 241 | """ 242 | Convert block linear index into cartesian index. 243 | Example: in a 3D stack split in 2x2x3 blocks, 244 | 245 | self.linear_to_cartesian(0) = (0,0,0) # first block 246 | bs.linear_to_cartesian(11) = (1,1,2) # last block 247 | :param lin_idx: block linear index (int) 248 | :return: block cartesian index (tuple of ints) 249 | """ 250 | return np.unravel_index(lin_idx, self.block_starts.shape[:-1]) 251 | 252 | def cartesian_to_linear(self, ca_idx): 253 | """ 254 | Convert block cartesian index in linear index. 255 | Example: in a 3D stack split in 2x2x3 blocks 256 | 257 | self.cartesian_to_linear0,0,0) = 0 # first block 258 | bs.cartesian_to_linear(1,1,2) = 11 # last block 259 | 260 | :param ca_idx: block cartesian index (tuple of ints) 261 | :return: block linear index (int) 262 | """ 263 | return np.ravel_multi_index(ca_idx, self.block_starts.shape[:-1]) 264 | 265 | def __getitem__(self, item): 266 | """ 267 | :param item: 268 | :return: 269 | """ 270 | # TODO make less brittle, support also indexing by tuples 271 | 272 | # TODO decide what should be returned: slices are tricky 273 | # with multiprocessing 274 | if isinstance(item, int): 275 | idx = self.linear_to_cartesian(item) 276 | return tuple( 277 | slice(s, e) 278 | for s, e in zip(self.block_starts[idx], self.block_ends[idx]) 279 | ) 280 | 281 | def neighbour_blocks(self, i_block, dims=None): 282 | """ 283 | Return neighbouring blocks across given dimensions 284 | :param i_block: 285 | :param dims: 286 | :return: 287 | """ 288 | block_idx = self.linear_to_cartesian(i_block) 289 | act_dims = np.ones(self.n_dims, dtype=bool) 290 | if dims is not None: 291 | act_dims[dims] = True 292 | 293 | neighbors = [] 294 | for idx_neighbour in product( 295 | *[ 296 | ( 297 | range( 298 | max(block_idx[i_dim] - 1, 0), 299 | min(block_idx[i_dim] + 1, self.block_starts.shape[i_dim]), 300 | ) 301 | if act_dims[i_dim] 302 | else [block_idx[i_dim]] 303 | ) 304 | for i_dim in range(self.n_dims) 305 | ] 306 | ): 307 | if idx_neighbour != block_idx: 308 | neighbors.append(idx_neighbour) 309 | if neighbors: 310 | return np.ravel_multi_index( 311 | np.stack(neighbors, 1), self.block_starts.shape[:-1] 312 | ) 313 | else: 314 | return np.array([]) 315 | 316 | def blocks_to_take(self, start_take, end_take): 317 | """ 318 | Find which blocks to take to cover the range: 319 | :param start_take: starting points in the N dims (tuple) 320 | :param end_take: ending points in the N dims (tuple) 321 | :return: tuple of tuples with the extremes of blocks to take in N dims; 322 | starting index of data in the first block; 323 | ending index of data in the last block. 324 | """ 325 | # n_dims = len(start_take) 326 | block_slices = [] 327 | take_block_s_idx = [] 328 | take_block_e_idx = [] 329 | for i_dim, (start, end) in enumerate(zip(start_take, end_take)): 330 | axis_index = tuple( 331 | 0 if i != i_dim else slice(None) for i in range(self.n_dims) 332 | ) + (i_dim,) 333 | s = max( 334 | 0, 335 | min( 336 | np.searchsorted(self.block_starts[axis_index], start) - 1, 337 | len(self.block_starts[axis_index]) - 1, 338 | ), 339 | ) 340 | e = np.searchsorted(self.block_starts[axis_index], end) 341 | block_start = start - self.block_starts[axis_index][s] 342 | block_end = end - self.block_starts[axis_index][e - 1] 343 | 344 | block_slices.append((s, e)) 345 | take_block_s_idx.append(block_start) 346 | take_block_e_idx.append(block_end) 347 | return block_slices, take_block_s_idx, take_block_e_idx 348 | 349 | @staticmethod 350 | def block_to_slices(block): 351 | return tuple(slice(lb, rb) for lb, rb in block) 352 | 353 | def centres(self): 354 | return (self.block_ends + self.block_starts) / 2 355 | 356 | def block_containing_coords(self, coords): 357 | """ 358 | Find the linear index of a block containing the given coordinates 359 | 360 | :param coords: a tuple of the coordinates 361 | :return: 362 | """ 363 | dims = [] 364 | for ic, c in enumerate(coords): 365 | # Create a tuple with the starting points on current dimension 366 | # for all the blocks: 367 | starts = self.block_starts[ 368 | tuple(slice(None) if i == ic else 0 for i in range(self.n_dims)) + (ic,) 369 | ] 370 | 371 | # find in which position our guy should be ordered, correcting 372 | # for 0 value: 373 | dims.append(max((np.searchsorted(starts, c)) - 1, 0)) 374 | return dims 375 | 376 | def drop_dim(self, dim_to_drop): 377 | """ 378 | Return a new BlockSplitter object with a dimension dropped, 379 | useful for getting spatial from spatio-temporal blocks. 380 | 381 | :param dim_to_drop: dimension to be dropped (int) 382 | :return: new BlockSplitter object 383 | """ 384 | return Blocks( 385 | _drop_ith(self.shape_full, dim_to_drop), 386 | shape_block=_drop_ith(self.shape_block, dim_to_drop), 387 | padding=_drop_ith(self.padding, dim_to_drop), 388 | crop=_drop_ith(self.crop, dim_to_drop), 389 | ) 390 | 391 | def serialize(self): 392 | """ 393 | Returns a dictionary with a complete description of the 394 | BlockSplitter, e.g. to save its structure as json file. 395 | :return: 396 | """ 397 | # TODO it should be possible to initialize the BlockSplitter from 398 | # this dictionary! 399 | return dict( 400 | shape_full=self.shape_full, 401 | shape_block=self.shape_block, 402 | crop_start=tuple(c[0] for c in self.crop), 403 | crop_end=tuple(c[1] for c in self.crop), 404 | padding=self.padding, 405 | ) 406 | -------------------------------------------------------------------------------- /split_dataset/split_dataset.py: -------------------------------------------------------------------------------- 1 | import json 2 | import warnings 3 | from itertools import product 4 | from pathlib import Path 5 | 6 | import flammkuchen as fl 7 | import numpy as np 8 | 9 | from split_dataset.blocks import Blocks 10 | 11 | 12 | # TODO this should probably be done as a constructor of the SplitDataset 13 | def save_to_split_dataset( 14 | data, 15 | root_name, 16 | block_size=None, 17 | crop=None, 18 | padding=0, 19 | prefix="", 20 | compression="blosc", 21 | ): 22 | """Function to save block of data into a split_dataset.""" 23 | 24 | new_name = prefix + ("_cropped" if crop is not None else "") 25 | padding = ( 26 | data.padding if padding is not None and isinstance(data, Blocks) else padding 27 | ) 28 | blocks = EmptySplitDataset( 29 | shape_full=data.shape, 30 | shape_block=data.shape_block if block_size is None else block_size, 31 | crop=crop, 32 | padding=padding, 33 | root=root_name, 34 | name=new_name, 35 | ) 36 | for filename, (idxs, slices) in zip(blocks.files, blocks.slices()): 37 | fl.save( 38 | str(blocks.root / filename), 39 | {"stack_{}D".format(len(blocks.shape_cropped)): data[slices]}, 40 | compression=compression, 41 | ) 42 | 43 | return blocks.finalize() 44 | 45 | 46 | class SplitDataset(Blocks): 47 | """ 48 | Manages datasets split over multiple h5 file across arbitrary dimensions. 49 | To do so, uses the BlockSplitter class functions, and define blocks as 50 | files. 51 | 52 | """ 53 | 54 | def __init__(self, root, prefix=None): 55 | """ 56 | :param root: The directory containing the files 57 | :param prefix: The class assumes individual file names to be xxxx.h5. 58 | If there is a prefix to this, for example if the files are stack_xxxx.h5 59 | this has to be passed to the object as a string, in this 60 | particular case it would be prefix="stack_" 61 | """ 62 | 63 | # Load information about stack and splitting. Use the json metadata 64 | # file if possible: 65 | self.root = Path(root) 66 | try: 67 | stack_meta_f = next(self.root.glob("*stack_metadata.json")) 68 | 69 | with open(str(stack_meta_f), "r") as f: 70 | block_metadata = json.load(f) 71 | except StopIteration: 72 | last_data_f = sorted(list(self.root.glob("{}*.h5".format(prefix))))[-1] 73 | block_metadata = fl.load(str(last_data_f), "/stack_metadata") 74 | 75 | # Ugly keyword fix to handle transition to new json system: 76 | for new_k, old_k in zip( 77 | ["shape_block", "shape_full"], ["block_size", "full_size"] 78 | ): 79 | block_metadata[new_k] = block_metadata.pop(old_k) 80 | 81 | # By putting this here, we generate the proper stack_metadata 82 | # file when we open old version data (int conversion for some 83 | # weird format problem with flammkuchen dictionary): 84 | clean_metadata = dict() 85 | _save_metadata_json(block_metadata, self.root) 86 | for k in block_metadata.keys(): 87 | if isinstance(block_metadata[k], tuple): 88 | clean_metadata[k] = tuple( 89 | int(n) if n is not None else None for n in block_metadata[k] 90 | ) 91 | else: 92 | clean_metadata[k] = block_metadata[k] 93 | with open(str(), "w") as f: 94 | json.dump(clean_metadata, f) 95 | 96 | # Start the parent BlockSplitter: 97 | super().__init__( 98 | shape_full=block_metadata["shape_full"], 99 | shape_block=block_metadata["shape_block"], 100 | ) 101 | 102 | if prefix is None: 103 | files = sorted(self.root.glob("*[0-9]*.h5")) 104 | else: 105 | files = sorted(self.root.glob("*{}_[0-9]*.h5".format(prefix))) 106 | 107 | self.files = np.array(files).reshape(self.block_starts.shape[:-1]) 108 | 109 | # If available, read resolution 110 | try: 111 | self.resolution = block_metadata["resolution"] 112 | except KeyError: 113 | self.resolution = (1, 1, 1) 114 | # TODO check this 115 | self.shape = self.shape_cropped 116 | 117 | @property 118 | def ndim(self): 119 | return len(self.shape) 120 | 121 | @property 122 | def dtype(self): 123 | px = fl.load( 124 | str(self.files.flatten()[0]), 125 | "/" + self.data_key, 126 | sel=(0,) * len(self.shape), 127 | ) 128 | return px.dtype 129 | 130 | @property 131 | def data_key(self): 132 | """To migrate smoothly to removal of stack_ND key in favour of only stack""" 133 | return [k for k in fl.meta(self.files.flatten()[0]).keys() if "stack" in k][0] 134 | 135 | def __getitem__(self, item): 136 | """ 137 | Implement usage of the H5SplitDataset as normal numpy array. 138 | :param item: 139 | :return: 140 | """ 141 | # Lot of input munging to emulate indexing in numpy array 142 | if np.any(self.padding) != 0: 143 | raise ValueError( 144 | "Indexing in datasets with overlap (padding) is" 145 | " not supported, merge them first with an" 146 | " appropriate merging function" 147 | ) 148 | 149 | if isinstance(item, int): 150 | item = (slice(item, item + 1),) 151 | 152 | if isinstance(item, slice): 153 | item = (item,) 154 | 155 | if isinstance(item, tuple): 156 | # Take care of the case when only the first few dimensions 157 | # are specified: 158 | if len(item) < len(self.shape): 159 | item = item + (None,) * (len(self.shape) - len(item)) 160 | 161 | # Loop over dimensions creating a list of starting and ending 162 | # points 163 | 164 | starts = [] 165 | ends = [] 166 | singletons = np.zeros(len(item), dtype=bool) 167 | for i_dim, (dim_slc, dim_full) in enumerate(zip(item, self.shape)): 168 | # i_dim: index of current dimension 169 | # dim_slc: slice/index for current dimension 170 | # fd: length of dataset on current dimension 171 | 172 | # If nothing specified, start from 0 and finish at end: 173 | if dim_slc is None: 174 | starts.append(0) 175 | ends.append(dim_full) 176 | 177 | # If a slice is specified: 178 | elif isinstance(dim_slc, slice): 179 | if dim_slc.start is None: 180 | starts.append(0) 181 | else: 182 | if dim_slc.start >= 0: 183 | starts.append(dim_slc.start) 184 | else: 185 | starts.append(max(0, dim_full + dim_slc.start)) 186 | 187 | if dim_slc.stop is None: 188 | ends.append(dim_full) 189 | else: 190 | if dim_slc.stop >= 0: 191 | ends.append(min(dim_slc.stop, dim_full)) 192 | else: 193 | ends.append(max(0, dim_full + dim_slc.stop)) 194 | elif type(dim_slc) in [int, np.int32, np.int64]: 195 | singletons[i_dim] = True 196 | if dim_slc >= 0: 197 | if dim_slc > dim_full - 1: 198 | raise IndexError( 199 | "Indexes {} out of dimensions {}!".format( 200 | item, self.shape 201 | ) 202 | ) 203 | starts.append(dim_slc) 204 | ends.append(dim_slc + 1) 205 | else: 206 | if -dim_slc > dim_full: 207 | raise IndexError( 208 | "Indexes {} out of dimensions {}!".format( 209 | item, self.shape 210 | ) 211 | ) 212 | starts.append(dim_full + dim_slc) 213 | ends.append(dim_full + dim_slc + 1) 214 | else: 215 | raise IndexError("Unsupported indexing") 216 | else: 217 | raise IndexError("Unsupported indexing") 218 | 219 | file_slices, take_block_s_idx, take_block_e_idx = self.blocks_to_take( 220 | starts, ends 221 | ) 222 | output_size = tuple(e - s for s, e in zip(starts, ends)) 223 | 224 | output = None 225 | 226 | # A lot of indexing tricks to achieve multidimensional generality 227 | for f_idx in product(*(range(s, e) for s, e in file_slices)): 228 | abs_idx = [ri - s for ri, (s, e) in zip(f_idx, file_slices)] 229 | sel_slices = tuple( 230 | slice(0 if ci != s else si, None if ci < e - 1 else ei) 231 | for ci, (s, e), si, ei in zip( 232 | f_idx, file_slices, take_block_s_idx, take_block_e_idx 233 | ) 234 | ) 235 | arr = fl.load( 236 | str(self.files[f_idx]), 237 | "/" + self.data_key, 238 | sel=fl.aslice[sel_slices], 239 | ) 240 | 241 | if output is None: 242 | output = np.empty(output_size, arr.dtype) 243 | 244 | output_sel_tuple = tuple( 245 | slice( 246 | 0 if st_idx == 0 else bs - first_idx + (st_idx - 1) * (bs), 247 | (0 if st_idx == 0 else bs - first_idx + (st_idx - 1) * (bs)) + sz, 248 | ) 249 | for st_idx, bs, first_idx, sz in zip( 250 | abs_idx, self.shape_block, take_block_s_idx, arr.shape 251 | ) 252 | ) 253 | output[output_sel_tuple] = arr 254 | 255 | if output is None: 256 | raise IndexError( 257 | "Trying to index the split dataset outside of bounds, between " 258 | + str(starts) 259 | + " and " 260 | + str(ends) 261 | ) 262 | 263 | output_sel = tuple(0 if singleton else slice(None) for singleton in singletons) 264 | 265 | return output[output_sel] 266 | 267 | def apply_crop(self, crop): 268 | """Take out the data with a crop""" 269 | # TODO there is the crop atrribute, which is a lazy crop, this should actually 270 | # return a non-cropped dataset 271 | ds_cropped = EmptySplitDataset( 272 | shape_full=self.shape, 273 | shape_block=self.shape_block, 274 | padding=self.padding, 275 | crop=crop, 276 | root=self.root.parent, 277 | name=self.root.name + "_cropped", 278 | ) 279 | # the slices iterator does not return just the slices, but also the indicesS 280 | for (i_slice, block_slices), file_name in zip( 281 | ds_cropped.slices(), ds_cropped.files 282 | ): 283 | fl.save( 284 | str(self.root / file_name), 285 | {"stack": self[block_slices]}, 286 | ) 287 | 288 | ds_cropped.finalize() 289 | 290 | 291 | class EmptySplitDataset(Blocks): 292 | """Class to initialize an empty dataset for which we have to save metadata 293 | after filling its blocks. 294 | """ 295 | 296 | def __init__(self, root, name, *args, resolution=None, **kwargs): 297 | """ 298 | :param root: folder where the stack will be saved; 299 | :param name: name of the dataset, for the folder name; 300 | :param resolution: resolution of the stack, in microns; 301 | """ 302 | super().__init__(*args, **kwargs) 303 | self.root = Path(root) / name 304 | if not self.root.is_dir(): 305 | self.root.mkdir(parents=True) 306 | else: 307 | warnings.warn("Existing directory") 308 | 309 | self.files = ["{:04d}.h5".format(i) for i in range(self.n_blocks)] 310 | self.resolution = resolution 311 | 312 | def save_block_data(self, n, data, verbose=False): 313 | """Optional method to save data in a block. Often we don't use it, 314 | as we directly save data in the parallelized function. Might be good to 315 | find ways of centralizing saving here? 316 | :param n: n of the block we are saving in; 317 | :param data: data to be pured in the block; 318 | :param verbose: 319 | :return: 320 | """ 321 | fname = "{:04d}.h5".format(n) 322 | if verbose: 323 | print("Saving ", str(self.root / fname)) 324 | 325 | if data.shape != self.shape_block: 326 | print(" - data has different dimension from block!") 327 | 328 | to_save = {"stack": data} 329 | 330 | fl.save(str(self.root / fname), to_save, compression="blosc") 331 | 332 | def finalize(self): 333 | n_dims = len(self.shape_block) 334 | block_dict = self.serialize() 335 | block_dict["shape_full"] = self.shape_cropped 336 | block_dict["crop_start"] = (0,) * n_dims 337 | block_dict["crop_end"] = (0,) * n_dims 338 | block_dict["resolution"] = ( 339 | self.resolution if self.resolution is not None else (1,) * n_dims 340 | ) 341 | 342 | block_dict["axis_order"] = "tzyx" if n_dims == 4 else "zyx" 343 | 344 | _save_metadata_json(block_dict, self.root) 345 | return SplitDataset(self.root) 346 | 347 | 348 | def _save_metadata_json(dictionary, root): 349 | """Save json file preventing type failures for stack shapes 350 | :param path: path for saving 351 | :param dictionary: dictionary to be saved 352 | :return: 353 | """ 354 | METADATA_FILENAME = "stack_metadata.json" 355 | for k in dictionary.keys(): 356 | if type(dictionary[k]) is tuple: 357 | # funny fix for variable type mysterious error: 358 | if type(dictionary[k][0]) == np.int64 or type(dictionary[k][0]) == int: 359 | dictionary[k] = tuple([int(i) for i in dictionary[k]]) 360 | 361 | json.dump(dictionary, open(root / METADATA_FILENAME, "w")) 362 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Unit test package for split_dataset.""" 2 | -------------------------------------------------------------------------------- /tests/test_blocks.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from split_dataset import Blocks 4 | 5 | 6 | def test_cartesian_blocks(): 7 | test_size = (20, 20) 8 | a = np.ones(test_size) 9 | blocks = Blocks(test_size, shape_block=(3, 7), padding=(1, 2)) 10 | for idx, block in blocks.slices(): 11 | a[block] = 0 12 | np.testing.assert_array_equal(a, np.zeros(test_size)) 13 | 14 | 15 | def test_dropped_dimension(): 16 | test_size = (5, 15, 20) 17 | blocks = Blocks( 18 | test_size, shape_block=(3, 7), padding=(1, 2), crop=((1, 1), (0, 0), (0, 0)) 19 | ) 20 | np.testing.assert_equal(blocks.drop_dim(1).shape_full, (5, 20)) 21 | -------------------------------------------------------------------------------- /tests/test_split_dataset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Tests for `split_dataset` package.""" 4 | 5 | import shutil 6 | import tempfile 7 | import unittest 8 | 9 | import numpy as np 10 | 11 | from split_dataset import save_to_split_dataset 12 | 13 | 14 | class TestSplitDataset(unittest.TestCase): 15 | def setUp(self): 16 | self.test_dir = tempfile.mkdtemp() 17 | 18 | def tearDown(self): 19 | shutil.rmtree(self.test_dir) 20 | 21 | def test_SplitDataset(self): 22 | dims = [(10, 3, 3, 3), (5, 5, 5), (5, 5), (1, 5, 5, 5)] 23 | block_sizes = [(2, None, None, None), (1, None, 3), (2, None), (None, 2, 5, 5)] 24 | all_slices = [ 25 | [(slice(3, 8), slice(None))], 26 | [(slice(0, 1),), (slice(0, 2), slice(0, 1), slice(None))], 27 | [slice(0, 2)], 28 | [ 29 | (slice(0, 1),), 30 | (slice(0, 2), slice(0, 1), slice(None)), 31 | (0, slice(0, 2), slice(0, 1)), 32 | ], 33 | ] 34 | 35 | for i, (di, bs, slices) in enumerate(zip(dims, block_sizes, all_slices)): 36 | test_data = np.arange(np.product(di)).reshape(di) 37 | 38 | sd = save_to_split_dataset( 39 | test_data, 40 | block_size=bs, 41 | root_name=self.test_dir, 42 | prefix="te{:02d}".format(i), 43 | ) 44 | for sl in slices: 45 | a = sd[sl] 46 | b = test_data[sl] 47 | np.testing.assert_equal( 48 | a, 49 | b, 50 | err_msg="Testing " 51 | + str(di) 52 | + " " 53 | + str(sl) 54 | + " of shape " 55 | + str(a.shape) 56 | + " and shape" 57 | + str(b.shape), 58 | ) 59 | --------------------------------------------------------------------------------