├── .editorconfig
├── .github
├── ISSUE_TEMPLATE.md
└── workflows
│ └── main.yml
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── pyproject.toml
├── requirements_dev.txt
├── setup.cfg
├── setup.py
├── split_dataset
├── __init__.py
├── blocks.py
└── split_dataset.py
└── tests
├── __init__.py
├── test_blocks.py
└── test_split_dataset.py
/.editorconfig:
--------------------------------------------------------------------------------
1 | # http://editorconfig.org
2 |
3 | root = true
4 |
5 | [*]
6 | indent_style = space
7 | indent_size = 4
8 | trim_trailing_whitespace = true
9 | insert_final_newline = true
10 | charset = utf-8
11 | end_of_line = lf
12 |
13 | [*.bat]
14 | indent_style = tab
15 | end_of_line = crlf
16 |
17 | [LICENSE]
18 | insert_final_newline = false
19 |
20 | [Makefile]
21 | indent_style = tab
22 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | * Split Dataset version:
2 | * Python version:
3 | * Operating System:
4 |
5 | ### Description
6 |
7 | Describe what you were trying to get done.
8 | Tell us what happened, what went wrong, and what you expected to happen.
9 |
10 | ### What I Did
11 |
12 | ```
13 | Paste the command(s) you ran and the output.
14 | If there was a crash, please include the traceback here.
15 | ```
16 |
--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
1 | name: tests
2 |
3 | on: push
4 |
5 | jobs:
6 | lint:
7 | runs-on: ubuntu-latest
8 | steps:
9 | - uses: actions/checkout@v2
10 | - name: Set up Python
11 | uses: actions/setup-python@v2
12 | with:
13 | python-version: 3.8
14 | - name: Install dependencies
15 | run: |
16 | python -m pip install --upgrade pip
17 | pip install .[dev]
18 | - name: Lint
19 | run: |
20 | black . --check
21 | flake8 .
22 | isort . --check
23 | test:
24 | needs: lint
25 | runs-on: ubuntu-latest
26 | steps:
27 | - uses: actions/checkout@v2
28 | - name: Set up Python
29 | uses: actions/setup-python@v2
30 | with:
31 | python-version: 3.8
32 | - name: Install dependencies
33 | run: |
34 | python -m pip install --upgrade pip
35 | pip install .[dev]
36 | - name: Test
37 | run: pytest --cov
38 | - name: Coveralls
39 | env:
40 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
41 | run: |
42 | pip install coveralls
43 | coveralls --service=github
44 |
45 | deploy:
46 | needs: test
47 | runs-on: ubuntu-latest
48 | if: startsWith(github.ref, 'refs/tags/v')
49 | steps:
50 | - uses: actions/checkout@v2
51 | - name: Set up Python
52 | uses: actions/setup-python@v1
53 | with:
54 | python-version: "3.x"
55 | - name: Install dependencies
56 | run: |
57 | python -m pip install --upgrade pip
58 | pip install -U setuptools setuptools_scm wheel twine
59 | - name: Build and publish
60 | env:
61 | TWINE_USERNAME: __token__
62 | TWINE_PASSWORD: ${{ secrets.TWINE_API_KEY }}
63 | run: |
64 | git tag
65 | python setup.py sdist bdist_wheel
66 | twine upload dist/*
67 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 |
58 | # Flask stuff:
59 | instance/
60 | .webassets-cache
61 |
62 | # Scrapy stuff:
63 | .scrapy
64 |
65 | # Sphinx documentation
66 | docs/_build/
67 |
68 | # PyBuilder
69 | target/
70 |
71 | # Jupyter Notebook
72 | .ipynb_checkpoints
73 |
74 | # pyenv
75 | .python-version
76 |
77 | # celery beat schedule file
78 | celerybeat-schedule
79 |
80 | # SageMath parsed files
81 | *.sage.py
82 |
83 | # dotenv
84 | .env
85 |
86 | # virtualenv
87 | .venv
88 | venv/
89 | ENV/
90 |
91 | # Spyder project settings
92 | .spyderproject
93 | .spyproject
94 |
95 | # Rope project settings
96 | .ropeproject
97 |
98 | # mkdocs documentation
99 | /site
100 |
101 | # mypy
102 | .mypy_cache/
103 |
104 | # IDE settings
105 | .vscode/
106 |
107 | # Idea
108 | .idea/
109 | \.idea/
110 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | A package for HDF5-based chunked arrays
5 | Copyright (C) 2020 Vilim Stih & Luigi Petrucco @portugueslab
6 |
7 | This program is free software: you can redistribute it and/or modify
8 | it under the terms of the GNU General Public License as published by
9 | the Free Software Foundation, either version 3 of the License, or
10 | (at your option) any later version.
11 |
12 | This program is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU General Public License for more details.
16 |
17 | You should have received a copy of the GNU General Public License
18 | along with this program. If not, see .
19 |
20 | Also add information on how to contact you by electronic and paper mail.
21 |
22 | You should also get your employer (if you work as a programmer) or school,
23 | if any, to sign a "copyright disclaimer" for the program, if necessary.
24 | For more information on this, and how to apply and follow the GNU GPL, see
25 | .
26 |
27 | The GNU General Public License does not permit incorporating your program
28 | into proprietary programs. If your program is a subroutine library, you
29 | may consider it more useful to permit linking proprietary applications with
30 | the library. If this is what you want to do, use the GNU Lesser General
31 | Public License instead of this License. But first, please read
32 | .
33 |
34 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include README.md
3 |
4 | recursive-include tests *
5 | recursive-exclude * __pycache__
6 | recursive-exclude * *.py[co]
7 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | [](https://pypi.org/project/split_dataset)
3 | [](
4 | https://pypi.python.org/pypi/split_dataset)
5 | [](
6 | https://github.com/portugueslab/split_dataset/actions)
7 | [](https://coveralls.io/github/portugueslab/split_dataset?branch=master)
8 | [](https://github.com/python/black)
9 | [](https://www.gnu.org/licenses/gpl-3.0)
10 |
11 |
12 |
13 | A minimal package for saving and reading large HDF5-based chunked arrays.
14 |
15 | This package has been developed in the [`Portugues lab`](http://www.portugueslab.com) for volumetric calcium imaging data. `split_dataset` is extensively used in the calcium imaging analysis package [`fimpy`](https://github.com/portugueslab/fimpy); The microscope control libraries [`sashimi`](https://github.com/portugueslab/sashimi) and [`brunoise`](https://github.com/portugueslab/brunoise) save files as split datasets.
16 |
17 | [`napari-split-dataset`](https://github.com/portugueslab/napari-split-dataset) support the visualization of SplitDatasets in `napari`.
18 |
19 | ## Why using Split dataset?
20 | Split datasets are numpy-like array saved over multiple h5 files. The concept of spli datasets is not different from e.g. [zarr arrays](https://zarr.readthedocs.io/en/stable/); however, relying on h5 files allow for partial reading even within the same file, which is crucial for visualizing volumetric time series, the main application `split_dataset` has been developed for (see [this discussion](https://github.com/zarr-developers/zarr-python/issues/521) on the limitation of zarr arrays).
21 |
22 | # Structure of a split dataset
23 | A split dataset is contained in a folder containing multiple, numbered h5 files (one file per chunk) and a metadata json file with information on the shape of the full dataset and of its chunks.
24 | The h5 files are saved using the [flammkuchen](https://github.com/portugueslab/flammkuchen) library (ex [deepdish](https://deepdish.readthedocs.io/en/latest/)). Each file contains a dictionary with the data under the `stack` keyword.
25 |
26 | `SplitDataset` objects can than be instantiated from the dataset path, and numpy-style indexing can then be used to load data as numpy arrays. Any n of dimensions and block sizes are supported in principle; the package has been used mainly with 3D and 4D arrays.
27 |
28 |
29 |
30 | ## Minimal example
31 | ```python
32 | # Load a SplitDataset via a SplitDataset object:
33 | from split_dataset import SplitDataset
34 | ds = SplitDataset(path_to_dataset)
35 |
36 | # Retrieve data in an interval:
37 | data_array = ds[n_start:n_end, :, :, :]
38 | ```
39 |
40 | ## Creating split datasets
41 | New split datasets can be created with the `split_dataset.save_to_split_dataset` function, provided that the original data is fully loaded in memory. Alternatively, e.g. for time acquisitions, a split dataset can be saved one chunk at a time. It is enough to save with `flammkuchen` correctly formatted .h5 files and the correspondent json metadata file describing the full split dataset shape (this is [what happens in sashimi](https://github.com/portugueslab/sashimi/blob/01046f2f24483ab702be379843a1782ababa7d2d/sashimi/processes/streaming_save.py#L186))
42 |
43 |
44 | # TODO
45 | * provide utilities for partial saving of split datasets
46 | * support for more advanced indexing (support for step and vector indexing)
47 | * support for cropping a `SplitDataset`
48 | * support for resolution and frequency metadata
49 |
50 |
51 | # History
52 |
53 | ### 0.4.0 (2021-03-23)
54 | * Added support to use a `SplitDataset` as data in a `napari` layer.
55 |
56 | ...
57 |
58 | ### 0.1.0 (2020-05-06)
59 | * First release on PyPI.
60 |
61 |
62 | Credits
63 | -------
64 |
65 | Part of this package was inspired by [Cookiecutter](https://github.com/audreyr/cookiecutter) and [this](https://github.com/audreyr/cookiecutter-pypackage) template.
66 |
67 | .. _`Portugues lab`:
68 | .. _Cookiecutter:
69 | .. _this:
70 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | target-version = ['py36', 'py37', 'py38']
3 | skip-string-normalization = false
4 | exclude = '''
5 | (
6 | /(
7 | \.eggs
8 | | \.git
9 | | \.hg
10 | | \.mypy_cache
11 | | \.tox
12 | | \.venv
13 | | _build
14 | | buck-out
15 | | build
16 | | dist
17 | | examples
18 | )/
19 | )
20 | '''
21 |
22 | [tool.isort]
23 | multi_line_output = 3
24 | include_trailing_comma = true
25 |
--------------------------------------------------------------------------------
/requirements_dev.txt:
--------------------------------------------------------------------------------
1 | pip
2 | bump2version
3 | wheel
4 | flake8
5 | coverage
6 | Sphinx
7 | twine
8 | black
9 | isort
10 |
11 | pytest
12 | pytest-runner
13 | pytest-cov
14 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bumpversion]
2 | current_version = 0.4.3
3 | commit = True
4 | tag = True
5 |
6 | [bumpversion:file:setup.py]
7 | search = version="{current_version}"
8 | replace = version="{new_version}"
9 |
10 | [bumpversion:file:split_dataset/__init__.py]
11 | search = __version__ = "{current_version}"
12 | replace = __version__ = "{new_version}"
13 |
14 | [bdist_wheel]
15 | universal = 1
16 |
17 | [flake8]
18 | ignore = E203, W503
19 | max-line-length = 88
20 | exclude = __init__.py
21 |
22 | [aliases]
23 | test = pytest
24 |
25 | [tool:pytest]
26 | collect_ignore = ['setup.py']
27 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | """The setup script."""
4 |
5 | from setuptools import find_packages, setup
6 |
7 | with open("README.md") as readme_file:
8 | readme = readme_file.read()
9 |
10 |
11 | requirements = ["flammkuchen", "numpy"]
12 |
13 | with open("requirements_dev.txt") as f:
14 | requirements_dev = f.read().splitlines()
15 |
16 | setup(
17 | author="Vilim Stih & Luigi Petrucco @portugueslab",
18 | author_email="luigi.petrucco@gmail.com",
19 | python_requires=">=3.5",
20 | classifiers=[
21 | "Development Status :: 2 - Pre-Alpha",
22 | "Intended Audience :: Developers",
23 | "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
24 | "Natural Language :: English",
25 | "Programming Language :: Python :: 3",
26 | "Programming Language :: Python :: 3.5",
27 | "Programming Language :: Python :: 3.6",
28 | "Programming Language :: Python :: 3.7",
29 | "Programming Language :: Python :: 3.8",
30 | ],
31 | description="A package for HDF5-based chunked arrays",
32 | install_requires=requirements,
33 | extras_require=dict(dev=requirements_dev),
34 | license="GNU General Public License v3",
35 | long_description=readme,
36 | long_description_content_type="text/markdown",
37 | include_package_data=True,
38 | keywords="split_dataset",
39 | name="split_dataset",
40 | packages=find_packages(include=["split_dataset", "split_dataset.*"]),
41 | test_suite="tests",
42 | url="https://github.com/portugueslab/split_dataset",
43 | version="0.4.3",
44 | zip_safe=False,
45 | )
46 |
--------------------------------------------------------------------------------
/split_dataset/__init__.py:
--------------------------------------------------------------------------------
1 | """Top-level package for Split Dataset."""
2 |
3 | __author__ = """Vilim Stih & Luigi Petrucco @portugueslab"""
4 | __version__ = "0.4.3"
5 |
6 | from split_dataset.blocks import Blocks
7 | from split_dataset.split_dataset import (
8 | EmptySplitDataset,
9 | SplitDataset,
10 | save_to_split_dataset,
11 | )
12 |
--------------------------------------------------------------------------------
/split_dataset/blocks.py:
--------------------------------------------------------------------------------
1 | from itertools import product
2 | from typing import Optional, Tuple, Union
3 |
4 | import numpy as np
5 |
6 |
7 | def _drop_ith(xs, dim_to_drop):
8 | return tuple(x for i, x in enumerate(xs) if i != dim_to_drop)
9 |
10 |
11 | class BlockIterator:
12 | def __init__(self, blocks, slices=True):
13 | self.blocks = blocks
14 | self.current_block = 0
15 | self.slices = slices
16 |
17 | def __iter__(self):
18 | return self
19 |
20 | def __next__(self):
21 | if self.current_block == self.blocks.n_blocks:
22 | raise StopIteration
23 | else:
24 | idx = self.blocks.linear_to_cartesian(self.current_block)
25 | self.current_block += 1
26 | if self.slices:
27 | return (
28 | idx,
29 | tuple(
30 | slice(s, e)
31 | for s, e in zip(
32 | self.blocks.block_starts[idx], self.blocks.block_ends[idx]
33 | )
34 | ),
35 | )
36 | else:
37 | return (
38 | idx,
39 | tuple(
40 | (s, e)
41 | for s, e in zip(
42 | self.blocks.block_starts[idx], self.blocks.block_ends[idx]
43 | )
44 | ),
45 | )
46 |
47 |
48 | def _make_iterable(input_var, n_rep=1):
49 | try:
50 | iter(input_var)
51 | return input_var
52 | except TypeError:
53 | return (input_var,) * n_rep
54 |
55 |
56 | class Blocks:
57 | """
58 | Blocks have two indexing systems:
59 | - linear:
60 | - cartesian: gives the position of the block in the general block tiling.
61 | """
62 |
63 | def __init__(
64 | self,
65 | shape_full: Tuple,
66 | shape_block: Optional[Tuple] = None,
67 | dim_split: Optional[int] = None,
68 | blocks_number: Optional[int] = None,
69 | padding: Union[int, Tuple] = 0,
70 | crop: Optional[Tuple] = None,
71 | ):
72 | """Make a block structure. It can be defined using block size or number
73 | of blocks (number of blocks if specified will overwrite size).
74 | For example, one split over the 2nd and 3rd dimensions of a 100x20x40x10 block
75 | can equivalently defined as:
76 | BlockSplitter((100,20,40,10), block_size=(10,10,20,30))
77 | BlockSplitter((100,20,40,10), blocks_number=(1, 2, 2, 1))
78 | BlockSplitter((100,20,40,10), dim_split=(1,2), block_size=(10,20))
79 | BlockSplitter((100,20,40,10), dim_split=(1,2), blocks_number=(2,2))
80 |
81 | :param shape_full: dimensions of the whole stack
82 | :param dim_split: dimension along which to split (if undefined, start
83 | counting from the first dimension)
84 | :param shape_block: size of blocks along each dimension
85 | :param blocks_number: number of blocks along each dimension
86 | :param padding: amount of overlap between blocks
87 | :param crop: iterable of tuples giving the amount of cropping in
88 | each dimension
89 | """
90 | self._shape_full = shape_full
91 |
92 | if crop is None:
93 | crop = ((0, 0),) * len(shape_full) if shape_full is not None else None
94 |
95 | self._crop = crop
96 | self.shape_cropped = shape_full
97 |
98 | self.starts = None
99 | self.block_starts = None
100 | self.block_ends = None
101 |
102 | self.update_stack_dims()
103 |
104 | # Define shape block and padding allowing multiple input types.
105 |
106 | # Initialize block size as full stack size and 0 padding:
107 | self._shape_block = list(self.shape_cropped)
108 | self._padding = [0 for _ in range(len(self.shape_cropped))]
109 |
110 | if not dim_split:
111 | dim_split = [j for j, d in enumerate(shape_block) if d is not None]
112 |
113 | # Make tuple if single numbers
114 | self.dim_split = _make_iterable(dim_split)
115 | shape_block = _make_iterable(shape_block, max(self.dim_split) + 1)
116 | pad_amount = _make_iterable(padding, max(self.dim_split) + 1)
117 |
118 | if blocks_number: # define from required number of blocks
119 | shape_block = []
120 | blocks_number = _make_iterable(blocks_number, len(self.dim_split))
121 | for dim, n in zip(self.dim_split, blocks_number):
122 | shape_block.append(int(np.ceil(self.shape_cropped[dim] / n)))
123 |
124 | for dim in self.dim_split:
125 | self._shape_block[dim] = min(shape_block[dim], self.shape_cropped[dim])
126 | self._padding[dim] = pad_amount[dim]
127 |
128 | # set property:
129 | self.shape_block = tuple(self._shape_block)
130 |
131 | @property
132 | def n_blocks(self):
133 | return np.product(self.block_starts.shape[:-1])
134 |
135 | @property
136 | def n_dims(self):
137 | return len(self.shape_cropped)
138 |
139 | @property
140 | def shape_full(self):
141 | return self._shape_full
142 |
143 | @shape_full.setter
144 | def shape_full(self, value):
145 | self._shape_full = value
146 | self.update_stack_dims()
147 | self.update_block_structure()
148 |
149 | @property
150 | def crop(self):
151 | return self._crop
152 |
153 | @crop.setter
154 | def crop(self, value):
155 | if value is None:
156 | value = ((0, 0),) * len(self.shape_full)
157 | self._crop = value
158 | self.update_stack_dims()
159 | self.update_block_structure()
160 |
161 | @property
162 | def shape_block(self):
163 | return self._shape_block
164 |
165 | @shape_block.setter
166 | def shape_block(self, value):
167 | self._shape_block = value
168 | self.update_block_structure()
169 |
170 | @property
171 | def padding(self):
172 | return self._padding
173 |
174 | @padding.setter
175 | def padding(self, value):
176 | self._padding = value
177 | self.update_block_structure()
178 |
179 | def update_stack_dims(self):
180 | """Update stack dimensions and cropping, if shape_full or cropping
181 | is changed.
182 | :return:
183 | """
184 |
185 | if self.shape_full is not None:
186 | self.shape_cropped = tuple(
187 | d - cl - ch for d, (cl, ch) in zip(self.shape_full, self.crop)
188 | )
189 | self.starts = tuple(cl for cl, ch in self.crop)
190 |
191 | def update_block_structure(self):
192 | """
193 | Update the Blocks structure, e.g. when block
194 | shape or padding are changed.
195 | """
196 | # Cartesian product for generating a list of indexes on every split
197 | # dimension (i.e., dimensions where int(np.ceil(stack_size / block_size)
198 | # is != 1).
199 | # For example, splitting one time in 2nd and 3rd dims,
200 | # idx_blocks = (0, 0, 0, 0), (0, 0, 1, 0), (0, 1, 0, 0), (0, 1, 1, 0).
201 |
202 | # block_starts and block_ends will be arrays of shape
203 | # (n_blocks_dim0, n_blocks_dim1, n_blocks_dim2 ..., shape_full)
204 | # by addressing the N-1 dimensions with the index of the block we
205 | # will get a vector with the starting position of the block on all
206 | # original dimensions of the full stack.
207 | if self.shape_block is not None:
208 | self.block_starts = np.empty(
209 | tuple(
210 | int(np.ceil((stack_size - pad_size) / block_size))
211 | for stack_size, block_size, pad_size in zip(
212 | self.shape_cropped, self.shape_block, self.padding
213 | )
214 | )
215 | + (len(self.shape_cropped),),
216 | dtype=np.int32,
217 | )
218 | self.block_ends = np.empty_like(self.block_starts)
219 | for idx_blocks in product(
220 | *(range(s) for s in self.block_starts.shape[:-1])
221 | ):
222 | self.block_starts[idx_blocks + (slice(None),)] = [
223 | st + i_bd * bs
224 | for i_bd, bs, st in zip(idx_blocks, self.shape_block, self.starts)
225 | ]
226 | self.block_ends[idx_blocks + (slice(None),)] = [
227 | min(maxdim + st, (i_bd + 1) * bs + pd + st)
228 | for i_bd, bs, pd, maxdim, st in zip(
229 | idx_blocks,
230 | self.shape_block,
231 | self.padding,
232 | self.shape_cropped,
233 | self.starts,
234 | )
235 | ]
236 |
237 | def slices(self, as_tuples=False):
238 | return BlockIterator(self, slices=not as_tuples)
239 |
240 | def linear_to_cartesian(self, lin_idx):
241 | """
242 | Convert block linear index into cartesian index.
243 | Example: in a 3D stack split in 2x2x3 blocks,
244 |
245 | self.linear_to_cartesian(0) = (0,0,0) # first block
246 | bs.linear_to_cartesian(11) = (1,1,2) # last block
247 | :param lin_idx: block linear index (int)
248 | :return: block cartesian index (tuple of ints)
249 | """
250 | return np.unravel_index(lin_idx, self.block_starts.shape[:-1])
251 |
252 | def cartesian_to_linear(self, ca_idx):
253 | """
254 | Convert block cartesian index in linear index.
255 | Example: in a 3D stack split in 2x2x3 blocks
256 |
257 | self.cartesian_to_linear0,0,0) = 0 # first block
258 | bs.cartesian_to_linear(1,1,2) = 11 # last block
259 |
260 | :param ca_idx: block cartesian index (tuple of ints)
261 | :return: block linear index (int)
262 | """
263 | return np.ravel_multi_index(ca_idx, self.block_starts.shape[:-1])
264 |
265 | def __getitem__(self, item):
266 | """
267 | :param item:
268 | :return:
269 | """
270 | # TODO make less brittle, support also indexing by tuples
271 |
272 | # TODO decide what should be returned: slices are tricky
273 | # with multiprocessing
274 | if isinstance(item, int):
275 | idx = self.linear_to_cartesian(item)
276 | return tuple(
277 | slice(s, e)
278 | for s, e in zip(self.block_starts[idx], self.block_ends[idx])
279 | )
280 |
281 | def neighbour_blocks(self, i_block, dims=None):
282 | """
283 | Return neighbouring blocks across given dimensions
284 | :param i_block:
285 | :param dims:
286 | :return:
287 | """
288 | block_idx = self.linear_to_cartesian(i_block)
289 | act_dims = np.ones(self.n_dims, dtype=bool)
290 | if dims is not None:
291 | act_dims[dims] = True
292 |
293 | neighbors = []
294 | for idx_neighbour in product(
295 | *[
296 | (
297 | range(
298 | max(block_idx[i_dim] - 1, 0),
299 | min(block_idx[i_dim] + 1, self.block_starts.shape[i_dim]),
300 | )
301 | if act_dims[i_dim]
302 | else [block_idx[i_dim]]
303 | )
304 | for i_dim in range(self.n_dims)
305 | ]
306 | ):
307 | if idx_neighbour != block_idx:
308 | neighbors.append(idx_neighbour)
309 | if neighbors:
310 | return np.ravel_multi_index(
311 | np.stack(neighbors, 1), self.block_starts.shape[:-1]
312 | )
313 | else:
314 | return np.array([])
315 |
316 | def blocks_to_take(self, start_take, end_take):
317 | """
318 | Find which blocks to take to cover the range:
319 | :param start_take: starting points in the N dims (tuple)
320 | :param end_take: ending points in the N dims (tuple)
321 | :return: tuple of tuples with the extremes of blocks to take in N dims;
322 | starting index of data in the first block;
323 | ending index of data in the last block.
324 | """
325 | # n_dims = len(start_take)
326 | block_slices = []
327 | take_block_s_idx = []
328 | take_block_e_idx = []
329 | for i_dim, (start, end) in enumerate(zip(start_take, end_take)):
330 | axis_index = tuple(
331 | 0 if i != i_dim else slice(None) for i in range(self.n_dims)
332 | ) + (i_dim,)
333 | s = max(
334 | 0,
335 | min(
336 | np.searchsorted(self.block_starts[axis_index], start) - 1,
337 | len(self.block_starts[axis_index]) - 1,
338 | ),
339 | )
340 | e = np.searchsorted(self.block_starts[axis_index], end)
341 | block_start = start - self.block_starts[axis_index][s]
342 | block_end = end - self.block_starts[axis_index][e - 1]
343 |
344 | block_slices.append((s, e))
345 | take_block_s_idx.append(block_start)
346 | take_block_e_idx.append(block_end)
347 | return block_slices, take_block_s_idx, take_block_e_idx
348 |
349 | @staticmethod
350 | def block_to_slices(block):
351 | return tuple(slice(lb, rb) for lb, rb in block)
352 |
353 | def centres(self):
354 | return (self.block_ends + self.block_starts) / 2
355 |
356 | def block_containing_coords(self, coords):
357 | """
358 | Find the linear index of a block containing the given coordinates
359 |
360 | :param coords: a tuple of the coordinates
361 | :return:
362 | """
363 | dims = []
364 | for ic, c in enumerate(coords):
365 | # Create a tuple with the starting points on current dimension
366 | # for all the blocks:
367 | starts = self.block_starts[
368 | tuple(slice(None) if i == ic else 0 for i in range(self.n_dims)) + (ic,)
369 | ]
370 |
371 | # find in which position our guy should be ordered, correcting
372 | # for 0 value:
373 | dims.append(max((np.searchsorted(starts, c)) - 1, 0))
374 | return dims
375 |
376 | def drop_dim(self, dim_to_drop):
377 | """
378 | Return a new BlockSplitter object with a dimension dropped,
379 | useful for getting spatial from spatio-temporal blocks.
380 |
381 | :param dim_to_drop: dimension to be dropped (int)
382 | :return: new BlockSplitter object
383 | """
384 | return Blocks(
385 | _drop_ith(self.shape_full, dim_to_drop),
386 | shape_block=_drop_ith(self.shape_block, dim_to_drop),
387 | padding=_drop_ith(self.padding, dim_to_drop),
388 | crop=_drop_ith(self.crop, dim_to_drop),
389 | )
390 |
391 | def serialize(self):
392 | """
393 | Returns a dictionary with a complete description of the
394 | BlockSplitter, e.g. to save its structure as json file.
395 | :return:
396 | """
397 | # TODO it should be possible to initialize the BlockSplitter from
398 | # this dictionary!
399 | return dict(
400 | shape_full=self.shape_full,
401 | shape_block=self.shape_block,
402 | crop_start=tuple(c[0] for c in self.crop),
403 | crop_end=tuple(c[1] for c in self.crop),
404 | padding=self.padding,
405 | )
406 |
--------------------------------------------------------------------------------
/split_dataset/split_dataset.py:
--------------------------------------------------------------------------------
1 | import json
2 | import warnings
3 | from itertools import product
4 | from pathlib import Path
5 |
6 | import flammkuchen as fl
7 | import numpy as np
8 |
9 | from split_dataset.blocks import Blocks
10 |
11 |
12 | # TODO this should probably be done as a constructor of the SplitDataset
13 | def save_to_split_dataset(
14 | data,
15 | root_name,
16 | block_size=None,
17 | crop=None,
18 | padding=0,
19 | prefix="",
20 | compression="blosc",
21 | ):
22 | """Function to save block of data into a split_dataset."""
23 |
24 | new_name = prefix + ("_cropped" if crop is not None else "")
25 | padding = (
26 | data.padding if padding is not None and isinstance(data, Blocks) else padding
27 | )
28 | blocks = EmptySplitDataset(
29 | shape_full=data.shape,
30 | shape_block=data.shape_block if block_size is None else block_size,
31 | crop=crop,
32 | padding=padding,
33 | root=root_name,
34 | name=new_name,
35 | )
36 | for filename, (idxs, slices) in zip(blocks.files, blocks.slices()):
37 | fl.save(
38 | str(blocks.root / filename),
39 | {"stack_{}D".format(len(blocks.shape_cropped)): data[slices]},
40 | compression=compression,
41 | )
42 |
43 | return blocks.finalize()
44 |
45 |
46 | class SplitDataset(Blocks):
47 | """
48 | Manages datasets split over multiple h5 file across arbitrary dimensions.
49 | To do so, uses the BlockSplitter class functions, and define blocks as
50 | files.
51 |
52 | """
53 |
54 | def __init__(self, root, prefix=None):
55 | """
56 | :param root: The directory containing the files
57 | :param prefix: The class assumes individual file names to be xxxx.h5.
58 | If there is a prefix to this, for example if the files are stack_xxxx.h5
59 | this has to be passed to the object as a string, in this
60 | particular case it would be prefix="stack_"
61 | """
62 |
63 | # Load information about stack and splitting. Use the json metadata
64 | # file if possible:
65 | self.root = Path(root)
66 | try:
67 | stack_meta_f = next(self.root.glob("*stack_metadata.json"))
68 |
69 | with open(str(stack_meta_f), "r") as f:
70 | block_metadata = json.load(f)
71 | except StopIteration:
72 | last_data_f = sorted(list(self.root.glob("{}*.h5".format(prefix))))[-1]
73 | block_metadata = fl.load(str(last_data_f), "/stack_metadata")
74 |
75 | # Ugly keyword fix to handle transition to new json system:
76 | for new_k, old_k in zip(
77 | ["shape_block", "shape_full"], ["block_size", "full_size"]
78 | ):
79 | block_metadata[new_k] = block_metadata.pop(old_k)
80 |
81 | # By putting this here, we generate the proper stack_metadata
82 | # file when we open old version data (int conversion for some
83 | # weird format problem with flammkuchen dictionary):
84 | clean_metadata = dict()
85 | _save_metadata_json(block_metadata, self.root)
86 | for k in block_metadata.keys():
87 | if isinstance(block_metadata[k], tuple):
88 | clean_metadata[k] = tuple(
89 | int(n) if n is not None else None for n in block_metadata[k]
90 | )
91 | else:
92 | clean_metadata[k] = block_metadata[k]
93 | with open(str(), "w") as f:
94 | json.dump(clean_metadata, f)
95 |
96 | # Start the parent BlockSplitter:
97 | super().__init__(
98 | shape_full=block_metadata["shape_full"],
99 | shape_block=block_metadata["shape_block"],
100 | )
101 |
102 | if prefix is None:
103 | files = sorted(self.root.glob("*[0-9]*.h5"))
104 | else:
105 | files = sorted(self.root.glob("*{}_[0-9]*.h5".format(prefix)))
106 |
107 | self.files = np.array(files).reshape(self.block_starts.shape[:-1])
108 |
109 | # If available, read resolution
110 | try:
111 | self.resolution = block_metadata["resolution"]
112 | except KeyError:
113 | self.resolution = (1, 1, 1)
114 | # TODO check this
115 | self.shape = self.shape_cropped
116 |
117 | @property
118 | def ndim(self):
119 | return len(self.shape)
120 |
121 | @property
122 | def dtype(self):
123 | px = fl.load(
124 | str(self.files.flatten()[0]),
125 | "/" + self.data_key,
126 | sel=(0,) * len(self.shape),
127 | )
128 | return px.dtype
129 |
130 | @property
131 | def data_key(self):
132 | """To migrate smoothly to removal of stack_ND key in favour of only stack"""
133 | return [k for k in fl.meta(self.files.flatten()[0]).keys() if "stack" in k][0]
134 |
135 | def __getitem__(self, item):
136 | """
137 | Implement usage of the H5SplitDataset as normal numpy array.
138 | :param item:
139 | :return:
140 | """
141 | # Lot of input munging to emulate indexing in numpy array
142 | if np.any(self.padding) != 0:
143 | raise ValueError(
144 | "Indexing in datasets with overlap (padding) is"
145 | " not supported, merge them first with an"
146 | " appropriate merging function"
147 | )
148 |
149 | if isinstance(item, int):
150 | item = (slice(item, item + 1),)
151 |
152 | if isinstance(item, slice):
153 | item = (item,)
154 |
155 | if isinstance(item, tuple):
156 | # Take care of the case when only the first few dimensions
157 | # are specified:
158 | if len(item) < len(self.shape):
159 | item = item + (None,) * (len(self.shape) - len(item))
160 |
161 | # Loop over dimensions creating a list of starting and ending
162 | # points
163 |
164 | starts = []
165 | ends = []
166 | singletons = np.zeros(len(item), dtype=bool)
167 | for i_dim, (dim_slc, dim_full) in enumerate(zip(item, self.shape)):
168 | # i_dim: index of current dimension
169 | # dim_slc: slice/index for current dimension
170 | # fd: length of dataset on current dimension
171 |
172 | # If nothing specified, start from 0 and finish at end:
173 | if dim_slc is None:
174 | starts.append(0)
175 | ends.append(dim_full)
176 |
177 | # If a slice is specified:
178 | elif isinstance(dim_slc, slice):
179 | if dim_slc.start is None:
180 | starts.append(0)
181 | else:
182 | if dim_slc.start >= 0:
183 | starts.append(dim_slc.start)
184 | else:
185 | starts.append(max(0, dim_full + dim_slc.start))
186 |
187 | if dim_slc.stop is None:
188 | ends.append(dim_full)
189 | else:
190 | if dim_slc.stop >= 0:
191 | ends.append(min(dim_slc.stop, dim_full))
192 | else:
193 | ends.append(max(0, dim_full + dim_slc.stop))
194 | elif type(dim_slc) in [int, np.int32, np.int64]:
195 | singletons[i_dim] = True
196 | if dim_slc >= 0:
197 | if dim_slc > dim_full - 1:
198 | raise IndexError(
199 | "Indexes {} out of dimensions {}!".format(
200 | item, self.shape
201 | )
202 | )
203 | starts.append(dim_slc)
204 | ends.append(dim_slc + 1)
205 | else:
206 | if -dim_slc > dim_full:
207 | raise IndexError(
208 | "Indexes {} out of dimensions {}!".format(
209 | item, self.shape
210 | )
211 | )
212 | starts.append(dim_full + dim_slc)
213 | ends.append(dim_full + dim_slc + 1)
214 | else:
215 | raise IndexError("Unsupported indexing")
216 | else:
217 | raise IndexError("Unsupported indexing")
218 |
219 | file_slices, take_block_s_idx, take_block_e_idx = self.blocks_to_take(
220 | starts, ends
221 | )
222 | output_size = tuple(e - s for s, e in zip(starts, ends))
223 |
224 | output = None
225 |
226 | # A lot of indexing tricks to achieve multidimensional generality
227 | for f_idx in product(*(range(s, e) for s, e in file_slices)):
228 | abs_idx = [ri - s for ri, (s, e) in zip(f_idx, file_slices)]
229 | sel_slices = tuple(
230 | slice(0 if ci != s else si, None if ci < e - 1 else ei)
231 | for ci, (s, e), si, ei in zip(
232 | f_idx, file_slices, take_block_s_idx, take_block_e_idx
233 | )
234 | )
235 | arr = fl.load(
236 | str(self.files[f_idx]),
237 | "/" + self.data_key,
238 | sel=fl.aslice[sel_slices],
239 | )
240 |
241 | if output is None:
242 | output = np.empty(output_size, arr.dtype)
243 |
244 | output_sel_tuple = tuple(
245 | slice(
246 | 0 if st_idx == 0 else bs - first_idx + (st_idx - 1) * (bs),
247 | (0 if st_idx == 0 else bs - first_idx + (st_idx - 1) * (bs)) + sz,
248 | )
249 | for st_idx, bs, first_idx, sz in zip(
250 | abs_idx, self.shape_block, take_block_s_idx, arr.shape
251 | )
252 | )
253 | output[output_sel_tuple] = arr
254 |
255 | if output is None:
256 | raise IndexError(
257 | "Trying to index the split dataset outside of bounds, between "
258 | + str(starts)
259 | + " and "
260 | + str(ends)
261 | )
262 |
263 | output_sel = tuple(0 if singleton else slice(None) for singleton in singletons)
264 |
265 | return output[output_sel]
266 |
267 | def apply_crop(self, crop):
268 | """Take out the data with a crop"""
269 | # TODO there is the crop atrribute, which is a lazy crop, this should actually
270 | # return a non-cropped dataset
271 | ds_cropped = EmptySplitDataset(
272 | shape_full=self.shape,
273 | shape_block=self.shape_block,
274 | padding=self.padding,
275 | crop=crop,
276 | root=self.root.parent,
277 | name=self.root.name + "_cropped",
278 | )
279 | # the slices iterator does not return just the slices, but also the indicesS
280 | for (i_slice, block_slices), file_name in zip(
281 | ds_cropped.slices(), ds_cropped.files
282 | ):
283 | fl.save(
284 | str(self.root / file_name),
285 | {"stack": self[block_slices]},
286 | )
287 |
288 | ds_cropped.finalize()
289 |
290 |
291 | class EmptySplitDataset(Blocks):
292 | """Class to initialize an empty dataset for which we have to save metadata
293 | after filling its blocks.
294 | """
295 |
296 | def __init__(self, root, name, *args, resolution=None, **kwargs):
297 | """
298 | :param root: folder where the stack will be saved;
299 | :param name: name of the dataset, for the folder name;
300 | :param resolution: resolution of the stack, in microns;
301 | """
302 | super().__init__(*args, **kwargs)
303 | self.root = Path(root) / name
304 | if not self.root.is_dir():
305 | self.root.mkdir(parents=True)
306 | else:
307 | warnings.warn("Existing directory")
308 |
309 | self.files = ["{:04d}.h5".format(i) for i in range(self.n_blocks)]
310 | self.resolution = resolution
311 |
312 | def save_block_data(self, n, data, verbose=False):
313 | """Optional method to save data in a block. Often we don't use it,
314 | as we directly save data in the parallelized function. Might be good to
315 | find ways of centralizing saving here?
316 | :param n: n of the block we are saving in;
317 | :param data: data to be pured in the block;
318 | :param verbose:
319 | :return:
320 | """
321 | fname = "{:04d}.h5".format(n)
322 | if verbose:
323 | print("Saving ", str(self.root / fname))
324 |
325 | if data.shape != self.shape_block:
326 | print(" - data has different dimension from block!")
327 |
328 | to_save = {"stack": data}
329 |
330 | fl.save(str(self.root / fname), to_save, compression="blosc")
331 |
332 | def finalize(self):
333 | n_dims = len(self.shape_block)
334 | block_dict = self.serialize()
335 | block_dict["shape_full"] = self.shape_cropped
336 | block_dict["crop_start"] = (0,) * n_dims
337 | block_dict["crop_end"] = (0,) * n_dims
338 | block_dict["resolution"] = (
339 | self.resolution if self.resolution is not None else (1,) * n_dims
340 | )
341 |
342 | block_dict["axis_order"] = "tzyx" if n_dims == 4 else "zyx"
343 |
344 | _save_metadata_json(block_dict, self.root)
345 | return SplitDataset(self.root)
346 |
347 |
348 | def _save_metadata_json(dictionary, root):
349 | """Save json file preventing type failures for stack shapes
350 | :param path: path for saving
351 | :param dictionary: dictionary to be saved
352 | :return:
353 | """
354 | METADATA_FILENAME = "stack_metadata.json"
355 | for k in dictionary.keys():
356 | if type(dictionary[k]) is tuple:
357 | # funny fix for variable type mysterious error:
358 | if type(dictionary[k][0]) == np.int64 or type(dictionary[k][0]) == int:
359 | dictionary[k] = tuple([int(i) for i in dictionary[k]])
360 |
361 | json.dump(dictionary, open(root / METADATA_FILENAME, "w"))
362 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """Unit test package for split_dataset."""
2 |
--------------------------------------------------------------------------------
/tests/test_blocks.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from split_dataset import Blocks
4 |
5 |
6 | def test_cartesian_blocks():
7 | test_size = (20, 20)
8 | a = np.ones(test_size)
9 | blocks = Blocks(test_size, shape_block=(3, 7), padding=(1, 2))
10 | for idx, block in blocks.slices():
11 | a[block] = 0
12 | np.testing.assert_array_equal(a, np.zeros(test_size))
13 |
14 |
15 | def test_dropped_dimension():
16 | test_size = (5, 15, 20)
17 | blocks = Blocks(
18 | test_size, shape_block=(3, 7), padding=(1, 2), crop=((1, 1), (0, 0), (0, 0))
19 | )
20 | np.testing.assert_equal(blocks.drop_dim(1).shape_full, (5, 20))
21 |
--------------------------------------------------------------------------------
/tests/test_split_dataset.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | """Tests for `split_dataset` package."""
4 |
5 | import shutil
6 | import tempfile
7 | import unittest
8 |
9 | import numpy as np
10 |
11 | from split_dataset import save_to_split_dataset
12 |
13 |
14 | class TestSplitDataset(unittest.TestCase):
15 | def setUp(self):
16 | self.test_dir = tempfile.mkdtemp()
17 |
18 | def tearDown(self):
19 | shutil.rmtree(self.test_dir)
20 |
21 | def test_SplitDataset(self):
22 | dims = [(10, 3, 3, 3), (5, 5, 5), (5, 5), (1, 5, 5, 5)]
23 | block_sizes = [(2, None, None, None), (1, None, 3), (2, None), (None, 2, 5, 5)]
24 | all_slices = [
25 | [(slice(3, 8), slice(None))],
26 | [(slice(0, 1),), (slice(0, 2), slice(0, 1), slice(None))],
27 | [slice(0, 2)],
28 | [
29 | (slice(0, 1),),
30 | (slice(0, 2), slice(0, 1), slice(None)),
31 | (0, slice(0, 2), slice(0, 1)),
32 | ],
33 | ]
34 |
35 | for i, (di, bs, slices) in enumerate(zip(dims, block_sizes, all_slices)):
36 | test_data = np.arange(np.product(di)).reshape(di)
37 |
38 | sd = save_to_split_dataset(
39 | test_data,
40 | block_size=bs,
41 | root_name=self.test_dir,
42 | prefix="te{:02d}".format(i),
43 | )
44 | for sl in slices:
45 | a = sd[sl]
46 | b = test_data[sl]
47 | np.testing.assert_equal(
48 | a,
49 | b,
50 | err_msg="Testing "
51 | + str(di)
52 | + " "
53 | + str(sl)
54 | + " of shape "
55 | + str(a.shape)
56 | + " and shape"
57 | + str(b.shape),
58 | )
59 |
--------------------------------------------------------------------------------